Exemple #1
0
def getHTML():
	"""
		For each of the urls in query_duplications, get the HTML and store it in the query_articles collection
	"""
	db = sets.getDB()


	for item in db.query_duplications.find():
		url = item['url']

		# if the article doesn't already exist, get the html
		if db.query_articles.findOne({"url": url}).count() == 0:
			try:
				# Get raw html
				http_pool = urllib3.connection_from_url(url)
				r = http_pool.urlopen('GET', url)
				raw_html = r.data.decode('utf-8')


				# Get soup
				soup = BeautifulSoup(raw_html, 'html5lib')

				head = soup.find('head')
				body = soup.find('body')

				db.query_articles.insert({
					"url"		:	url,
					"raw_html"	:	raw_html,
					"head"		:	head,
					"body"		:	body
				})
			except:
				print("ERROR: ", sys.exc_info()[0])

	print("Finished retrieving HTML")
Exemple #2
0
def optionUpdateIndicators():
    """
		Prompts the user to update the indicator files, and then 
		updates database with new indicators.
	"""
    print("OPTION 9: Update the indicators")
    print("######################################")
    print(
        "If you are unsure of any setting, consult the application documentation"
    )
    print()
    print("######################################")
    print()
    print(
        "Indicators will be updated with the values of the indicator files in /config/indicators. Update these files with desired indicators before continuing."
    )
    input("Hit enter to continue once you have updated the indicator files")

    print("Updating indicators...")
    db = sets.getDB()
    sets.loadIndicators(db, "Topic", "TOP", "./config/Indicators/topic.txt")
    sets.loadIndicators(db, "Reasoning", "RES",
                        "./config/Indicators/reasoning.txt")
    sets.loadIndicators(db, "Blog", "BLG", "./config/Indicators/blog.txt")
    sets.loadIndicators(db, "Experience", "EXP",
                        "./config/Indicators/experience.txt")

    print("Indicators updated")
Exemple #3
0
def calculateDuplications():
    """
	Drops and populates the query_duplications table with calculated duplications
	"""
    db = sets.getDB()

    db.query_duplications.drop()

    # get all results (query history)
    archive_results = db.query_archive.find()

    # init duplication count list
    duplication_count = []

    for res in archive_results:
        query_string = res['query_string']
        date = res['timestamp'][0:13]  #2017-08-21 16:34:31.622082
        results_list = res['results']
        for r in results_list:
            url = r['url']

            # Ignore query parameters
            url_split = url.split('?')
            base_url = url_split[0]

            flag = False

            for record in duplication_count:
                if base_url == record['url']:
                    record['count'] = record['count'] + 1

                    if query_string not in record['query_list']:
                        record['query_list'].append(query_string)

                    if date not in record['date_list']:
                        record['date_list'].append(date)

                    flag = True
                    break

            if flag == False:
                duplication_count.append({
                    "url": base_url,
                    "count": 1,
                    "query_list": [query_string],
                    "date_list": [date]
                })

    for item in duplication_count:
        db.query_duplications.insert({
            "url": item['url'],
            "frequency": item['count'],
            "query_list": item['query_list'],
            "date_list": item['date_list']
        })
Exemple #4
0
def validateOptionalArgs(indicators, start_date, end_date, number_of_results):
    """
		returns true if all option arguments are valid
	"""
    if indicators != "off":

        # check indicators contains valid codes
        db = sets.getDB()
        db_indicator_types = db.indicators.find()

        db_codes = []
        for ind_type in db_indicator_types:
            db_codes.append(ind_type['code'])

        for code in indicators:
            if code not in db_codes:
                return False

    if start_date != "off":
        # check start date is valid
        isValidStartDate = re.match(
            r'^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$',
            start_date)

        if not isValidStartDate:
            return False

    if end_date != "off":
        # check start date is valid
        isValidEndDate = re.match(
            r'^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$',
            end_date)

        if not isValidEndDate:
            return False

    # check number of results is a valid int
    if type(number_of_results) is not int:
        return False

    # check number of results is multiple of ten
    if number_of_results % 10 != 0:
        return False

    return True
Exemple #5
0
def query(query_string, indicators, start_date, end_date, number_of_results):
    """
		Query the api
	"""
    if indicators != 'off':
        # convert indicators to a list
        indicators = indicators.split(',')

    try:
        number_of_results = int(number_of_results)
    except:
        # ignore, it will be picked up by validateOptionalArgs
        pass

    # check optional args are valid
    validArgs = ql.validateOptionalArgs(indicators, start_date, end_date,
                                        number_of_results)

    if not validArgs:
        print(
            "ERROR: one or more of the optional arguments are invalid, please review the documentation and try again"
        )
        sys.exit()

    # warn about number of results
    if number_of_results > 100:
        number_of_results = 100
        print(
            "WARNING: number of results can't be greater than 100, it has been set to 100 for you."
        )

    # get config
    db = sets.getDB()
    query_mode = sets.getQueryMode()
    number_of_runs = sets.getNumberOfRuns()

    # generate queries
    # SINGLE query mode returns a string
    # MULTI query mode returns a list of strings
    generated_query = ql.generateQuery(query_string, indicators, query_mode)

    print(generated_query)
    print(len(generated_query))
    input("All good?")

    # query the API
    results = []

    if query_mode == 'single' or indicators == 'off':
        # generated_query is a string
        for i in range(0, number_of_runs):
            results += [
                ql.queryAPI(generated_query, start_date, end_date,
                            number_of_results)
            ]
    else:
        # generated query is a list
        for i in range(0, number_of_runs):
            for query in generated_query:
                results += [
                    ql.queryAPI(query, start_date, end_date, number_of_results)
                ]

    # store the results in the database
    db.query_results.drop()

    # results is now a nested list of len(number_of_runs)
    # each nested list is a list of json results from 0 to num_of_results

    for result_portion in results:
        for r in result_portion:
            db.query_results.insert_one(r)
            db.query_archive.insert_one(r)

    # populate duplications
    ql.calculateDuplications()

    print("Query complete, exporting db.results...")
    sets.exportResultsCSV()

    print("End of query")
Exemple #6
0
def generateQuery(query_string, indicators, query_mode):
    """
		Generate query(s) from base string and indicators.
		- SINGLE query mode returns a string
		- MULTI query mode returns a list of strings
	"""
    db = sets.getDB()

    # if indicators off, just return the query
    if indicators == "off":
        return query_string

    # indicator list provided
    # handle based on query mode
    if query_mode == 'single':
        # query mode is SINGLE, create a single string

        # get the indicators from the database
        indcator_types = []

        for ind_code in indicators:
            indicator_types.append(db.indicators.find({"code": ind_code}))

        generated_query_string = query_string

        for ind_type in indcator_types:
            generated_query_string += " AND ("

            ind_words = []
            for iword in ind_type['words']:
                ind_words += iword

            for word in ind_words:
                generated_query_string += + "'" + word + "' OR "
            generated_query_string += ")"

        return generated_query_string

    else:
        # query mode is MULTI, create a list of queries

        # get the indicators from the database
        indicator_types = []

        for ind_code in indicators:
            db_ind = db.indicators.find_one({"code": ind_code})
            indicator_types.append(db_ind)

        # make ind_words a nested list of indicators by their indicator type
        ind_words = []
        for ind_type in indicator_types:
            ind_words.append(ind_type['words'])

        # get all combinations
        all_combos = list(itertools.product(*ind_words))

        query_list = []

        # have one query that is just the query string
        query_list.append(query_string)

        for combo in all_combos:
            combo = list(combo)

            temp_query = query_string
            for key_phrase in combo:
                temp_query += ' AND "' + key_phrase + '"'

            query_list.append(temp_query)

        return query_list