def getHTML(): """ For each of the urls in query_duplications, get the HTML and store it in the query_articles collection """ db = sets.getDB() for item in db.query_duplications.find(): url = item['url'] # if the article doesn't already exist, get the html if db.query_articles.findOne({"url": url}).count() == 0: try: # Get raw html http_pool = urllib3.connection_from_url(url) r = http_pool.urlopen('GET', url) raw_html = r.data.decode('utf-8') # Get soup soup = BeautifulSoup(raw_html, 'html5lib') head = soup.find('head') body = soup.find('body') db.query_articles.insert({ "url" : url, "raw_html" : raw_html, "head" : head, "body" : body }) except: print("ERROR: ", sys.exc_info()[0]) print("Finished retrieving HTML")
def optionUpdateIndicators(): """ Prompts the user to update the indicator files, and then updates database with new indicators. """ print("OPTION 9: Update the indicators") print("######################################") print( "If you are unsure of any setting, consult the application documentation" ) print() print("######################################") print() print( "Indicators will be updated with the values of the indicator files in /config/indicators. Update these files with desired indicators before continuing." ) input("Hit enter to continue once you have updated the indicator files") print("Updating indicators...") db = sets.getDB() sets.loadIndicators(db, "Topic", "TOP", "./config/Indicators/topic.txt") sets.loadIndicators(db, "Reasoning", "RES", "./config/Indicators/reasoning.txt") sets.loadIndicators(db, "Blog", "BLG", "./config/Indicators/blog.txt") sets.loadIndicators(db, "Experience", "EXP", "./config/Indicators/experience.txt") print("Indicators updated")
def calculateDuplications(): """ Drops and populates the query_duplications table with calculated duplications """ db = sets.getDB() db.query_duplications.drop() # get all results (query history) archive_results = db.query_archive.find() # init duplication count list duplication_count = [] for res in archive_results: query_string = res['query_string'] date = res['timestamp'][0:13] #2017-08-21 16:34:31.622082 results_list = res['results'] for r in results_list: url = r['url'] # Ignore query parameters url_split = url.split('?') base_url = url_split[0] flag = False for record in duplication_count: if base_url == record['url']: record['count'] = record['count'] + 1 if query_string not in record['query_list']: record['query_list'].append(query_string) if date not in record['date_list']: record['date_list'].append(date) flag = True break if flag == False: duplication_count.append({ "url": base_url, "count": 1, "query_list": [query_string], "date_list": [date] }) for item in duplication_count: db.query_duplications.insert({ "url": item['url'], "frequency": item['count'], "query_list": item['query_list'], "date_list": item['date_list'] })
def validateOptionalArgs(indicators, start_date, end_date, number_of_results): """ returns true if all option arguments are valid """ if indicators != "off": # check indicators contains valid codes db = sets.getDB() db_indicator_types = db.indicators.find() db_codes = [] for ind_type in db_indicator_types: db_codes.append(ind_type['code']) for code in indicators: if code not in db_codes: return False if start_date != "off": # check start date is valid isValidStartDate = re.match( r'^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$', start_date) if not isValidStartDate: return False if end_date != "off": # check start date is valid isValidEndDate = re.match( r'^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$', end_date) if not isValidEndDate: return False # check number of results is a valid int if type(number_of_results) is not int: return False # check number of results is multiple of ten if number_of_results % 10 != 0: return False return True
def query(query_string, indicators, start_date, end_date, number_of_results): """ Query the api """ if indicators != 'off': # convert indicators to a list indicators = indicators.split(',') try: number_of_results = int(number_of_results) except: # ignore, it will be picked up by validateOptionalArgs pass # check optional args are valid validArgs = ql.validateOptionalArgs(indicators, start_date, end_date, number_of_results) if not validArgs: print( "ERROR: one or more of the optional arguments are invalid, please review the documentation and try again" ) sys.exit() # warn about number of results if number_of_results > 100: number_of_results = 100 print( "WARNING: number of results can't be greater than 100, it has been set to 100 for you." ) # get config db = sets.getDB() query_mode = sets.getQueryMode() number_of_runs = sets.getNumberOfRuns() # generate queries # SINGLE query mode returns a string # MULTI query mode returns a list of strings generated_query = ql.generateQuery(query_string, indicators, query_mode) print(generated_query) print(len(generated_query)) input("All good?") # query the API results = [] if query_mode == 'single' or indicators == 'off': # generated_query is a string for i in range(0, number_of_runs): results += [ ql.queryAPI(generated_query, start_date, end_date, number_of_results) ] else: # generated query is a list for i in range(0, number_of_runs): for query in generated_query: results += [ ql.queryAPI(query, start_date, end_date, number_of_results) ] # store the results in the database db.query_results.drop() # results is now a nested list of len(number_of_runs) # each nested list is a list of json results from 0 to num_of_results for result_portion in results: for r in result_portion: db.query_results.insert_one(r) db.query_archive.insert_one(r) # populate duplications ql.calculateDuplications() print("Query complete, exporting db.results...") sets.exportResultsCSV() print("End of query")
def generateQuery(query_string, indicators, query_mode): """ Generate query(s) from base string and indicators. - SINGLE query mode returns a string - MULTI query mode returns a list of strings """ db = sets.getDB() # if indicators off, just return the query if indicators == "off": return query_string # indicator list provided # handle based on query mode if query_mode == 'single': # query mode is SINGLE, create a single string # get the indicators from the database indcator_types = [] for ind_code in indicators: indicator_types.append(db.indicators.find({"code": ind_code})) generated_query_string = query_string for ind_type in indcator_types: generated_query_string += " AND (" ind_words = [] for iword in ind_type['words']: ind_words += iword for word in ind_words: generated_query_string += + "'" + word + "' OR " generated_query_string += ")" return generated_query_string else: # query mode is MULTI, create a list of queries # get the indicators from the database indicator_types = [] for ind_code in indicators: db_ind = db.indicators.find_one({"code": ind_code}) indicator_types.append(db_ind) # make ind_words a nested list of indicators by their indicator type ind_words = [] for ind_type in indicator_types: ind_words.append(ind_type['words']) # get all combinations all_combos = list(itertools.product(*ind_words)) query_list = [] # have one query that is just the query string query_list.append(query_string) for combo in all_combos: combo = list(combo) temp_query = query_string for key_phrase in combo: temp_query += ' AND "' + key_phrase + '"' query_list.append(temp_query) return query_list