def __init__(self, query_string, exact_matches, replace_parentheses): if replace_parentheses: sub_queries = filter(lambda q: len(q) > 0, Query.parentheses_regex.split(query_string)) if exact_matches: self.query_string = '"' + '" "'.join(sub_queries) + '"' else: self.query_string = ' '.join(sub_queries) else: if exact_matches: self.query_string = '"' + str(query_string) + '"' else: self.query_string = str(query_string) self.uri = 'https://duckduckgo.com/html/?q=' + urllib.parse.quote( self.query_string) self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0", "Accept-Language": "en" } self.search_results = SearchResultList() # session for data retrieval self.session = requests.Session()
def handle_error(self, max_results, min_wait, max_wait, wait_on_error, check_for_empty_snippets, depth=0, e=None): logger.error('An error occurred while retrieving result list for query: ' + str(self)) logger.error('Resetting result list for query: ' + str(self)) self.search_results = SearchResultList() if depth <= 2 and (e is None or isinstance(e, requests.exceptions.RequestException) or (type(e) == OSError and e.errno == errno.ENETDOWN)): # ignore empty snippets in last iteration if depth == 2: check_for_empty_snippets = False logger.info('Retrying in ' + str(wait_on_error) + ' milliseconds... ') time.sleep(wait_on_error / 1000) self.retrieve_search_results(max_results, min_wait, max_wait, wait_on_error, check_for_empty_snippets, depth + 1) return elif type(e) == OSError: logger.error('Terminating.') sys.exit(1) elif not self.has_failed: self.has_failed = True logger.info('Stopping this query, continuing with next query...') return else: logger.info('Unable to retrieve search results for query: ' + str(self)) return
def __init__(self, query_string, exact_matches, remove_special_characters): # transliterate unicode string into closest possible ASCII representation # not doing this caused issues with queries such as "L'Hôpital's rule" self.query_string = unidecode.unidecode(query_string) self.is_empty = False self.has_failed = False if remove_special_characters: sub_queries = list(filter(lambda q: len(q) > 0, Query.special_character_regex.split(self.query_string))) if len(sub_queries) == 0: self.is_empty = True else: if exact_matches: self.query_string = '"' + '" "'.join(sub_queries) + '"' else: self.query_string = ' '.join(sub_queries) else: self.query_string = str(self.query_string).strip() if len(self.query_string) == 0: self.is_empty = True elif exact_matches: self.query_string = '"' + self.query_string + '"' self.uri = 'https://duckduckgo.com/html/?q=' + urllib.parse.quote(self.query_string) # see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0", "Accept": "text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8", "Accept-Charset": "utf-8", "Accept-Language": "en" } self.search_results = SearchResultList() # session for data retrieval self.session = requests.Session()
def main(): # parse command line arguments parser = get_argument_parser() args = parser.parse_args() # parse config file config = configparser.ConfigParser() config.read(args.config_file) # read configuration if 'DEFAULT' not in config: logger.error("DEFAULT configuration missing.\nTerminating.") sys.exit() # i/o input_file = str(config['DEFAULT'].get('InputFile', None)) output_dir = str(config['DEFAULT'].get('OutputDirectory', None)) delimiter = str(config['DEFAULT'].get('Delimiter', None)) if input_file is None or output_dir is None or delimiter is None: logger.error("Required configuration missing.\nTerminating.") sys.exit() # requests exact_matches = config['DEFAULT'].getboolean('ExactMatches', True) replace_parentheses = config['DEFAULT'].getboolean('ReplaceParentheses', True) max_results = config['DEFAULT'].getint('MaxResults', 25) min_wait = config['DEFAULT'].getint('MinWait', 500) max_wait = config['DEFAULT'].getint('MaxWait', 2000) # detecting languages of snippets detect_languages = config['DEFAULT'].getboolean('DetectLanguages', True) queries_only = False # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: logger.info("Checking input format in " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") queries_only = len(header) == 1 if queries_only: logger.info( "Input file contains only queries, retrieving search results...") query_list = QueryList() query_list.read_from_csv(input_file, exact_matches, replace_parentheses, delimiter) query_list.retrieve_search_results(max_results, min_wait, max_wait, detect_languages) query_list.write_search_results_to_csv(output_dir, delimiter, detect_languages) elif detect_languages: logger.info( "Input file contains search results, detecting language of snippets..." ) search_result_list = SearchResultList() search_result_list.read_from_csv(input_file, delimiter) search_result_list.detect_languages() search_result_list.write_to_csv(output_dir, delimiter, detect_languages) else: logger.info("No action configured, terminating...")
def __init__(self): self.filename = "" self.values = [] self.search_results = SearchResultList()
class QueryList(object): """ List of search queries. """ def __init__(self): self.filename = "" self.values = [] self.search_results = SearchResultList() def read_from_csv(self, input_file, exact_matches, replace_parentheses, delimiter): """ Read search queries from a CSV file (header required). :param replace_parentheses: Replace Wikipedia parentheses in query strings :param exact_matches: Only search for exact matches of query strings :param input_file: Path to the CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443) with codecs.open(input_file, encoding='utf8') as fp: logger.info("Reading search queries from " + input_file + "...") reader = csv.reader(fp, delimiter=delimiter) # read header header = next(reader, None) if not header: raise IllegalArgumentError("Missing header in CSV file.") query = header.index("query") # read CSV file for row in reader: if row: self.values.append( Query(row[query], exact_matches, replace_parentheses)) else: raise IllegalArgumentError("Wrong CSV format.") self.filename = os.path.basename(input_file) logger.info( str(len(self.values)) + " search queries have been imported.") def retrieve_search_results(self, max_results, min_wait, max_wait, detect_languages): for query in self.values: query.retrieve_search_results(max_results, min_wait, max_wait) if detect_languages: query.search_results.detect_languages() for search_result in query.search_results.values: self.search_results.values.append(search_result) def write_search_results_to_csv(self, output_dir, delimiter, include_language): """ Export search results to a CSV file. :param include_language: Add column "language" if tool was configured to detect languages of snippets :param output_dir: Target directory for generated CSV file. :param delimiter: Column delimiter in CSV file (typically ','). """ self.search_results.write_to_csv(output_dir, delimiter, include_language, self.filename)