コード例 #1
0
ファイル: query.py プロジェクト: yu-andy/ddg-retriever
    def __init__(self, query_string, exact_matches, replace_parentheses):

        if replace_parentheses:
            sub_queries = filter(lambda q: len(q) > 0,
                                 Query.parentheses_regex.split(query_string))

            if exact_matches:
                self.query_string = '"' + '" "'.join(sub_queries) + '"'
            else:
                self.query_string = ' '.join(sub_queries)
        else:
            if exact_matches:
                self.query_string = '"' + str(query_string) + '"'
            else:
                self.query_string = str(query_string)

        self.uri = 'https://duckduckgo.com/html/?q=' + urllib.parse.quote(
            self.query_string)
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0",
            "Accept-Language": "en"
        }

        self.search_results = SearchResultList()

        # session for data retrieval
        self.session = requests.Session()
コード例 #2
0
ファイル: query.py プロジェクト: sbaltes/ddg-retriever
 def handle_error(self, max_results, min_wait, max_wait, wait_on_error, check_for_empty_snippets, depth=0, e=None):
     logger.error('An error occurred while retrieving result list for query: ' + str(self))
     logger.error('Resetting result list for query: ' + str(self))
     self.search_results = SearchResultList()
     if depth <= 2 and (e is None
                        or isinstance(e, requests.exceptions.RequestException)
                        or (type(e) == OSError and e.errno == errno.ENETDOWN)):
         # ignore empty snippets in last iteration
         if depth == 2:
             check_for_empty_snippets = False
         logger.info('Retrying in ' + str(wait_on_error) + ' milliseconds... ')
         time.sleep(wait_on_error / 1000)
         self.retrieve_search_results(max_results, min_wait, max_wait, wait_on_error,
                                      check_for_empty_snippets, depth + 1)
         return
     elif type(e) == OSError:
         logger.error('Terminating.')
         sys.exit(1)
     elif not self.has_failed:
         self.has_failed = True
         logger.info('Stopping this query, continuing with next query...')
         return
     else:
         logger.info('Unable to retrieve search results for query: ' + str(self))
         return
コード例 #3
0
ファイル: query.py プロジェクト: sbaltes/ddg-retriever
    def __init__(self, query_string, exact_matches, remove_special_characters):
        # transliterate unicode string into closest possible ASCII representation
        # not doing this caused issues with queries such as "L'Hôpital's rule"
        self.query_string = unidecode.unidecode(query_string)
        self.is_empty = False
        self.has_failed = False

        if remove_special_characters:
            sub_queries = list(filter(lambda q: len(q) > 0, Query.special_character_regex.split(self.query_string)))

            if len(sub_queries) == 0:
                self.is_empty = True
            else:
                if exact_matches:
                    self.query_string = '"' + '" "'.join(sub_queries) + '"'
                else:
                    self.query_string = ' '.join(sub_queries)
        else:
            self.query_string = str(self.query_string).strip()

            if len(self.query_string) == 0:
                self.is_empty = True
            elif exact_matches:
                self.query_string = '"' + self.query_string + '"'

        self.uri = 'https://duckduckgo.com/html/?q=' + urllib.parse.quote(self.query_string)
        # see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:68.0) Gecko/20100101 Firefox/68.0",
            "Accept": "text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8",
            "Accept-Charset": "utf-8",
            "Accept-Language": "en"
        }

        self.search_results = SearchResultList()

        # session for data retrieval
        self.session = requests.Session()
コード例 #4
0
def main():
    # parse command line arguments
    parser = get_argument_parser()
    args = parser.parse_args()

    # parse config file
    config = configparser.ConfigParser()
    config.read(args.config_file)

    # read configuration
    if 'DEFAULT' not in config:
        logger.error("DEFAULT configuration missing.\nTerminating.")
        sys.exit()

    # i/o
    input_file = str(config['DEFAULT'].get('InputFile', None))
    output_dir = str(config['DEFAULT'].get('OutputDirectory', None))
    delimiter = str(config['DEFAULT'].get('Delimiter', None))

    if input_file is None or output_dir is None or delimiter is None:
        logger.error("Required configuration missing.\nTerminating.")
        sys.exit()

    # requests
    exact_matches = config['DEFAULT'].getboolean('ExactMatches', True)
    replace_parentheses = config['DEFAULT'].getboolean('ReplaceParentheses',
                                                       True)
    max_results = config['DEFAULT'].getint('MaxResults', 25)
    min_wait = config['DEFAULT'].getint('MinWait', 500)
    max_wait = config['DEFAULT'].getint('MaxWait', 2000)

    # detecting languages of snippets
    detect_languages = config['DEFAULT'].getboolean('DetectLanguages', True)

    queries_only = False

    # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443)
    with codecs.open(input_file, encoding='utf8') as fp:
        logger.info("Checking input format in " + input_file + "...")
        reader = csv.reader(fp, delimiter=delimiter)
        # read header
        header = next(reader, None)
        if not header:
            raise IllegalArgumentError("Missing header in CSV file.")
        queries_only = len(header) == 1

    if queries_only:
        logger.info(
            "Input file contains only queries, retrieving search results...")
        query_list = QueryList()
        query_list.read_from_csv(input_file, exact_matches,
                                 replace_parentheses, delimiter)
        query_list.retrieve_search_results(max_results, min_wait, max_wait,
                                           detect_languages)
        query_list.write_search_results_to_csv(output_dir, delimiter,
                                               detect_languages)
    elif detect_languages:
        logger.info(
            "Input file contains search results, detecting language of snippets..."
        )
        search_result_list = SearchResultList()
        search_result_list.read_from_csv(input_file, delimiter)
        search_result_list.detect_languages()
        search_result_list.write_to_csv(output_dir, delimiter,
                                        detect_languages)
    else:
        logger.info("No action configured, terminating...")
コード例 #5
0
 def __init__(self):
     self.filename = ""
     self.values = []
     self.search_results = SearchResultList()
コード例 #6
0
class QueryList(object):
    """ List of search queries. """
    def __init__(self):
        self.filename = ""
        self.values = []
        self.search_results = SearchResultList()

    def read_from_csv(self, input_file, exact_matches, replace_parentheses,
                      delimiter):
        """
        Read search queries from a CSV file (header required).
        :param replace_parentheses: Replace Wikipedia parentheses in query strings
        :param exact_matches: Only search for exact matches of query strings
        :param input_file: Path to the CSV file.
        :param delimiter: Column delimiter in CSV file (typically ',').
        """

        # read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443)
        with codecs.open(input_file, encoding='utf8') as fp:
            logger.info("Reading search queries from " + input_file + "...")

            reader = csv.reader(fp, delimiter=delimiter)

            # read header
            header = next(reader, None)
            if not header:
                raise IllegalArgumentError("Missing header in CSV file.")

            query = header.index("query")

            # read CSV file
            for row in reader:
                if row:
                    self.values.append(
                        Query(row[query], exact_matches, replace_parentheses))
                else:
                    raise IllegalArgumentError("Wrong CSV format.")

        self.filename = os.path.basename(input_file)
        logger.info(
            str(len(self.values)) + " search queries have been imported.")

    def retrieve_search_results(self, max_results, min_wait, max_wait,
                                detect_languages):
        for query in self.values:
            query.retrieve_search_results(max_results, min_wait, max_wait)
            if detect_languages:
                query.search_results.detect_languages()
            for search_result in query.search_results.values:
                self.search_results.values.append(search_result)

    def write_search_results_to_csv(self, output_dir, delimiter,
                                    include_language):
        """
        Export search results to a CSV file.
        :param include_language: Add column "language" if tool was configured to detect languages of snippets
        :param output_dir: Target directory for generated CSV file.
        :param delimiter: Column delimiter in CSV file (typically ',').
        """
        self.search_results.write_to_csv(output_dir, delimiter,
                                         include_language, self.filename)