Esempio n. 1
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            if not hasattr(self, 'json_outfile'):
                self.json_outfile = open(output_file + '.json', 'a')

            obj = self._get_serp_obj()
            obj['requested_at'] = obj['requested_at'].isoformat()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)

        elif output_format == 'csv':
            if not hasattr(self, 'csv_outfile'):
                self.csv_outfile = csv.DictWriter(open(output_file + '.csv', 'a'),
                        fieldnames=('link', 'title', 'snippet', 'visible_link', 'num_results',
                                    'query', 'search_engine_name', 'requested_by',
                                    'scrapemethod', 'page_number', 'requested_at'))
                self.csv_outfile.writeheader()

            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)

            obj = self._get_serp_obj()
            obj['num_results'] = self.parser.search_results['num_results']
            for row in rows:
                row.update(obj)
                self.csv_outfile.writerow(row)
Esempio n. 2
0
    def run(self):

        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    if self.cache_manager:
                        self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                      scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query)

                        if self.scraper_search:
                            self.scraper_search.serps.append(serp)

                        if self.session:
                            self.session.add(serp)
                            self.session.commit()

                        store_serp_result(serp, self.config)
    def run(self):
        print("----------------------------start def run(self):-----------------------------")
        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    if self.cache_manager:
                        self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                      scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query)

                        if self.scraper_search:
                            self.scraper_search.serps.append(serp)

                        if self.session:
                            self.session.add(serp)
                            self.session.commit()

                        store_serp_result(serp, self.config)
        print("----------------------------end def run(self):-----------------------------")
Esempio n. 4
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session.'

        if self.html:
            self.parser.parse(self.html)
        else:
            print("Nothing to parse for {keyword}! (page len = {pagelen})".
                  format(keyword=self.query, pagelen=len(self.html)))
            self.parser = None

        with self.db_lock:

            serp = parse_serp(parser=self.parser,
                              scraper=self,
                              query=self.query)

            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(serp)

            if serp.num_results:
                return True
            else:
                return False
Esempio n. 5
0
    def run(self):

        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                  scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(parser=scrape.parser, scraper=scrape, query=scrape.query)

                        self.scraper_search.serps.append(serp)
                        self.session.add(serp)
                        self.session.commit()

                        store_serp_result(serp)
Esempio n. 6
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session.'

        if self.html:
            self.parser.parse(self.html)
        else:
            self.parser = None

        with self.db_lock:

            serp = parse_serp(self.config,
                              parser=self.parser,
                              scraper=self,
                              query=self.query)

            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(serp, self.config)

            if serp.num_results:
                return True
            else:
                return False
Esempio n. 7
0
def parse_again(fname, search_engine, scrapemethod, query):
    html = read_cached_file(get_path(fname))
    return parse_serp(html=html,
                      search_engine=search_engine,
                      scrapemethod=scrapemethod,
                      current_page=0,
                      current_keyword=query)
Esempio n. 8
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.
                search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        def results():
            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)
            return rows

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            obj = self._get_serp_obj()
            obj['results'] = results()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)
            self.json_outfile.write(',')

        elif output_format == 'csv':
            obj = self._get_serp_obj()
            for row in results():
                row.update(obj)
                self.csv_outfile.writerow(row)
Esempio n. 9
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        def results():
            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)
            return rows

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            obj = self._get_serp_obj()
            obj['results'] = results()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)
            self.json_outfile.write(',')

        elif output_format == 'csv':
            obj = self._get_serp_obj()
            for row in results():
                row.update(obj)
                self.csv_outfile.writerow(row)
Esempio n. 10
0
def parse_again(fname, search_engine, scrapemethod, query):
    html = read_cached_file(get_path(fname))
    return parse_serp(
        html=html,
        search_engine=search_engine,
        scrapemethod=scrapemethod,
        current_page=0,
        current_keyword=query
    )
Esempio n. 11
0
def parse_again(fname, search_engine, scrape_method, query):
    """
    @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it
    """
    html = read_cached_file(get_path(fname))
    return parse_serp(
        html=html,
        search_engine=search_engine,
        query=query
    )
Esempio n. 12
0
 def parse_again(self, fname, search_engine, scrape_method, query):
     """
     @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it
     """
     path = os.path.join(self.config.get('cachedir', '.scrapecache'), fname)
     html = self.read_cached_file(path)
     return parse_serp(self.config,
                       html=html,
                       search_engine=search_engine,
                       query=query)
Esempio n. 13
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=datetime.datetime.utcnow(),
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()
Esempio n. 14
0
 def parse_again(self, fname, search_engine, scrape_method, query):
     """
     @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it
     """
     path = os.path.join(self.config.get('cachedir', '.scrapecache'), fname)
     html = self.read_cached_file(path)
     return parse_serp(
         self.config,
         html=html,
         search_engine=search_engine,
         query=query
     )
Esempio n. 15
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.
                search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            serp, parser = parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(dict_from_scraping_object(self), self.parser)
Esempio n. 16
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session.'

        if self.html:
            self.parser.parse(self.html)
        else:
            self.parser = None

        with self.db_lock:

            serp = parse_serp(parser=self.parser, scraper=self, query=self.query)

            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(serp)

            if serp.num_results:
                return True
            else:
                return False
Esempio n. 17
0
def parse_again(fname, search_engine, scrape_method, query):
    """
    @todo: `scrape_method` is not used here -> check if scrape_method is passed to this function and remove it
    """
    html = read_cached_file(get_path(fname))
    return parse_serp(html=html, search_engine=search_engine, query=query)
Esempio n. 18
0
def parse_all_cached_files(keywords, search_engines, session, scraper_search, try_harder=False):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        identifying_list: A list of list with elements that identify the result. The consist of the keyword, search_engine, scrapemode.
        session: An sql alchemy session to add the entities
        try_harder: If there is a cache file that cannot be mapped to a keyword, read it and try it again and try to
                    extract the search query from the html.

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>')

    files = _get_all_cache_files()
    mapping = {}
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    num_cached = 0

    for kw in keywords:
        for search_engine in search_engines:
            key = cached_file_name(kw, search_engine, scrapemethod)

            out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'.format(
                    kw=kw,
                    se=search_engine,
                    sm=scrapemethod,
                    hash=key
                ), lvl=5)

            mapping[key] = (kw, search_engine)

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = search_engine = None
        val = mapping.get(clean_filename, None)
        if val:
            query, search_engine = val

        if query and search_engine:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            try:
                serp = session.query(SearchEngineResultsPage).filter(
                        SearchEngineResultsPage.query == query,
                        SearchEngineResultsPage.search_engine_name == search_engine,
                        SearchEngineResultsPage.scrapemethod == scrapemethod).one()
            except NoResultFound as e:
                # that shouldn't happen
                # we have a cache file that matches the above identifying information
                # but it was never stored to the database.
                logger.error('No entry for file {} found in database. Will parse again.'.format(clean_filename))
                html = read_cached_file(get_path(fname))
                serp = parse_serp(
                    html=html,
                    search_engine=search_engine,
                    scrapemethod=scrapemethod,
                    current_page=0,
                    current_keyword=query
                )
            except MultipleResultsFound as e:
                raise e

            if serp:
                scraper_search.serps.append(serp)

            mapping.pop(clean_filename)
            num_cached += 1

    out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'.format(
        num_cached, len(keywords), len(keywords) - num_cached), lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return [e[0] for e in mapping.values()]
Esempio n. 19
0
def parse_all_cached_files(keywords,
                           session,
                           scraper_search,
                           try_harder=False):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        identifying_list: A list of list with elements that identify the result. The consist of the keyword, search_engine, scrapemode.
        session: An sql alchemy session to add the entities
        try_harder: If there is a cache file that cannot be mapped to a keyword, read it and try it again and try to
                    extract the search query from the html.

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(
        r'<title>(?P<kw>.*?) - Google Search</title>')
    files = _get_all_cache_files()
    mapping = {}
    search_engine = Config['SCRAPING'].get('search_engine')
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    for kw in keywords:
        key = cached_file_name(kw, search_engine, scrapemethod)

        out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'
            .format(kw=kw, se=search_engine, sm=scrapemethod, hash=key),
            lvl=5)

        mapping[key] = kw

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = mapping.get(clean_filename, None)

        if query:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            try:
                serp = session.query(SearchEngineResultsPage).filter(
                    SearchEngineResultsPage.query == query,
                    SearchEngineResultsPage.search_engine_name ==
                    search_engine, SearchEngineResultsPage.scrapemethod ==
                    scrapemethod).one()
            except NoResultFound as e:
                # that shouldn't happen
                # we have a cache file that matches the above identifying information
                # but it was never stored to the database.
                logger.error(
                    'No entry for file {} found in database. Will parse again.'
                    .format(clean_filename))
                serp = parse_serp(html=read_cached_file(get_path(fname)),
                                  search_engine=search_engine,
                                  scrapemethod=scrapemethod,
                                  current_page=0,
                                  current_keyword=query)
            except MultipleResultsFound as e:
                raise e
            finally:
                scraper_search.serps.append(serp)

            mapping.pop(clean_filename)

        # TODO: support query detection for all supported search engines
        # by parsing the keyword, search engine from the raw html

    out('{} cache files found in {}'.format(len(files),
                                            Config['GLOBAL'].get('cachedir')),
        lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'
        .format(len(keywords) - len(mapping), len(keywords), len(mapping)),
        lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return mapping.values()
Esempio n. 20
0
def parse_again(fname, search_engine, scrape_method, query):
    html = read_cached_file(get_path(fname))
    return parse_serp(html=html, search_engine=search_engine, query=query)