def run(self):
        print("----------------------------start def run(self):-----------------------------")
        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    if self.cache_manager:
                        self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                      scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query)

                        if self.scraper_search:
                            self.scraper_search.serps.append(serp)

                        if self.session:
                            self.session.add(serp)
                            self.session.commit()

                        store_serp_result(serp, self.config)
        print("----------------------------end def run(self):-----------------------------")
Esempio n. 2
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session.'

        if self.html:
            self.parser.parse(self.html)
        else:
            self.parser = None

        with self.db_lock:

            serp = parse_serp(self.config,
                              parser=self.parser,
                              scraper=self,
                              query=self.query)

            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(serp, self.config)

            if serp.num_results:
                return True
            else:
                return False
Esempio n. 3
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session.'

        if self.html:
            self.parser.parse(self.html)
        else:
            print("Nothing to parse for {keyword}! (page len = {pagelen})".
                  format(keyword=self.query, pagelen=len(self.html)))
            self.parser = None

        with self.db_lock:

            serp = parse_serp(parser=self.parser,
                              scraper=self,
                              query=self.query)

            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(serp)

            if serp.num_results:
                return True
            else:
                return False
Esempio n. 4
0
    def run(self):

        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    if self.cache_manager:
                        self.cache_manager.cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                      scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(self.config, parser=scrape.parser, scraper=scrape, query=scrape.query)

                        if self.scraper_search:
                            self.scraper_search.serps.append(serp)

                        if self.session:
                            self.session.add(serp)
                            self.session.commit()

                        store_serp_result(serp, self.config)
Esempio n. 5
0
    def parse_all_cached_files(self, scrape_jobs, session, scraper_search):
        """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

        Args:
            session: An sql alchemy session to add the entities
            scraper_search: Abstract object representing the current search.

        Returns:
            The scrape jobs that couldn't be parsed from the cache directory.
        """
        files = self._get_all_cache_files()
        num_cached = num_total = 0
        mapping = {}
        for job in scrape_jobs:
            cache_name = self.cached_file_name(
                job['query'],
                job['search_engine'],
                job['scrape_method'],
                job['page_number']
            )
            mapping[cache_name] = job
            num_total += 1

        for path in files:
            # strip of the extension of the path if it has eny
            fname = os.path.split(path)[1]
            clean_filename = fname
            for ext in ALLOWED_COMPRESSION_ALGORITHMS:
                if fname.endswith(ext):
                    clean_filename = fname.rstrip('.' + ext)

            job = mapping.get(clean_filename, None)

            if job:
                # We found a file that contains the keyword, search engine name and
                # search mode that fits our description. Let's see if there is already
                # an record in the database and link it to our new ScraperSearch object.
                serp = self.get_serp_from_database(session, job['query'], job['search_engine'], job['scrape_method'], job['page_number'])

                if not serp:
                    serp = self.parse_again(fname, job['search_engine'], job['scrape_method'], job['query'])

                serp.scraper_searches.append(scraper_search)
                session.add(serp)

                if num_cached % 200 == 0:
                    session.commit()

                store_serp_result(serp, self.config)
                num_cached += 1
                scrape_jobs.remove(job)

        logger.info('{} cache files found in {}'.format(len(files), self.config.get('cachedir')))
        logger.info('{}/{} objects have been read from the cache. {} remain to get scraped.'.format(
            num_cached, num_total, num_total - num_cached))

        session.add(scraper_search)
        session.commit()

        return scrape_jobs
Esempio n. 6
0
    def run(self):

        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                  scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(parser=scrape.parser, scraper=scrape, query=scrape.query)

                        self.scraper_search.serps.append(serp)
                        self.session.add(serp)
                        self.session.commit()

                        store_serp_result(serp)
Esempio n. 7
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.
                search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            serp, parser = parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(dict_from_scraping_object(self), self.parser)
Esempio n. 8
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session.'

        if self.html:
            self.parser.parse(self.html)
        else:
            self.parser = None

        with self.db_lock:

            serp = parse_serp(parser=self.parser, scraper=self, query=self.query)

            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(serp)

            if serp.num_results:
                return True
            else:
                return False
Esempio n. 9
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            serp, parser = parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(dict_from_scraping_object(self), self.parser)
Esempio n. 10
0
    def parse_all_cached_files(self, scrape_jobs, session, scraper_search):
        """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

        Args:
            session: An sql alchemy session to add the entities
            scraper_search: Abstract object representing the current search.

        Returns:
            The scrape jobs that couldn't be parsed from the cache directory.
        """
        files = self._get_all_cache_files()
        num_cached = num_total = 0
        mapping = {}
        for job in scrape_jobs:
            cache_name = self.cached_file_name(job['query'],
                                               job['search_engine'],
                                               job['scrape_method'],
                                               job['page_number'])
            mapping[cache_name] = job
            num_total += 1

        for path in files:
            # strip of the extension of the path if it has eny
            fname = os.path.split(path)[1]
            clean_filename = fname
            for ext in ALLOWED_COMPRESSION_ALGORITHMS:
                if fname.endswith(ext):
                    clean_filename = fname.rstrip('.' + ext)

            job = mapping.get(clean_filename, None)

            if job:
                # We found a file that contains the keyword, search engine name and
                # search mode that fits our description. Let's see if there is already
                # an record in the database and link it to our new ScraperSearch object.
                serp = self.get_serp_from_database(session, job['query'],
                                                   job['search_engine'],
                                                   job['scrape_method'],
                                                   job['page_number'])

                if not serp:
                    serp = self.parse_again(fname, job['search_engine'],
                                            job['scrape_method'], job['query'])

                serp.scraper_searches.append(scraper_search)
                session.add(serp)

                if num_cached % 200 == 0:
                    session.commit()

                store_serp_result(serp, self.config)
                num_cached += 1
                scrape_jobs.remove(job)

        logger.info('{} cache files found in {}'.format(
            len(files), self.config.get('cachedir')))
        logger.info(
            '{}/{} objects have been read from the cache. {} remain to get scraped.'
            .format(num_cached, num_total, num_total - num_cached))

        session.add(scraper_search)
        session.commit()

        return scrape_jobs
Esempio n. 11
0
def parse_all_cached_files(keywords, search_engines, session, scraper_search):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        session: An sql alchemy session to add the entities

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(
        r'<title>(?P<kw>.*?) - Google Search</title>')

    files = _get_all_cache_files()
    mapping = {}
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    num_cached = 0
    # a keyword is requested once for each search engine
    num_total_keywords = len(keywords) * len(search_engines)

    for kw in keywords:
        for search_engine in search_engines:
            key = cached_file_name(kw, search_engine, scrapemethod)

            out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'
                .format(kw=kw, se=search_engine, sm=scrapemethod, hash=key),
                lvl=5)

            mapping[key] = (kw, search_engine)

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = search_engine = None
        val = mapping.get(clean_filename, None)
        if val:
            query, search_engine = val

        if query and search_engine:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            serp = None  #get_serp_from_database(session, query, search_engine, scrapemethod)

            if not serp:
                serp, parser = parse_again(fname, search_engine, scrapemethod,
                                           query)

            serp.scraper_searches.append(scraper_search)
            session.add(serp)
            session.commit()

            store_serp_result(dict_from_serp_object(serp), parser)

            mapping.pop(clean_filename)
            num_cached += 1

    out('{} cache files found in {}'.format(len(files),
                                            Config['GLOBAL'].get('cachedir')),
        lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'
        .format(num_cached, num_total_keywords,
                num_total_keywords - num_cached),
        lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return [e[0] for e in mapping.values()]
Esempio n. 12
0
def parse_all_cached_files(keywords, search_engines, session, scraper_search):
    """Walk recursively through the cachedir (as given by the Config) and parse all cached files.

    Args:
        session: An sql alchemy session to add the entities

    Returns:
        A list of keywords that couldn't be parsed and which need to be scraped anew.
    """
    google_query_needle = re.compile(r'<title>(?P<kw>.*?) - Google Search</title>')

    files = _get_all_cache_files()
    mapping = {}
    scrapemethod = Config['SCRAPING'].get('scrapemethod')
    num_cached = 0
    # a keyword is requested once for each search engine
    num_total_keywords = len(keywords) * len(search_engines)

    for kw in keywords:
        for search_engine in search_engines:
            key = cached_file_name(kw, search_engine, scrapemethod)

            out('Params(keyword="{kw}", search_engine="{se}", scrapemethod="{sm}" yields {hash}'.format(
                    kw=kw,
                    se=search_engine,
                    sm=scrapemethod,
                    hash=key
                ), lvl=5)

            mapping[key] = (kw, search_engine)

    for path in files:
        # strip of the extension of the path if it has eny
        fname = os.path.split(path)[1]
        clean_filename = fname
        for ext in ALLOWED_COMPRESSION_ALGORITHMS:
            if fname.endswith(ext):
                clean_filename = fname.rstrip('.' + ext)

        query = search_engine = None
        val = mapping.get(clean_filename, None)
        if val:
            query, search_engine = val

        if query and search_engine:
            # We found a file that contains the keyword, search engine name and
            # searchmode that fits our description. Let's see if there is already
            # an record in the database and link it to our new ScraperSearch object.
            serp = None #get_serp_from_database(session, query, search_engine, scrapemethod)

            if not serp:
                serp, parser = parse_again(fname, search_engine, scrapemethod, query)

            serp.scraper_searches.append(scraper_search)
            session.add(serp)
            session.commit()

            store_serp_result(dict_from_serp_object(serp), parser)

            mapping.pop(clean_filename)
            num_cached += 1

    out('{} cache files found in {}'.format(len(files), Config['GLOBAL'].get('cachedir')), lvl=1)
    out('{}/{} keywords have been cached and are ready to get parsed. {} remain to get scraped.'.format(
        num_cached, num_total_keywords, num_total_keywords - num_cached), lvl=1)

    session.add(scraper_search)
    session.commit()
    # return the remaining keywords to scrape
    return [e[0] for e in mapping.values()]