コード例 #1
0
    def run(self):

        while True:
            self.get_requests()

            if not self.requests:
                break

            self.results = self.loop.run_until_complete(asyncio.wait([r()() for r in self.requests]))

            for task in self.results[0]:
                scrape = task.result()

                if scrape:

                    cache_results(scrape.parser, scrape.query, scrape.search_engine_name, scrape.scrape_method,
                                  scrape.page_number)

                    if scrape.parser:
                        serp = parse_serp(parser=scrape.parser, scraper=scrape, query=scrape.query)

                        self.scraper_search.serps.append(serp)
                        self.session.add(serp)
                        self.session.commit()

                        store_serp_result(serp)
コード例 #2
0
 def cache_results(self):
     """Caches the html for the current request."""
     cache_results(self.parser,
                   self.query,
                   self.search_engine_name,
                   self.scrape_method,
                   self.page_number,
                   db_lock=self.db_lock)
コード例 #3
0
ファイル: scraping.py プロジェクト: Julienh/GoogleScraper
    def _search(self, searchtype='normal'):
        """The actual search and parsing of the results.

        Private, internal method.
        Parsing is done with lxml and cssselect. The html structure of the Google Search
        results may change over time. Effective: February 2014

        There are several parts of a SERP results page the average user is most likely interested:

        (Probably in this order)
        - Non-advertisement links, as well as their little snippet and title
        - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)"
        - Advertisement search results (links, titles, snippets like above)

        Problem: This data comes in a wide range of different formats, depending on the parameters set in the search.
        Investigations over the different formats are done in the directory tests/serp_formats.

        """
        self._build_query(searchtype)

        # After building the query, all parameters are set, so we know what we're requesting.
        logger.debug("Created new GoogleScrape object with searchparams={}".format(pprint.pformat(self.search_params)))

        html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params)
        self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params))

        if not html:
            try:
                r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS,
                                 params=self.search_params, timeout=3.0)

                logger.debug("Scraped with url: {} and User-Agent: {}".format(r.url, self._HEADERS['User-Agent']))

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                if str(r.status_code)[0] == '5':
                    print('Maybe google recognizes you as sneaky spammer after'
                          ' you requested their services too inexhaustibly :D')
                return False

            html = r.text

            if Config['HTTP'].getboolean('view', False):
                self.browserview(html)

            # cache fresh results
            cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params)
            self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params))

        self.parser = GoogleParser(html, searchtype=self.searchtype)
        self.search_results.update(self.parser.all_results)
コード例 #4
0
ファイル: scraping.py プロジェクト: bowlofstew/GoogleScraper
    def search(self):
        """Search with webdriver.

        Fills out the search form of the search engine for each keyword.
        Clicks the next link while num_pages_per_keyword is not reached.
        """
        n = 0

        for self.current_keyword in self.keywords:

            super().next_keyword_info(n)

            self.search_input = self._wait_until_search_input_field_appears()

            if self.search_input is False:
                self.search_input = self.handle_request_denied()

            if self.search_input:
                self.search_input.clear()
                time.sleep(.25)
                self.search_input.send_keys(self.current_keyword + Keys.ENTER)
            else:
                raise GoogleSearchError('Cannot get handle to the input form!')

            for self.current_page in range(1, self.num_pages_per_keyword + 1):
                # Waiting until the keyword appears in the title may
                # not be enough. The content may still be from the old page.
                try:
                    WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword))
                except TimeoutException as e:
                    logger.error(SeleniumSearchError('Keyword "{}" not found in title: {}'.format(self.current_keyword, self.webdriver.title)))
                    break

                # match the largest sleep range
                sleep_time = random.randrange(*self._largest_sleep_range(self.search_number))
                time.sleep(sleep_time)

                html = self.webdriver.page_source

                self.parser.parse(html)
                self.store()
                out(str(self.parser), lvl=2)

                # Lock for the sake that two threads write to same file (not probable)
                with self.cache_lock:
                    cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

                self.search_number += 1

                # Click the next page link not when leaving the loop
                if self.current_page < self.num_pages_per_keyword + 1:
                    self.next_url = self._goto_next_page()

                    if not self.next_url:
                        break

            n += 1
コード例 #5
0
ファイル: scraping.py プロジェクト: vgoklani/GoogleScraper
    def search(self):
        """Search with webdriver."""

            self.webdriver.get(self.)
            # match the largest sleep range
            j = random.randrange(*self._largest_sleep_range(i))
            if self.proxy:
                logger.info('[i] Page number={}, ScraperThread({url}) ({ip}:{port} {} is sleeping for {} seconds...Next keyword: ["{kw}"]'.format(page_num, self._ident, j, url= next_url, ip=self.proxy.host, port=self.proxy.port, kw=kw))
            else:
                logger.info('[i] Page number={}, ScraperThread({url}) ({} is sleeping for {} seconds...Next keyword: ["{}"]'.format(page_num, self._ident, j, kw, url=next_url))
            time.sleep(j)
            try:
                self.element = WebDriverWait(self.webdriver, 10).until(EC.presence_of_element_located((By.NAME, "q")))
            except TimeoutException as e:
                if not self.handle_request_denied():
                    open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png())
                    raise GoogleSearchError('`q` search input cannot be found.')

            if write_kw:
                self.element.clear()
                time.sleep(.25)
                self.element.send_keys(kw + Keys.ENTER)
                write_kw = False
            # Waiting until the keyword appears in the title may
            # not be enough. The content may still be off the old page.
            try:
                WebDriverWait(self.webdriver, 10).until(EC.title_contains(kw))
            except TimeoutException as e:
                logger.debug('Keyword not found in title: {}'.format(e))

            try:
                # wait until the next page link emerges
                WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#pnnext')))
                next_url = self.webdriver.find_element_by_css_selector('#pnnext').get_attribute('href')
            except TimeoutException as te:
                logger.debug('Cannot locate next page html id #pnnext')
            except WebDriverException as e:
                # leave if no next results page is available
                pass

            # That's because we sleep explicitly one second, so the site and
            # whatever js loads all shit dynamically has time to update the
            # DOM accordingly.
            time.sleep(1.5)

            html = self._maybe_crop(self.webdriver.page_source)

            if self.rlock or self.queue:
                # Lock for the sake that two threads write to same file (not probable)
                self.rlock.acquire()
                cache_results(html, kw, self.url)
                self.rlock.release()
                # commit in intervals specified in the config
                self.queue.put(self._get_parse_links(html, kw, page_num=page_num+1, ip=self.ip))

            self._results.append(self._get_parse_links(html, kw, only_results=True).all_results)
コード例 #6
0
ファイル: scraping.py プロジェクト: purejade/GoogleScraper
    def search(self):
        """Search with webdriver.

        Called within the blocking_search search loop.

        """
        for self.current_keyword in self.keywords:

            for self.current_page in range(1, self.num_pages_per_keyword + 1):
                # match the largest sleep range
                sleep_time = random.randrange(*self._largest_sleep_range(self.search_number))

                time.sleep(sleep_time)

                # Waiting until the keyword appears in the title may
                # not be enough. The content may still be from the old page.
                try:
                    WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword))
                except TimeoutException as e:
                    logger.error(SeleniumSearchError('Keyword "{}" not found in title: {}'.format(self.current_keyword, self.webdriver.title)))


                html = self.webdriver.page_source

                self.parser.parse(html)
                self.store()
                out(str(self.parser), lvl=2)

                # Lock for the sake that two threads write to same file (not probable)
                with self.cache_lock:
                    cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

                self.search_number += 1

                if self.current_page > 1:
                    self.next_url = self._goto_next_page()

            try:
                self.search_input = WebDriverWait(self.webdriver, 5).until(
                    EC.presence_of_element_located(self._get_search_input_field()))
            except TimeoutException as e:
                logger.error(e)
                if not self.handle_request_denied():
                    open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png())
                    raise GoogleSearchError('search input field cannot be found.')

            if self.search_input:
                self.search_input.clear()
                time.sleep(.25)
                self.search_input.send_keys(self.current_keyword + Keys.ENTER)
コード例 #7
0
    def search(self):
        """Search with webdriver."""

        next_url = None

        # match the largest sleep range
        sleep_time = random.randrange(*self._largest_sleep_range(self.search_number))

        # log stuff if verbosity is set accordingly
        if Config['GLOBAL'].getint('verbosity', 1 ) > 1:
            if self.proxy:
                logger.info('[i] Page number={}, ScraperThread({url}) ({ip}:{port} {} is sleeping for {} seconds...Next keyword: ["{kw}"]'.format(self.current_page, self._ident, sleep_time, url=next_url, ip=self.proxy.host, port=self.proxy.port, kw=self.current_keyword))
            else:
                logger.info('[i] Page number={}, ScraperThread({url}) ({} is sleeping for {} seconds...Next keyword: ["{}"]'.format(self.current_page, self._ident, sleep_time, self.current_keyword, url=next_url))

        time.sleep(sleep_time)

        try:
            self.search_input = WebDriverWait(self.webdriver, 5).until(EC.presence_of_element_located(self._get_search_input_field()))
        except TimeoutException as e:
            logger.error(e)
            if not self.handle_request_denied():
                open('/tmp/out.png', 'wb').write(self.webdriver.get_screenshot_as_png())
                raise GoogleSearchError('search input field cannot be found.')

        # Waiting until the keyword appears in the title may
        # not be enough. The content may still be off the old page.
        try:
            WebDriverWait(self.webdriver, 5).until(EC.title_contains(self.current_keyword))
        except TimeoutException as e:
            raise SeleniumSearchError('Keyword not found in title: {}'.format(e))

        next_url = self._get_next_page_url()

        # That's because we sleep explicitly one second, so the site and
        # whatever js loads all shit dynamically has time to update the
        # DOM accordingly.
        time.sleep(1.5)

        html = self.webdriver.page_source

        self.parser.parse(html)
        self.store()
        print(self.parser)

        # Lock for the sake that two threads write to same file (not probable)
        with (yield from self.cache_lock):
            cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

        self.search_number += 1
コード例 #8
0
ファイル: scraping.py プロジェクト: manugarri/GoogleScraper
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        html = get_cached(self.current_keyword,
                          self.base_search_url,
                          params=self.search_params)

        if not html:
            try:
                if Config['GLOBAL'].getint('verbosity', 0) > 1:
                    logger.info(
                        '[HTTP] Base_url: {base_url}, headers={headers}, params={params}'
                        .format(base_url=self.base_search_url,
                                headers=self.headers,
                                params=self.search_params))

                r = self.requests.get(self.base_search_url,
                                      headers=self.headers,
                                      params=self.search_params,
                                      timeout=3.0)

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                self.handle_request_denied(r.status_code)
                return False

            html = r.text

            # cache fresh results
            cache_results(html,
                          self.current_keyword,
                          url=self.base_search_url,
                          params=self.search_params)

        self.parser.parse(html)
        self.store()
        print(self.parser)
コード例 #9
0
ファイル: scraping.py プロジェクト: bowlofstew/GoogleScraper
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        try:
            out('[HTTP - {proxy}] Base_url: {base_url}, headers={headers}, params={params}'.format(
                proxy=self.proxy,
                base_url=self.base_search_url,
                headers=self.headers,
                params=self.search_params),
            lvl=3)

            super().next_keyword_info(self.n)

            request = self.requests.get(self.base_search_url, headers=self.headers,
                             params=self.search_params, timeout=3.0)

        except self.requests.ConnectionError as ce:
            logger.error('Network problem occurred {}'.format(ce))
            raise ce
        except self.requests.Timeout as te:
            logger.error('Connection timeout {}'.format(te))
            raise te

        if not request.ok:
            logger.error('HTTP Error: {}'.format(request.status_code))
            self.handle_request_denied(request.status_code)
            return False

        html = request.text

        # cache fresh results
        with self.cache_lock:
            cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

        self.parser.parse(html)
        self.store()
        out(str(self.parser), lvl=2)

        self.n += 1
コード例 #10
0
ファイル: scraping.py プロジェクト: vgoklani/GoogleScraper
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        html = get_cached(self.current_keyword, self.base_search_url, params=self.search_params)

        if not html:
            try:
                if Config['GLOBAL'].getint('verbosity', 0) > 1:
                    logger.info('[HTTP] Base_url: {base_url}, headers={headers}, params={params}'.format(
                        base_url=self.base_search_url,
                        headers=self.headers,
                        params=self.search_params)
                    )

                r = self.requests.get(self.base_search_url, headers=self.headers,
                                 params=self.search_params, timeout=3.0)

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                self.handle_request_denied(r.status_code)
                return False

            html = r.text

            # cache fresh results
            cache_results(html, self.current_keyword, url=self.base_search_url, params=self.search_params)

        self.parser.parse(html)

        # TODO: remove it and save it to a data storage
        print(self.parser)
コード例 #11
0
ファイル: scraping.py プロジェクト: Annie201/GoogleScraper
 def cache_results(self):
     """Caches the html for the current request."""
     cache_results(self.parser, self.query, self.search_engine_name, self.scrape_method, self.page_number,
                   db_lock=self.db_lock)
コード例 #12
0
ファイル: scraping.py プロジェクト: sisteamnik/GoogleScraper
 def cache_results(self):
     """Caches the html for the current request."""
     if Config['GLOBAL'].getboolean('do_caching', False):
         with self.cache_lock:
             cache_results(self.parser.cleaned_html, self.current_keyword,
                           self.search_engine, self.scrapemethod)
コード例 #13
0
ファイル: scraping.py プロジェクト: Wkryst/GoogleScraper
 def cache_results(self):
     """Caches the html for the current request."""
     if Config['GLOBAL'].getboolean('do_caching', False):
         with self.cache_lock:
             cache_results(self.parser.cleaned_html, self.current_keyword, self.search_engine, self.scrapemethod)
コード例 #14
0
ファイル: scraping.py プロジェクト: csrgxtu/gps
    def _search(self, searchtype='normal'):
        """The actual search and parsing of the results.

        Private, internal method.
        Parsing is done with lxml and cssselect. The html structure of the Google Search
        results may change over time. Effective: February 2014

        There are several parts of a SERP results page the average user is most likely interested:

        (Probably in this order)
        - Non-advertisement links, as well as their little snippet and title
        - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)"
        - Advertisement search results (links, titles, snippets like above)

        Problem: This data comes in a wide range of different formats, depending on the parameters set in the search.
        Investigations over the different formats are done in the directory tests/serp_formats.

        """
        self._build_query(searchtype)

        # After building the query, all parameters are set, so we know what we're requesting.
        logger.debug(
            "Created new GoogleScrape object with searchparams={}".format(
                pprint.pformat(self.search_params)))

        html = get_cached(self.search_query,
                          Config['GLOBAL'].get('base_search_url'),
                          params=self.search_params)
        self.search_results['cache_file'] = os.path.join(
            Config['GLOBAL'].get('cachedir'),
            cached_file_name(self.search_query,
                             Config['GLOBAL'].get('base_search_url'),
                             self.search_params))

        if not html:
            try:
                r = self.requests.get(Config['GLOBAL'].get('base_search_url'),
                                      headers=self._HEADERS,
                                      params=self.search_params,
                                      timeout=3.0)

                logger.debug("Scraped with url: {} and User-Agent: {}".format(
                    r.url, self._HEADERS['User-Agent']))

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                if str(r.status_code)[0] == '5':
                    print('Maybe google recognizes you as sneaky spammer after'
                          ' you requested their services too inexhaustibly :D')
                return False

            html = r.text

            if Config['HTTP'].getboolean('view', False):
                self.browserview(html)

            # cache fresh results
            cache_results(html,
                          self.search_query,
                          url=Config['GLOBAL'].get('base_search_url'),
                          params=self.search_params)
            self.search_results['cache_file'] = os.path.join(
                Config['GLOBAL'].get('cachedir'),
                cached_file_name(self.search_query,
                                 Config['GLOBAL'].get('base_search_url'),
                                 self.search_params))

        self.parser = GoogleParser(html, searchtype=self.searchtype)
        self.search_results.update(self.parser.all_results)
コード例 #15
0
ファイル: scraping.py プロジェクト: manugarri/GoogleScraper
    def search(self):
        """Search with webdriver."""

        next_url = None

        # match the largest sleep range
        sleep_time = random.randrange(
            *self._largest_sleep_range(self.search_number))

        # log stuff if verbosity is set accordingly
        if Config['GLOBAL'].getint('verbosity', 1) > 1:
            if self.proxy:
                logger.info(
                    '[i] Page number={}, ScraperThread({url}) ({ip}:{port} {} is sleeping for {} seconds...Next keyword: ["{kw}"]'
                    .format(self.current_page,
                            self._ident,
                            sleep_time,
                            url=next_url,
                            ip=self.proxy.host,
                            port=self.proxy.port,
                            kw=self.current_keyword))
            else:
                logger.info(
                    '[i] Page number={}, ScraperThread({url}) ({} is sleeping for {} seconds...Next keyword: ["{}"]'
                    .format(self.current_page,
                            self._ident,
                            sleep_time,
                            self.current_keyword,
                            url=next_url))

        time.sleep(sleep_time)

        try:
            self.search_input = WebDriverWait(self.webdriver, 5).until(
                EC.presence_of_element_located(self._get_search_input_field()))
        except TimeoutException as e:
            logger.error(e)
            if not self.handle_request_denied():
                open('/tmp/out.png',
                     'wb').write(self.webdriver.get_screenshot_as_png())
                raise GoogleSearchError('search input field cannot be found.')

        # Waiting until the keyword appears in the title may
        # not be enough. The content may still be off the old page.
        try:
            WebDriverWait(self.webdriver,
                          5).until(EC.title_contains(self.current_keyword))
        except TimeoutException as e:
            raise SeleniumSearchError(
                'Keyword not found in title: {}'.format(e))

        next_url = self._get_next_page_url()

        # That's because we sleep explicitly one second, so the site and
        # whatever js loads all shit dynamically has time to update the
        # DOM accordingly.
        time.sleep(1.5)

        html = self.webdriver.page_source

        self.parser.parse(html)
        self.store()
        print(self.parser)

        if self.rlock:
            # Lock for the sake that two threads write to same file (not probable)
            self.rlock.acquire()
            cache_results(html, self.current_keyword,
                          next_url if next_url else self.starting_point)
            self.rlock.release()

        self.search_number += 1