Example #1
0
    def _search(self, searchtype='normal'):
        """The actual search and parsing of the results.

        Private, internal method.
        Parsing is done with lxml and cssselect. The html structure of the Google Search
        results may change over time. Effective: February 2014

        There are several parts of a SERP results page the average user is most likely interested:

        (Probably in this order)
        - Non-advertisement links, as well as their little snippet and title
        - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)"
        - Advertisement search results (links, titles, snippets like above)

        Problem: This data comes in a wide range of different formats, depending on the parameters set in the search.
        Investigations over the different formats are done in the directory tests/serp_formats.

        """
        self._build_query(searchtype)

        # After building the query, all parameters are set, so we know what we're requesting.
        logger.debug("Created new GoogleScrape object with searchparams={}".format(pprint.pformat(self.search_params)))

        html = get_cached(self.search_query, Config['GLOBAL'].get('base_search_url'), params=self.search_params)
        self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params))

        if not html:
            try:
                r = self.requests.get(Config['GLOBAL'].get('base_search_url'), headers=self._HEADERS,
                                 params=self.search_params, timeout=3.0)

                logger.debug("Scraped with url: {} and User-Agent: {}".format(r.url, self._HEADERS['User-Agent']))

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                if str(r.status_code)[0] == '5':
                    print('Maybe google recognizes you as sneaky spammer after'
                          ' you requested their services too inexhaustibly :D')
                return False

            html = r.text

            if Config['HTTP'].getboolean('view', False):
                self.browserview(html)

            # cache fresh results
            cache_results(html, self.search_query, url=Config['GLOBAL'].get('base_search_url'), params=self.search_params)
            self.search_results['cache_file'] = os.path.join(Config['GLOBAL'].get('cachedir'), cached_file_name(self.search_query, Config['GLOBAL'].get('base_search_url'), self.search_params))

        self.parser = GoogleParser(html, searchtype=self.searchtype)
        self.search_results.update(self.parser.all_results)
Example #2
0
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        html = get_cached(self.current_keyword,
                          self.base_search_url,
                          params=self.search_params)

        if not html:
            try:
                if Config['GLOBAL'].getint('verbosity', 0) > 1:
                    logger.info(
                        '[HTTP] Base_url: {base_url}, headers={headers}, params={params}'
                        .format(base_url=self.base_search_url,
                                headers=self.headers,
                                params=self.search_params))

                r = self.requests.get(self.base_search_url,
                                      headers=self.headers,
                                      params=self.search_params,
                                      timeout=3.0)

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                self.handle_request_denied(r.status_code)
                return False

            html = r.text

            # cache fresh results
            cache_results(html,
                          self.current_keyword,
                          url=self.base_search_url,
                          params=self.search_params)

        self.parser.parse(html)
        self.store()
        print(self.parser)
Example #3
0
    def search(self, *args, rand=False, **kwargs):
        """The actual search for the search engine."""

        self.build_search()

        if rand:
            self.headers['User-Agent'] = random.choice(self.USER_AGENTS)

        html = get_cached(self.current_keyword, self.search_engine, 'http')

        if not html:
            try:
                if Config['GLOBAL'].getint('verbosity', 0) > 1:
                    logger.info('[HTTP] Base_url: {base_url}, headers={headers}, params={params}'.format(
                        base_url=self.base_search_url,
                        headers=self.headers,
                        params=self.search_params)
                    )

                request = self.requests.get(self.base_search_url, headers=self.headers,
                                 params=self.search_params, timeout=3.0)

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not request.ok:
                logger.error('HTTP Error: {}'.format(request.status_code))
                self.handle_request_denied(request.status_code)
                return False

            html = request.text

            # cache fresh results
            with self.cache_lock:
                cache_results(html, self.current_keyword, self.search_engine, self.scrapemethod)

        self.parser.parse(html)
        self.store()
        out(str(self.parser), lvl=2)
Example #4
0
    def _search(self, searchtype='normal'):
        """The actual search and parsing of the results.

        Private, internal method.
        Parsing is done with lxml and cssselect. The html structure of the Google Search
        results may change over time. Effective: February 2014

        There are several parts of a SERP results page the average user is most likely interested:

        (Probably in this order)
        - Non-advertisement links, as well as their little snippet and title
        - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)"
        - Advertisement search results (links, titles, snippets like above)

        Problem: This data comes in a wide range of different formats, depending on the parameters set in the search.
        Investigations over the different formats are done in the directory tests/serp_formats.

        """
        self._build_query(searchtype)

        # After building the query, all parameters are set, so we know what we're requesting.
        logger.debug(
            "Created new GoogleScrape object with searchparams={}".format(
                pprint.pformat(self.search_params)))

        html = get_cached(self.search_query,
                          Config['GLOBAL'].get('base_search_url'),
                          params=self.search_params)
        self.search_results['cache_file'] = os.path.join(
            Config['GLOBAL'].get('cachedir'),
            cached_file_name(self.search_query,
                             Config['GLOBAL'].get('base_search_url'),
                             self.search_params))

        if not html:
            try:
                r = self.requests.get(Config['GLOBAL'].get('base_search_url'),
                                      headers=self._HEADERS,
                                      params=self.search_params,
                                      timeout=3.0)

                logger.debug("Scraped with url: {} and User-Agent: {}".format(
                    r.url, self._HEADERS['User-Agent']))

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                if str(r.status_code)[0] == '5':
                    print('Maybe google recognizes you as sneaky spammer after'
                          ' you requested their services too inexhaustibly :D')
                return False

            html = r.text

            if Config['HTTP'].getboolean('view', False):
                self.browserview(html)

            # cache fresh results
            cache_results(html,
                          self.search_query,
                          url=Config['GLOBAL'].get('base_search_url'),
                          params=self.search_params)
            self.search_results['cache_file'] = os.path.join(
                Config['GLOBAL'].get('cachedir'),
                cached_file_name(self.search_query,
                                 Config['GLOBAL'].get('base_search_url'),
                                 self.search_params))

        self.parser = GoogleParser(html, searchtype=self.searchtype)
        self.search_results.update(self.parser.all_results)