Exemple #1
0
    def _get_parse_links(self, data, kw, only_results=False, page_num = 1, ip='127.0.0.1'):
        """Act the same as _parse_links, but just return the db data instead of inserting data into a connection or
        or building actual queries.

        [[lastrowid]] needs to be replaced with the last rowid from the database when inserting.

        Not secure against sql injections from google ~_~
        """

        parser = GoogleParser(data)
        if only_results:
            return parser

        results = parser.links
        first = (page_num,
                 time.asctime(),
                 len(results),
                 parser.num_results() or '',
                 kw,
                 ip)

        second = []
        for result in results:
            second.append([
                result.link_title,
                result.link_url.geturl(),
                result.link_snippet,
                result.link_position,
                result.link_url.hostname
            ])

        return (first, second)
Exemple #2
0
    def _get_parse_links(self,
                         data,
                         kw,
                         only_results=False,
                         page_num=1,
                         ip='127.0.0.1'):
        """Act the same as _parse_links, but just return the db data instead of inserting data into a connection or
        or building actual queries.

        [[lastrowid]] needs to be replaced with the last rowid from the database when inserting.

        Not secure against sql injections from google ~_~
        """

        parser = GoogleParser(data)
        if only_results:
            return parser

        results = parser.links
        first = (page_num, time.asctime(), len(results), parser.num_results()
                 or '', kw, ip)

        second = []
        for result in results:
            second.append([
                result.link_title,
                result.link_url.geturl(), result.link_snippet,
                result.link_position, result.link_url.hostname
            ])

        return (first, second)
Exemple #3
0
def parse_links(data, conn, kw, page_num=1, ip='127.0.0.1'):
    """Insert parsed data into the database. High level parsing function.

    Args:
    conn -- Either a sqlite3 cursor or connection object. If called in threads, make sure
    to wrap this function in some kind of synchronization functionality.
    """
    parser = GoogleParser(data)
    results = parser.links
    conn.execute('''
        INSERT INTO serp_page
         (page_number, requested_at,
         num_results, num_results_for_kw_google,
         search_query, requested_by)
         VALUES(?, ?, ?, ?, ?, ?)''',
           (page_num, time.asctime(), len(results), parser.num_results() or '',  kw, ip))
    lastrowid = conn.lastrowid
    #logger.debug('Inserting in link: search_query={}, title={}, url={}'.format(kw, ))
    conn.executemany('''INSERT INTO link
    ( title,
     url,
     snippet,
     rank,
     domain,
     serp_id) VALUES(?, ?, ?, ?, ?, ?)''',
    [(
      result.link_title,
      result.link_url.geturl(),
      result.link_snippet,
      result.link_position,
      result.link_url.hostname) +
     (lastrowid, ) for result in results])
Exemple #4
0
    def build_search(self):
        """Build the headers and params for the search request for the search engine."""

        self.search_params = {}

        # Don't set the offset parameter explicitly if the default search (no offset) is correct.
        start_search_position = None if self.current_page == 1 else str(
            int(self.num_results_per_page) * int(self.current_page))

        if self.search_engine == 'google':
            self.parser = GoogleParser()
            self.search_params['q'] = self.current_keyword
            self.search_params['num'] = str(self.num_results_per_page)
            self.search_params['start'] = start_search_position

            if self.search_type == 'image':
                self.search_params.update({
                    'oq': self.current_keyword,
                    'site': 'imghp',
                    'tbm': 'isch',
                    'source': 'hp',
                    #'sa': 'X',
                    'biw': 1920,
                    'bih': 881
                })
            elif self.search_type == 'video':
                self.search_params.update({
                    'tbm': 'vid',
                    'source': 'lnms',
                    'sa': 'X',
                    'biw': 1920,
                    'bih': 881
                })
            elif self.search_type == 'news':
                self.search_params.update({
                    'tbm': 'nws',
                    'source': 'lnms',
                    'sa': 'X'
                })

        elif self.search_engine == 'yandex':
            self.parser = YandexParser()
            self.search_params['text'] = self.current_keyword
            self.search_params['p'] = start_search_position

            if self.search_type == 'image':
                self.base_search_url = 'http://yandex.ru/images/search?'

        elif self.search_engine == 'bing':
            self.parser = BingParser()
            self.search_params['q'] = self.current_keyword
            self.search_params['first'] = start_search_position

        elif self.search_engine == 'yahoo':
            self.parser = YahooParser()
            self.search_params['p'] = self.current_keyword
            self.search_params['b'] = start_search_position
            self.search_params['ei'] = 'UTF-8'

        elif self.search_engine == 'baidu':
            self.parser = BaiduParser()
            self.search_params['wd'] = self.current_keyword
            self.search_params['pn'] = start_search_position
            self.search_params['ie'] = 'utf-8'
        elif self.search_engine == 'duckduckgo':
            self.parser = DuckduckgoParser()
            self.search_params['q'] = self.current_keyword
Exemple #5
0
    def _search(self, searchtype='normal'):
        """The actual search and parsing of the results.

        Private, internal method.
        Parsing is done with lxml and cssselect. The html structure of the Google Search
        results may change over time. Effective: February 2014

        There are several parts of a SERP results page the average user is most likely interested:

        (Probably in this order)
        - Non-advertisement links, as well as their little snippet and title
        - The message that indicates how many results were found. For example: "About 834,000,000 results (0.39 seconds)"
        - Advertisement search results (links, titles, snippets like above)

        Problem: This data comes in a wide range of different formats, depending on the parameters set in the search.
        Investigations over the different formats are done in the directory tests/serp_formats.

        """
        self._build_query(searchtype)

        # After building the query, all parameters are set, so we know what we're requesting.
        logger.debug(
            "Created new GoogleScrape object with searchparams={}".format(
                pprint.pformat(self.search_params)))

        html = get_cached(self.search_query,
                          Config['GLOBAL'].get('base_search_url'),
                          params=self.search_params)
        self.search_results['cache_file'] = os.path.join(
            Config['GLOBAL'].get('cachedir'),
            cached_file_name(self.search_query,
                             Config['GLOBAL'].get('base_search_url'),
                             self.search_params))

        if not html:
            try:
                r = self.requests.get(Config['GLOBAL'].get('base_search_url'),
                                      headers=self._HEADERS,
                                      params=self.search_params,
                                      timeout=3.0)

                logger.debug("Scraped with url: {} and User-Agent: {}".format(
                    r.url, self._HEADERS['User-Agent']))

            except self.requests.ConnectionError as ce:
                logger.error('Network problem occurred {}'.format(ce))
                raise ce
            except self.requests.Timeout as te:
                logger.error('Connection timeout {}'.format(te))
                raise te

            if not r.ok:
                logger.error('HTTP Error: {}'.format(r.status_code))
                if str(r.status_code)[0] == '5':
                    print('Maybe google recognizes you as sneaky spammer after'
                          ' you requested their services too inexhaustibly :D')
                return False

            html = r.text

            if Config['HTTP'].getboolean('view', False):
                self.browserview(html)

            # cache fresh results
            cache_results(html,
                          self.search_query,
                          url=Config['GLOBAL'].get('base_search_url'),
                          params=self.search_params)
            self.search_results['cache_file'] = os.path.join(
                Config['GLOBAL'].get('cachedir'),
                cached_file_name(self.search_query,
                                 Config['GLOBAL'].get('base_search_url'),
                                 self.search_params))

        self.parser = GoogleParser(html, searchtype=self.searchtype)
        self.search_results.update(self.parser.all_results)