Beispiel #1
0
def parse_serp(html=None, search_engine=None,
                    scrapemethod=None, current_page=None, requested_at=None,
                    requested_by='127.0.0.1', current_keyword=None, parser=None, serp=None):
        """Store the parsed data in the sqlalchemy session.

        Args:
            TODO: A whole lot

        Returns:
            The parsed SERP object.
        """

        if not parser:
            parser = get_parser_by_search_engine(search_engine)
            parser = parser()
            parser.parse(html)

        out(parser, lvl=2)
        num_results = 0

        if not serp:
            serp = SearchEngineResultsPage(
                search_engine_name=search_engine,
                scrapemethod=scrapemethod,
                page_number=current_page,
                requested_at=requested_at,
                requested_by=requested_by,
                query=current_keyword,
                num_results_for_keyword=parser.search_results['num_results'],
            )

        for key, value in parser.search_results.items():
            if isinstance(value, list):
                rank = 1
                for link in value:
                    parsed = urlparse(link['link'])

                    # fill with nones to prevent key errors
                    [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link]

                    l = Link(
                        link=link['link'],
                        snippet=link['snippet'],
                        title=link['title'],
                        visible_link=link['visible_link'],
                        domain=parsed.netloc,
                        rank=rank,
                        serp=serp
                    )
                    num_results += 1
                    rank += 1

        serp.num_results = num_results

        return (serp, parser)
Beispiel #2
0
def parse_serp(html=None, search_engine=None,
                    scrapemethod=None, current_page=None, requested_at=None,
                    requested_by='127.0.0.1', current_keyword=None, parser=None, serp=None):
        """Store the parsed data in the sqlalchemy session.

        Args:
            TODO: A whole lot

        Returns:
            The parsed SERP object.
        """

        if not parser:
            parser = get_parser_by_search_engine(search_engine)
            parser = parser()
            parser.parse(html)

        out(parser, lvl=2)
        num_results = 0

        if not serp:
            serp = SearchEngineResultsPage(
                search_engine_name=search_engine,
                scrapemethod=scrapemethod,
                page_number=current_page,
                requested_at=requested_at,
                requested_by=requested_by,
                query=current_keyword,
                num_results_for_keyword=parser.search_results['num_results'],
            )

        for key, value in parser.search_results.items():
            if isinstance(value, list):
                rank = 1
                for link in value:
                    parsed = urlparse(link['link'])

                    # fill with nones to prevent key errors
                    [link.update({key: None}) for key in ('snippet', 'title', 'visible_link') if key not in link]

                    l = Link(
                        link=link['link'],
                        snippet=link['snippet'],
                        title=link['title'],
                        visible_link=link['visible_link'],
                        domain=parsed.netloc,
                        rank=rank,
                        serp=serp
                    )
                    num_results += 1
                    rank += 1

        serp.num_results = num_results

        return serp
Beispiel #3
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'You need to pass a sqlalchemy scoped session to SearchEngineScrape instances'

        serp = SearchEngineResultsPage(
            search_engine_name=self.search_engine,
            page_number=self.current_page,
            requested_at=datetime.datetime.utcnow(),
            requested_by='127.0.0.1',
            query=self.current_keyword,
            num_results_for_keyword=self.parser.search_results['num_results'],
            search=self.scraper_search)
        self.session.add(serp)

        for key, value in self.parser.search_results.items():
            if isinstance(value, list):
                for link in value:
                    l = Link(url=link['link'],
                             snippet=link['snippet'],
                             title=link['title'],
                             visible_link=link['visible_link'],
                             serp=serp)
                    self.session.add(l)

        self.session.commit()
def parse_serp(html=None, search_engine=None,
                    scrapemethod=None, current_page=None, requested_at=None,
                    requested_by='127.0.0.1', current_keyword=None):
        """Store the parsed data in the sqlalchemy session.

        Args:
            TODO: A whole lot

        Returns:
            The parsed SERP object.
        """

        parser = get_parser_by_search_engine(search_engine)
        parser = parser()
        parser.parse(html)

        num_results = 0

        serp = SearchEngineResultsPage(
            search_engine_name=search_engine,
            scrapemethod=scrapemethod,
            page_number=current_page,
            requested_at=requested_at,
            requested_by=requested_by,
            query=current_keyword,
            num_results_for_keyword=parser.search_results['num_results'],
        )

        for key, value in parser.search_results.items():
            if isinstance(value, list):
                rank = 1
                for link in value:
                    l = Link(
                        url=link['link'],
                        snippet=link['snippet'],
                        title=link['title'],
                        visible_link=link['visible_link'],
                        rank=rank,
                        serp=serp
                    )
                    num_results += 1
                    rank += 1

        serp.num_results = num_results

        return serp
Beispiel #5
0
def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, query=''):
    """Store the parsed data in the sqlalchemy session.

    If no parser is supplied then we are expected to parse again with
    the provided html.

    This function may be called from scraping and caching.
    When called from caching, some info is lost (like current page number).

    Args:
        TODO: A whole lot

    Returns:
        The parsed SERP object.
    """

    if not parser and html:
        parser = get_parser_by_search_engine(search_engine)
        parser = parser(config, query=query)
        parser.parse(html)

    serp = SearchEngineResultsPage()

    if query:
        serp.query = query

    if parser:
        serp.set_values_from_parser(parser)
    if scraper:
        serp.set_values_from_scraper(scraper)

    return serp
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'You need to pass a sqlalchemy scoped session to SearchEngineScrape instances'

        num_results = 0

        ip = '127.0.0.1'
        if self.proxy:
            ip = self.proxy.ip

        serp = SearchEngineResultsPage(
            search_engine_name=self.search_engine,
            scrapemethod=self.scrapemethod,
            page_number=self.current_page,
            requested_at=datetime.datetime.utcnow(),
            requested_by=ip,
            query=self.current_keyword,
            num_results_for_keyword=self.parser.search_results['num_results'],
        )

        with (yield from self.db_lock):
            self.scraper_search.serps.append(serp)
            self.session.add(serp)
            self.session.add(self.scraper_search)
            self.session.commit()

            for key, value in self.parser.search_results.items():
                if isinstance(value, list):
                    rank = 1
                    for link in value:
                        l = Link(
                            url=link['link'],
                            snippet=link['snippet'],
                            title=link['title'],
                            visible_link=link['visible_link'],
                            rank=rank,
                            serp=serp
                        )
                        self.session.add(l)
                        num_results += 1
                        rank += 1

            serp.num_results = num_results
            self.session.add(serp)
            self.session.commit()
Beispiel #7
0
def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None, query=''):
    """Store the parsed data in the sqlalchemy session.

    If no parser is supplied then we are expected to parse again with
    the provided html.

    This function may be called from scraping and caching.
    When called from caching, some info is lost (like current page number).

    Args:
        TODO: A whole lot

    Returns:
        The parsed SERP object.
    """

    if not parser and html:
        parser = get_parser_by_search_engine(search_engine)
        parser = parser(config, query=query)
        parser.parse(html)

    serp = SearchEngineResultsPage()

    if query:
        serp.query = query

    if parser:
        serp.set_values_from_parser(parser)
    if scraper:
        serp.set_values_from_scraper(scraper)

    return serp
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=datetime.datetime.utcnow(),
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            for key, value in self.parser.search_results.items():
                if isinstance(value, list):
                    rank = 1
                    for link in value:
                        l = Link(
                            url=link['link'],
                            snippet=link['snippet'],
                            title=link['title'],
                            visible_link=link['visible_link'],
                            rank=rank,
                            serp=serp
                        )
                        self.session.add(l)
                        num_results += 1
                        rank += 1

            serp.num_results = num_results
            self.session.add(serp)
            self.session.commit()
Beispiel #9
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            num_results = 0

            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.
                search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

        output_format = Config['GLOBAL'].get('output_format', 'stdout')
        output_file = Config['GLOBAL'].get('output_filename', 'google_scraper')

        def results():
            rows = []
            for result_type, value in self.parser.search_results.items():
                if isinstance(value, list):
                    for link in value:
                        rows.append(link)
            return rows

        if output_format == 'stdout':
            out(self.parser, lvl=2)
        elif output_format == 'json':
            obj = self._get_serp_obj()
            obj['results'] = results()
            json.dump(obj, self.json_outfile, indent=2, sort_keys=True)
            self.json_outfile.write(',')

        elif output_format == 'csv':
            obj = self._get_serp_obj()
            for row in results():
                row.update(obj)
                self.csv_outfile.writerow(row)
Beispiel #10
0
    def store(self):
        """Store the parsed data in the sqlalchemy scoped session."""
        assert self.session, 'No database session. Turning down.'

        with self.db_lock:
            serp = SearchEngineResultsPage(
                search_engine_name=self.search_engine,
                scrapemethod=self.scrapemethod,
                page_number=self.current_page,
                requested_at=self.current_request_time,
                requested_by=self.ip,
                query=self.current_keyword,
                num_results_for_keyword=self.parser.
                search_results['num_results'],
            )
            self.scraper_search.serps.append(serp)

            serp, parser = parse_serp(serp=serp, parser=self.parser)
            self.session.add(serp)
            self.session.commit()

            store_serp_result(dict_from_scraping_object(self), self.parser)