コード例 #1
0
    def init(self, config=None, keywords=None):
        """init config and serp_query
        Args:
            config (None|dict): override default config
            keywords (str|list): string or list of strings, keywords to scrape
        Raises:
            ValueError:
        """
        if config is not None:
            self.config = config
        else:
            self.config = Config().get()

        if self.config['executable_path'] == '':
            logger.info('preparing phantomjs')
            firstrun = PhantomInstall()
            phantomjs = firstrun.detect_phantomjs()
            if phantomjs is False:
                firstrun.download()
                phantomjs = firstrun.detect_phantomjs()
                if phantomjs is False:
                    raise Exception('''
                        phantomjs binary not found,
                        provide custom path in config''')
            self.config.__setitem__('executable_path', phantomjs)
            logger.info('using ' + phantomjs)

        if isinstance(keywords, str):
            self.serp_query = [keywords]
        elif isinstance(keywords, list) and len(keywords) > 0:
            self.serp_query = keywords
        else:
            raise ValueError('no keywords given')
コード例 #2
0
    def init(self, config=None, keywords=None):
        """init config and serp_query
        Args:
            config (None|dict): override default config
            keywords (str|list): string or list of strings, keywords to scrape
        Raises:
            ValueError:
        """
        if config is not None:
            self.config = config
        else:
            self.config = Config().get()

        if self.config['executable_path'] == '' and self.config[
                'sel_browser'] == 'phantomjs':
            logger.info('preparing phantomjs')
            firstrun = PhantomInstall()
            phantomjs = firstrun.detect_phantomjs()
            if phantomjs is None:
                firstrun.download()
                phantomjs = firstrun.detect_phantomjs()
                if phantomjs is None:
                    raise Exception('''
                        phantomjs binary not found,
                        provide custom path in config''')
            self.config.__setitem__('executable_path', phantomjs)
            logger.info('using ' + str(phantomjs))
        elif self.config['executable_path'] == '' and self.config[
                'sel_browser'] == 'chrome':
            logger.info('preparing chromedriver')
            firstrun = ChromeInstall()
            chromedriver = firstrun.detect_chromedriver()
            if chromedriver is None:
                firstrun.download()
                chromedriver = firstrun.detect_chromedriver()
                if chromedriver is None:
                    raise Exception('''
                        chromedriver binary not found,
                        provide custom path in config''')
            self.config.__setitem__('executable_path', chromedriver)
            logger.info('using ' + str(chromedriver))

        # cleanup screenshot dir on init
        if os.path.exists(self.config['dir_screenshot']):
            shutil.rmtree(self.config['dir_screenshot'], ignore_errors=True)
        # create screenshot dir current date
        screendir = '{}/{}'.format(self.config['dir_screenshot'],
                                   self.config['today'])

        if not os.path.exists(screendir):
            os.makedirs(screendir)

        if isinstance(keywords, str):
            self.serp_query = [keywords]
        elif isinstance(keywords, list) and len(keywords) > 0:
            self.serp_query = keywords
        else:
            raise ValueError('no keywords given')
コード例 #3
0
    def test_config_default(self):
        config = Config()
        assert len(config.get()) == 31
        assert config.use_own_ip is True
        assert config.screenshot is True
        assert config.scrape_urls is False

        today = datetime.datetime.strftime(datetime.datetime.utcnow(),
                                           '%Y-%m-%d')
        assert config.today == today
コード例 #4
0
ファイル: test_basic.py プロジェクト: tinduong94/SerpScrap
    def test_simple(self):

        keywords = random.choice(self.keyword_list)

        config = Config()
        scrap = SerpScrap()
        scrap.init(config=config.get(), keywords=keywords)
        results = scrap.run()

        assert len(results) > 0
        assert len(results[0]) > 0
コード例 #5
0
    def test_simple(self):

        keywords = random.choice(self.keyword_list)

        config = Config()
#         config.set('sel_browser', 'chrome')
#         config.set('chrome_headless', True)
#         config.set('executable_path', '/usr/local/bin/chromedriver')
        scrap = SerpScrap()
        scrap.init(config=config.get(), keywords=keywords)
        results = scrap.run()

        assert len(results) > 0
        assert len(results[0]) > 0
コード例 #6
0
    def test_screenshot(self):
        keywords = random.choice(self.keyword_list)
        config = Config()
        config.set('screenshot', True)
        scrap = SerpScrap()
        scrap.init(config=config.get(), keywords=keywords)
        # results = scrap.run()

        screendir = '{}/{}'.format(config.get()['dir_screenshot'],
                                   config.today)

        assert config.get()['screenshot'] is True
        assert os.path.exists(screendir) is True
コード例 #7
0
ファイル: test_basic.py プロジェクト: DrSn2/SerpScrap
    def test_simple(self):

        keyword_list = [
            'computer news',
            'since topics',
            'python tutorial',
            'pythons',
            'machine learning',
            'artificial intelligence',
        ]
        keywords = random.choice(keyword_list)

        config = Config()
        config.set('scrape_urls', False)
        scrap = SerpScrap()
        scrap.init(config=config.get(), keywords=keywords)
        results = scrap.run()

        assert len(config.get()) == 28
        assert len(results) > 0
        assert len(results[0]) > 0
コード例 #8
0
class SerpScrap():
    """main module to execute the serp and url scrape tasks
    Attributes:
        args: list for cli args
        serp_query: list holds the keywords to query the search engine
        cli (list): for cli attributes
        init (dict, str|list): init SerpScarp
        run (): main method
        scrap_serps (): scrape serps
        scrap (): calls GoogleScraper
        scrap_url(string): calls UrlScrape
        as_csv(string): scrape serps save as csv
    """
    args = []

    serp_query = None

    results = []

    related = []

    def cli(self, args=None):
        """method called if executed on command line
        Args:
            args (mixed): args via commandline
        Returns:
            list: dicts of results
        """
        parser = argparse.ArgumentParser(prog='serpscrap')
        parser.add_argument('-k',
                            '--keyword',
                            help='keyword for scraping',
                            nargs='*')
        self.args = parser.parse_args()
        if len(self.args.keyword) > 0:
            keywords = ' '.join(self.args.keyword)

        self.init(config=None, keywords=keywords)
        return self.run()

    def init(self, config=None, keywords=None):
        """init config and serp_query
        Args:
            config (None|dict): override default config
            keywords (str|list): string or list of strings, keywords to scrape
        Raises:
            ValueError:
        """
        if config is not None:
            self.config = config
        else:
            self.config = Config().get()

        if self.config['executable_path'] == '' and self.config[
                'sel_browser'] == 'phantomjs':
            logger.info('preparing phantomjs')
            firstrun = PhantomInstall()
            phantomjs = firstrun.detect_phantomjs()
            if phantomjs is None:
                firstrun.download()
                phantomjs = firstrun.detect_phantomjs()
                if phantomjs is None:
                    raise Exception('''
                        phantomjs binary not found,
                        provide custom path in config''')
            self.config.__setitem__('executable_path', phantomjs)
            logger.info('using ' + str(phantomjs))
        elif self.config['executable_path'] == '' and self.config[
                'sel_browser'] == 'chrome':
            logger.info('preparing chromedriver')
            firstrun = ChromeInstall()
            chromedriver = firstrun.detect_chromedriver()
            if chromedriver is None:
                firstrun.download()
                chromedriver = firstrun.detect_chromedriver()
                if chromedriver is None:
                    raise Exception('''
                        chromedriver binary not found,
                        provide custom path in config''')
            self.config.__setitem__('executable_path', chromedriver)
            logger.info('using ' + str(chromedriver))

        # cleanup screenshot dir on init
        if os.path.exists(self.config['dir_screenshot']):
            shutil.rmtree(self.config['dir_screenshot'], ignore_errors=True)
        # create screenshot dir current date
        screendir = '{}/{}'.format(self.config['dir_screenshot'],
                                   self.config['today'])

        if not os.path.exists(screendir):
            os.makedirs(screendir)

        if isinstance(keywords, str):
            self.serp_query = [keywords]
        elif isinstance(keywords, list) and len(keywords) > 0:
            self.serp_query = keywords
        else:
            raise ValueError('no keywords given')

    def run(self):
        """main method to run scrap_serps and scrap_url
        Returns:
            list: dicts with all results
        """
        self.results = []
        if self.serp_query is not None:
            self.results = self.scrap_serps()

        if self.config['scrape_urls']:
            for index, result in enumerate(self.results):
                if 'serp_type' in result and \
                   'serp_url' in result:
                    doscrap = True
                    if 'exclude' in self.config.keys():
                        if len(self.config['exclude']) > 0:
                            for exl in self.config['exclude']:
                                if exl in result['serp_url']:
                                    doscrap = False
                    if doscrap:
                        logger.info('Scraping URL: ' + result['serp_url'])
                        result_url = self.scrap_url(result['serp_url'])
                        if 'status' in result_url:
                            self.results[index].update(result_url)
        return self.results if isinstance(self.results,
                                          list) else [self.results]

    def as_csv(self, file_path):
        writer = CsvWriter()
        self.results = self.run()
        writer.write(file_path + '.csv', self.results)

    def scrap_serps(self):
        """call scrap method and append serp results to list
        Returns
            list: dict of scrape results
        """
        search = self.scrap()
        self.results = []
        if search is not None:
            for serp in search.serps:
                self.related = []
                for related_keyword in serp.related_keywords:
                    self.related.append({
                        'keyword': related_keyword.keyword,
                        'rank': related_keyword.rank
                    })
                for link in serp.links:
                    self.results.append({
                        'query_num_results_total':
                        serp.num_results_for_query,
                        'query_num_results_page':
                        serp.num_results,
                        'query_page_number':
                        serp.page_number,
                        'query':
                        serp.query,
                        'serp_rank':
                        link.rank,
                        'serp_type':
                        link.link_type,
                        'serp_url':
                        link.link,
                        'serp_rating':
                        link.rating,
                        'serp_title':
                        link.title,
                        'serp_domain':
                        link.domain,
                        'serp_visible_link':
                        link.visible_link,
                        'serp_snippet':
                        link.snippet,
                        'serp_sitelinks':
                        link.sitelinks,
                        'screenshot':
                        os.path.join('{}/{}/{}_{}-p{}.png'.format(
                            self.config['dir_screenshot'],
                            self.config['today'],
                            'google',
                            serp.query,
                            str(serp.page_number),
                        ))
                    })
            return self.results
        else:
            raise Exception('No Results')

    def scrap(self):
        """scrap, method calls GoogleScraper method
        Returns:
            dict: scrape result#
        """
        # See in the config.cfg file for possible values
        self.config['keywords'] = self.serp_query \
            if isinstance(self.serp_query, list) else [self.serp_query]

        return Core().run(self.config)

    def scrap_url(self, url):
        """method calls UrlScrape
        Args:
            url (string): url to scrape
        Returns:
            dict: result of url scrape
        """
        urlscrape = UrlScrape(self.config)
        return urlscrape.scrap_url(url)

    def get_related(self):
        return self.related
コード例 #9
0
class SerpScrap():
    """main module to execute the serp and url scrape tasks
    Attributes:
        args: list for cli args
        serp_query: list holds the keywords to query the search engine
        cli (list): for cli attributes
        init (dict, str|list): init SerpScarp
        run (): main method
        scrap_serps (): scrape serps
        scrap (): calls GoogleScraper
        scrap_url(string): calls UrlScrape
        adjust_encoding(string): for encoding
    """
    args = []

    serp_query = None

    def cli(self, args=None):
        """method called if executed on command line
        Args:
            args (mixed): args via commandline
        Returns:
            list: dicts of results
        """
        parser = argparse.ArgumentParser(prog='serpscrap')
        parser.add_argument('-k',
                            '--keyword',
                            help='keyword for scraping',
                            nargs='*')
        self.args = parser.parse_args()
        if len(self.args.keyword) > 0:
            keywords = ' '.join(self.args.keyword)

        self.init(config=None, keywords=keywords)
        return self.run()

    def init(self, config=None, keywords=None):
        """init config and serp_query
        Args:
            config (None|dict): override default config
            keywords (str|list): string or list of strings, keywords to scrape
        Raises:
            ValueError:
        """
        if config is not None:
            self.config = config
        else:
            self.config = Config().get()

        if self.config['executable_path'] == '':
            logger.info('preparing phantomjs')
            firstrun = PhantomInstall()
            phantomjs = firstrun.detect_phantomjs()
            if phantomjs is False:
                firstrun.download()
                phantomjs = firstrun.detect_phantomjs()
                if phantomjs is False:
                    raise Exception('''
                        phantomjs binary not found,
                        provide custom path in config''')
            self.config.__setitem__('executable_path', phantomjs)
            logger.info('using ' + phantomjs)

        if isinstance(keywords, str):
            self.serp_query = [keywords]
        elif isinstance(keywords, list) and len(keywords) > 0:
            self.serp_query = keywords
        else:
            raise ValueError('no keywords given')

    def run(self):
        """main method to run scrap_serps and scrap_url
        Returns:
            list: dicts with all results
        """
        results = None
        if self.serp_query is not None:
            results = self.scrap_serps()

        if self.config['scrape_urls']:
            for index, result in enumerate(results):
                if 'serp_type' in result and \
                   'ads_main' not in result['serp_type'] and \
                   'serp_url' in result:
                    result_url = self.scrap_url(result['serp_url'])[0]
                    if 'status' in result_url:
                        results[index].update(result_url)
        return results if isinstance(results, list) else [results]

    def as_csv(self, file_path):
        writer = CsvWriter()
        result = self.run()
        writer.write(file_path + '.csv', result)

    def scrap_serps(self):
        """call scrap method and append serp results to list
        Returns
            list: dict of scrape results
        """
        search = self.scrap()
        result = []
        if search is not None:
            for serp in search.serps:
                related = []
                for related_keyword in serp.related_keywords:
                    related.append({
                        'keyword': related_keyword.keyword,
                        'rank': related_keyword.rank
                    })
                for link in serp.links:
                    result.append({
                        'query_num_results total': serp.num_results_for_query,
                        'query_num_results_page': serp.num_results,
                        'query_page_number': serp.page_number,
                        'query': serp.query,
                        'serp_rank': link.rank,
                        'serp_type': link.link_type,
                        'serp_url': link.link,
                        'serp_rating': link.rating,
                        'serp_title': link.title,
                        'serp_domain': link.domain,
                        'serp_visible_link': link.visible_link,
                        'serp_snippet': link.snippet,
                        'serp_sitelinks': link.sitelinks,
                        'related_keywords': related
                    })
            return result
        else:
            raise Exception('No Results')

    def scrap(self):
        """scrap, method calls GoogleScraper method
        Returns:
            dict: scrape result#
        """
        # See in the config.cfg file for possible values
        self.config['keywords'] = self.serp_query \
            if isinstance(self.serp_query, list) else [self.serp_query]

        return Core().run(self.config)

    def scrap_url(self, url):
        """method calls UrlScrape
        Args:
            url (string): url to scrape
        Returns:
            dict: result of url scrape
        """
        urlscrape = UrlScrape(self.config)
        return urlscrape.scrap_url(url)

    def adjust_encoding(self, data):
        """detect and adjust encoding of data return data decoded to utf-8
        TODO:
            move to tools
        Args:
            data (string): data to encode
        Returns:
            dict: encoding and data
        """
        if data is None:
            return {'encoding': None, 'data': data}

        data = data.encode('utf-8')
        check_encoding = chardet.detect(data)

        if check_encoding['encoding'] is not None \
           and 'utf-8' not in check_encoding['encoding']:
            try:
                data = data.decode(check_encoding['encoding']).encode('utf-8')
            except Exception:
                pass
        try:
            data = data.decode('utf-8')
        except Exception:
            data = data.decode('utf-8', 'ignore')

        return {'encoding': check_encoding['encoding'], 'data': data}
コード例 #10
0
ファイル: example_markovi.py プロジェクト: DrSn2/SerpScrap
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import pprint

from serpscrap.config import Config
from serpscrap.markovi import Markovi
from serpscrap.urlscrape import UrlScrape

url = 'http://gutenberg.spiegel.de/buch/johann-wolfgang-goethe-gedichte-3670/231'
config = Config().get()

urlscrape = UrlScrape(config)
contents = urlscrape.scrap_url(url)

markovi = Markovi(config)
texts = []
for content in contents:
    for _ in range(5):
        texts.append(markovi.generate(content.__getitem__('text_raw'), 1))

for text in texts:
    pprint.pprint(text, width=120)