Beispiel #1
0
def _crawl(path=None):
     crawl = CrawlerProcess({
         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
     })
     crawl.crawl(ProvinceSpider)
     crawl.start()
     crawl.stop()
Beispiel #2
0
def runSpiderProcess(spider_cls, *args, **kwargs):
    """
    Helper method that starts a spider with the given init arguments, waits for it to complete, and returns the
    items it yielded in a list.
    :param spider_cls: the spider class to run
    :param args: the indexed arguments to the spider
    :param kwargs: the keyword arguments to the spider
    :return: a list of items yielded by the spider
    """
    process = CrawlerProcess()
    process.crawl(spider_cls, *args, **kwargs)

    final_result = []

    def _nab_item(item):
        # FIXME: this silly dance of encoding and decoding is to prevent scrapy items from being returned to celery
        # FIXME: celery can't serialize them, so it throws a rather opaque error, but it's fine with lists and dicts
        final_result.append(json.loads(scrapy_encoder.encode(item)))

    for crawler in process.crawlers:
        crawler.signals.connect(_nab_item, item_scraped)

    process.start()
    process.stop()

    return final_result
Beispiel #3
0
class MySpiderProcess1(scrapy.Spider):
    def __init__(self, name, urls):
        self.name = name
        self.start_urls = urls
        scrapy.Spider.__init__(self)

    def parse(self, response):
        print('parse response')

    def _crawl(self):
        settings = Settings()
        settings.set('ITEM_PIPELINES', {
            'app.pipelines.JsonWriterPipeline': 300
        })
        self.process = CrawlerProcess(settings)
        self.process.crawl(self, self.name, self.start_urls)
        self.process.start()
        # self.process.stop()
        # self.process.join()

    def start(self):
        p = Process(target=self._crawl)
        p.start()
        p.join()

    #
    # def start(self):
    #     self._crawl()

    def stop(self):
        self.process.stop()
Beispiel #4
0
def main():
    """Index alexa demographics
    """

    engine = db_connect()
    Session = sessionmaker(bind=engine)
    session = Session()

    settings = get_project_settings()
    settings.set('ITEM_PIPELINES',
                 {'demographic_scraper.demographic_scraper.pipelines.WebsiteDemographicPipeline': 300})
    settings.set('EXTENSIONS',
                 {'scrapy.telnet.TelnetConsole': None,})


    process = CrawlerProcess(settings)
    for website in session.query(WebsitesContent).all():
        demographic = list(session.query(Websites).filter_by(link=website.link))
        if len(demographic) is 0:
            url = website.link
            print website.link
            AlexaSpider.name = url
            process.crawl(AlexaSpider, url=url, db_session=session)
    process.start()
    process.stop()

    session.close()
Beispiel #5
0
class CrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.crawler.queue.append_spider(spider)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, spider):
        queue = Queue()
        p = Process(target=self._crawl, args=(queue, spider,))
        p.start()
        p.join()
        return queue.get(True)
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name):
    #调用Scrapy内部方法
    settings = get_project_settings()
    #实例化一个爬虫进程
    crawlerProcess = CrawlerProcess(settings)

    #创建一个爬虫,一个爬取处理器可以,运行多个爬取。
    crawler = crawlerProcess.create_crawler(spider_name)

    #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。
    crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened)
    crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error)
    crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed)

    #获取爬取类
    spiderConf = Spider_Dict[group_type][spider_type]
    spiderArgs = spiderConf[1].copy()
    spiderArgs["name"] = spider_name
    spiderArgs["redis_key"] = spider_name
    spiderArgs["spider_type"] = spider_type
    spiderArgs["spider_group_name"] = spider_group_name
    spiderArgs["task_id"] = "-1"

    spider = spiderConf[0](**spiderArgs)

    #给爬虫设置爬取类
    crawler.configure()
    crawler.crawl(spider)

    #爬虫启动。
    crawlerProcess.start()
    crawlerProcess.stop()
Beispiel #7
0
class CrawlerScript():

    def __init__(self):
        settings = get_project_settings()
        settings.set('LOG_ENABLED', False, priority='cmdline')
        #settings.overrides['LOG_ENABLED'] = False
        self.crawler = CrawlerProcess(settings)
        self.items = []
        SignalManager(dispatcher.Any).connect(self._item_passed, signal=signals.item_scraped)

    def _item_passed(self,item,response,spider):
        self.items.append(item)

    def _crawl(self, q, queue):
        self.crawler.crawl(BingSpider, q=q)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)

    def crawl(self, q):
        queue = Queue()
        p = Process(target=self._crawl, args=[q, queue])
        p.start()
        p.join()
        return queue.get(True)
Beispiel #8
0
def main():

    repos = read_yaml(CONFIG_PATH)
    # FIXME: obtain last commit?

    rules = read_yaml(RULES_PATH)
    # FIXME: obtain last commit?

    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings

    process = CrawlerProcess(get_project_settings())

    for rule in rules:
        policies_path = create_policy_file_path(rule)
        process.crawl(
            'policies', policies_path=policies_path, url=rule['url'],
            xpath=rule['xpath'])
    logger.debug('starting crawler')
    # the script will block here until the crawling is finished
    process.start()
    process.stop()

    for repo in repos:
        commit(DATA_REPO_PATH, repo['url'], repo['name'])
Beispiel #9
0
def scrapeando():
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher

    def catch_item(sender, item, **kwargs):
        """Rellenamos la BD"""
        for i in enumerate(item.items()):
            x = i[0]
            query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");"
            db.micursor.execute(query)
            db.conexion.commit()
        print item

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.conf import settings
    settings.overrides['LOG_ENABLED'] = False

    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    book = BookSpider()
    book.busqueda=unicode(search.getbusqueda())
    crawler.crawl(book)
    print "Start scraping to la Casa del Libro"
    crawler.start()
    print "End scraping to la Casa del Libro"
    crawler.stop()
class CrawlerWorker(multiprocessing.Process):
 
    def __init__(self, spider, result_queue):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue
 
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
 
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
    #__init__
    
    def _item_passed(self, item):
        self.items.append(item)
    # _item_passed
    
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
    #run
Beispiel #11
0
def _crawl(path):
	from scrapy.crawler import CrawlerProcess
	crawl = CrawlerProcess({
          'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
          'LOG_FILE': 'text.log',
          'LOG_LEVEL': 'INFO'
	})
	crawl.crawl(ProvinceSpider)
	crawl.start()
	crawl.stop()
Beispiel #12
0
def main():

    repos = []
    repos_conf = obtain_yaml(CONFIG_REPO_PATH, CONFIG_REPO_URL,
                             CONFIG_REPO_BRANCH, CONFIG_PATH)

    rules = obtain_yaml(RULES_REPO_PATH, RULES_REPO_URL,
                        RULES_REPO_BRANCH)

    write_ssh_keys(SSH_DIR, MORPH_SSH_PRIV_KEY_ENV, MORPH_SSH_PUB_KEY_ENV,
                   SSH_PRIV_KEY_PATH, SSH_PUB_KEY_PATH)

    if ismorpio():
        write_ssh_command(GIT_SSH_COMMAND_PATH, GIT_SSH_COMMAND_MORPHIO)
    else:
        write_ssh_command(GIT_SSH_COMMAND_PATH, GIT_SSH_COMMAND)

    write_ssh_key_server(GITHUB_SSH_PUB_KEY, SSH_PUB_KEY_SERVER_PATH)

    for repo_conf in repos_conf:
        logger.debug('repo name %s' % repo_conf.get('name'))
        repo = pull_or_clone(DATA_REPO_PATH, repo_conf.get('url'),
                             DATA_REPO_BRANCH, repo_conf.get('name'),
                             GIT_SSH_COMMAND_PATH, False)
        repos.append(repo)

    from scrapy.crawler import CrawlerProcess
    from scrapy.utils.project import get_project_settings

    process = CrawlerProcess(get_project_settings())

    for rule in rules:
        policies_path = create_data_file_path(rule, DATA_REPO_PATH)
        process.crawl(
            'policies', policies_path=policies_path, url=rule['url'],
            xpath=rule['xpath'])
    logger.debug('starting crawler')
    # the script will block here until the crawling is finished
    process.start()
    process.stop()

    for repo in repos:
        commit_push_if_changes(repo, GIT_AUTHOR_NAME, GIT_AUTHOR_EMAIL,
                               GIT_SSH_COMMAND_PATH, DATA_REPO_BRANCH,
                               METADATA_PATH)

    # logger.debug('CHECKING SSH KEYS')
    # logger.debug('===================')
    # for repo in repos:
    #     check_ssh_keys(repo, GIT_SSH_COMMAND_PATH, SSH_PRIV_KEY_PATH,
    #                SSH_PUB_KEY_PATH, SSH_PUB_KEY_SERVER_PATH)

    sys.exit()
Beispiel #13
0
def run_water_spider(startDate, endDate, **kwargs):
    water_crawlerProcess = CrawlerProcess(settings)
    water_crawlerProcess.install()
    water_crawlerProcess.configure()
    
    spider = WaterSpider("8735180", sys.argv[1], sys.argv[2])
    water_crawlerProcess.crawl(spider)
    try:
    	water_crawlerProcess.start()
    	water_crawlerProcess.stop()
    	water_crawlerProcess.uninstall()
    except Exception as e:
    	print e
Beispiel #14
0
def run_wind_spider(startDate, endDate, **kwargs):
    wind_crawlerProcess = CrawlerProcess(settings)
    wind_crawlerProcess.install()
    wind_crawlerProcess.configure()
    spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2])

    wind_crawlerProcess.crawl(spider2)
    try:
        wind_crawlerProcess.start()
        wind_crawlerProcess.stop()
        wind_crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #15
0
def runscrapy(stationID, startDate, endDate, **kwargs):
    crawlerProcess = CrawlerProcess(settings)
    crawlerProcess.install()
    crawlerProcess.configure()

    spider = Spider(stationID, startDate, endDate)
    crawlerProcess.crawl(spider)
    try:
        crawlerProcess.start()
        crawlerProcess.stop()
        crawlerProcess.uninstall()
    except Exception as e:
        print e
Beispiel #16
0
def start_my_crawl(builder):
    settings = Settings()
    # settings.set('DEPTH_LIMIT', 1)
    settings.set("ITEM_PIPELINES", {
        # 'app.pipelines.JsonWriterPipeline': 200,
        'app.pipelines.DataBasePipeline': 300,
    })

    process = CrawlerProcess(settings)
    process.crawl(MyCrawlSpider, builder=builder)
    process.start()
    print('process.stop()')
    process.stop()
Beispiel #17
0
def _runCrawler(spider, results):
        settings_module = importlib.import_module('Extractors.HTMLScraper.settings')
        settings = CrawlerSettings(settings_module)
        crawlerProcess = CrawlerProcess(settings)
        items = []

        def _item_passed(item, response, spider):
                items.append(item)

        dispatcher.connect(_item_passed, signals.item_scraped)

        crawler = crawlerProcess.create_crawler("currentCrawler")
        crawler.crawl(spider)
        crawlerProcess.start()
        crawlerProcess.stop()
        results.put(items)
Beispiel #18
0
class CrawlerWorker(Process):
    def __init__(self, spider, results, **kwargs):
        Process.__init__(self)
        self.results = results
        self.crawler = CrawlerProcess(get_project_settings())
        self.items = []
        self.spider = spider
        self.kwargs = kwargs
        dispatcher.connect(self._item_scraped, signals.item_scraped)

    def _item_scraped(self, item, spider):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider, **self.kwargs)
        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)
Beispiel #19
0
class CrawlerWorker(Process):
    def __init__(self, spider,query,results):
        Process.__init__(self)
        self.results = results
        self.items = []
        self.query = query
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        #self.crawler = CrawlerProcess(get_project_settings())
        self.crawler = CrawlerProcess(Settings())
        self.crawler.crawl(self.spider,self.query)
        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)
class CrawlerScript():

    def __init__(self, spider, results):
        self.results = results
        self.crawler = CrawlerProcess(settings)
        if not hasattr(project, 'crawler'):
            self.crawler.install()
        self.crawler.configure()
        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)
Beispiel #21
0
class CrawlerScript():
    def __init__( self ):
        self.crawler = CrawlerProcess( Settings() )
        self.crawler.install()
        self.crawler.configure()
    def _crawl( self, queue, search ):
        log.start( loglevel = log.DEBUG )
        current_spider = CraigslistSpider()
        if search:
            current_spider.set_search_url( search )
        self.crawler.crawl( current_spider )
        self.crawler.start()
        self.crawler.stop()
        queue.put( current_spider.get_object_list() )
    def crawl( self, search = "" ):
        q = Queue()
        p = Process( target = self._crawl, args = ( q, search ) )
        p.start()
        p.join()
        return q.get()
class RunCrawler():
    """RunCrawler runs a crawler in a separate process.

    Useful sources:
    https://groups.google.com/forum/?fromgroups#!topic/scrapy-users/8zL8W3SdQBo
    http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
    """
    def __init__(self, settings):
        self.crawler = CrawlerProcess(settings)
        self.crawler.configure()

    def _crawl(self, spider):
        self.crawler.crawl(spider)
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, spider):
        p = Process(target=self._crawl, args=(spider,))
        p.start()
        p.join()
Beispiel #23
0
class CrawlerWorker():
    def __init__(self):
        self.process = CrawlerProcess(Settings({
            'ITEM_PIPELINES': {
                'pipelines.MongoDBPipeline': 1
            },
            'MONGO_URI': 'mongodb://localhost:27017/',
            'MONGO_DATABASE': 'realto'
        }))

        self.items = []
        dispatcher.connect(self.item_passed, signals.item_passed)

    def item_passed(self, item):
        self.items.append(item)

    def run(self, config):
        self.process.crawl(IrrSpider, **config)
        self.process.start()
        self.process.stop()
        return self.items
class CrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)

    def _item_passed(self, item):
        self.items.append(item)

    def _crawl(self, queue, spider_name):
        spider = self.crawler.spiders.create(spider_name)
        if spider:
            self.crawler.queue.append_spider(spider)
        self.crawler.start()
        self.crawler.stop()
        queue.put(self.items)
    
    def crawl(self, spider):
        queue = Queue()
        p = Process(target=self._crawl, args=(queue, spider,))
        p.start()
        p.join()
        return queue.get(True)
Beispiel #25
0
class CrawlerWorker(Process):
    def __init__(self, spider, results):
        Process.__init__(self)

        self.results = results     
        settings_module = importlib.import_module('Extractors.HTMLScraper.settings')
        settings = CrawlerSettings(settings_module)
        self.crawlerProcess = CrawlerProcess(settings)

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        crawler = self.crawlerProcess.create_crawler("currentCrawler")
        crawler.crawl(self.spider)
        self.crawlerProcess.start()
        self.crawlerProcess.stop()
        self.results.put(self.items)
class DomainCrawlerScript():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

    def _crawl(self, domain_pk):
        domain = Domain.objects.get(
            pk = domain_pk,
        )
        urls = []
        for page in domain.pages.all():
            urls.append(page.url())
        self.crawler.crawl(DomainSpider(urls))
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, domain_pk):
        p = Process(target=self._crawl, args=[domain_pk])
        p.start()
        p.join()
Beispiel #27
0
class AccountCrawler():

    def __init__(self):
        self.crawler = CrawlerProcess(settings)

    def _crawl(self, origin_oj, username):
        self.crawler.crawl(
            origin_oj + '_user',
            username=username,
        )
        self.crawler.start()
        self.crawler.stop()

    def crawl(self, origin_oj, username):
        p = Process(
            target=self._crawl,
            args=[
                origin_oj,
                username,
            ]
        )
        p.start()
        p.join()
Beispiel #28
0
class CrawlerWorker(Process):

    def __init__(self, spider, result_list, settings=None):
        Process.__init__(self)
        self.result_queue = result_list

        if settings is None:
            settings = Settings()

        self.crawler = CrawlerProcess(settings)
        self.crawler.create_crawler(spider.__class__.__name__)
        self.crawler.crawlers['spider'] = spider
        self.spider = spider
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)
        print "here"

    def run(self):
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
Beispiel #29
0
class CrawlerWorker(multiprocessing.Process):
 
    def __init__(self, result_queue, url):
        multiprocessing.Process.__init__(self)
        self.result_queue = result_queue
        self.url=url
 
        self.crawler = CrawlerProcess(Settings)
        #if not hasattr(project, 'crawler'):
         #   self.crawler.install()
        self.crawler.configure()
 
        self.items = []
        self.spider = RecipeSpider(url)
        dispatcher.connect(self._item_passed, signals.item_passed)
 
    def _item_passed(self, item):
        self.items.append(item)
  
    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
def _run_crawl_process(**kwargs):
  #log.start must be explicitly called
  log.start(loglevel=getattr(django_settings, 'SCRAPY_LOG_LEVEL', 'INFO'))

  # region How to run a crawler in-process
  # examples on how to get this stuff:
  # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1
  # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script
  # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python
  # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy
  # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw
  # endregion

  crawler = CrawlerProcess(settings)
  crawler.install()
  crawler.configure()
  spider = crawler.spiders.create(kwargs['spider'], **kwargs)
  crawler.crawl(spider)


  log.msg('Spider started...')
  crawler.start()
  log.msg('Spider stopped.')
  crawler.stop()
Beispiel #31
0
class Processor(Process):
    """Start a twisted reactor and run the provided scrapy spiders.
    Blocks until all have finished.
    """
    def __init__(self, settings=None):
        """
        Parms:
          settings (scrapy.settings.Settings) - settings to apply.  Defaults
        to Scrapy default settings.
        """
        kwargs = {"ctx": __import__("billiard.synchronize")}

        self.results = Queue(**kwargs)
        self.items = []
        self.settings = settings or Settings()
        dispatcher.connect(self._item_scraped, signals.item_scraped)

    def _item_scraped(self, item):
        self.items.append(item)

    def _crawl(self, requests):
        """
        Parameters:
            requests (Request) - One or more Jobs. All will
                                 be loaded into a single invocation of the reactor.
        """
        self.crawler = CrawlerProcess(self.settings)

        # crawl can be called multiple times to queue several requests
        for req in requests:
            self.crawler.crawl(req.spider, *req.args, **req.kwargs)

        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)

    def run(self, jobs):
        """Start the Scrapy engine, and execute all jobs.  Return consolidated results
        in a single list.

        Parms:
          jobs ([Job]) - one or more Job objects to be processed.

        Returns:
          List of objects yielded by the spiders after all jobs have run.
        """
        if not isinstance(jobs, collections.abc.Iterable):
            jobs = [jobs]
        self.validate(jobs)

        p = Process(target=self._crawl, args=[jobs])
        p.start()
        results = self.results.get()
        p.join()
        p.terminate()

        return results

    def validate(self, jobs):
        if not all([isinstance(x, Job) for x in jobs]):
            raise ScrapyScriptException("scrapyscript requires Job objects.")
Beispiel #32
0
 def f(self, urls, _video=None):
     process = CrawlerProcess(get_project_settings())
     process.crawl(Moocspider, urls=urls, video=_video)
     process.start(stop_after_crawl=True)
     process.stop()
Beispiel #33
0
class CrawlerProcessScript(object):
    """
        Creates multiple crawlers and call them sequentially
        Crawler names should follow this naming convention:
            spider_name + _ + city + _ + category
        crawlers : keeps track of all crawlers run, so to get their stats after they are finished.
    """
    def __init__(self, dsite_name='', updating=False):
        self.updating = str(updating)
        self.dsite = DSite.objects.get(name=dsite_name)
        self.crawler_process = CrawlerProcess(get_project_settings())
        self.crawlers = {}

    def _add_crawler(self,
                     crawler_name,
                     city_mapping_pk=None,
                     category_mapping_pk=None):
        crawler = self.crawler_process.create_crawler(crawler_name)
        spider = crawler.spiders.create(
            self.dsite.name,
            dsite_pk=self.dsite.pk,
            city_mapping_pk=city_mapping_pk,
            category_mapping_pk=category_mapping_pk,
            updating=self.updating)
        crawler.crawl(spider)
        self.crawlers[crawler_name] = crawler

    def _create_crawlers(self):
        if self.dsite.has_both_mappings:
            for city_mapping in CityMapping.objects.filter(dsite=self.dsite):
                for category_mapping in CategoryMapping.objects.filter(
                        dsite=self.dsite, all_cities=False):
                    crawler_name = self.dsite.name + '_' + city_mapping.site_city + '_' + category_mapping.site_category
                    self._add_crawler(crawler_name=crawler_name,
                                      category_mapping_pk=category_mapping.pk,
                                      city_mapping_pk=city_mapping.pk)
            if self.dsite.has_category_mapping:
                for category_mapping in CategoryMapping.objects.filter(
                        dsite=self.dsite, all_cities=True):
                    crawler_name = self.dsite.name + '_' + category_mapping.site_category
                    self._add_crawler(crawler_name=crawler_name,
                                      category_mapping_pk=category_mapping.pk)
        elif self.dsite.has_city_mapping:
            for city_mapping in CityMapping.objects.filter(dsite=self.dsite):
                crawler_name = self.dsite.name + '_' + city_mapping.site_city
                self._add_crawler(crawler_name=crawler_name,
                                  city_mapping_pk=city_mapping.pk)
            if self.dsite.has_category_mapping:
                for category_mapping in CategoryMapping.objects.filter(
                        dsite=self.dsite, all_cities=True):
                    crawler_name = self.dsite.name + '_' + category_mapping.site_category
                    self._add_crawler(crawler_name=crawler_name,
                                      category_mapping_pk=category_mapping.pk)
        elif self.dsite.has_category_mapping:
            for category_mapping in CategoryMapping.objects.filter(
                    dsite=self.dsite):
                crawler_name = self.dsite.name + '_' + category_mapping.site_category
            self._add_crawler(crawler_name=crawler_name,
                              category_mapping_pk=category_mapping.pk)

    def start(self):
        self._create_crawlers()
        self.crawler_process.start()
        self.crawler_process.stop()
        self.crawler_process.stop_reactor()

    def dump_stats(self):
        for crawler_name, crawler in self.crawlers.iteritems():
            print crawler_name
            print crawler.stats.get_stats()
Beispiel #34
0
def run_config(config):
    config = ConfigLoader(config)
    CustomMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    if config.use_anchors:
        from . import scrapy_patch

    strategy = DefaultStrategy(config)

    algolia_helper = AlgoliaHelper(
        config.app_id, config.api_key, config.index_name,
        AlgoliaSettings.get(config, strategy.levels))

    DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_middleware.CustomMiddleware'
    DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'

    if __name__ == '__main__':
        DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_middleware.CustomMiddleware'
        DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'

    process = CrawlerProcess({
        'LOG_ENABLED':
        '1',
        'LOG_LEVEL':
        'ERROR',
        # 'LOG_LEVEL': 'DEBUG',
        'USER_AGENT':
        config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {
            DOWNLOADER_MIDDLEWARES_PATH: 900
        },
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY':
        DOWNLOADER_CLIENTCONTEXTFACTORY
    })

    process.crawl(DocumentationSpider,
                  config=config,
                  algolia_helper=algolia_helper,
                  strategy=strategy)

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        algolia_helper.add_records(config.extra_records, "Extra records")

    if len(Camelizer.synonyms) > 0:
        algolia_helper.add_synonyms(Camelizer.synonyms)

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        algolia_helper.commit_tmp_index()
        print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED))
        config.update_nb_hits(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_name)
        algolia_helper.report_crawling_issue()

    print("")
Beispiel #35
0
def runScrape(
    page="",
    jumps=0
):  #this procedure is responsible for instantiating the scraper and scraping the required webpage
    #tries and connect too database so the scraper doesn't run without the database being ready.
    try:
        mydb = mysql.connector.connect(  # connects to database
            host="localhost",
            user=username,
            password=password,
            database='websites',
            auth_plugin='mysql_native_password')
    except mysql.connector.InterfaceError:
        print("""The connection to the database has been unsuccessful.
Please make sure the SQL server is running, and the database has been initialised.
To initialise database please type \"CREATE DATABASE websites;\" in a suitable SQL terminal and make sure the admin username and password have been entered into credentials.txt"""
              )
        sys.exit()
    mydb.close()
    if jumps <= 0:  #checks if the max number of jumps has been modified from the default value
        validInput = False
        while validInput == False:
            jumps = input(
                "Please input the max number of jumps to be performed by the scraper \n"
            )
            try:  #we only check if the user is inputting text here as the terminal only system checks automatically
                int(
                    jumps
                )  #tries to convert the user input to an integer. If a type error occurs then the input was not an integer and need to be received from user again
                if int(jumps) > 0 and float(
                        jumps
                ) % 1 == 0:  #using modulus to check if whole number.
                    validInput = True
                else:
                    print("Please input a valid positive integer.")
            except ValueError:
                validInput = False
                print("Please input a valid positive integer.")

    if page == "":  #checks if the page has been passed as a parameter and if it hasn't then executes the following code
        website = input("Please input the website you wish to scrape: ")
    else:
        website = page

    # trims away anything that trails after the first / and all references to http or https making it into a domain
    domain = website.replace("https://", "").replace("http://",
                                                     "").split("/", 1)[0]

    #this part starts up the scraper with all the required parameters
    #region scraper start
    ScraperWithLimit.allowed_domains = [domain]
    ScraperWithLimit.start_urls = [website]
    ScraperWithLimit.custom_settings = {
        'DEPTH_LIMIT': jumps,
        'DEPTH_PRIORITY': 0,
    }
    process = CrawlerProcess()
    process.crawl(ScraperWithLimit)
    process.start()
    process.stop()
    #endregion
    #region database setup
    try:
        mydb = mysql.connector.connect(  # connects to database
            host="localhost",
            user=username,
            password=password,
            database='websites',
            auth_plugin='mysql_native_password')
    except mysql.connector.InterfaceError:
        print("""The connection to the database has been unsuccessful.
Please make sure the SQL server is running, and the database has been initialised.
To initialise database please type \"CREATE DATABASE websites;\" in a suitable SQL terminal and make sure the admin username and password have been entered into credentials.txt"""
              )
        sys.exit()
    mycursor = mydb.cursor()  #initialises cursor so that commands can be sent

    # drops the table if it already exists
    mycursor.execute("DROP TABLE IF EXISTS `%s`;" % domain)
    time.sleep(
        .25
    )  #sleeps the thread as the deletion actually overlaps with the creation

    # creates a table with the name of the domain being scraped.
    mycursor.execute(
        "CREATE TABLE `%s`(AutoID INT NOT NULL AUTO_INCREMENT PRIMARY KEY, OriginURL VARCHAR(2000) NOT NULL, Hyperlink VARCHAR(2000) NOT NULL);"
        % (domain))
    time.sleep(
        .25
    )  #as stuff is executed asynchronously this pause is needed to make sure the SQL statements are executed in correct order
    mydb.commit()
    #endregion

    # backticks are used so that any character can be accepted aka the . in the URL. The surrounding '' are used so that MySQL doesn't mistake them for table references
    query = "INSERT INTO `" + domain + "` VALUES (NULL, %s, %s);"

    if domain[-1] == "/":  #this removes a trailing slash at the end of URLs
        domain = domain[:-1]

    #this is so that any values that have the domain appended later will also contain the domains transfer protocol
    if "https://" in website:
        domain = "https://" + domain
    elif "http://" in website:
        domain = "http://" + domain
    print("Writing to database. This may take a while.")

    for originURL, hyperlinks in dictOfUrl.items():
        #removes the trailing slash to make sure all links are identical. (http://blah.com/ is the same as http://blah.com for example, but when string comparisons are done, they are different.)
        if originURL[-1] == "/":
            originURL = originURL[:-1]
        for item in hyperlinks:
            if item != "":
                if len(item) > 1 or "http" in item:  #ignores all anchor links
                    if item[-1] == "/":  #removes any trailing slashes.
                        item = item[:-1]

                    if "http" in item:
                        queryParameters = (
                            originURL,
                            item,
                        )
                        mycursor.execute(query, queryParameters)

                    elif item[0] != '#' and item[
                            0] == "/":  #if http is not in the item and # is not the first char we can assume that it is a relative link
                        queryParameters = (
                            originURL,
                            domain + item,
                        )  #appends the domain name to relative paths
                        # actually executes the SQL statement with the parameters place of %s. This method also removes any SQL injection attempts
                        mycursor.execute(query, queryParameters)
                        # testFile.write(originURL + ", " + domain + item + "\n") #test statement to see if any of this even works
                    elif item != '#':
                        queryParameters = (
                            originURL,
                            domain + '/' + item,
                        )  #appends the domain name and a slash to relative paths that are using interactive link. Seems to be rare but some websites do have it
                        mycursor.execute(query, queryParameters)
                    mydb.commit()  #commits the changes to the database
                elif item == "/":  #this is to make sure that a / redirects to the home page.
                    queryParameters = (originURL, domain)
                    mycursor.execute(query, queryParameters)
                    mydb.commit()

    mycursor.close()
    mydb.close()
    return True  #this is done so that there can be confirmation that the program has stopped running as problems arose due to seeming asynchronous execution of select statements while values were being inserted
Beispiel #36
0
class VineyardSpider:
    def __init__(self, driver: webdriver, destination: str, log=True):
        """Start an instance of the VineyardSpider, which can then be used
        to download data from http://www.biodynamicfood.org/"""
        self.driver = driver
        self.destination = destination
        self.log = log
        self._create_csv()
        self.time = None
        self.process = None

    def _create_csv(self):
        if not os.path.isfile(self.destination):
            with open(self.destination, 'w') as output:
                writer = csv.writer(output)
                writer.writerow([
                    'Name', 'Date', 'Category', 'Address', 'Phone', 'Email',
                    'Website', 'Short description', 'Description', 'Crops',
                    'Processed products', 'Cropped_acreage', 'Total_acreage'
                ])
        else:
            pass

    def load_vineyards(self, link: str, time: date):
        self.time = time

        self.driver.get(link)

        # Select category 'Crops'
        product_selector = self.driver.find_element_by_id('filter_3_primary')
        product_selector.click()
        product_selector.send_keys(Keys.ARROW_DOWN)
        product_selector.send_keys(Keys.ENTER)
        sleep(0.3)
        if self.log:
            print('Loading category "Crops" successful.')

        # Select subcategory 'Fruit
        crop_selector = self.driver.find_element_by_id('filter_3_secondary')
        crop_selector.click()
        crop_selector.send_keys(Keys.ARROW_DOWN)
        crop_selector.send_keys(Keys.ARROW_DOWN)
        crop_selector.send_keys(Keys.ENTER)
        sleep(0.3)
        if self.log:
            print('Loading subcategory "Fruit" successful.')

        # Select 'Grapes For Wine'
        fruit_selector = self.driver.find_element_by_id('filter_3_tertiary')
        fruit_selector.click()
        for i in range(19):
            sleep(0.1)
            fruit_selector.send_keys(Keys.ARROW_DOWN)
        fruit_selector.send_keys(Keys.ENTER)
        if self.log:
            print('Loading "Grapes For Wine" successful.')

        # Load all producers
        load_button = self.driver.find_element_by_id('scrollDown')
        n_elements = len(
            self.driver.find_elements_by_class_name('results_list_item'))
        while True:
            if self.log:
                print('Loading more vineyards.')
            for i in range(20):
                load_button.click()
                sleep(0.1)
            n_elements_new = len(
                self.driver.find_elements_by_class_name('results_list_item'))
            if n_elements < n_elements_new:
                n_elements = n_elements_new
            else:
                if self.log:
                    print('Loaded all vineyards.')
                break

    def get_vineyard_links(self):
        selector = Selector(text=self.driver.page_source)
        links = [
            'http://www.biodynamicfood.org' + link for link in selector.xpath(
                '//*[@class="results_list_item"]//a/@href').extract()
        ]
        return links

    def prepare_vineyard_parsing(self, links):
        destination = self.destination
        time = self.time

        class GenericSpider(Spider):
            name = 'vineyards'
            allowed_domains = []
            start_urls = links

            def parse(self, response):
                sel = Selector(response)
                name = sel.xpath('//h1/text()').extract_first()
                category = sel.xpath(
                    '//h2[@class="business-type"]/text()').extract_first()

                address_field_1 = sel.xpath(
                    '//div[@class="member-address"]/p/text()[1]'
                ).extract_first().strip()
                address_field_2 = sel.xpath(
                    '//div[@class="member-address"]/p/text()[2]'
                ).extract_first().strip()
                address = address_field_1 + '\n' + address_field_2

                contact_info = sel.xpath(
                    '//div[@class="member-address"]/p/text()').extract()
                contact_info = [line.strip() for line in contact_info]
                phone = [
                    line for line in contact_info if line.startswith('Phone: ')
                ][0]
                phone = phone.replace('Phone: ', '')
                email = sel.xpath('//div[@class="member-address"]//a[1]/text()'
                                  ).extract_first()
                website = sel.xpath(
                    '//div[@class="member-address"]//a[2]/text()'
                ).extract_first()
                short_description = sel.xpath(
                    '//p[@class="quote"]/text()').extract_first()

                profile = sel.xpath(
                    '//div[@class="member-profile"]/div/p/text()').extract()
                profile = [element.strip() for element in profile]
                len_description = max([len(element) for element in profile])
                description = [
                    element for element in profile
                    if len(element) == len_description
                ][0]
                crops = sel.xpath(
                    '//div[p/*/text()="Crops"]//li//text()').extract()
                crops = ', '.join(crops)
                processed_products = sel.xpath(
                    '//div[p/*/text()="Processed Product"]//li//text()'
                ).extract()
                processed_products = ', '.join(processed_products)

                all_text = sel.xpath('//p/text()').extract()
                all_text = [text.strip() for text in all_text]
                acreage = [text for text in all_text if 'Acres' in text]
                try:
                    cropped_acreage = acreage[0]
                    total_acreage = acreage[1]

                except IndexError:
                    print('Acreage not specified for one organization.')
                    cropped_acreage = ''
                    total_acreage = ''

                with open(destination, 'a', newline='') as output:
                    writer = csv.writer(output)
                    writer.writerow([
                        name, time, category, address, phone, email, website,
                        short_description, description, crops,
                        processed_products, cropped_acreage, total_acreage
                    ])

        # Run spider
        self.process = CrawlerProcess({
            'USER_AGENT':
            'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
        })
        self.process.crawl(GenericSpider)

    def parse_organizations(self):
        self.process.start()

    def close_parser(self):
        self.process.stop()
Beispiel #37
0
def execute(cat_urls):
    process = CrawlerProcess(get_project_settings())
    process.crawl(ZoomSpider, cat_urls=cat_urls)
    process.start()
    process.stop()
Beispiel #38
0
def main():
    settings = get_project_settings()

    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--debug',
                        help='Set logging level to debug',
                        action='store_true')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        help='version',
                        version='%(prog)s ' + version)
    parser.add_argument('-o',
                        '--outputdir',
                        help='output local path',
                        default=conf.DATA_LOCAL_REPO_PATH)
    parser.add_argument('-c',
                        '--crawl',
                        help='Crawl AFAQ.',
                        action='store_true')
    parser.add_argument('-g',
                        '--pull',
                        help='Pull git repo before obtaining AFAQ.',
                        action='store_true')
    parser.add_argument('-p',
                        '--push',
                        help='Push to git repo AFAQ changes.',
                        action='store_true')
    parser.add_argument('-r',
                        '--rm',
                        help='Remove content outputdir before crawling.',
                        action='store_true')
    parser.add_argument('-m',
                        '--convert',
                        help='Convert obtained AFAQ to other formats.',
                        action='store_true')
    parser.add_argument('-a',
                        '--all',
                        help='Equivalent to -g, -c, -p, -r, -m.',
                        action='store_true',
                        default=True)

    args = parser.parse_args()
    if args.all is True:
        args.pull = args.crawl = args.push = args.convert = args.rm = \
            args.debug = True

#    configure_logging()
    logging.basicConfig(format=conf.LOG_FORMAT)
    logging.getLogger('scrapy').propagate = False
    logger = logging.getLogger('root')
    logger.propagate = True

    if args.debug is True:
        logger.setLevel(logging.DEBUG)


#        settings.set('DEBUG', True)
#        conf.DEBUG = True

    if args.pull is True:
        # Write ssh keys and command neede for git_utils
        if not os.path.isdir(conf.SSH_PATH):
            os.makedirs(conf.SSH_PATH)
            logger.debug('Created ssh dir: %s.', conf.SSH_PATH)
        if system.ismorpio():
            git_utils.write_ssh_keys(conf.SSH_DIR, conf.MORPH_SSH_PRIV_KEY_ENV,
                                     conf.MORPH_SSH_PUB_KEY_ENV,
                                     conf.SSH_PRIV_KEY_PATH,
                                     conf.SSH_PUB_KEY_PATH)
            git_utils.write_ssh_command(conf.GIT_SSH_COMMAND_PATH,
                                        conf.GIT_SSH_COMMAND_MORPHIO)
        else:
            git_utils.write_ssh_command(conf.GIT_SSH_COMMAND_PATH,
                                        conf.GIT_SSH_COMMAND)
        git_utils.write_ssh_key_server(conf.GITLAB_SSH_PUB_KEY,
                                       conf.SSH_PUB_KEY_SERVER_PATH)

        # Pull or clone the data repos
        logger.debug('Remote repo name %s' % conf.DATA_REMOTE_REPO.get('name'))
        local_repo, remote_repo = git_utils.obtain_repo(
            conf.DATA_LOCAL_REPO_PATH, conf.DATA_REMOTE_REPO,
            conf.GIT_SSH_COMMAND_PATH, False)
    if args.rm is True:
        # rm files in case they are deleted in the sources
        # TODO: if this is removed then the files removed should be
        # detected on git commit
        system.rm_data(conf.DATA_LOCAL_REPO_PATH)

    if args.crawl is True:
        # Run the scraper
        process = CrawlerProcess(settings)
        process.crawl('afaq')
        process.start()
        process.stop()

    if args.convert is True:
        # Conversions
        convert.convert_dir(conf.HTML_PATH, conf.MD_PATH, convert.html2md,
                            '.md')
        # NOTE: since there is already md, txt is not needed
        #convert.html2txt(conf.HTML2TXT_COMMAND, conf.HTML_PATH)

    if args.push is True:
        # Push the scraped data in the repos
        git_utils.commit_push_if_changes(local_repo, conf.GIT_AUTHOR_NAME,
                                         conf.GIT_AUTHOR_EMAIL,
                                         conf.GIT_SSH_COMMAND_PATH,
                                         conf.DATA_REMOTE_REPO,
                                         conf.METADATA_PATH)
Beispiel #39
0
if __name__ == '__main__':
    load_dotenv(find_dotenv())
    trademarks_list_file = os.getenv("TRADEMARKS")
    timeframe = os.getenv("TIMEFRAME")
    configure_logging(install_root_handler=False)
    session_dir = os.path.abspath(os.getenv("SESSIONDIR"))
    mkdir_p(session_dir)
    log_dir = os.path.abspath(os.getenv("LOGDIR", os.path.join(SCRIPT_DIR, 'Log')))
    log_file_name = 'feefo_' + str(datetime.utcnow())[:19].replace('-', '').replace(':', '').replace(' ', '_') + '.log'
    mkdir_p(log_dir)
    logging.basicConfig(
        filename=os.path.join(log_dir, log_file_name),
        format='%(levelname)s: %(message)s',
        level=logging.INFO
    )
    if trademarks_list_file and timeframe:
        try:
            with open(os.path.abspath(trademarks_list_file), 'r') as f:
                trademarks = f.readlines()
        except Exception as exc:
            print(f'File {trademarks_list_file} open error: {exc}')
            raise SystemExit
        else:
            start_urls = [ FeefoSpider.base_url.format(trademark=trademark.strip(), timeframe=timeframe)  for trademark in  trademarks]
            process = CrawlerProcess(get_project_settings())
            process.crawl(FeefoSpider, start_urls=start_urls, session_dir=session_dir, connection=Connection() )
            process.start()
            process.stop()

Beispiel #40
0
 def run(self, in_theater_ids, out_theater_ids):
     process = CrawlerProcess(self.scrapy_settings)
     process.crawl('get_movies', in_theater_ids, out_theater_ids)
     process.start()
     process.stop()
Beispiel #41
0
def execute(urls):
    process = CrawlerProcess(get_project_settings())
    process.crawl(KabumSpider, categories=urls)
    process.start()
    process.stop()
Beispiel #42
0
def _crawl(path=None):
    crawl = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    crawl.crawl(ProvinceSpider)
    crawl.start()
    crawl.stop()
Beispiel #43
0
def execute(tags_urls):
    process = CrawlerProcess({'LOG_LEVEL': LOG_LEVEL})

    process.crawl(TagSpider, tags=';'.join(tags_urls))
    process.start()
    process.stop()
Beispiel #44
0
def run_config(config):
    config = ConfigLoader(config)
    CustomDownloaderMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    strategy = DefaultStrategy(config)

    meilisearch_helper = MeiliSearchHelper(
        config.app_id,
        config.api_key,
        config.index_uid,
        config.custom_settings
    )

    root_module = 'src.' if __name__ == '__main__' else 'scraper.src.'
    DOWNLOADER_MIDDLEWARES_PATH = root_module + 'custom_downloader_middleware.' + CustomDownloaderMiddleware.__name__
    DOWNLOADER_CLIENTCONTEXTFACTORY = root_module + 'scrapy_patch.' + CustomContextFactory.__name__
    DUPEFILTER_CLASS_PATH = root_module + 'custom_dupefilter.' + CustomDupeFilter.__name__

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en",
    }  # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers

    if os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv("CF_ACCESS_CLIENT_SECRET"):
        headers.update(
            {
                "CF-Access-Client-Id": os.getenv("CF_ACCESS_CLIENT_ID"),
                "CF-Access-Client-Secret": os.getenv("CF_ACCESS_CLIENT_SECRET"),
            }
        )
    elif os.getenv("IAP_AUTH_CLIENT_ID") and os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON"):
        iap_token = IAPAuth(
            client_id=os.getenv("IAP_AUTH_CLIENT_ID"),
            service_account_secret_dict=json.loads(
                os.getenv("IAP_AUTH_SERVICE_ACCOUNT_JSON")
            ),
        )(requests.Request()).headers["Authorization"]
        headers.update({"Authorization": iap_token})

    DEFAULT_REQUEST_HEADERS = headers

    process = CrawlerProcess({
        'LOG_ENABLED': '1',
        'LOG_LEVEL': 'ERROR',
        'USER_AGENT': config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {DOWNLOADER_MIDDLEWARES_PATH: 900},
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY,
        'DUPEFILTER_USE_ANCHORS': config.use_anchors,
        # Use our custom dupefilter in order to be scheme agnostic regarding link provided
        'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH,
        'DEFAULT_REQUEST_HEADERS': DEFAULT_REQUEST_HEADERS,
    })

    process.crawl(
        DocumentationSpider,
        config=config,
        meilisearch_helper=meilisearch_helper,
        strategy=strategy
    )

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        meilisearch_helper.add_records(config.extra_records, "Extra records", False)

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        # meilisearch_helper.commit_tmp_index()
        print('Nb hits: {}'.format(DocumentationSpider.NB_INDEXED))
        config.update_nb_hits_value(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_uid)
        # meilisearch_helper.report_crawling_issue()
        exit(EXIT_CODE_NO_RECORD)
    print("")
Beispiel #45
0
import sched
import time
from scrapy.crawler import CrawlerProcess
from finan.spiders.yobit_spider import YobitSpider
from scrapy.utils.project import get_project_settings

process = CrawlerProcess(
    {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

process.crawl(YobitSpider)
process.start()
del process

scheduler = sched.scheduler(time.time, time.sleep)
waiting = 10
while True:
    repeated = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    repeated.crawl(YobitSpider)
    scheduler.enter(waiting, 2, repeated.start)
    scheduler.run()

    print(
        "\n waiting for %d seconds before more action. Pres CTRL+Z to cancel\n"
        % (waiting))
    repeated.stop()
    del repeated
Beispiel #46
0
class Processor(Process):
    ''' Start a twisted reactor and run the provided scrapy spiders.
    Blocks until all have finished.
    '''
    def __init__(self, settings=None, item_scraped=True):
        '''
        Parms:
          settings (scrapy.settings.Settings) - settings to apply.  Defaults
        to Scrapy default settings.
        '''
        kwargs = {'ctx': __import__('billiard.synchronize')}

        self.results = Queue(**kwargs)
        self.counts = Queue(**kwargs)
        self.items = {}
        self.items_count = {}
        self.settings = settings or Settings()
        self.item_scraped = item_scraped
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item, response, spider):
        if spider.name not in self.items.keys():
            self.items[spider.name] = []
            self.items_count[spider.name] = 0
        if self.item_scraped is True:
            self.items[spider.name].append(dict(item))

        self.items_count[spider.name] += 1

    def _crawl(self, requests):
        '''
        Parameters:
            requests (Request) - One or more Jobs. All will
                                 be loaded into a single invocation of the reactor.
        '''
        self.crawler = CrawlerProcess(self.settings)

        # crawl can be called multiple times to queue several requests
        for req in requests:
            self.crawler.crawl(req.spider, *req.args, **req.kwargs)

        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)
        self.counts.put(self.items_count)

    def run(self, jobs):
        '''Start the Scrapy engine, and execute all jobs.  Return consolidated results
        in a single list.

        Parms:
          jobs ([Job]) - one or more Job objects to be processed.

        Returns:
          List of objects yielded by the spiders after all jobs have run.
        '''
        if not isinstance(jobs, collections.Iterable):
            jobs = [jobs]
        self.validate(jobs)

        p = Process(target=self._crawl, args=[jobs])
        p.start()
        p.join()
        p.terminate()

    def data(self):
        return self.results.get()

    def count(self):
        return self.counts.get()

    def validate(self, jobs):
        if not all([isinstance(x, Job) for x in jobs]):
            raise ScrapyScriptException('scrapy-script requires Job objects.')
Beispiel #47
0
def run_config(config):
    config = ConfigLoader(config)
    CustomDownloaderMiddleware.driver = config.driver
    DocumentationSpider.NB_INDEXED = 0

    if config.use_anchors:
        from . import scrapy_patch

    strategy = DefaultStrategy(config)

    algolia_helper = AlgoliaHelper(
        config.app_id, config.api_key, config.index_name,
        AlgoliaSettings.get(config, strategy.levels), config.query_rules)

    DOWNLOADER_MIDDLEWARES_PATH = 'scraper.src.custom_downloader_middleware.CustomDownloaderMiddleware'
    DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.src.scrapy_patch.CustomContextFactory'
    DUPEFILTER_CLASS_PATH = 'scraper.src.custom_dupefilter.CustomDupeFilter'

    if __name__ == '__main__':
        DOWNLOADER_MIDDLEWARES_PATH = 'src.custom_downloader_middleware.CustomDownloaderMiddleware'
        DOWNLOADER_CLIENTCONTEXTFACTORY = 'src.scrapy_patch.CustomContextFactory'
        DUPEFILTER_CLASS_PATH = 'src.custom_dupefilter.CustomDupeFilter'

    process = CrawlerProcess({
        'LOG_ENABLED': '1',
        'LOG_LEVEL': 'ERROR',
        'USER_AGENT': config.user_agent,
        'DOWNLOADER_MIDDLEWARES': {
            DOWNLOADER_MIDDLEWARES_PATH: 900
        },
        # Need to be > 600 to be after the redirectMiddleware
        'DOWNLOADER_CLIENTCONTEXTFACTORY': DOWNLOADER_CLIENTCONTEXTFACTORY,
        'DUPEFILTER_USE_ANCHORS': config.use_anchors,
        # Use our custom dupefilter in order to be scheme agnostic regarding link provided
        'DUPEFILTER_CLASS': DUPEFILTER_CLASS_PATH
    })

    process.crawl(DocumentationSpider,
                  config=config,
                  algolia_helper=algolia_helper,
                  strategy=strategy)

    process.start()
    process.stop()

    # Kill browser if needed
    BrowserHandler.destroy(config.driver)

    if len(config.extra_records) > 0:
        algolia_helper.add_records(config.extra_records, "Extra records")

    print("")

    if DocumentationSpider.NB_INDEXED > 0:
        algolia_helper.commit_tmp_index()
        print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED))
        config.update_nb_hits_value(DocumentationSpider.NB_INDEXED)
    else:
        print('Crawling issue: nbHits 0 for ' + config.index_name)
        algolia_helper.report_crawling_issue()
        exit(EXIT_CODE_NO_RECORD)
    print("")