Beispiel #1
0
def start_crawler(
        data, **kwargs):  # todo base spider logic, modularity (time-consuming)

    idx = kwargs.get('idx')
    process = CrawlerProcess(get_project_settings())

    items = []

    def item_scraped(item, response, spider):
        items.append(item)

    if 'source' in kwargs:  # checks if crawler needs a source site and keyword to search with
        crawler = process.create_crawler()
        source = kwargs.get('source')
        process.crawl('Knipex',
                      url=data[source].tolist(),
                      idx=data[idx].tolist())
    else:
        crawler = process.create_crawler('Astro')
        crawler.signals.connect(item_scraped, signal=signals.item_passed)
        process.crawl(
            crawler,
            url=data[idx].tolist())  # hits the page directly without searching

    process.start()

    return items
class taskService(service.MultiService)  :
    def __init__(self, taskId, taskName, setting):
        service.MultiService.__init__(self)
        self.taskId = taskId
        self.name = taskName
        self.setting = setting
        self._crawlerProcess = CrawlerProcess(project_settings)
        self._crawlerProcess.create_crawler('start_page_crawler')
#         self._crawlerProcess.create_crawler('list_page_crawler')
#         self._crawlerProcess.create_crawler('content_page_crawler')
#         self._crawlerProcess.create_crawler('extra_page_crawler')
        
        # 
        # self._listPageService = listPageService()
        # self._contentPageService = contentPageService()
        # self._extraPageService = extraPageService()



    def startService(self):
        _spider_start_page_setting =  self.setting.get(SPIDER_TYPE_START_PAGE)
        if _spider_start_page_setting is not None:
            _spider_start_page_setting['szStartUrl']=self.setting.get('szStartUrl')
            _spider_start_page_setting['szRegStartUrl']=self.setting.get('szRegStartUrl')
#             self._startStartPageSpider(_spider_start_page_setting)
            
        
        self._crawlerProcess.start()
#     sService = startPageService(self)
        lSeevice = listPageService(self)
        cService = contentPageService(self)
        # eService = extraPageService(self)
#         self.addService(sService)
        self.addService(lSeevice)
        self.addService(cService)
        # self.addService(eService)
        service.MultiService.startService(self)
        log.msg('taskService->startService')

    def _startStartPageSpider(self, config):
        startPageCrawler = self._crawlerProcess.crawlers.get('start_page_crawler')
        
        print '======>'
        print startPageCrawler
        
#         startPageSpider = load_object(config.get('szSnameSpace'))
#         startPageCrawler.crawl(startPageSpider)
        

    def stopService(self):
        service.MultiService.stopService(self)
        log.msg('taskService->stopService')
Beispiel #3
0
def lambda_handler(event, context):
    tz = pytz.timezone(TIMEZONE)
    now_str = datetime.now(tz).strftime("%d-%m-%Y_%H:%M")

    scrapy_settings = SCRAPY_SETTINGS
    scrapy_settings["FEED_URI"] = scrapy_settings["FEED_URI"].format(now_str)

    process = CrawlerProcess(scrapy_settings)
    process.crawl(VillaSpider)
    crawler = process.create_crawler(VillaSpider)
    process.crawl(crawler)
    process.start()

    stats = crawler.stats.get_stats()

    mean_response_time = statistics.mean(crawler.spider.response_times)

    result = f"{stats['downloader/request_count']} requests " \
             f"with an average response time of {round(mean_response_time, 2)} seconds"

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": result,
        }),
    }
Beispiel #4
0
 def f(queue):
     process = CrawlerProcess(get_project_settings())
     crawl = process.create_crawler(spider)
     process.crawl(crawl, **kwargs)
     process.start()
     data = crawl.spider.get_data()
     queue.put(data)
Beispiel #5
0
def startSpider(group_type, spider_type, spider_group_name, spider_name):
    #调用Scrapy内部方法
    settings = get_project_settings()
    #实例化一个爬虫进程
    crawlerProcess = CrawlerProcess(settings)

    #创建一个爬虫,一个爬取处理器可以,运行多个爬取。
    crawler = crawlerProcess.create_crawler(spider_name)

    #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。
    crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened)
    crawler.signals.connect(spiderSignal.idleSingnal, signals.spider_idle)
    crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error)
    crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed)

    #获取爬取类
    spiderConf = Spider_Dict[group_type][spider_type]
    spiderArgs = spiderConf[1].copy()
    spiderArgs["name"] = spider_name
    spiderArgs["redis_key"] = spider_name
    spiderArgs["spider_type"] = spider_type
    spiderArgs["spider_group_name"] = spider_group_name
    spiderArgs["task_id"] = "-1"

    spider = spiderConf[0](**spiderArgs)

    #给爬虫设置爬取类
    crawler.configure()
    crawler.crawl(spider)

    #爬虫启动。
    crawlerProcess.start()
    crawlerProcess.stop()
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name):
    #调用Scrapy内部方法
    settings = get_project_settings()
    #实例化一个爬虫进程
    crawlerProcess = CrawlerProcess(settings)

    #创建一个爬虫,一个爬取处理器可以,运行多个爬取。
    crawler = crawlerProcess.create_crawler(spider_name)

    #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。
    crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened)
    crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error)
    crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed)

    #获取爬取类
    spiderConf = Spider_Dict[group_type][spider_type]
    spiderArgs = spiderConf[1].copy()
    spiderArgs["name"] = spider_name
    spiderArgs["redis_key"] = spider_name
    spiderArgs["spider_type"] = spider_type
    spiderArgs["spider_group_name"] = spider_group_name
    spiderArgs["task_id"] = "-1"

    spider = spiderConf[0](**spiderArgs)

    #给爬虫设置爬取类
    crawler.configure()
    crawler.crawl(spider)

    #爬虫启动。
    crawlerProcess.start()
    crawlerProcess.stop()
Beispiel #7
0
 def run(cls, dependencies):
     process = CrawlerProcess(dependencies.scrapy_settings)
     crawler = process.create_crawler(cls)
     process.crawl(crawler, dependencies)
     process.start(
     )  # the script will block here until the crawling is finished
     return crawler
Beispiel #8
0
def get_fetch(log=False):
    settings = Settings()
    settings.set('LOG_ENABLED', log)

    crawler_process = CrawlerProcess(settings)
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()

    t = Thread(target=crawler_process.start_reactor)
    t.daemon = True
    t.start()

    shell = Shell(crawler)
    shell.code = 'adsf'

    import threading
    lock = threading.Lock()

    def fetch(url_or_request):
        lock.acquire()
        try:
            shell.fetch(url_or_request)
            response = shell.vars.get('response')
            return response
        finally:
            lock.release()

    return fetch
Beispiel #9
0
class ScrapyPuppeteerTestCase(TestCase):
    """Test case for the ``scrapy-puppeteer`` package"""
    class PuppeteerSpider(scrapy.Spider):
        name = 'puppeteer_crawl_spider'
        allowed_domains = ['ufmg.br']
        items = []

        def start_requests(self):
            yield scrapy_puppeteer.PuppeteerRequest('https://ufmg.br',
                                                    wait_until='networkidle2')

        def parse(self, response):
            for selector_item in response.selector.xpath(
                    '//*[@id="rodape"]/section[1]/div/div[1]/div/ol/li'):
                self.items.append(selector_item)

    def setUp(self):
        """Store the Scrapy runner to use in the tests"""
        self.settings = custom_settings = {
            'DOWNLOADER_MIDDLEWARES': {
                'scrapy_puppeteer.PuppeteerMiddleware': 800
            }
        }
        self.process = CrawlerProcess(settings=self.settings)

    def test_items_number(self):
        crawler = self.process.create_crawler(self.PuppeteerSpider)
        self.process.crawl(crawler)
        self.process.start()
        self.assertEqual(len(crawler.spider.items), 12)
Beispiel #10
0
def main():
    print(
        "Inserisci un ISBN e premi invio per raccogliere i dati, oppure inserisci 'stop' per terminare"
    )
    mySpider = "bookspider"
    process = CrawlerProcess(get_project_settings())
    crawler = process.create_crawler(mySpider)
    # Connetti la funzione close_spider al segnale spider_closed
    crawler.signals.connect(close_spider, signals.spider_closed)
    # Ricevi ISBN
    global ISBN_RECEIVED
    if ISBN_RECEIVED is not None:  # Se avvio con argomento
        # Avvia il processo assegnato allo spider
        process.crawl(crawler, isbn=ISBN_RECEIVED)
        process.start()
    else:  # Altrimenti se avvio senza argomento
        while ISBN_RECEIVED != "stop":
            ISBN_RECEIVED = input("\n[ISBN] > ")
            if ISBN_RECEIVED != "stop":
                # Avvia il processo assegnato allo spider
                process.crawl(crawler, isbn=ISBN_RECEIVED)
                process.start()
                ISBN_RECEIVED = None
                # Permetti di rieseguire il proceso, da https://stackoverflow.com/a/47127561
                time.sleep(0.5)
                os.execl(sys.executable, sys.executable, *sys.argv)
Beispiel #11
0
def execute_spiders(urls, run_name):

    process = CrawlerProcess(get_project_settings())

    spiders = []
    export_headers = True
    for url in urls:
        if "nepremicnine.net" in url:
            spider_name = "nepremicnine"
        elif "bolha.com" in url:
            spider_name = "bolha"
        else:
            print("No spdider for url: " + url + ", skipping ...")
            continue
        spider = process.create_crawler(spider_name)
        spiders.append(spider)
        process.crawl(spider,
                      url=url,
                      run_name=run_name,
                      export_headers=export_headers)
        export_headers = False  #so only first wil export them

    process.start(
    )  # the script will block here until the crawling is finished

    for spider in spiders:
        stats = spider.stats.get_stats()
        print("Spider " + spider.spider.name + " executed in " +
              str(stats.get("elapsed_time_seconds")))
        print("  Scraped " + str(stats.get("item_scraped_count", 0)) +
              " items")
        if "log_count/ERROR" in stats:
            print("  Errors in spider " + spider.name + "!!!")
        print()
Beispiel #12
0
def check(urls, auth=None, crawl=True, robotstxt=True, verbosity=0):
    """Crawl a list of url"""
    # we somehow need to append an empty string for bold to work
    puts(colored.white('Checking forms:', bold=True) + '')
    settings = get_project_settings()

    if verbosity > 0:
        settings.set('LOG_ENABLED', True)
        log_levels = ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG']
        if verbosity >= len(log_levels):
            verbosity = len(log_levels) - 1
        settings.set('LOG_LEVEL', log_levels[verbosity])

    if not robotstxt:
        settings.set('ROBOTSTXT_OBEY', False)

    process = CrawlerProcess(settings)
    crawler = process.create_crawler('form')
    result = Result()

    crawler.signals.connect(result.add_item, scrapy.signals.item_scraped)

    process.crawl(crawler, urls=urls, crawl=crawl, auth=auth)
    process.start()

    result.print(verbosity)
Beispiel #13
0
def start_userscrapers(list_dicts):
    reset_file("user_gathering_data.txt")
    reset_file("usernames.txt")
    process = CrawlerProcess()
    for dictionary in list_dicts:
        argument = dictionary
        process.crawl(process.create_crawler(UserScraper), argument)
    process.start(True)
Beispiel #14
0
    def execute(self):
        # Initialise settings for a limited scraping
        os.environ.setdefault(
            'SCRAPY_SETTINGS_MODULE',
            'wsf_scraping.settings'
        )

        if not self.dst_s3_dir.startswith('s3://'):
            raise ValueError('Invalid S3 url: %s' % self.dst_s3_dir)

        # This monkey-patching only works because Airflow shells out to
        # a new Python interpreter for every task it runs. It thus *must*
        # remain inside execute(), so other code paths don't touch it.
        wsf_scraping.settings.MAX_ARTICLE = self.item_max
        wsf_scraping.settings.WHO_IRIS_YEARS = \
            self.item_years

        wsf_scraping.settings.FEED_URI = \
            'manifest' + self.dst_s3_dir

        settings = get_project_settings()
        self.log.info(
            "scrapy settings: %s",
            json.dumps(
                {k: v for k, v in settings.items()
                 if isinstance(v, (str, int, float, bool))}
            )
        )

        process = CrawlerProcess(settings, install_root_handler=False)
        spider = SPIDERS[self.organisation]
        crawler = process.create_crawler(spider)

        self.item_count = None
        self.scraper_errors = []
        crawler.signals.connect(
            self.on_item_error,
            signal=scrapy.signals.item_error)
        crawler.signals.connect(
            self.on_manifest_storage_error,
            signal=feed_storage.manifest_storage_error)

        process.crawl(crawler)  # starts reactor
        process.start()  # waits for reactor to finish

        if self.scraper_errors:
            scraper_errors = self.scraper_errors  # put into local for sentry
            self.log.error(
                'SpiderOperator: scrapy signaled %d errors:',
                len(scraper_errors)
            )
            for tup in self.scraper_errors:
                self.log.error('DummySpiderOperator: %r', tup)
            raise Exception(
                "%d errors occurred during scrape" %
                len(scraper_errors)
            )
Beispiel #15
0
    def populate(cls, season, *args, **kwargs):
        print(f'Scraping {season} covers')
        settings = get_project_settings()
        process = CrawlerProcess(settings)
        crawler = process.create_crawler(GameSpider)
        process.crawl(crawler, season=season, *args, **kwargs)
        process.start()

        games = crawler.stats.get_value('games', 0)
        print(f'Saved {games} rows to {cls.__tablename__}')
def run_spiders(name, **kwargs):
    """
    @param name: spider name
    """
    prs = CrawlerProcess( CrawlerSettings(scrapy_settings) )
    crawler = prs.create_crawler()
    for spdname, spd in crawler.spiders._spiders.iteritems():
        if name == spdname:
            spidercls = spd
    crawler.crawl( spidercls(**kwargs) )
    prs.start()
Beispiel #17
0
def main():
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

    crawler = process.create_crawler(MySpider)
    crawler.signals.connect(response_received,
                            signal=signals.response_received)
    crawler.signals.connect(engine_started, signal=signals.engine_started)
    crawler.signals.connect(spider_opened, signal=signals.spider_opened)
    crawler.signals.connect(spider_error, signal=signals.spider_error)
    #crawler.crawl()
    #import pdb; pdb.set_trace()
    process.crawl(crawler)
    process.start()
Beispiel #18
0
    def crawling_start(
            self,
            scrapy_settings: Settings, 
            spider: object, 
            board_code: str,
            return_dic: Dict) -> Dict:
        process = CrawlerProcess(scrapy_settings)
        crawler = process.create_crawler(spider)
        process.crawl(crawler, args={'callback': self._yield_output})
        process.start()
        return_dic[board_code] = self.output

        # stats = crawler.stats   # <class 'scrapy.statscollectors.MemoryStatsCollector'>
        stats = crawler.stats.get_stats()   # <class 'dict'>
        return stats
Beispiel #19
0
class CrawlerWorker(Process):

    def __init__(self, spider, result_list, settings=None):
        Process.__init__(self)
        self.result_queue = result_list

        if settings is None:
            settings = Settings()

        self.crawler = CrawlerProcess(settings)
        self.crawler.create_crawler(spider.__class__.__name__)
        self.crawler.crawlers['spider'] = spider
        self.spider = spider
        self.items = []
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)
        print "here"

    def run(self):
        self.crawler.start()
        self.crawler.stop()
        self.result_queue.put(self.items)
Beispiel #20
0
def shell(argv):
    """ Open a url in the scrapy shell """
    parser = argparse.ArgumentParser('ozzy shell',
                                     description=shell.__doc__)
    parser.add_argument('url', help="URL to open in a shell")
    args = parser.parse_args(argv)

    crawler_process = CrawlerProcess(load_settings())
    crawler = crawler_process.create_crawler()
    crawler_process.start_crawling()
    thread = Thread(target=crawler_process.start_reactor)
    thread.daemon = True
    thread.start()
    sh = Shell(crawler)
    sh.start(url=args.url)
Beispiel #21
0
class MainCrawler:
    def __init__(self):
        self.settings = get_project_settings()
        self.session = Session()

    def run(self, url=TARGET_URL):
        self.process = CrawlerProcess(self.settings)
        self.url = url
        self.flag = "top_urls"
        configure_logging()
        self.crawl_url()
        self.process.start()

    # Use repetedly
    def _crawl(self, spider, callback, urls=None):
        crawler = self.process.create_crawler(spider)
        crawler.signals.connect(callback, signal=signals.spider_closed)
        self.process.crawl(crawler, urls)

    def crawl_url(self, url=None):
        if not url:
            url = self.url
        spider = UrlSpider(url=url)
        self._crawl(spider, self.callbacks)

    def crawl_page(self, urls=None):
        logging.info(urls[0])
        spider = WebpageSpider(url=urls)
        self._crawl(spider, self.callbacks, urls)

    # Each functions
    def crawl_top_pages(self):
        logging.info("START CRAWLING TOP PAGES")
        self.flag = "top_pages"
        pages_q = self.session.query(Webpage.original_url).filter(
            and_(Webpage.html == None, Webpage.path_label == self.url))
        page_list = []
        for p in pages_q:
            page_list.append(p[0])
        self.crawl_page(page_list)

    # Callbacks
    def callbacks(self, spider=None, urls=None):
        logging.info("START CALLBACKS")
        if self.flag == "top_urls":
            self.crawl_top_pages()
        elif self.flag == "top_pages":
            reactor.stop()
Beispiel #22
0
def _runCrawler(spider, results):
        settings_module = importlib.import_module('Extractors.HTMLScraper.settings')
        settings = CrawlerSettings(settings_module)
        crawlerProcess = CrawlerProcess(settings)
        items = []

        def _item_passed(item, response, spider):
                items.append(item)

        dispatcher.connect(_item_passed, signals.item_scraped)

        crawler = crawlerProcess.create_crawler("currentCrawler")
        crawler.crawl(spider)
        crawlerProcess.start()
        crawlerProcess.stop()
        results.put(items)
Beispiel #23
0
def run_spider(spider, settings, loglevel='INFO'):
    """
    Run a spider with given settings
    """
    if 'SENTRY_DSN' in os.environ:
        import scrapy_sentry
        settings.setdict({
            'SENTRY_DSN': os.environ['SENTRY_DSN'],
            'EXTENSIONS': {
                "scrapy_sentry.extensions.Errors": 10,
            },
        })

    crawler_process = CrawlerProcess(settings)
    crawler = crawler_process.create_crawler()
    crawler.crawl(spider)
    crawler_process.start()
Beispiel #24
0
def _runCrawler(spider, results):
    settings_module = importlib.import_module(
        'Extractors.HTMLScraper.settings')
    settings = CrawlerSettings(settings_module)
    crawlerProcess = CrawlerProcess(settings)
    items = []

    def _item_passed(item, response, spider):
        items.append(item)

    dispatcher.connect(_item_passed, signals.item_scraped)

    crawler = crawlerProcess.create_crawler("currentCrawler")
    crawler.crawl(spider)
    crawlerProcess.start()
    crawlerProcess.stop()
    results.put(items)
Beispiel #25
0
def main(args):
    settings = Settings()
    settings.setmodule(iw_settings)
    spider = ThedySpider()

    process = CrawlerProcess(settings)
    crawler = process.create_crawler(spider)
    crawler.signals.connect(item_scraped, signal=signals.item_scraped)
    process.crawl(crawler)
    process.start(stop_after_crawl=True)
    process.join()

    result["scraping_time"] = result["scraping_time"].isoformat()

    doc = {"doc": dict(result)}

    return doc
def run(event, context):
    items = []

    def add_item(item):
        items.append(item)

    # Create and run the crawler, scrapy stuff
    process = CrawlerProcess(get_project_settings())
    crawler = process.create_crawler('broken_link_spider')
    crawler.signals.connect(add_item, signals.item_passed) # Intercept the results
    process.crawl(crawler)
    process.start()

    # Convert results to json and send email
    json_string = json.dumps([ob.__dict__ for ob in items])
    print("Found broken links:", json_string)
    send_simple_message(EMAIL, json_string)
Beispiel #27
0
def start_malscrapers(args_list):
    # Value checks
    try:
        if len(args_list) < 1:
            raise ValueError(
                "args_list length less than 1. THIS SHOULD NEVER HAPPEN")

    except ValueError:
        raise

    # Process that holds spiders
    process = CrawlerProcess()
    for args in args_list:
        # Add spiders and arguments to each spider
        arguments = list(args)
        process.crawl(process.create_crawler(Malscraper), arguments)
    process.start(True)
Beispiel #28
0
def run_spiders_concurrently(spiders: dict):
    default_settings = get_project_settings()
    default_settings["LOG_LEVEL"] = "ERROR"

    process = CrawlerProcess(default_settings)
    crawlers = dict()
    for name, spider_class in spiders.items():
        logging.info(f"running {name}")
        crawler = process.create_crawler(spider_class)
        crawlers[name] = crawler
        try:
            process.crawl(crawler)
        except (AttributeError, TypeError, KeyError, ValueError,
                ImportError) as e:
            logging.error(e)
            continue

    process.start()
class GetResultCrawler(object):
    def __init__(self):
        self.crawled_items = []
        settings = get_project_settings()
        settings["LOG_LEVEL"] = logging.WARNING
        self.process = CrawlerProcess(settings)

    def crawl(self, spider_dict):
        def _add_crawled_item(item):
            if item:
                self.crawled_items.append(item)
        for spider_name, spider_kwargs in spider_dict.items():
            crawler = self.process.create_crawler(spider_name)
            crawler.signals.connect(_add_crawled_item, signals.item_scraped)
            self.process.crawl(crawler, **spider_kwargs)

        self.process.start(stop_after_crawl=True)
        return self.crawled_items
Beispiel #30
0
def main(args):
    settings = Settings()
    settings.setmodule(iw_settings)
    spider = ThedySpider()

    process = CrawlerProcess(settings)
    crawler = process.create_crawler(spider)
    crawler.signals.connect(item_scraped,
                            signal=signals.item_scraped)
    process.crawl(crawler)
    process.start(stop_after_crawl=True)
    process.join()

    result["scraping_time"] = result["scraping_time"].isoformat()

    doc = {
        "doc": dict(result)
    }

    return doc
Beispiel #31
0
def run_one_spider(spider_name):
    try:
        settings = get_project_settings()

        _, output_log = tempfile.mkstemp('.log')
        _, output_results = tempfile.mkstemp('.geojson')

        settings.set('LOG_FILE', output_log)
        settings.set('LOG_LEVEL', 'INFO')
        settings.set('TELNETCONSOLE_ENABLED', False)
        settings.set('FEED_URI', output_results)
        settings.set('FEED_FORMAT', 'ndgeojson')

        def spider_opened(spider):
            logger.info("Spider %s opened, saving to %s", spider.name,
                        output_results)

        def spider_closed(spider):
            logger.info(
                "Spider %s closed (%s) after %0.1f sec, %d items",
                spider.name,
                spider.crawler.stats.get_value('finish_reason'),
                (spider.crawler.stats.get_value('finish_time') -
                 spider.crawler.stats.get_value('start_time')).total_seconds(),
                spider.crawler.stats.get_value('item_scraped_count') or 0,
            )

        process = CrawlerProcess(settings)
        crawler = process.create_crawler(spider_name)
        crawler.signals.connect(spider_closed, signals.spider_closed)
        crawler.signals.connect(spider_opened, signals.spider_opened)
        process.crawl(crawler)
        process.start()

        results = crawler.stats.spider_stats.get(spider_name)
        results['output_filename'] = output_results
        results['log_filename'] = output_log
        results['spider'] = spider_name
        return results
    except Exception as e:
        logger.exception("Exception in scraper process")
Beispiel #32
0
def run_spider(spider, bail=False, debug=False, **kwargs):
    def process_spider_error(failure, response, spider):
        nonlocal had_error
        had_error = True

    had_error = False
    spider_middlewares = {}
    if bail:
        spider_middlewares['kpopnet.spiders.HttpErrorMiddleware'] = 1
    process = CrawlerProcess({
        'LOG_LEVEL': 'DEBUG' if debug else 'WARNING',
        'USER_AGENT': USER_AGENT,
        'CLOSESPIDER_ERRORCOUNT': 1 if bail else 0,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 3,
        'SPIDER_MIDDLEWARES': spider_middlewares,
    })
    crawler = process.create_crawler(spider)
    crawler.signals.connect(process_spider_error, signals.spider_error)
    process.crawl(crawler, **kwargs)
    process.start()
    return 1 if had_error else 0
Beispiel #33
0
class CrawlerWorker(Process):
    def __init__(self, spider, results):
        Process.__init__(self)

        self.results = results     
        settings_module = importlib.import_module('Extractors.HTMLScraper.settings')
        settings = CrawlerSettings(settings_module)
        self.crawlerProcess = CrawlerProcess(settings)

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)

    def _item_passed(self, item):
        self.items.append(item)

    def run(self):
        crawler = self.crawlerProcess.create_crawler("currentCrawler")
        crawler.crawl(self.spider)
        self.crawlerProcess.start()
        self.crawlerProcess.stop()
        self.results.put(self.items)
Beispiel #34
0
def task_spider(self, type_id, time_from, time_to):
    settings = Settings()
    settings_module_path = os.environ.get('SCRAPY_ENV',
                                          'BilibiliTagSpider.settings')
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings=settings)
    crawler = process.create_crawler(TagSpider)

    thread_spider = Thread(target=spider_crawl,
                           args=(process, crawler, type_id, time_from,
                                 time_to))
    thread_spider.start()

    video_cur = 0
    video_total = 0
    while True:
        if (crawler.stats.get_value(ScrapyField.VideoCur.value) is None or \
                crawler.stats.get_value(ScrapyField.VideoTotal.value) is None):
            continue
        video_cur = crawler.stats.get_value(ScrapyField.VideoCur.value)
        video_total = crawler.stats.get_value(ScrapyField.VideoTotal.value)
        current_task.update_state(state='PROGRESS',
                                  meta={
                                      ScrapyField.VideoCur.value: video_cur,
                                      ScrapyField.VideoTotal.value:
                                      video_total,
                                  })
        time.sleep(1)
        # 线程有时候不会释放,还是从业务逻辑上跳出循环
        if video_cur == video_total:
            break

    task_ended(self.request.id)
    return {
        ScrapyField.VideoCur.value:
        crawler.stats.get_value(ScrapyField.VideoCur.value),
        ScrapyField.VideoTotal.value:
        crawler.stats.get_value(ScrapyField.VideoTotal.value),
    }
Beispiel #35
0
def start_idscrapers(start_ends):
    process = CrawlerProcess()
    for values in start_ends:
        arguments = list(values)
        process.crawl(process.create_crawler(Idscraper), arguments)
    process.start(True)
Beispiel #36
0
import scrapy
from scrapy.crawler import CrawlerProcess

from bd.spiders.sc import SCSpider

crawler_process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

crawler = crawler_process.create_crawler()
spider = crawler.spiders.create('sc')
crawler.crawl(spider)
crawler_process.start()
Beispiel #37
0
class CrawlerProcessScript(object):
    """
        Creates multiple crawlers and call them sequentially
        Crawler names should follow this naming convention:
            spider_name + _ + city + _ + category
        crawlers : keeps track of all crawlers run, so to get their stats after they are finished.
    """
    def __init__(self, dsite_name='', updating=False):
        self.updating = str(updating)
        self.dsite = DSite.objects.get(name=dsite_name)
        self.crawler_process = CrawlerProcess(get_project_settings())
        self.crawlers = {}

    def _add_crawler(self,
                     crawler_name,
                     city_mapping_pk=None,
                     category_mapping_pk=None):
        crawler = self.crawler_process.create_crawler(crawler_name)
        spider = crawler.spiders.create(
            self.dsite.name,
            dsite_pk=self.dsite.pk,
            city_mapping_pk=city_mapping_pk,
            category_mapping_pk=category_mapping_pk,
            updating=self.updating)
        crawler.crawl(spider)
        self.crawlers[crawler_name] = crawler

    def _create_crawlers(self):
        if self.dsite.has_both_mappings:
            for city_mapping in CityMapping.objects.filter(dsite=self.dsite):
                for category_mapping in CategoryMapping.objects.filter(
                        dsite=self.dsite, all_cities=False):
                    crawler_name = self.dsite.name + '_' + city_mapping.site_city + '_' + category_mapping.site_category
                    self._add_crawler(crawler_name=crawler_name,
                                      category_mapping_pk=category_mapping.pk,
                                      city_mapping_pk=city_mapping.pk)
            if self.dsite.has_category_mapping:
                for category_mapping in CategoryMapping.objects.filter(
                        dsite=self.dsite, all_cities=True):
                    crawler_name = self.dsite.name + '_' + category_mapping.site_category
                    self._add_crawler(crawler_name=crawler_name,
                                      category_mapping_pk=category_mapping.pk)
        elif self.dsite.has_city_mapping:
            for city_mapping in CityMapping.objects.filter(dsite=self.dsite):
                crawler_name = self.dsite.name + '_' + city_mapping.site_city
                self._add_crawler(crawler_name=crawler_name,
                                  city_mapping_pk=city_mapping.pk)
            if self.dsite.has_category_mapping:
                for category_mapping in CategoryMapping.objects.filter(
                        dsite=self.dsite, all_cities=True):
                    crawler_name = self.dsite.name + '_' + category_mapping.site_category
                    self._add_crawler(crawler_name=crawler_name,
                                      category_mapping_pk=category_mapping.pk)
        elif self.dsite.has_category_mapping:
            for category_mapping in CategoryMapping.objects.filter(
                    dsite=self.dsite):
                crawler_name = self.dsite.name + '_' + category_mapping.site_category
            self._add_crawler(crawler_name=crawler_name,
                              category_mapping_pk=category_mapping.pk)

    def start(self):
        self._create_crawlers()
        self.crawler_process.start()
        self.crawler_process.stop()
        self.crawler_process.stop_reactor()

    def dump_stats(self):
        for crawler_name, crawler in self.crawlers.iteritems():
            print crawler_name
            print crawler.stats.get_stats()
def engine_started():
    print('Engine started.')


def spider_opened(spider):
    print('Spider opened.')


def spider_error(failure, response, spider):
    print('Spider error.')


process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

crawler = process.create_crawler(ScannerSpider())
crawler.signals.connect(response_received,
                        signal=signals.response_received)
crawler.signals.connect(engine_started,
                        signal=signals.engine_started)
crawler.signals.connect(spider_opened,
                        signal=signals.spider_opened)
crawler.signals.connect(spider_error,
                        signal=signals.spider_error)
#crawler.crawl()
#import pdb; pdb.set_trace()
process.crawl(crawler)
process.start()
Beispiel #39
0
    def __init__(self, artist='', *args, **kwargs):
        """Takes artist as an argument when called"""
        super(EthnicScraperSpider, self).__init__(*args, **kwargs)
        # Replaces spaces with "-" for website
        artist = artist.replace(" ", "-")
        self.start_urls = [f'https://ethnicelebs.com/{artist}']

    def parse(self, response):
        """Get artist race"""
        next_page = response.xpath(
            '/html/body/div/div/div/div/div/section/div[2]/article/div/div[2]/div[1]/p[4]/strong/text()'
        ).get()
        # Find word and exclude
        string_start = next_page.find("Ethnicity: ") + len("Ethnicity: ")
        race = next_page[string_start:len(next_page)]

        yield {'race': race}


# Setup scraper
process = CrawlerProcess({
    'USER_AGENT':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
    'FEED_FORMAT': 'json',
    'FEED_URI': '123.json',
    'CONCURRENT_ITEMS': 1
})

crawler = process.create_crawler(EthnicScraperSpider)
process.crawl(crawler, artist='21 savage')
process.start()
Beispiel #40
0
import random

SEARCH_INTERVAL = 1
MAX_SPIDERS = 3

if __name__ == '__main__':
    spiders = []
    sites = list(SiteData.objects.filter(category__symbol='approved'))
    random.shuffle(sites)
    for site in sites:
        if ProductInfo.objects.filter(product__page__site=site, date__gt = now()-datetime.timedelta(SEARCH_INTERVAL)).exists():
            continue
        if not ScraperDescriptor.objects.filter(site=site).exists():
            continue

        Spider = get_spiders(site.id)
        if Spider is not None:
            spiders.append(Spider())

        if len(spiders) > MAX_SPIDERS:
            break
    if spiders:
        settings = get_project_settings()
        crawler_process = CrawlerProcess(settings)
        for spider in spiders:
            crawler = crawler_process.create_crawler(name=spider.name)
            crawler.crawl(spider)
        crawler_process.start()

            'item_scraped_count':
            spider.crawler.stats.get_value('item_scraped_count'),
        }

        print("Spider %s closed (%s) after %0.1f sec, %d items" % (
            spider.name,
            spider.crawler.stats.get_value('finish_reason'),
            (spider.crawler.stats.get_value('finish_time') -
             spider.crawler.stats.get_value('start_time')).total_seconds(),
            spider.crawler.stats.get_value('item_scraped_count') or 0,
        ))

    print("Starting to crawl")
    process = CrawlerProcess(settings)
    for spider_name in process.spider_loader.list():
        crawler = process.create_crawler(spider_name)
        crawler.signals.connect(spider_closed, signals.spider_closed)
        crawler.signals.connect(spider_opened, signals.spider_opened)
        process.crawl(crawler)
    process.start()
    print("Done crawling")

    client = boto3.client('s3')
    s3_key_prefix = "runs/{}".format(tstamp)

    # Gzip the output geojson
    with open(output_results, 'rb') as f_in:
        with gzip.open(output_results + '.gz', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    s3_output_size = os.path.getsize(output_results + '.gz')
Beispiel #42
0
class StartScan(object):
    """A scanner application which can be run."""

    def __init__(self, configuration):
        """
        Initialize the scanner application.
        Takes the JSON descriptor of this scan as its argument.
        """
        self.configuration = configuration

        scan_id = configuration['id']
        logfile = configuration['logfile']
        last_started = configuration['last_started']

        self.scan_id = scan_id
        self.logfile = logfile
        self.last_started = \
            parse_datetime(last_started) if last_started else None
        self.sitemap_crawler = None
        self.scanner_crawler = None

        self.settings = get_project_settings()
        self.crawler_process = None

    def run(self):
        """Updates the scan status and sets the pid.
        Run the scanner, blocking until finished."""

        # Each scanner process should set up logging separately, writing to
        # both the log file and to the scanner manager's standard error stream
        logging.basicConfig(
                level=logging.DEBUG,
                format="""\
%(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s""",
                handlers=[
                    logging.FileHandler(self.logfile),
                    logging.StreamHandler(stderr)
                ])

        # Scrapy expects to be able to log things, so this call should always
        # happen after we've initialised the root logging handler
        self.crawler_process = \
            CrawlerProcess(self.settings, install_root_handler=False)

        # A new instance of django setup needs to be loaded for the scan process,
        # so the django db connection is not shared between processors.
        from utils import run_django_setup
        run_django_setup()

    def handle_killed(self):
        """Handle being killed by updating the scan status."""
        from os2webscanner.models.scans.scan_model import Scan
        self.scanner.scan_object = Scan.objects.get(pk=self.scan_id)
        self.scanner.scan_object.set_scan_status_failed()
        self.scan.logging_occurrence("SCANNER FAILED: Killed")
        logging.error("Killed")

    def make_scanner_crawler(self, spider_type):
        """Setup the scanner spider and crawler."""
        self.scanner_crawler = \
            self.crawler_process.create_crawler(spider_type)
        csigs = self.scanner_crawler.signals
        csigs.connect(self.handle_closed, signal=signals.spider_closed)
        csigs.connect(self.handle_error, signal=signals.spider_error)
        csigs.connect(self.handle_idle, signal=signals.spider_idle)
        return self.scanner_crawler

    def handle_closed(self, spider, reason):
        """Handle the spider being finished."""
        # TODO: Check reason for if it was finished, cancelled, or shutdown
        logging.debug('Spider is closing. Reason {0}'.format(reason))
        self.store_stats()
        reactor.stop()

    def store_stats(self):
        """Stores scrapy scanning stats when scan is completed."""
        from os2webscanner.models.statistic_model import Statistic
        from django.core.exceptions import MultipleObjectsReturned
        logging.info('Stats: {0}'.format(self.scanner_crawler.stats.get_stats()))

        try:
            statistics, created = Statistic.objects.get_or_create(scan=self.scanner.scan_object)
        except MultipleObjectsReturned:
            logging.error('Multiple statistics objects found for scan job {}'.format(
                self.scan_id)
            )

        if self.scanner_crawler.stats.get_value(
                'last_modified_check/pages_skipped'):
            statistics.files_skipped_count += self.scanner_crawler.stats.get_value(
                'last_modified_check/pages_skipped'
            )
        if self.scanner_crawler.stats.get_value(
                'downloader/request_count'):
            statistics.files_scraped_count += self.scanner_crawler.stats.get_value(
                'downloader/request_count'
            )
        if self.scanner_crawler.stats.get_value(
                'downloader/exception_type_count/builtins.IsADirectoryError'):
            statistics.files_is_dir_count += self.scanner_crawler.stats.get_value(
                'downloader/exception_type_count/builtins.IsADirectoryError'
            )

        statistics.save()
        logging.debug('Statistic saved.')

    def handle_error(self, failure, response, spider):
        """Printing spider errors.

        When an exception occurs in a spider callback we do not need to stop the scan.
        The scan is only stopped when the spider signals it has stopped.

        So we only print the error to the log."""
        logging.error("An error occured: %s" % failure.getErrorMessage())

    def handle_idle(self, spider):
        """Handle when the spider is idle.

        Keep it open if there are still queue items to be processed.
        """
        from os2webscanner.models.conversionqueueitem_model import ConversionQueueItem
        logging.debug("Spider Idle...")
        # Keep spider alive if there are still queue items to be processed
        remaining_queue_items = ConversionQueueItem.objects.filter(
            status__in=[ConversionQueueItem.NEW,
                        ConversionQueueItem.PROCESSING],
            url__scan=self.scanner.scan_object
        ).count()

        if remaining_queue_items > 0:
            logging.info(
                "Keeping spider alive: %d remaining queue items to process" %
                remaining_queue_items
            )
            raise DontCloseSpider
        else:
            logging.info("No more active processors, closing spider...")
Beispiel #43
0
from scrapy.utils.project import get_project_settings
from XinLang_news.spiders.fudan import FudanSpider



# gundongnews = FudanSpider()
settings = get_project_settings()

# crawlerprocess = CrawlerProcess(settings)
# crawler = crawlerprocess.create_crawler()
# crawler.crawl(gundongnews)
# crawlerprocess.start()
##############################################################################
spname_list = ['jyb','tju']
# spname = 'jyb'
crawlerprocess = CrawlerProcess(settings)
for spname in spname_list:
    crawler = crawlerprocess.create_crawler(spname)
    spider = crawler.spiders.create(spname)
    crawler.crawl(spider)
crawlerprocess.start()
# crawlerprocess.start_reactor()

# log.start()
# reactor.run()