Beispiel #1
0
    def crawl(cls):

        def close(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!CLOSE')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}');


        def open(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!OPEN')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}')
            cls.q.put_nowait()
            print()

        def idle(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!IDLE')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}')
            print()

        def scraped(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!SCRAPED')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}')
            print()
            try:
                count = cls.q.get_nowait()
                cls.q.put_nowait(count + 1)
            except:
                cls.q.put_nowait(1)


        process = CrawlerProcess(get_project_settings())

        process.crawl(InfoempleoSpider())
        """
        dispatcher.connect(open, signal=signals.spider_opened)  # 'item_scraped'
        dispatcher.connect(close, signal=signals.spider_closed)  # 'item_scraped'
        dispatcher.connect(scraped, signal=signals.item_scraped)  # 'item_scraped'  dispatcher.connect(open, signal=signals.spider_opened)  # 'item_scraped'
       
        """
        crawler = Crawler(InfoempleoSpider())
        crawler.signals.connect(open, signal=signals.spider_opened)  # 'item_scraped'
        crawler.signals.connect(scraped, signal=signals.item_scraped)  # 'item_scraped'
        crawler.signals.connect(close, signal=signals.spider_closed)  # 'item_scraped'
        crawler.signals.connect(idle, signal=signals.spider_idle)  # 'item_scraped'
        process.crawl(crawler)

        process.start()
Beispiel #2
0
    def run_crawler(cls):
        print('run_crawler')

        def spider_opened(**kwargs):
            print(f'{multiprocessing.current_process().name}: *qOPENED')
            #cls.q.put(datetime.today())
            cls.q.put(0)
            cls.init_date = datetime.today()
            #reactor.stop()

        def spider_closed(**kwargs):
            print(f'{multiprocessing.current_process().name}: *qCLOSED')
            #while q.size > 2:
            #    cls.q.get_nowait()
            cls.finish_date(datetime.today())
            #reactor.stop()

        def item_scraped(**kwargs):
            print(f'{multiprocessing.current_process().name}: *qSCRAPED')
            try:
                items_scraped = cls.q.get_nowait()
                cls.q.put_nowait(items_scraped + 1)
            except:
                cls.q.put_nowait(1)

        try:
            print(f'{multiprocessing.current_process().name}: run_crawler')
            crawler_settings = get_project_settings()
            runner = CrawlerRunner(crawler_settings)
            """
            dispatcher.connect(lambda _: spider_opened(_), signal=signals.spider_opened)  # 'item_scraped'
            dispatcher.connect(lambda _: spider_closed(_), signal=signals.spider_closed)  # 'item_scraped'
            dispatcher.connect(lambda _: item_scraped(_), signal=signals.item_scraped)  # 'item_scraped'
            deferred = runner.crawl(InfoempleoSpider())
            """
            crawler = Crawler(InfoempleoSpider())
            crawler.signals.connect(lambda _: spider_opened(_), signal=signals.spider_opened)  # 'item_scraped'
            crawler.signals.connect(lambda _: spider_closed(_), signal=signals.spider_closed)  # 'item_scraped'
            crawler.signals.connect(lambda _: item_scraped(_), signal=signals.item_scraped)  # 'item_scraped'
            deferred = runner.crawl(crawler)
            deferred.addBoth(lambda _: reactor.stop())
            print('reactor...')
            reactor.run()
            print('run!!!!!')


        except Exception as e:
            print(e)
Beispiel #3
0
    def crawl_():

        def close(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!CLOSE')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}');
            print()

        def open(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!OPEN')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}');
            print()

        def scraped(*args, **kwargs):
            print(f'{multiprocessing.current_process().name}: *!!SCRAPED')
            print(f'args: {args}')
            print(f'kwargs: {kwargs}');
            print()

        process = CrawlerProcess(get_project_settings())


        dispatcher.connect(open, signal=signals.spider_opened)  # 'item_scraped'
        dispatcher.connect(close, signal=signals.spider_closed)  # 'item_scraped'
        dispatcher.connect(scraped, signal=signals.item_scraped)  # 'item_scraped'  dispatcher.connect(open, signal=signals.spider_opened)  # 'item_scraped'
        process.crawl(InfoempleoSpider())

        """
        crawler = Crawler(InfoempleoSpider())
        crawler.signals.connect(open, signal=signals.spider_opened)  # 'item_scraped'
        crawler.signals.connect(scraped, signal=signals.item_scraped)  # 'item_scraped'
        crawler.signals.connect(close, signal=signals.spider_closed)  # 'item_scraped'
        process.crawl(crawler)
        """
        process.start()
Beispiel #4
0
    def crawl(cls, q, conn):
        print()
        print()
        print('***************************************************************************************')
        print('crawl')

        def close(spider, reason):
            print(f'{multiprocessing.current_process().name}: *!!CLOSE')
            write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt')
            t = Task.objects.get_latest_crawler_task()
            d = datetime.today()
            t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}'
            t.result = CrawlProcess.count
            t.save()

        def open(spider):
            print(f'{multiprocessing.current_process().name}: *!!OPEN')
            try:
                name = spider.name
            except:
                name = str(spider)
            write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt')
            CrawlProcess.count = 0
            try:
                t = Task.objects.get_latest_crawler_task()
                t.name = str(process.pid)
                t.save()
            except Exception as e:
                t.name = e
                t.save()
            #q.put_nowait()
            print()


        def scraped(item, response, spider):
            print(f'{multiprocessing.current_process().name}: *!!SCRAPED')

            print()
            CrawlProcess.count = CrawlProcess.count + 1
            n = CrawlProcess.count
            write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt')
            try:
                q.get_nowait()
                q.put_nowait(n)
            except:
                q.put_nowait(n)

        def stopped(*args, **kwargs):
            write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt')

        def error(*args, **kwargs):
            write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt')

        def send_by_pipe(item):
            try:
                conn.send(item)
                #conn.close()
            except Exception as e:
                write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt')

        process = CrawlerProcess(get_project_settings())
        write_in_a_file('CrawlProcess.crawl: first', {'crawler_process': str(process), 'dir process': dir(process)},
                        'debug.txt')
        send_by_pipe(process)
        write_in_a_file('CrawlProcess.crawl: second', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt')
        process.crawl(InfoempleoSpider())
        write_in_a_file('CrawlProcess.crawl: third', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt')
        crawler = Crawler(InfoempleoSpider())
        crawler.signals.connect(open, signal=signals.spider_opened)
        crawler.signals.connect(scraped, signal=signals.item_scraped)
        crawler.signals.connect(close, signal=signals.spider_closed)
        crawler.signals.connect(stopped, signal=signals.engine_stopped)
        crawler.signals.connect(error, signal=signals.spider_error)

        write_in_a_file('CrawlProcess.crawl: before', {'crawler_process': str(process),'dir process': dir(process)},'debug.txt')

        process.crawl(crawler)
        write_in_a_file('CrawlProcess.crawl: after', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt')

        process.start()
        write_in_a_file('CrawlProcess._crawl: process started', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt')

        print('***************************************************************************************')
        print(f'CrawlerProcess: {process}')
        print(dir(process))
        print('***************************************************************************************')
        print()
        print()
        write_in_a_file('CrawlProcess.crawl', {'CrawlerProcess': str(process), 'dir(CrawlerProcess)': dir(process)}, 'task.txt')
        process.join()
        write_in_a_file('CrawlProcess.crawl: process.join', {}, 'task.txt')
        write_in_a_file('CrawlProcess.crawl: process.join', {}, 'spider.txt')

        print('Crawler Process has Finished!!!!!')
Beispiel #5
0
def run_crawler_script():
    Job.objects.filter(area=Job.AREA_BANKING_AND_INSURANCE).delete()
    crawler = CrawlerScript()
    spider = InfoempleoSpider()
    p, q = crawler.crawl(spider)
Beispiel #6
0
def run_cp():
    print('run_cp')
    crawler = Crawler(InfoempleoSpider())
    cp = CP(crawler)
    cp.start()
Beispiel #7
0
def crawl_async():
    print('crawl_async')
    spider = InfoempleoSpider()
    crawler = CrawlerScript(spider)
    crawler.start()