Esempio n. 1
0
 def close(spider, reason):
     print(f'{multiprocessing.current_process().name}: *!!CLOSE')
     write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt')
     t = Task.objects.get_latest_crawler_task()
     d = datetime.today()
     t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}'
     t.result = CrawlProcess.count
     t.save()
Esempio n. 2
0
 def crawl(self, spider):
     queue = Queue()
     self.queue = Queue()
     self.process = Process(target=self._crawl, args=(queue, spider))
     self.process.start()
     write_in_a_file('.crawl 1', {'process': self.process, 'process-pid': self.process and self.process.pid, 'queue': self.queue.qsize()}, "t.txt")
     self.process.join()
     write_in_a_file('.crawl 2', {'process': self.process,
                                'process-pid': self.process and self.process.pid, 'queue': self.queue.qsize()}, "t.txt")
Esempio n. 3
0
        def scraped(item, response, spider):
            print(f'{multiprocessing.current_process().name}: *!!SCRAPED')

            print()
            CrawlProcess.count = CrawlProcess.count + 1
            n = CrawlProcess.count
            write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt')
            try:
                q.get_nowait()
                q.put_nowait(n)
            except:
                q.put_nowait(n)
Esempio n. 4
0
 def _crawl(self, queue, spider):
     self.crawler = CrawlerProcess(get_project_settings())
     self.crawler.crawl(spider)
     dispatcher.connect(self._item_scraped, signals.item_scraped)
     dispatcher.connect(self._so, signals.spider_opened)
     dispatcher.connect(self._so2, signals.spider_opened)
     dispatcher.connect(self._sc, signals.spider_closed)
     write_in_a_file('crawler start', {'db': dir(db), 'db.connection': dir(db.connection)}, "t.txt")
     print(dir(db.connection))
     db.connection.close()
     self.crawler.start()
     self.crawler.stop()
     write_in_a_file('crawler ended', {'qsize': queue.qsize() }, "t.txt")
     queue.put(self.items)
Esempio n. 5
0
 def open(spider):
     print(f'{multiprocessing.current_process().name}: *!!OPEN')
     try:
         name = spider.name
     except:
         name = str(spider)
     write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt')
     CrawlProcess.count = 0
     try:
         t = Task.objects.get_latest_crawler_task()
         t.name = str(process.pid)
         t.save()
     except Exception as e:
         t.name = e
         t.save()
     #q.put_nowait()
     print()
Esempio n. 6
0
 def _reset_process(self, state=Task.STATE_FINISHED):
     print(f'CrawlerProcess._reset_process({state})')
     try:
         self.process.terminate()
         write_in_a_file('_reset_process terminated (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt')
         self.task.result = CrawlProcess.count
         self.task.state = state
         self.task.save()
         self.process.join()  # ! IMPORTANT after .terminate -> .join
         write_in_a_file('_reset_process joinned (from stop)', {'is_running': self.process.is_alive()}, 'debug.txt')
     except:
         pass
     try:
         self.result = self.q.get_nowait()
     except Exception as e:
         pass
     self._clear_queue()
Esempio n. 7
0
def run_crawler():
    #https://srv.buysellads.com/ads/click/x/GTND42QNC6BDCKQ7CV7LYKQMCYYIC2QJFTAD4Z3JCWSD42QYCYYIVKQKC6BIKKQIF6AI6K3EHJNCLSIZ?segment=placement:techiediariescom;
    write_in_a_file('run crawler start', {}, 'celery.txt')
    Job.objects.filter(area=Job.AREA_BANKING_AND_INSURANCE).delete()
    try:
        crawler_settings = get_project_settings()
        runner = CrawlerRunner(crawler_settings)
        dispatcher.connect(lambda _: print('finish'), signal=signals.spider_closed)#'item_scraped'
        dispatcher.connect(lambda _: print('item scraped'), signal=signals.item_scraped)#'item_scraped'
        deferred = runner.crawl(InfoempleoSpider)
        deferred.addBoth(lambda _: reactor.stop())
        print('reactor...')
        print('run!!!!!')
        reactor.run()
        write_in_a_file('run crawler end', {}, 'celery.txt')
        print('end!!!!!')
    except Exception as e:
       print(e)
Esempio n. 8
0
 def _crawl(self, queue, spider):
     crawler = CrawlerProcess(get_project_settings())
     crawler.crawl(spider)
     write_in_a_file('signals', {'signals': dir(signals)}, 'task.txt')
     write_in_a_file('._crawl start', {'process': self.process, 'process-pid': self.process and self.process.pid, 'db': dir(db), 'db.connection': dir(db.connection)}, "t.txt")
     print(dir(db.connection))
     db.connection.close()
     crawler.start()
     crawler.stop()
     write_in_a_file('._crawl ended 1', {'qsize': self.queue.qsize() }, "t.txt")
     queue.put_nowait(self.items)
     write_in_a_file('._crawlended after q 2', {'qsize': queue.qsize()}, "t.txt")
Esempio n. 9
0
 def _start_process(self):
     print(f'CrawlerProcess._start_process')
     self.init_datetime = timezone.now()  # Before create the task
     self.process.start()
     self.task.pid = self.process.pid
     write_in_a_file('CrawlProcess._start_process: process started', {'pid': self.process.pid}, 'debug.txt')
     self.task.state = Task.STATE_RUNNING
     self.task.save()
     self.crawler_process = self.parent_conn.recv()
     write_in_a_file('CrawlProcess._start_process: conn.recv', {'crawler_process':str(self.crawler_process), 'dir crawler_process':dir(self.crawler_process)}, 'debug.txt')
     write_in_a_file('CrawlProcess._start_process', {'CrawlerProcess': str(self.crawler_process), 'dir(CrawlerProcess)': dir(self.crawler_process)},'task.txt')
Esempio n. 10
0
    def crawl(cls, q, conn):
        print()
        print()
        print('***************************************************************************************')
        print('crawl')

        def close(spider, reason):
            print(f'{multiprocessing.current_process().name}: *!!CLOSE')
            write_in_a_file('CrawlerProcess.signal.close', {'reason': reason}, 'task.txt')
            t = Task.objects.get_latest_crawler_task()
            d = datetime.today()
            t.description = f'spider closed with count: {CrawlProcess.count} at {str(d)}'
            t.result = CrawlProcess.count
            t.save()

        def open(spider):
            print(f'{multiprocessing.current_process().name}: *!!OPEN')
            try:
                name = spider.name
            except:
                name = str(spider)
            write_in_a_file('CrawlerProcess.signal.open', {'spider': name}, 'task.txt')
            CrawlProcess.count = 0
            try:
                t = Task.objects.get_latest_crawler_task()
                t.name = str(process.pid)
                t.save()
            except Exception as e:
                t.name = e
                t.save()
            #q.put_nowait()
            print()


        def scraped(item, response, spider):
            print(f'{multiprocessing.current_process().name}: *!!SCRAPED')

            print()
            CrawlProcess.count = CrawlProcess.count + 1
            n = CrawlProcess.count
            write_in_a_file('CrawlerProcess.signal.scraped_item', {'response': response, 'count': n}, 'task.txt')
            try:
                q.get_nowait()
                q.put_nowait(n)
            except:
                q.put_nowait(n)

        def stopped(*args, **kwargs):
            write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt')

        def error(*args, **kwargs):
            write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt')

        def send_by_pipe(item):
            try:
                conn.send(item)
                #conn.close()
            except Exception as e:
                write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt')

        process = CrawlerProcess(get_project_settings())
        write_in_a_file('CrawlProcess.crawl: first', {'crawler_process': str(process), 'dir process': dir(process)},
                        'debug.txt')
        send_by_pipe(process)
        write_in_a_file('CrawlProcess.crawl: second', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt')
        process.crawl(InfoempleoSpider())
        write_in_a_file('CrawlProcess.crawl: third', {'crawler_process': str(process), 'dir process': dir(process)},'debug.txt')
        crawler = Crawler(InfoempleoSpider())
        crawler.signals.connect(open, signal=signals.spider_opened)
        crawler.signals.connect(scraped, signal=signals.item_scraped)
        crawler.signals.connect(close, signal=signals.spider_closed)
        crawler.signals.connect(stopped, signal=signals.engine_stopped)
        crawler.signals.connect(error, signal=signals.spider_error)

        write_in_a_file('CrawlProcess.crawl: before', {'crawler_process': str(process),'dir process': dir(process)},'debug.txt')

        process.crawl(crawler)
        write_in_a_file('CrawlProcess.crawl: after', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt')

        process.start()
        write_in_a_file('CrawlProcess._crawl: process started', {'crawler_process': str(process), 'dir process': dir(process)}, 'debug.txt')

        print('***************************************************************************************')
        print(f'CrawlerProcess: {process}')
        print(dir(process))
        print('***************************************************************************************')
        print()
        print()
        write_in_a_file('CrawlProcess.crawl', {'CrawlerProcess': str(process), 'dir(CrawlerProcess)': dir(process)}, 'task.txt')
        process.join()
        write_in_a_file('CrawlProcess.crawl: process.join', {}, 'task.txt')
        write_in_a_file('CrawlProcess.crawl: process.join', {}, 'spider.txt')

        print('Crawler Process has Finished!!!!!')
Esempio n. 11
0
 def send_by_pipe(item):
     try:
         conn.send(item)
         #conn.close()
     except Exception as e:
         write_in_a_file('CrawlProcess._crawl: error conn.send', {'conn error': e}, 'debug.txt')
Esempio n. 12
0
 def _item_scraped(self, item, **kwargs):
     self._count = self._count + 1
     write_in_a_file('item scraped', {'count':self._count, 'item': item, 'kwargs':kwargs, 'process': self.process, 'process-pid': self.process and self.process.pid}, "t.txt")
     self.items.append(item)
     self.queue.put_nowait(item)
Esempio n. 13
0
 def _sc(self):
     write_in_a_file('spider_closed', {'scraped items': len(self.items)}, "t.txt")
Esempio n. 14
0
 def _so(self):
     write_in_a_file('spider_opened 1', {'open': 'open!', 'x': self.x, 'process': self.process, 'process-pid': self.process and self.process.pid}, "t.txt")
Esempio n. 15
0
 def error(*args, **kwargs):
     write_in_a_file('CrawlerProcess.signal.error', {'args': args, 'kwargs': kwargs}, 'task.txt')
Esempio n. 16
0
 def crawl(self, spider):
     queue = Queue()
     self.process = Process(target=self._crawl, args=(queue, spider,))
     self.process.start()
     write_in_a_file('crawler started', {'crawler': dir(self.crawler)}, "t.txt")
     return self.process, queue#p.join()
Esempio n. 17
0
 def _item_scraped(self, item):
     write_in_a_file('item scraped', {'item': item}, "t.txt")
     self.items.append(item)
Esempio n. 18
0
 def _so2(self):
     write_in_a_file('spider_opened 2', {}, "t.txt")
Esempio n. 19
0
def print_something5():
    x = "print_something5"
    write_in_a_file('print_something5',{},'celery.txt')
    print(x)
    return x
Esempio n. 20
0
 def stopped(*args, **kwargs):
     write_in_a_file('CrawlerProcess.signal.stopped', {'args': args, 'kwargs': kwargs}, 'task.txt')