コード例 #1
0
ファイル: ingestor.py プロジェクト: CodeForAfrica/aleph
 def dispatch(cls, collection_id, meta):
     local_path = get_archive().load_file(meta)
     try:
         best_cls = cls.auction_file(meta, local_path)
         log.debug("Dispatching %r to %r", meta.file_name, best_cls)
         best_cls(collection_id).ingest(meta, local_path)
         CrawlerState.store_ok(meta, collection_id)
         db.session.commit()
     except Exception as exc:
         cls.handle_exception(meta, collection_id, exc)
     finally:
         get_archive().cleanup_file(meta)
コード例 #2
0
ファイル: ingestor.py プロジェクト: OpenOil-UG/aleph
 def dispatch(cls, source_id, meta):
     local_path = get_archive().load_file(meta)
     try:
         best_cls = cls.auction_file(meta, local_path)
         log.debug("Dispatching %r to %r", meta.file_name, best_cls)
         best_cls(source_id).ingest(meta, local_path)
         CrawlerState.store_ok(meta, source_id)
         db.session.commit()
     except Exception as exception:
         cls.handle_exception(meta, source_id, exception)
     finally:
         get_archive().cleanup_file(meta)
コード例 #3
0
ファイル: test_crawler.py プロジェクト: CodeForAfrica/aleph
    def test_crawler_execute(self):
        tdc = TDocumentCrawler()
        ccnt = CrawlerState.all().count()
        assert ccnt == 0, ccnt
        tdc.execute()
        states = CrawlerState.all().all()
        assert len(states) == 2, len(states)
        demo = states[1]
        assert 'kitty' in demo.meta['title'], demo.meta
        assert 'demo.pdf' in demo.meta['source_path'], demo.meta

        coll = Collection.by_foreign_id('test')
        assert coll is not None, coll
        assert len(list(coll.documents)) == 1, list(coll.documents)
コード例 #4
0
    def test_crawler_execute(self):
        tdc = TDocumentCrawler()
        ccnt = CrawlerState.all().count()
        assert ccnt == 0, ccnt
        tdc.execute()
        states = CrawlerState.all().all()
        assert len(states) == 2, len(states)
        demo = states[1]
        assert 'kitty' in demo.meta['title'], demo.meta
        assert 'demo.pdf' in demo.meta['source_path'], demo.meta

        coll = Collection.by_foreign_id('test')
        assert coll is not None, coll
        assert len(list(coll.documents)) == 1, list(coll.documents)
コード例 #5
0
 def handle_exception(cls, meta, source_id, exception):
     db.session.rollback()
     db.session.close()
     (error_type, error_message, error_details) = sys.exc_info()
     if error_type is not None:
         error_message = unicode(error_message)
         error_details = traceback.format_exc()
     else:
         error_message = unicode(exception)
     error_type = exception.__class__.__name__
     log.warning(error_message)
     CrawlerState.store_fail(meta, source_id,
                             error_type=error_type,
                             error_message=error_message,
                             error_details=error_details)
     db.session.commit()
コード例 #6
0
ファイル: ingestor.py プロジェクト: adamchainz/aleph
 def handle_exception(cls, meta, collection_id, exception):
     db.session.rollback()
     db.session.close()
     if isinstance(exception, SQLAlchemyError):
         log.exception(exception)
         return
     (error_type, error_message, error_details) = sys.exc_info()
     if error_type is not None:
         error_message = unicode(error_message)
         error_details = traceback.format_exc()
     else:
         error_message = unicode(exception)
     error_type = exception.__class__.__name__
     log.warning(error_message)
     CrawlerState.store_fail(meta, collection_id,
                             error_type=error_type,
                             error_message=error_message,
                             error_details=error_details)
     db.session.commit()
コード例 #7
0
 def handle_exception(cls, meta, collection_id, exception):
     db.session.rollback()
     db.session.close()
     if isinstance(exception, SQLAlchemyError):
         log.exception(exception)
         return
     (error_type, error_message, error_details) = sys.exc_info()
     if error_type is not None:
         error_message = unicode(error_message)
         error_details = traceback.format_exc()
     else:
         error_message = unicode(exception)
     error_type = exception.__class__.__name__
     log.warning(error_message)
     CrawlerState.store_fail(meta,
                             collection_id,
                             error_type=error_type,
                             error_message=error_message,
                             error_details=error_details)
     db.session.commit()
コード例 #8
0
ファイル: crawler.py プロジェクト: adamchainz/aleph
 def to_dict(self):
     data = CrawlerState.crawler_stats(self.get_id())
     data.update({
         'collection': self.collection,
         'collection_id': self.COLLECTION_ID,
         'collection_label': self.COLLECTION_LABEL or self.COLLECTION_ID,
         'name': self.CRAWLER_NAME,
         'schedule': self.SCHEDULE,
         'id': self.get_id()
     })
     return data
コード例 #9
0
 def to_dict(self):
     data = CrawlerState.crawler_stats(self.get_id())
     data.update({
         'source': self.source,
         'source_id': self.SOURCE_ID,
         'source_label': self.SOURCE_LABEL or self.SOURCE_ID,
         'name': self.CRAWLER_NAME,
         'schedule': self.SCHEDULE,
         'id': self.get_id()
     })
     return data
コード例 #10
0
ファイル: crawler.py プロジェクト: adamchainz/aleph
 def to_dict(self):
     data = CrawlerState.crawler_stats(self.get_id())
     data.update({
         'collection': self.collection,
         'collection_id': self.COLLECTION_ID,
         'collection_label': self.COLLECTION_LABEL or self.COLLECTION_ID,
         'name': self.CRAWLER_NAME,
         'schedule': self.SCHEDULE,
         'id': self.get_id()
     })
     return data
コード例 #11
0
 def check_due(self, crawler_id):
     # should this be utcnow?
     _, last_run = CrawlerState.crawler_last_run(crawler_id)
     if last_run is None:
         return True
     now = datetime.now()
     if last_run > (now - CrawlerState.TIMEOUT):
         log.info("Crawler was active very recently. Skip due.")
         return False
     if now > last_run + self.delta:
         return True
     return False
コード例 #12
0
ファイル: schedule.py プロジェクト: CodeForAfrica/aleph
 def check_due(self, crawler_id):
     # should this be utcnow?
     _, last_run = CrawlerState.crawler_last_run(crawler_id)
     if last_run is None:
         return True
     now = datetime.now()
     if last_run > (now - CrawlerState.TIMEOUT):
         log.info("Crawler was active very recently. Skip due.")
         return False
     if now > last_run + self.delta:
         return True
     return False
コード例 #13
0
ファイル: crawler.py プロジェクト: rlugojr/aleph
 def to_dict(self):
     data = CrawlerState.crawler_stats(self.get_id())
     data.update({"name": self.CRAWLER_NAME, "schedule": self.SCHEDULE, "id": self.get_id()})
     if self.COLLECTION_ID:
         data.update({"collection": self.collection, "collection_id": self.COLLECTION_ID})
     return data
コード例 #14
0
ファイル: crawler.py プロジェクト: adamchainz/aleph
 def execute(self, **kwargs):
     CrawlerState.store_stub(self.collection.id,
                             self.get_id(),
                             self.crawler_run)
     db.session.commit()
     super(DocumentCrawler, self).execute(**kwargs)
コード例 #15
0
 def execute(self, **kwargs):
     CrawlerState.store_stub(self.source.id,
                             self.get_id(),
                             self.crawler_run)
     db.session.commit()
     super(DocumentCrawler, self).execute(**kwargs)
コード例 #16
0
ファイル: test_crawler.py プロジェクト: CodeForAfrica/aleph
 def test_incremental(self):
     tdc = TDocumentCrawler()
     tdc.execute()
     tdc.execute(incremental=True)
     states = CrawlerState.all().all()
     assert len(states) == 3, len(states)
コード例 #17
0
 def test_incremental(self):
     tdc = TDocumentCrawler()
     tdc.execute()
     tdc.execute(incremental=True)
     states = CrawlerState.all().all()
     assert len(states) == 3, len(states)