def test_crawler_execute(self): tdc = TDocumentCrawler() ccnt = CrawlerState.all().count() assert ccnt == 0, ccnt tdc.execute() states = CrawlerState.all().all() assert len(states) == 2, len(states) demo = states[1] assert 'kitty' in demo.meta['title'], demo.meta assert 'demo.pdf' in demo.meta['source_path'], demo.meta coll = Collection.by_foreign_id('test') assert coll is not None, coll assert len(list(coll.documents)) == 1, list(coll.documents)
def test_incremental(self): tdc = TDocumentCrawler() tdc.execute() tdc.execute(incremental=True) states = CrawlerState.all().all() assert len(states) == 3, len(states)