def test_insert_documents(models, mock_osp): """ Corpus.insert_documents() should create a row for each syllabus. """ # 10 segments x 10 files. for s in segment_range(10): for i in range(10): mock_osp.add_file(segment=s, name=s+'-'+str(i)) # Insert document rows. Document.insert_documents() # Should create 100 rows. assert Document.select().count() == 100 # All docs should have rows. for s in segment_range(10): for i in range(10): # Path is [segment]/[file] path = s+'/'+s+'-'+str(i) # Query for the document path. query = Document.select().where(Document.path==path) assert query.count() == 1
def queue_text(): """ Queue text extraction tasks in the worker. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_text, doc.id)
def queue_match_doc(): """ Queue institution matching tasks in the worker. """ for doc in Document.select(): config.rq.enqueue(match_doc, doc.id)
def queue_file_metadata(): """ Queue file metadata extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_file_metadata, doc.id)
def queue_semester(): """ Queue semester regex extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_semester, doc.id)
def queue_archive_url(): """ Queue Internet Archive timestamp extraction tasks. """ for doc in query_bar(Document.select()): config.rq.enqueue(ext_archive_url, doc.id)
def test_insert_new_documents(models, mock_osp): """ When new documents are added to the corpus, just the new documents should be registered in the database. """ # 10 files in `000`. for i in range(10): mock_osp.add_file(segment='000', name='000-'+str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 10 # 10 new files in `001`. for i in range(10): mock_osp.add_file(segment='001', name='001-'+str(i)) # Should add 10 docs. Document.insert_documents() assert Document.select().count() == 20