Ejemplo n.º 1
0
def test_insert_documents(models, mock_osp):

    """
    Corpus.insert_documents() should create a row for each syllabus.
    """

    # 10 segments x 10 files.
    for s in segment_range(10):
        for i in range(10):
            mock_osp.add_file(segment=s, name=s+'-'+str(i))

    # Insert document rows.
    Document.insert_documents()

    # Should create 100 rows.
    assert Document.select().count() == 100

    # All docs should have rows.
    for s in segment_range(10):
        for i in range(10):

            # Path is [segment]/[file]
            path = s+'/'+s+'-'+str(i)

            # Query for the document path.
            query = Document.select().where(Document.path==path)
            assert query.count() == 1
Ejemplo n.º 2
0
def queue_text():

    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
Ejemplo n.º 3
0
def queue_match_doc():

    """
    Queue institution matching tasks in the worker.
    """

    for doc in Document.select():
        config.rq.enqueue(match_doc, doc.id)
Ejemplo n.º 4
0
def queue_file_metadata():

    """
    Queue file metadata extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_file_metadata, doc.id)
Ejemplo n.º 5
0
def queue_semester():

    """
    Queue semester regex extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_semester, doc.id)
Ejemplo n.º 6
0
def queue_archive_url():

    """
    Queue Internet Archive timestamp extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_archive_url, doc.id)
Ejemplo n.º 7
0
def test_insert_new_documents(models, mock_osp):

    """
    When new documents are added to the corpus, just the new documents should
    be registered in the database.
    """

    # 10 files in `000`.
    for i in range(10):
        mock_osp.add_file(segment='000', name='000-'+str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 10

    # 10 new files in `001`.
    for i in range(10):
        mock_osp.add_file(segment='001', name='001-'+str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 20