Ejemplo n.º 1
0
def test_insert_documents(models, mock_osp):

    """
    Corpus.insert_documents() should create a row for each syllabus.
    """

    # 10 segments x 10 files.
    for s in segment_range(10):
        for i in range(10):
            mock_osp.add_file(segment=s, name=s+'-'+str(i))

    # Insert document rows.
    Document.insert_documents()

    # Should create 100 rows.
    assert Document.select().count() == 100

    # All docs should have rows.
    for s in segment_range(10):
        for i in range(10):

            # Path is [segment]/[file]
            path = s+'/'+s+'-'+str(i)

            # Query for the document path.
            query = Document.select().where(Document.path==path)
            assert query.count() == 1
Ejemplo n.º 2
0
def insert_documents():

    """
    Insert documents in the database.
    """

    Document.insert_documents()
Ejemplo n.º 3
0
def match_doc(id):

    """
    Find an institution with the same base URL as a document.

    Args:
        id (int): A document id.
    """

    doc = Document.get(Document.id==id)

    # Break if no manifest.
    if not doc.syllabus.registered_domain:
        return

    # Form the domain query.
    q = '%'+doc.syllabus.registered_domain+'%'

    inst = (
        Institution
        .select()
        .where(Institution.metadata['Institution_Web_Address'] ** (q))
        .order_by(Institution.id)
        .first()
    )

    if inst:

        Document_Institution.create(
            document=doc.id,
            institution=inst
        )
Ejemplo n.º 4
0
def test_es_insert(models, config, corpus_index):

    """
    CorpusIndex.index() should index all rows in Elasticsearch.
    """

    # Index 100 documents.
    for i in range(10):
        doc = Document.create(path=str(i))
        Document_Text.create(document=doc, text=str(i))

    Document_Text.es_insert()

    # Should insert 10 docs.
    assert Document_Text.es_count() == 10

    # For each text row:
    for t in Document_Text.select():

        # A document should exist.
        doc = config.es.get('osp', t.document.path)

        # Should index text / doc ID.
        assert doc['_source']['doc_id'] == t.document.id
        assert doc['_source']['body']   == t.document.path
Ejemplo n.º 5
0
def ext_archive_url(doc_id):

    """
    Try to extract an Internet Archive timestamp from the URL.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    match = re.search(
        'web\.archive\.org\/web\/(?P<timestamp>\d+)',
        doc.syllabus.url
    )

    if match:

        date = datetime.strptime(
            match.group('timestamp'),
            date_format
        )

        if date < datetime.now():

            return Document_Date_Archive_Url.create(
                document=doc,
                date=date
            )
Ejemplo n.º 6
0
def queue_file_metadata():

    """
    Queue file metadata extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_file_metadata, doc.id)
Ejemplo n.º 7
0
def test_link_with_document(models, mock_osp):

    """
    When a semester marker is found, the metadata row should be associated
    with the document that was passed to the job.
    """

    # 2 document rows.
    doc1 = Document.create(path='path1')
    doc2 = Document.create(path='path2')

    # Just 1 text row.
    doc_text = Document_Text.create(document=doc2, text='Fall 2012')
    assert doc_text.id != doc_text.document.id

    row = ext_semester(doc2.id)
    assert row.document == doc2
Ejemplo n.º 8
0
def queue_archive_url():

    """
    Queue Internet Archive timestamp extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_archive_url, doc.id)
Ejemplo n.º 9
0
def queue_semester():

    """
    Queue semester regex extraction tasks.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_semester, doc.id)
Ejemplo n.º 10
0
def queue_match_doc():

    """
    Queue institution matching tasks in the worker.
    """

    for doc in Document.select():
        config.rq.enqueue(match_doc, doc.id)
Ejemplo n.º 11
0
def queue_text():

    """
    Queue text extraction tasks in the worker.
    """

    for doc in query_bar(Document.select()):
        config.rq.enqueue(ext_text, doc.id)
Ejemplo n.º 12
0
def test_es_doc(models):

    """
    Document_Text#es_doc() should return an Elasticsearch document.
    """

    doc = Document.create(path='000/abc')
    text = Document_Text.create(document=doc, text='text')

    assert text.es_doc['_id']       == '000/abc'
    assert text.es_doc['doc_id']    == doc.id
    assert text.es_doc['body']      == 'text'
Ejemplo n.º 13
0
def test_institution_counts(models):

    """
    Document_Institution.institution_counts() should provide syllabus counts
    for each institution id.
    """

    i1 = Institution.create()
    i2 = Institution.create()
    i3 = Institution.create()

    d1 = Document.create(path='d1')
    d2 = Document.create(path='d2')
    d3 = Document.create(path='d3')
    d4 = Document.create(path='d4')
    d5 = Document.create(path='d5')
    d6 = Document.create(path='d6')

    # 1 document for institution 1.
    Document_Institution.create(institution=i1, document=d1)

    # 2 documents for institution 2.
    Document_Institution.create(institution=i2, document=d2)
    Document_Institution.create(institution=i2, document=d3)

    # 3 documents for institution 3.
    Document_Institution.create(institution=i3, document=d4)
    Document_Institution.create(institution=i3, document=d5)
    Document_Institution.create(institution=i3, document=d6)

    assert Document_Institution.institution_counts() == {
        d1.id: 1,
        d2.id: 2,
        d3.id: 3,
    }
Ejemplo n.º 14
0
def test_format_counts(models):

    """
    Document.format_counts()
    """

    d1 = Document.create(path='1')
    d2 = Document.create(path='2')
    d3 = Document.create(path='3')
    d4 = Document.create(path='4')
    d5 = Document.create(path='5')
    d6 = Document.create(path='6')

    # 1 doc with 'format1'.
    f1 = Document_Format.create(document=d1, format='format1')

    # 2 docs with 'format2'.
    f2 = Document_Format.create(document=d2, format='format2')
    f3 = Document_Format.create(document=d3, format='format2')

    # 3 docs with 'format3'.
    f4 = Document_Format.create(document=d4, format='format3')
    f5 = Document_Format.create(document=d5, format='format3')
    f6 = Document_Format.create(document=d6, format='format3')

    assert Document_Format.format_counts() == [
        ('format3', 3),
        ('format2', 2),
        ('format1', 1)
    ]
Ejemplo n.º 15
0
    def _doc(content='content'):

        # Write a file.
        path = mock_osp.add_file(content=content)
        syllabus = Syllabus(path)

        # Insert the document row.
        document = Document.create(path=syllabus.relative_path)

        # Extract text.
        text = ext_text(document.id)

        return document
Ejemplo n.º 16
0
def test_text_extraction_fails(models, mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content="")
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
Ejemplo n.º 17
0
def ext_format(doc_id):

    """
    Write the libmagic file format.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    return Document_Format.create(
        format=doc.syllabus.libmagic_file_type,
        document=doc
    )
Ejemplo n.º 18
0
def test_read_format(models, mock_osp):

    """
    read_format() should write the format to the `document_format` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file()
    document = Document.create(path=path)

    ext_format(document.id)

    # Pop out the new row.
    row = Document_Format.get(Document_Format.document==document)
    assert row.format == 'text/plain'
Ejemplo n.º 19
0
    def _ext(ftype):

        # Create a document.
        path = mock_osp.add_file(ftype=ftype)
        document = Document.create(path=path)

        # Extract the date.
        ext_file_metadata(document.id)

        # Pop out the new row.
        return (
            Document_Date_File_Metadata
            .select()
            .where(Document_Date_File_Metadata.document==document)
            .first()
        )
Ejemplo n.º 20
0
def test_text_extraction_succeeds(models, mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content="text")
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == "text"
Ejemplo n.º 21
0
    def _ext(url):

        # Create a document.
        path = mock_osp.add_file(log={'url': url})
        document = Document.create(path=path)

        # Extract the date.
        ext_archive_url(document.id)

        # Pop out the new row.
        return (
            Document_Date_Archive_Url
            .select()
            .where(Document_Date_Archive_Url.document==document)
            .first()
        )
Ejemplo n.º 22
0
    def _ext(content):

        # Create a document.
        path = mock_osp.add_file(content=content)
        document = Document.create(path=path)

        # Extract text, then date.
        ext_text(document.id)
        ext_semester(document.id)

        # Pop out the new row.
        return (
            Document_Date_Semester
            .select()
            .where(Document_Date_Semester.document==document)
            .first()
        )
Ejemplo n.º 23
0
def ext_text(doc_id):

    """
    Write the document as plain text.

    Args:
        doc_id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)

    if doc.syllabus.text:

        return Document_Text.create(
            text=doc.syllabus.text,
            document=doc
        )
Ejemplo n.º 24
0
def ext_file_metadata(doc_id):

    """
    Try to extract a created date from PDF and DOCX file metadata.

    Args:
        id (int): The document id.
    """

    doc = Document.get(Document.id==doc_id)
    date = doc.syllabus.created_date

    if date:

        return Document_Date_File_Metadata.create(
            document=doc,
            date=date
        )
Ejemplo n.º 25
0
def test_insert_new_documents(models, mock_osp):

    """
    When new documents are added to the corpus, just the new documents should
    be registered in the database.
    """

    # 10 files in `000`.
    for i in range(10):
        mock_osp.add_file(segment='000', name='000-'+str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 10

    # 10 new files in `001`.
    for i in range(10):
        mock_osp.add_file(segment='001', name='001-'+str(i))

    # Should add 10 docs.
    Document.insert_documents()
    assert Document.select().count() == 20