Ejemplo n.º 1
0
def test_es_insert(models, config, corpus_index):

    """
    CorpusIndex.index() should index all rows in Elasticsearch.
    """

    # Index 100 documents.
    for i in range(10):
        doc = Document.create(path=str(i))
        Document_Text.create(document=doc, text=str(i))

    Document_Text.es_insert()

    # Should insert 10 docs.
    assert Document_Text.es_count() == 10

    # For each text row:
    for t in Document_Text.select():

        # A document should exist.
        doc = config.es.get('osp', t.document.path)

        # Should index text / doc ID.
        assert doc['_source']['doc_id'] == t.document.id
        assert doc['_source']['body']   == t.document.path
Ejemplo n.º 2
0
def test_link_with_document(models, mock_osp):

    """
    When a semester marker is found, the metadata row should be associated
    with the document that was passed to the job.
    """

    # 2 document rows.
    doc1 = Document.create(path='path1')
    doc2 = Document.create(path='path2')

    # Just 1 text row.
    doc_text = Document_Text.create(document=doc2, text='Fall 2012')
    assert doc_text.id != doc_text.document.id

    row = ext_semester(doc2.id)
    assert row.document == doc2
Ejemplo n.º 3
0
def test_es_doc(models):

    """
    Document_Text#es_doc() should return an Elasticsearch document.
    """

    doc = Document.create(path='000/abc')
    text = Document_Text.create(document=doc, text='text')

    assert text.es_doc['_id']       == '000/abc'
    assert text.es_doc['doc_id']    == doc.id
    assert text.es_doc['body']      == 'text'
Ejemplo n.º 4
0
def test_format_counts(models):

    """
    Document.format_counts()
    """

    d1 = Document.create(path='1')
    d2 = Document.create(path='2')
    d3 = Document.create(path='3')
    d4 = Document.create(path='4')
    d5 = Document.create(path='5')
    d6 = Document.create(path='6')

    # 1 doc with 'format1'.
    f1 = Document_Format.create(document=d1, format='format1')

    # 2 docs with 'format2'.
    f2 = Document_Format.create(document=d2, format='format2')
    f3 = Document_Format.create(document=d3, format='format2')

    # 3 docs with 'format3'.
    f4 = Document_Format.create(document=d4, format='format3')
    f5 = Document_Format.create(document=d5, format='format3')
    f6 = Document_Format.create(document=d6, format='format3')

    assert Document_Format.format_counts() == [
        ('format3', 3),
        ('format2', 2),
        ('format1', 1)
    ]
Ejemplo n.º 5
0
def test_institution_counts(models):

    """
    Document_Institution.institution_counts() should provide syllabus counts
    for each institution id.
    """

    i1 = Institution.create()
    i2 = Institution.create()
    i3 = Institution.create()

    d1 = Document.create(path='d1')
    d2 = Document.create(path='d2')
    d3 = Document.create(path='d3')
    d4 = Document.create(path='d4')
    d5 = Document.create(path='d5')
    d6 = Document.create(path='d6')

    # 1 document for institution 1.
    Document_Institution.create(institution=i1, document=d1)

    # 2 documents for institution 2.
    Document_Institution.create(institution=i2, document=d2)
    Document_Institution.create(institution=i2, document=d3)

    # 3 documents for institution 3.
    Document_Institution.create(institution=i3, document=d4)
    Document_Institution.create(institution=i3, document=d5)
    Document_Institution.create(institution=i3, document=d6)

    assert Document_Institution.institution_counts() == {
        d1.id: 1,
        d2.id: 2,
        d3.id: 3,
    }
Ejemplo n.º 6
0
    def _doc(content='content'):

        # Write a file.
        path = mock_osp.add_file(content=content)
        syllabus = Syllabus(path)

        # Insert the document row.
        document = Document.create(path=syllabus.relative_path)

        # Extract text.
        text = ext_text(document.id)

        return document
Ejemplo n.º 7
0
def test_text_extraction_fails(models, mock_osp):

    """
    If no text can be extracted, don't write the row.
    """

    # Add an empty file.
    path = mock_osp.add_file(content="")
    document = Document.create(path=path)

    ext_text(document.id)

    # Shouldn't write a row.
    assert Document_Text.select().count() == 0
Ejemplo n.º 8
0
def test_read_format(models, mock_osp):

    """
    read_format() should write the format to the `document_format` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file()
    document = Document.create(path=path)

    ext_format(document.id)

    # Pop out the new row.
    row = Document_Format.get(Document_Format.document==document)
    assert row.format == 'text/plain'
Ejemplo n.º 9
0
    def _ext(ftype):

        # Create a document.
        path = mock_osp.add_file(ftype=ftype)
        document = Document.create(path=path)

        # Extract the date.
        ext_file_metadata(document.id)

        # Pop out the new row.
        return (
            Document_Date_File_Metadata
            .select()
            .where(Document_Date_File_Metadata.document==document)
            .first()
        )
Ejemplo n.º 10
0
    def _ext(url):

        # Create a document.
        path = mock_osp.add_file(log={'url': url})
        document = Document.create(path=path)

        # Extract the date.
        ext_archive_url(document.id)

        # Pop out the new row.
        return (
            Document_Date_Archive_Url
            .select()
            .where(Document_Date_Archive_Url.document==document)
            .first()
        )
Ejemplo n.º 11
0
def test_text_extraction_succeeds(models, mock_osp):

    """
    read_text() should extract text for a document and write the result into
    the `document_text` table.
    """

    # Add a file, create a document row.
    path = mock_osp.add_file(content="text")
    document = Document.create(path=path)

    ext_text(document.id)

    # Pop out the new row.
    row = Document_Text.get(Document_Text.document == document)
    assert row.text == "text"
Ejemplo n.º 12
0
    def _ext(content):

        # Create a document.
        path = mock_osp.add_file(content=content)
        document = Document.create(path=path)

        # Extract text, then date.
        ext_text(document.id)
        ext_semester(document.id)

        # Pop out the new row.
        return (
            Document_Date_Semester
            .select()
            .where(Document_Date_Semester.document==document)
            .first()
        )