def test_invalid_index(mock_osp):
    """
    Return None when an out-of-bounds offset is passed.
    """

    path = mock_osp.add_file()
    syllabus = Syllabus(path)

    assert syllabus.metadata(10) is None
def test_valid_index(mock_osp):
    """
    When an in-bounds offset is passed to Syllabus#metadata(), the value
    should be returned.
    """

    path = mock_osp.add_file()
    syllabus = Syllabus(path)

    assert syllabus.metadata(1) is not None
def test_text(mock_osp):
    """
    If the file isn't a PDF or DOCX, return None.
    """

    path = mock_osp.add_file(ftype='plain')
    plain = Syllabus(path)

    assert plain.created_date == None

    path = mock_osp.add_file(ftype='html')
    html = Syllabus(path)

    assert html.created_date == None
Ejemplo n.º 4
0
    def syllabus(self):

        """
        Wrap the row as an OSP Syllabus instance.
        """

        return Syllabus.from_env(self.path)
Ejemplo n.º 5
0
def test_domain(mock_osp):
    """
    Syllabus#domain should provide the parsed TLD.
    """

    path = mock_osp.add_file(log=dict(url='http://test.edu'))
    syllabus = Syllabus(path)

    assert syllabus.domain == 'test.edu'
def test_html(mock_osp):
    """
    Should extract text from HTML files.
    """

    path = mock_osp.add_file(content='<p>text</p>', ftype='html')
    syllabus = Syllabus(path)

    assert syllabus.text == 'text'
def test_checksum(mock_osp):
    """
    Syllabus#checksum should return the log checksum.
    """

    path = mock_osp.add_file(log={'checksum': '123'})
    syllabus = Syllabus(path)

    assert syllabus.checksum == '123'
def test_file_type(mock_osp):
    """
    Syllabus#file_type should return the log file type.
    """

    path = mock_osp.add_file(log={'format': 'text/plain'})
    syllabus = Syllabus(path)

    assert syllabus.file_type == 'text/plain'
def test_log_exists(mock_osp):
    """
    Syllabus#log_exists should return True when a log is present.
    """

    path = mock_osp.add_file()
    syllabus = Syllabus(path)

    assert syllabus.log_exists == True
def test_retrieved_date(mock_osp):
    """
    Syllabus#retrieved_date should return the log date.
    """

    path = mock_osp.add_file(log={'date': 'now'})
    syllabus = Syllabus(path)

    assert syllabus.retrieved_date == 'now'
def test_provenance(mock_osp):
    """
    Syllabus#provenance should return the log origin.
    """

    path = mock_osp.add_file(log={'provenance': 'pytest'})
    syllabus = Syllabus(path)

    assert syllabus.provenance == 'pytest'
def test_plaintext(mock_osp):
    """
    Should extract text from vanilla text files.
    """

    path = mock_osp.add_file(content='text', ftype='plain')
    syllabus = Syllabus(path)

    assert syllabus.text == 'text'
def test_pdf(mock_osp):
    """
    Should extract text from PDF files.
    """

    path = mock_osp.add_file(content='text', ftype='pdf')
    syllabus = Syllabus(path)

    assert syllabus.text.strip() == 'text'
def test_empty(mock_osp):
    """
    Should return None if the file is empty.
    """

    path = mock_osp.add_file(content='', ftype='plain')
    syllabus = Syllabus(path)

    assert syllabus.text == None
def test_office(mock_osp):
    """
    Should extract text from office files.
    """

    path = mock_osp.add_file(content='text', ftype='docx')
    syllabus = Syllabus(path)

    assert syllabus.text.strip() == 'text'
Ejemplo n.º 16
0
def test_log_path(mock_osp):
    """
    Syllabus#log_path should return the .log file path.
    """

    path = mock_osp.add_file()
    syllabus = Syllabus(path)

    assert syllabus.log_path == path + '.log'
def test_url(mock_osp):
    """
    Syllabus#url should return the log URL.
    """

    path = mock_osp.add_file(log={'url': 'osp.org'})
    syllabus = Syllabus(path)

    assert syllabus.url == 'osp.org'
Ejemplo n.º 18
0
def test_log_missing(mock_osp):
    """
    When the log is absent, return an empty list.
    """

    path = mock_osp.add_file()
    syllabus = Syllabus(path)

    os.remove(path + '.log')
    assert syllabus.log == []
Ejemplo n.º 19
0
def test_relative_path(mock_osp):

    """
    Syllabys#relative_path should return the corpus-relative path.
    """

    path = mock_osp.add_file(segment='segment', name='name')
    syllabus = Syllabus(path)

    assert syllabus.relative_path == 'segment/name'
Ejemplo n.º 20
0
def test_file_name(mock_osp):

    """
    Syllabus#file_name Should return the base file name.
    """

    path = mock_osp.add_file(name='name')
    syllabus = Syllabus(path)

    assert syllabus.file_name == 'name'
def test_log_missing(mock_osp):
    """
    When the log is absent, return False.
    """

    path = mock_osp.add_file()
    syllabus = Syllabus(path)

    os.remove(path + '.log')
    assert syllabus.log_exists == False
def test_segment_name(mock_osp):

    """
    Syllabys#segment_name should return the segment directory name.
    """

    path = mock_osp.add_file(segment='001')
    syllabus = Syllabus(path)

    assert syllabus.segment_name == '001'
Ejemplo n.º 23
0
    def syllabi(self):
        """
        Generate Syllabus instances for each file.

        Yields:
            Syllabus: The next syllabus.
        """

        for path in self.file_paths():
            yield Syllabus(path)
def test_collapse_whitespace(mock_osp):
    """
    Should collapse all 2-char+ strings of whitespace.
    """

    text = 'w1 \n\t w2 \n\t w3'

    path = mock_osp.add_file(content=text, ftype='plain')
    syllabus = Syllabus(path)

    assert syllabus.unbroken_text == 'w1 w2 w3'
def test_trim(mock_osp):
    """
    Should trim whitespace from the beginnging and end.
    """

    text = ' \n\t word \n\t '

    path = mock_osp.add_file(content=text, ftype='plain')
    syllabus = Syllabus(path)

    assert syllabus.unbroken_text == 'word'
def test_pdf(mock_osp):
    """
    Syllabus#created_date should extract the created date from PDFs.
    """

    now = datetime.now()

    path = mock_osp.add_file(ftype='pdf')
    syllabus = Syllabus(path)

    # Created within a second of now.
    assert abs(syllabus.created_date - now).seconds <= 1
    def _doc(*args, **kwargs):

        # Write a file.
        path = mock_osp.add_file(*args, **kwargs)
        syllabus = Syllabus(path)

        # Insert the document row.
        document = Document.create(path=syllabus.relative_path)

        # Extract text.
        text = ext_text(document.id)

        return document
Ejemplo n.º 28
0
def test_log_exists(mock_osp):
    """
    Syllabus#log should split the log file into lines.
    """

    log = {
        'url': 'url',
        'provenance': 'provenance',
        'date': 'date',
        'checksum': 'checksum',
        'format': 'format'
    }

    path = mock_osp.add_file(log=log)
    syllabus = Syllabus(path)

    assert syllabus.log == ['url', 'provenance', 'date', 'checksum', 'format']
Ejemplo n.º 29
0
    def syllabus(self):
        """
        Wrap the row as an OSP Syllabus instance.
        """

        return Syllabus.from_env(self.path)