def test_invalid_index(mock_osp): """ Return None when an out-of-bounds offset is passed. """ path = mock_osp.add_file() syllabus = Syllabus(path) assert syllabus.metadata(10) is None
def test_valid_index(mock_osp): """ When an in-bounds offset is passed to Syllabus#metadata(), the value should be returned. """ path = mock_osp.add_file() syllabus = Syllabus(path) assert syllabus.metadata(1) is not None
def test_text(mock_osp): """ If the file isn't a PDF or DOCX, return None. """ path = mock_osp.add_file(ftype='plain') plain = Syllabus(path) assert plain.created_date == None path = mock_osp.add_file(ftype='html') html = Syllabus(path) assert html.created_date == None
def syllabus(self): """ Wrap the row as an OSP Syllabus instance. """ return Syllabus.from_env(self.path)
def test_domain(mock_osp): """ Syllabus#domain should provide the parsed TLD. """ path = mock_osp.add_file(log=dict(url='http://test.edu')) syllabus = Syllabus(path) assert syllabus.domain == 'test.edu'
def test_html(mock_osp): """ Should extract text from HTML files. """ path = mock_osp.add_file(content='<p>text</p>', ftype='html') syllabus = Syllabus(path) assert syllabus.text == 'text'
def test_checksum(mock_osp): """ Syllabus#checksum should return the log checksum. """ path = mock_osp.add_file(log={'checksum': '123'}) syllabus = Syllabus(path) assert syllabus.checksum == '123'
def test_file_type(mock_osp): """ Syllabus#file_type should return the log file type. """ path = mock_osp.add_file(log={'format': 'text/plain'}) syllabus = Syllabus(path) assert syllabus.file_type == 'text/plain'
def test_log_exists(mock_osp): """ Syllabus#log_exists should return True when a log is present. """ path = mock_osp.add_file() syllabus = Syllabus(path) assert syllabus.log_exists == True
def test_retrieved_date(mock_osp): """ Syllabus#retrieved_date should return the log date. """ path = mock_osp.add_file(log={'date': 'now'}) syllabus = Syllabus(path) assert syllabus.retrieved_date == 'now'
def test_provenance(mock_osp): """ Syllabus#provenance should return the log origin. """ path = mock_osp.add_file(log={'provenance': 'pytest'}) syllabus = Syllabus(path) assert syllabus.provenance == 'pytest'
def test_plaintext(mock_osp): """ Should extract text from vanilla text files. """ path = mock_osp.add_file(content='text', ftype='plain') syllabus = Syllabus(path) assert syllabus.text == 'text'
def test_pdf(mock_osp): """ Should extract text from PDF files. """ path = mock_osp.add_file(content='text', ftype='pdf') syllabus = Syllabus(path) assert syllabus.text.strip() == 'text'
def test_empty(mock_osp): """ Should return None if the file is empty. """ path = mock_osp.add_file(content='', ftype='plain') syllabus = Syllabus(path) assert syllabus.text == None
def test_office(mock_osp): """ Should extract text from office files. """ path = mock_osp.add_file(content='text', ftype='docx') syllabus = Syllabus(path) assert syllabus.text.strip() == 'text'
def test_log_path(mock_osp): """ Syllabus#log_path should return the .log file path. """ path = mock_osp.add_file() syllabus = Syllabus(path) assert syllabus.log_path == path + '.log'
def test_url(mock_osp): """ Syllabus#url should return the log URL. """ path = mock_osp.add_file(log={'url': 'osp.org'}) syllabus = Syllabus(path) assert syllabus.url == 'osp.org'
def test_log_missing(mock_osp): """ When the log is absent, return an empty list. """ path = mock_osp.add_file() syllabus = Syllabus(path) os.remove(path + '.log') assert syllabus.log == []
def test_relative_path(mock_osp): """ Syllabys#relative_path should return the corpus-relative path. """ path = mock_osp.add_file(segment='segment', name='name') syllabus = Syllabus(path) assert syllabus.relative_path == 'segment/name'
def test_file_name(mock_osp): """ Syllabus#file_name Should return the base file name. """ path = mock_osp.add_file(name='name') syllabus = Syllabus(path) assert syllabus.file_name == 'name'
def test_log_missing(mock_osp): """ When the log is absent, return False. """ path = mock_osp.add_file() syllabus = Syllabus(path) os.remove(path + '.log') assert syllabus.log_exists == False
def test_segment_name(mock_osp): """ Syllabys#segment_name should return the segment directory name. """ path = mock_osp.add_file(segment='001') syllabus = Syllabus(path) assert syllabus.segment_name == '001'
def syllabi(self): """ Generate Syllabus instances for each file. Yields: Syllabus: The next syllabus. """ for path in self.file_paths(): yield Syllabus(path)
def test_collapse_whitespace(mock_osp): """ Should collapse all 2-char+ strings of whitespace. """ text = 'w1 \n\t w2 \n\t w3' path = mock_osp.add_file(content=text, ftype='plain') syllabus = Syllabus(path) assert syllabus.unbroken_text == 'w1 w2 w3'
def test_trim(mock_osp): """ Should trim whitespace from the beginnging and end. """ text = ' \n\t word \n\t ' path = mock_osp.add_file(content=text, ftype='plain') syllabus = Syllabus(path) assert syllabus.unbroken_text == 'word'
def test_pdf(mock_osp): """ Syllabus#created_date should extract the created date from PDFs. """ now = datetime.now() path = mock_osp.add_file(ftype='pdf') syllabus = Syllabus(path) # Created within a second of now. assert abs(syllabus.created_date - now).seconds <= 1
def _doc(*args, **kwargs): # Write a file. path = mock_osp.add_file(*args, **kwargs) syllabus = Syllabus(path) # Insert the document row. document = Document.create(path=syllabus.relative_path) # Extract text. text = ext_text(document.id) return document
def test_log_exists(mock_osp): """ Syllabus#log should split the log file into lines. """ log = { 'url': 'url', 'provenance': 'provenance', 'date': 'date', 'checksum': 'checksum', 'format': 'format' } path = mock_osp.add_file(log=log) syllabus = Syllabus(path) assert syllabus.log == ['url', 'provenance', 'date', 'checksum', 'format']