def test_parseSimplePdf (self): samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf', '01_simple_text.pdf') pdfProcessor = PdfProcessor(samplePdfFile) document = pdfProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_01')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def test_textWithLineBlocks (self): sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '08_line_blocks.docx') docxProcessor = DocxProcessor(sampleDocxFile) document = docxProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_08')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def test_textWithWeirdFormatting (self): sampleDocxFile = os.path.join(os.getcwd(), 'samples', 'docx', '11_weird_formatting.docx') docxProcessor = DocxProcessor(sampleDocxFile) document = docxProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_11')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def test_sampleDocumentProcessing (self): sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance', '01_sample.json') substanceProcessor = SubstanceProcessor().initWithFile(sampleSubstanceFile) document = substanceProcessor.document() expectedDocument = Document().initWithFile(os.path.join(os.getcwd(), 'samples', 'expected outcome', 'substance_01')) print("DOCUMENT METADATA: " + str(document.metadata())) self.assertEqual(expectedDocument.content(), document.content()) self.assertEqual(expectedDocument.metadata(), document.metadata())
def test_parseSimplePdf(self): samplePdfFile = os.path.join(os.getcwd(), 'samples', 'pdf', '01_simple_text.pdf') pdfProcessor = PdfProcessor(samplePdfFile) document = pdfProcessor.document() expectedDocument = Document().initWithFile( os.path.join(os.getcwd(), 'samples', 'expected outcome', 'docx', 'test_01')) self.assertEquals(expectedDocument.content(), document.content()) self.assertEquals(expectedDocument.formatting(), document.formatting())
def test_sampleDocumentProcessing(self): sampleSubstanceFile = os.path.join(os.getcwd(), 'samples', 'substance', '01_sample.json') substanceProcessor = SubstanceProcessor().initWithFile( sampleSubstanceFile) document = substanceProcessor.document() expectedDocument = Document().initWithFile( os.path.join(os.getcwd(), 'samples', 'expected outcome', 'substance_01')) print("DOCUMENT METADATA: " + str(document.metadata())) self.assertEqual(expectedDocument.content(), document.content()) self.assertEqual(expectedDocument.metadata(), document.metadata())
def corpus_line(file_path: str, encoding='utf-8', zip_type=None): suffix = '' if not zip_type else '.' + zip_type # class label class_label_file = os.path.join(file_path, '.labels' + suffix) if os.path.isfile(class_label_file): class_label_iter = parse_file(class_label_file, encoding=encoding, zip_type=zip_type) else: class_label_iter = itertools.repeat('[none]') # document name doc_name_file = os.path.join(file_path, '.names' + suffix) if os.path.isfile(doc_name_file): doc_name_iter = parse_file(doc_name_file, encoding=encoding, zip_type=zip_type) else: doc_name_iter = itertools.repeat('[none]') # doc content content_iter = parse_file(file_path + suffix, encoding=encoding, zip_type=zip_type) for doc_id, class_label, name, content in\ zip(itertools.count(0), class_label_iter, doc_name_iter, content_iter): doc = Document(name, doc_id, class_label) doc.content = content yield doc