def test_percentage_match(self, mock_match, mock_get): """ From the ratio docstring: Where T is the total number of elements in both sequences, and M is the number of matches, this is 2.0*M / T. --- text = '012345' len(6) line = '0123456789' len(10) 2.0 * 6 / 16 = 0.75 """ text = "012345" text_data = b'' with open(self.pdf_1500_words, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response mock_match.return_value = False ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 0.75)
def test_exact_match(self, mock_get): text = "This is a class for comparing sequences of lines of text, and producing human-readable differences" \ " or deltas. Differ uses SequenceMatcher both to compare sequences of lines, and to compare sequences" \ " of characters within similar (near-matching) lines." mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 1.0)
def test_content_type_with_charset(self, mock_get): """ BUG: Some Content responses also have a charset with it that is separated by ';' FIX: splitting the Content-type on ';' and return the first item """ text = """hyperbolic""" self.mock_response.headers['Content-Type'] = 'text/html; charset=utf-8' mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', 'text') self.assertEqual(ratio, 1.0)
def test_exact_match_weird_format(self, mock_get): """ Same paragraph as above but with unnecessary enters, tabs and spaces added """ text = """ This is a class \t\t for comparing sequences of lines of text, and producing human-readable differences or deltas. Differ\n uses SequenceMatcher both to compare sequences of lines, and to compare sequences of characters within similar (near-matching) lines. """ mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 1.0)
def test_no_match(self, mock_get): text = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed commodo ex eget nibh posuere, non condimentum lectus scelerisque. Etiam ac viverra justo, sit amet placerat justo. Sed sed neque vitae velit egestas gravida eu vitae tellus. Aenean id gravida ligula. Interdum et malesuada fames ac ante ipsum primis in faucibus. Maecenas porttitor sit amet nibh a tincidunt. In dignissim turpis posuere, tincidunt ligula non, pellentesque magna. Nulla et tincidunt est. Etiam in lacus id magna laoreet suscipit sit amet fermentum eros. """ mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 0.0)
def test_text_list_item_match(self, mock_get): """ This line is part of a list within the PDF(pdfminer-docs) """ text = "• exact : preserve the exact location of each individual character (a large and messy HTML)." text_data = b'' with open(self.pdf_pdfminder_doc, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 1.0)
def test_text_different_format_match(self, mock_get): """ Within the PDF(pdfminer-docs) this text has a different styling format """ text = "-o filename" text_data = b'' with open(self.pdf_pdfminder_doc, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 1.0)
def test_no_match(self, mock_get): text = """This text should not be in the PDF. The result will 0.0, unless.. this text is within the PDF-Document. """ text_data = b'' with open(self.pdf_1500_words, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 0.0)
def test_text_with_block_match(self, mock_get): """ This part of text resides within a 'text' block""" text = """mkdir pdfminer\cmap python tools\conv_cmap.py -c B5=cp950 -c UniCNS-UTF8=utf-8 pdfminer\cmap ˓ → Adobe-CNS1 cmaprsrc\cid2code_Adobe_CNS1.txt""" text_data = b'' with open(self.pdf_pdfminder_doc, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertGreaterEqual(ratio, .89)
def test_perfect_match(self, mock_get): text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras mollis luctus lacus, id tristique magna vehicula in. Fusce vel neque a metus malesuada scelerisque sit amet auctor nibh. In luctus viverra libero, malesuada cursus enim rhoncus a. Vivamus eu dictum augue, in dignissim elit. Phasellus rhoncus rhoncus cursus. Quisque elementum erat in tempus placerat. Morbi arcu tortor, sodales eget commodo eget, dictum in diam.""" text_data = b'' with open(self.pdf_1500_words, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertEqual(ratio, 1.0)
def test_content_type_with_charset(self, mock_get): """ BUG: Some Content responses also have a charset with it that is separated by ';' FIX: splitting the Content-type on ';' and return the first item """ text = """Lorem""" text_data = b'' with open(self.pdf_1500_words, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data self.mock_response.headers['Content-Type'] = 'application/pdf; charset=utf-8' mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) self.assertEqual(ratio, 1.0)
def test_percentage_match(self, mock_exact, mock_get): """ From the ratio docstring: Where T is the total number of elements in both sequences, and M is the number of matches, this is 2.0*M / T. --- text = '012345' len(6) line = '0123456789' len(10) 2.0 * 6 / 16 = 0.75 """ text = "012345" mock_get.return_value = self.mock_response mock_exact.return_value = 0.0 ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() mock_exact.assert_called() self.assertEqual(ratio, 0.75)
def test_first_line_of_multiple_paragraphs_match(self, mock_get): text = """ Donec consectetur sit amet turpis id suscipit. Cras nulla metus, egestas ut viverra sed, tempor vel neque. Proin suscipit, nunc in feugiat dignissim, lectus eros fringilla velit, sed semper ex purus id purus. Vestibulum accumsan dui sed sem convallis maximus. Phasellus et ante justo. Donec sapien urna, condimentum vel congue ut, finibus in leo. """ text_data = b'' with open(self.pdf_1500_words, 'rb') as fp: text_data = fp.read(-1) self.mock_response.content = text_data mock_get.return_value = self.mock_response ratio = textmatcher.match('http://someurl.com', text) mock_get.assert_called() self.assertGreaterEqual(ratio, 0.95) self.assertLessEqual(ratio, 1.00)