def test_get_duplicate(testdb): doc = Doc(url='http://umsu.de/papers/driver-2011.pdf') doc.link = Link(url='http://umsu.de/papers/driver-2011.pdf') doc.content = readfile(os.path.join(testdir, 'attitudes.txt')) doc.numwords = 13940 doc.numpages = 26 doc.authors = 'Wolfang Schwarz' doc.title = 'Lost memories and useless coins: Revisiting the absentminded driver' doc.update_db() doc2 = Doc(url='http://download.springer.com/static/pdf/307/art%253A10.1007%252Fs11229-015-0699-z.pdf') doc2.link = Link(url=doc2.url) doc2.content = 'abcdefghjik'+readfile(os.path.join(testdir, 'attitudes.txt')) doc2.numwords = 14130 doc2.numpages = 29 doc2.authors = 'Wolfang Schwarz' doc2.title = 'Lost memories and useless coins: revisiting the absentminded driver' dupe = scraper.get_duplicate(doc2) assert dupe.doc_id == doc.doc_id
def test_process_file(): doc = Doc(filetype='pdf') doc.link = Link(url='foo') doc.link.context = 'Lorem ipsum dolor sit amet' doc.link.anchortext = 'Lorem ipsum dolor sit amet' doc.source = Source(url='foo', html='<b>Lorem ipsum dolor sit amet</b>') doc.tempfile = os.path.join(testdir, 'simple.pdf') scraper.process_file(doc) assert doc.title == 'Lorem ipsum dolor sit amet'
def test_cv(): doc = Doc(url="http://umsu.de/papers/cv.pdf") doc.link = Link(url="http://umsu.de/papers/cv.pdf") doc.link.anchortext = "CV" doc.link.context = "CV" doc.content = readfile(os.path.join(testdir, "cv.txt")) doc.numwords = 10200 doc.numpages = 22 doc.meta_confidence = 92 assert paperfilter.evaluate(doc) < 0.4
def test_gooddoc_badlink(): doc = Doc(url="http://umsu.de/papers/variations.pdf") doc.link = Link(url="http://umsu.de/papers/variations.pdf") doc.link.anchortext = "slides" doc.link.context = "The slides for my talk" doc.content = readfile(os.path.join(testdir, "attitudes.txt")) doc.numwords = 10200 doc.numpages = 22 doc.meta_confidence = 92 assert paperfilter.evaluate(doc) < 0.8
def test_gooddoc(): doc = Doc(url="http://umsu.de/papers/variations.pdf") doc.link = Link(url="http://umsu.de/papers/variations.pdf") doc.link.anchortext = "Download" doc.link.context = "Foo bar" doc.content = readfile(os.path.join(testdir, "attitudes.txt")) doc.numwords = 10200 doc.numpages = 22 doc.meta_confidence = 92 assert paperfilter.evaluate(doc) > 0.98