Exemple #1
0
def test_get_duplicate(testdb):
    doc = Doc(url='http://umsu.de/papers/driver-2011.pdf')
    doc.link = Link(url='http://umsu.de/papers/driver-2011.pdf')
    doc.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    doc.numwords = 13940
    doc.numpages = 26
    doc.authors = 'Wolfang Schwarz'
    doc.title = 'Lost memories and useless coins: Revisiting the absentminded driver'
    doc.update_db()
    doc2 = Doc(url='http://download.springer.com/static/pdf/307/art%253A10.1007%252Fs11229-015-0699-z.pdf')
    doc2.link = Link(url=doc2.url)
    doc2.content = 'abcdefghjik'+readfile(os.path.join(testdir, 'attitudes.txt'))
    doc2.numwords = 14130
    doc2.numpages = 29
    doc2.authors = 'Wolfang Schwarz'
    doc2.title = 'Lost memories and useless coins: revisiting the absentminded driver'
    dupe = scraper.get_duplicate(doc2)
    assert dupe.doc_id == doc.doc_id
Exemple #2
0
def test_process_file():
    doc = Doc(filetype='pdf')
    doc.link = Link(url='foo')
    doc.link.context = 'Lorem ipsum dolor sit amet'
    doc.link.anchortext = 'Lorem ipsum dolor sit amet'
    doc.source = Source(url='foo', html='<b>Lorem ipsum dolor sit amet</b>')
    doc.tempfile = os.path.join(testdir, 'simple.pdf')
    scraper.process_file(doc)
    assert doc.title == 'Lorem ipsum dolor sit amet'
Exemple #3
0
def test_cv():
    doc = Doc(url="http://umsu.de/papers/cv.pdf")
    doc.link = Link(url="http://umsu.de/papers/cv.pdf")
    doc.link.anchortext = "CV"
    doc.link.context = "CV"
    doc.content = readfile(os.path.join(testdir, "cv.txt"))
    doc.numwords = 10200
    doc.numpages = 22
    doc.meta_confidence = 92
    assert paperfilter.evaluate(doc) < 0.4
Exemple #4
0
def test_gooddoc_badlink():
    doc = Doc(url="http://umsu.de/papers/variations.pdf")
    doc.link = Link(url="http://umsu.de/papers/variations.pdf")
    doc.link.anchortext = "slides"
    doc.link.context = "The slides for my talk"
    doc.content = readfile(os.path.join(testdir, "attitudes.txt"))
    doc.numwords = 10200
    doc.numpages = 22
    doc.meta_confidence = 92
    assert paperfilter.evaluate(doc) < 0.8
Exemple #5
0
def test_gooddoc():
    doc = Doc(url="http://umsu.de/papers/variations.pdf")
    doc.link = Link(url="http://umsu.de/papers/variations.pdf")
    doc.link.anchortext = "Download"
    doc.link.context = "Foo bar"
    doc.content = readfile(os.path.join(testdir, "attitudes.txt"))
    doc.numwords = 10200
    doc.numpages = 22
    doc.meta_confidence = 92
    assert paperfilter.evaluate(doc) > 0.98