def test_match(add_doc, add_institution): """ When a doc URL matches an institution domain, write a link. """ doc = add_doc(log={ 'url': 'http://yale.edu/syllabus.pdf' }) yale = add_institution( name='Yale University', domain='yale.edu', ) harvard = add_institution( name='Harvard University', domain='harvard.edu', ) doc_to_inst(doc.id) # Should write a link. assert Institution_Document.select().count() == 1 # Should link the right rows. assert Institution_Document.select().where( Institution_Document.institution==yale, Institution_Document.document==doc, )
def test_match_paths(add_doc, add_institution): """ If the document has a path, it should be matched "greedily" against the institutions - find the institution with the longest shared path. """ i1 = add_institution(url='http://yale.edu') i2 = add_institution(url='http://yale.edu/p1') i3 = add_institution(url='http://yale.edu/p1/p2') i4 = add_institution(url='http://yale.edu/p1/p2/p3') d1 = add_doc(log=dict(url='http://yale.edu/syllabus.pdf')) d2 = add_doc(log=dict(url='http://yale.edu/p1/syllabus.pdf')) d3 = add_doc(log=dict(url='http://yale.edu/p1/p2/syllabus.pdf')) d4 = add_doc(log=dict(url='http://yale.edu/p1/p2/p3/syllabus.pdf')) Institution_Document.link() for i, d in [ (i1, d1), (i2, d2), (i3, d3), (i4, d4), ]: assert Institution_Document.select().where( Institution_Document.institution==i, Institution_Document.document==d, )
def test_match_subdomains(add_doc, add_institution): """ If the document has a subdomain(s), it should be matched "greedily" against the institutions - find the institution with the most shared subdomains. """ i1 = add_institution(url='http://yale.edu') i2 = add_institution(url='http://sub1.yale.edu') i3 = add_institution(url='http://sub2.sub1.yale.edu') i4 = add_institution(url='http://sub3.sub2.sub1.yale.edu') d1 = add_doc(log=dict(url='http://yale.edu/syllabus.pdf')) d2 = add_doc(log=dict(url='http://sub1.yale.edu/syllabus.pdf')) d3 = add_doc(log=dict(url='http://sub2.sub1.yale.edu/syllabus.pdf')) d4 = add_doc(log=dict(url='http://sub3.sub2.sub1.yale.edu/syllabus.pdf')) Institution_Document.link() for i, d in [ (i1, d1), (i2, d2), (i3, d3), (i4, d4), ]: assert Institution_Document.select().where( Institution_Document.institution==i, Institution_Document.document==d, )
def test_link(add_doc, add_institution): """ .link() should link documents -> institutions. """ i1 = add_institution(url='http://d1.edu') i2 = add_institution(url='http://d2.edu') i3 = add_institution(url='http://d3.edu') d1 = add_doc(log=dict(url='http://d1.edu/syllabus.pdf')) d2 = add_doc(log=dict(url='http://d2.edu/syllabus.pdf')) d3 = add_doc(log=dict(url='http://d3.edu/syllabus.pdf')) Institution_Document.link() for i, d in [ (i1, d1), (i2, d2), (i3, d3), ]: assert Institution_Document.select().where( Institution_Document.institution==i, Institution_Document.document==d, )
def test_no_match(add_doc, add_institution): """ When the URL doesn't match an institution, don't write a row. """ doc = add_doc(log={ 'url': 'http://yale.edu/syllabus.pdf' }) harvard = add_institution( name='Harvard University', domain='harvard.edu', ) doc_to_inst(doc.id) # Shouldn't write a link. assert Institution_Document.select().count() == 0
def doc_to_inst(out_file, n): """ Dump N institution -> document matches. """ cols = ['inst_url', 'doc_url'] writer = csv.DictWriter(out_file, cols) writer.writeheader() # Pull matches. matches = Institution_Document.select().limit(n) for row in matches: writer.writerow(dict( inst_url=row.institution.url, doc_url=row.document.syllabus.url, ))