def test_empty_documents(self): working_directory = "/tmp/test_empty_documents" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ empty_html_document = b"<html><body></body></html>" # if the first document is empty and all subsequent docs are empty, # then we are still on-topic, but this is to be debated cm.addTimeMap("timemap1", timemap_content, headers) cm.addMemento("memento11", empty_html_document, headers) cm.addMemento("memento12", empty_html_document, headers) cm.addMemento("memento13", empty_html_document, headers) mm = MeasureModel() mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=None, stemming=True) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=None, stemming=True) mm = compute_gensim_lsi_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lda_across_TimeMap( cm, mm, tokenize=True, stemming=True ) for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): for measurename in ["cosine", "jaccard", "gensim_lda", "gensim_lsi"]: self.assertEquals( mm.get_Memento_measurement_error_message(urim, "timemap measures", measurename), "After processing content, the first memento in TimeMap is now empty, cannot effectively compare memento content" ) shutil.rmtree(working_directory)
def test_all_mementos_different(self): working_directory = "/tmp/test_all_mementos_different" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } timemap1_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ timemap2_content ="""<original1>; rel="original", <timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT", <memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap1_content, headers) cm.addTimeMap("timemap2", timemap2_content, headers) urits = cm.getTimeMapURIList() # see: https://en.wikipedia.org/wiki/Pangram full_sentence = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'etaoin', 'shrdlu', 'Now','is', 'the', 'time', 'for', 'all', 'good', 'men', 'to', 'come', 'to', 'the', 'aid', 'of', 'their', 'country', 'Jived', 'fox', 'nymph', 'grabs', 'quick', 'waltz', 'Glib', 'jocks', 'quiz', 'nymph', 'to', 'vex', 'dwarf', 'Sphinx', 'of', 'black', 'quartz,', 'judge', 'my', 'vow', 'How', 'vexingly', 'quick', 'daft', 'zebras', 'jump', 'The', 'five', 'boxing', 'wizards', 'jump', 'quickly', 'Pack', 'my', 'box', 'with', 'five', 'dozen', 'liquor', 'jugs' ] for i in range(0, 2): timemap = cm.getTimeMap(urits[i]) index = i + 1 for memento in timemap["mementos"]["list"]: index += 1 urim = memento["uri"] mdt = memento["datetime"] innercontent = urim for j in range(0, index): innercontent += "\n" + " ".join(full_sentence[(i + j + index):]) + " " innercontent += "\n" + str(mdt) content = "<html><body>{}</body></html>".format(innercontent) cm.addMemento(urim, bytes(content, "utf8"), headers) mm = MeasureModel() mm = compute_bytecount_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_wordcount_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=True, stemming=True ) # mm = compute_cosine_across_TimeMap( # cm, scores=scores, stemming=True # ) mm = compute_sorensen_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_levenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_nlevenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) # mm = compute_tfintersection_across_TimeMap( # cm, scores=scores, tokenize=True, stemming=True # ) # mm = compute_rawsimhash_across_TimeMap( # cm, mm, tokenize=False, stemming=False # ) self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() ) self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() ) self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) expected_scores = { 'timemaps': { 'timemap1': { 'memento11': { 'timemap measures': { 'bytecount': { 'comparison score': 0.0, 'individual score': 723}, 'jaccard': { 'comparison score': 0.0}, 'levenshtein': { 'comparison score': 0}, 'nlevenshtein': { 'comparison score': 0.0}, 'sorensen': { 'comparison score': 0.0}, 'wordcount': { 'comparison score': 0.0, 'individual score': 94}}}, 'memento12': { 'timemap measures': { 'bytecount': { 'comparison score': 0.43015214384508993, 'individual score': 1034}, 'jaccard': { 'comparison score': 0.11363636363636365}, 'levenshtein': { 'comparison score': 45}, 'nlevenshtein': { 'comparison score': 0.3333333333333333}, 'sorensen': { 'comparison score': 0.06024096385542166}, 'wordcount': { 'comparison score': 0.43617021276595747, 'individual score': 135}}}, 'memento13': { 'timemap measures': { 'bytecount': { 'comparison score': 0.8409405255878284, 'individual score': 1331}, 'jaccard': { 'comparison score': 0.15555555555555556}, 'levenshtein': { 'comparison score': 86}, 'nlevenshtein': { 'comparison score': 0.48863636363636365}, 'sorensen': { 'comparison score': 0.08433734939759041}, 'wordcount': { 'comparison score': 0.8723404255319149, 'individual score': 176}}}}, 'timemap2': { 'memento21': { 'timemap measures': { 'bytecount': { 'comparison score': 0.0, 'individual score': 1019}, 'jaccard': { 'comparison score': 0.0}, 'levenshtein': { 'comparison score': 0}, 'nlevenshtein': { 'comparison score': 0.0}, 'sorensen': { 'comparison score': 0.0}, 'wordcount': { 'comparison score': 0.0, 'individual score': 133}}}, 'memento22': { 'timemap measures': { 'bytecount': { 'comparison score': 0.28655544651619236, 'individual score': 1311}, 'jaccard': { 'comparison score': 0.09302325581395354}, 'levenshtein': { 'comparison score': 45}, 'nlevenshtein': { 'comparison score': 0.25862068965517243}, 'sorensen': { 'comparison score': 0.04878048780487809}, 'wordcount': { 'comparison score': 0.30827067669172936, 'individual score': 174}}}, 'memento23': { 'timemap measures': { 'bytecount': { 'comparison score': 0.5593719332679097, 'individual score': 1589}, 'jaccard': { 'comparison score': 0.13636363636363635}, 'levenshtein': { 'comparison score': 86}, 'nlevenshtein': { 'comparison score': 0.4056603773584906}, 'sorensen': { 'comparison score': 0.07317073170731703}, 'wordcount': { 'comparison score': 0.593984962406015, 'individual score': 212}}}}}} for measure in same_scores: # we'll have to test TF intersection separately, # the way that I build the sentences does not # have enough different words if measure == "tfintersection" or measure == "cosine" or \ measure == "raw_simhash" or measure == "tf_simhash" or \ measure == "gensim_lda" or measure == "gensim_lsi": continue for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): # comparisons with themselves should match if urim == "memento11" or urim == "memento21": self.assertEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], "measure {} does not compute the correct score " "for document sameness".format(measure) ) else: self.assertNotEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], "measure {} does not compute the correct score " "for document differentness for URI-M {}".format( measure, urim) ) # for regression self.assertAlmostEqual( mm.get_score(urit, urim, "timemap measures", measure), expected_scores["timemaps"][urit][urim]["timemap measures"][measure]["comparison score"], msg="measure {} does not compute the expected score " "for URI-M {}".format(measure, urim) ) shutil.rmtree(working_directory)
def test_all_mementos_same(self): working_directory = "/tmp/test_all_mementos_same" if os.path.exists(working_directory): shutil.rmtree(working_directory) cm = collectionmodel.CollectionModel(working_directory=working_directory) headers = { "key1": "value1", "key2": "value2" } contents = [] contents.append(b"<html><body>Content1 is wonderful</body></html>") contents.append(b"<html><body>Content2 is great</body></html>") timemap1_content ="""<original1>; rel="original", <timemap1>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento11>; rel="first memento"; datetime="Tue, 21 Jan 2016 15:45:06 GMT", <memento12>; rel="memento"; datetime="Tue, 21 Jan 2017 15:45:06 GMT", <memento13>; rel="last memento"; datetime="Tue, 21 Jan 2018 15:45:12 GMT" """ timemap2_content ="""<original1>; rel="original", <timemap2>; rel="self"; type="application/link-format"; from="Tue, 21 Mar 2016 15:45:06 GMT"; until="Tue, 21 Mar 2018 15:45:12 GMT", <timegate1>; rel="timegate", <memento21>; rel="first memento"; datetime="Tue, 21 Mar 2016 15:45:06 GMT", <memento22>; rel="memento"; datetime="Tue, 21 Mar 2017 15:45:06 GMT", <memento23>; rel="last memento"; datetime="Tue, 21 Mar 2018 15:45:12 GMT" """ cm.addTimeMap("timemap1", timemap1_content, headers) cm.addTimeMap("timemap2", timemap2_content, headers) urits = cm.getTimeMapURIList() for i in range(0, 2): timemap = cm.getTimeMap(urits[i]) for memento in timemap["mementos"]["list"]: urim = memento["uri"] cm.addMemento(urim, contents[i], headers) mm = MeasureModel() mm = compute_bytecount_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_wordcount_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_jaccard_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_cosine_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_sorensen_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_levenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_nlevenshtein_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_tfintersection_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_rawsimhash_across_TimeMap( cm, mm, tokenize=False, stemming=False ) mm = compute_tfsimhash_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lsi_across_TimeMap( cm, mm, tokenize=True, stemming=True ) mm = compute_gensim_lda_across_TimeMap( cm, mm, tokenize=True, stemming=True ) self.assertTrue( "timemap1" in mm.get_TimeMap_URIs() ) self.assertTrue( "timemap2" in mm.get_TimeMap_URIs() ) self.assertTrue( "memento11" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento12" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento13" in mm.get_Memento_URIs_in_TimeMap("timemap1") ) self.assertTrue( "memento21" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento22" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) self.assertTrue( "memento23" in mm.get_Memento_URIs_in_TimeMap("timemap2") ) for measure in same_scores: print("evaluating measure {}".format(measure)) for urit in mm.get_TimeMap_URIs(): for urim in mm.get_Memento_URIs_in_TimeMap(urit): # LDA does not appear to be deterministic if measure == "gensim_lda": self.assertGreaterEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], msg="measure {} does not compute the correct score " "for document sameness with URI-M {}".format(measure, urim) ) else: self.assertAlmostEqual( mm.get_score(urit, urim, "timemap measures", measure), same_scores[measure], msg="measure {} does not compute the correct score " "for document sameness with URI-M {}".format(measure, urim) ) shutil.rmtree(working_directory)
def test_measuremodel_storage_happy_path(self): working_directory = "/tmp/test_measuremodel_storage_happy_path" if not os.path.exists(working_directory): os.makedirs(working_directory) mm = MeasureModel() mm.set_score( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1", 539) self.assertEqual( mm.get_score( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), 539) mm.set_TimeMap_access_error("timemap2", "this is an error message for timemap2") self.assertEqual(mm.get_TimeMap_access_error_message("timemap2"), "this is an error message for timemap2") # mm.set_TimeMap_measurement_error("timemap3", "this is an error message for timemap3") # self.assertEqual( # mm.get_TimeMap_measurement_error_message("timemap3"), # "this is an error message for timemap3" # ) mm.set_Memento_access_error( "timemap4", "http://examplearchive.org/19700101000000/http://memento2", "this is a memento error message for http://examplearchive.org/19700101000000/http://memento2" ) self.assertEqual( mm.get_Memento_access_error_message( "http://examplearchive.org/19700101000000/http://memento2"), "this is a memento error message for http://examplearchive.org/19700101000000/http://memento2" ) mm.set_Memento_measurement_error( "timemap5", "http://examplearchive.org/19700101000000/http://memento3", "measuretype1", "measure1", "this is a memento error message for http://examplearchive.org/19700101000000/http://memento3" ) self.assertEqual( mm.get_Memento_measurement_error_message( "http://examplearchive.org/19700101000000/http://memento3", "measuretype1", "measure1"), "this is a memento error message for http://examplearchive.org/19700101000000/http://memento3" ) self.assertEqual( mm.get_stemmed( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), None) mm.set_stemmed( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1", True) self.assertEqual( mm.get_stemmed( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), True) self.assertEqual( mm.get_tokenized( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), None) mm.set_tokenized( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1", True) self.assertEqual( mm.get_tokenized( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), True) self.assertEqual( mm.get_removed_boilerplate( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), None) mm.set_removed_boilerplate( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1", True) self.assertEqual( mm.get_removed_boilerplate( "timemap1", "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), True) self.assertEqual(mm.get_TimeMap_URIs(), ["timemap1", "timemap2", "timemap4", "timemap5"]) self.assertEqual( mm.get_Memento_URIs_in_TimeMap("timemap1"), ["http://examplearchive.org/19700101000000/http://memento1"]) self.assertEqual(mm.get_Measures(), [("measuretype1", "measure1")]) mm.set_score( "timemap1", "http://examplearchive.org/19700101000000/http://memento4", "measuretype1", "measure1", 550) mm.calculate_offtopic_by_measure("measuretype1", "measure1", 540, ">") mm.calculate_overall_offtopic_status() self.assertEqual( mm.get_off_topic_status_by_measure( "http://examplearchive.org/19700101000000/http://memento1", "measuretype1", "measure1"), "on-topic") self.assertEqual( mm.get_off_topic_status_by_measure( "http://examplearchive.org/19700101000000/http://memento4", "measuretype1", "measure1"), "off-topic") jsonfilename = "{}/test_measuremodel_storage_happy_path.json".format( working_directory) mm.save_as_JSON(jsonfilename) with open(jsonfilename) as jsonfile: jsondata = json.load(jsonfile) pp.pprint(jsondata) expectedjsondata = { "timemap1": { "http://examplearchive.org/19700101000000/http://memento1": { "measuretype1": { "measure1": { "stemmed": True, "tokenized": True, "removed boilerplate": True, "comparison score": 539, "topic status": "on-topic" } }, "overall topic status": "on-topic" }, "http://examplearchive.org/19700101000000/http://memento4": { "measuretype1": { "measure1": { "stemmed": None, "tokenized": None, "removed boilerplate": None, "comparison score": 550, "topic status": "off-topic" } }, "overall topic status": "off-topic" } }, "timemap2": { "access error": "this is an error message for timemap2" }, "timemap4": { "http://examplearchive.org/19700101000000/http://memento2": { "access error": "this is a memento error message for http://examplearchive.org/19700101000000/http://memento2" } }, "timemap5": { "http://examplearchive.org/19700101000000/http://memento3": { "measuretype1": { "measure1": { "measurement error": "this is a memento error message for http://examplearchive.org/19700101000000/http://memento3" } } } } } self.maxDiff = None self.assertEqual(expectedjsondata, jsondata) csvfilename = "{}/test_measuremodel_storage_happy_path.csv".format( working_directory) mm.save_as_CSV(csvfilename) with open(csvfilename) as csvfile: csvdata = csvfile.read() expectedcsvdata = """URI-T,URI-M,Error,Error Message,Content Length,Simhash,Measurement Type,Measurement Name,Comparison Score,Stemmed,Tokenized,Removed Boilerplate,Topic Status,Overall Topic Status timemap1,http://examplearchive.org/19700101000000/http://memento1,,,,,measuretype1,measure1,539,True,True,True,on-topic,on-topic timemap1,http://examplearchive.org/19700101000000/http://memento4,,,,,measuretype1,measure1,550,,,,off-topic,off-topic timemap2,,TimeMap Access Error,this is an error message for timemap2,,,,,,,,,, timemap4,http://examplearchive.org/19700101000000/http://memento2,Memento Access Error,this is a memento error message for http://examplearchive.org/19700101000000/http://memento2,,,,,,,,,, timemap5,http://examplearchive.org/19700101000000/http://memento3,Memento Measurement Error,this is a memento error message for http://examplearchive.org/19700101000000/http://memento3,,,measuretype1,measure1,,,,,, """ self.assertEqual(expectedcsvdata, csvdata) gsfilename = "{}/test_measuremodel_storage_happy_path.tsv".format( working_directory) mm.save_as_goldstandard(gsfilename) expectedgsdata = """id date URI label 1 19700101000000 http://examplearchive.org/19700101000000/http://memento1 1 1 19700101000000 http://examplearchive.org/19700101000000/http://memento4 0 2 ERROR 3 19700101000000 http://examplearchive.org/19700101000000/http://memento2 ERROR 4 19700101000000 http://examplearchive.org/19700101000000/http://memento3 ERROR """ with open(gsfilename) as gsfile: gsdata = gsfile.read() self.assertEqual(expectedgsdata, gsdata)