def book_to_book_corr(): file_list = open("res/filenames.txt").readlines() books = [] diffs = [] for filename in file_list: filename = filename.split(",")[0] print "Loading " + filename text = open("res/" + filename).read().replace("/r/n", " ").replace("/n", " ").replace("/r", " ") n_gram = analysis.compute_n_gram_words(1, text) books.append( { "title": filename.split(".")[0], "n-gram": n_gram } ) for from_book in books: print "Comparing " + from_book["title"] + " to ..." current_diffs = [] for to_book in books: print "... " + to_book["title"] current_diffs.append( { "title": to_book["title"], "diff": analysis.compute_diff(from_book, to_book) }) diffs.append( { "title": from_book["title"], "diffs": current_diffs } ) print "Correlations: " + dumps(diffs, sort_keys=True, indent=4, separators=(",", ": ")) with open("res/book-to-book", "w") as outfile: dump(diffs, outfile, sort_keys=True, indent=4, separators=(",", ": "))
def test_compute_diff(): from_book = { "name": "from_book", "n-gram": { "sequences": ["seq1", "seq2"], "frequencies": [0.8, 0.2] } } to_book1 = { "name": "to_book1", "n-gram": { "sequences": ["seq1", "seq2"], "frequencies": [0.7, 0.3] } } to_book2 = { "name": "to_book2", "n-gram": { "sequences": ["seq1", "seq3"], "frequencies": [0.7, 0.3] } } to_book3 = { "name": "to_book3", "n-gram": { "sequences": ["seq1", "seq2"], "frequencies": [0.6, 0.4] } } assert analysis.compute_diff(from_book, from_book) == 0 assert analysis.compute_diff(from_book, to_book1) < analysis.compute_diff(from_book, to_book2) assert analysis.compute_diff(from_book, to_book1) < analysis.compute_diff(from_book, to_book3) assert analysis.compute_diff(from_book, to_book3) < analysis.compute_diff(from_book, to_book2)