Esempio n. 1
0
def book_to_book_corr():
    
    file_list = open("res/filenames.txt").readlines()
    books = []
    
    diffs = []
    
    for filename in file_list:
        filename = filename.split(",")[0]
        print "Loading " + filename
        text = open("res/" + filename).read().replace("/r/n", " ").replace("/n", " ").replace("/r", " ")
        n_gram = analysis.compute_n_gram_words(1, text)
        books.append( { "title": filename.split(".")[0], "n-gram": n_gram } )
    for from_book in books:
        print "Comparing " + from_book["title"] + " to ..."
        current_diffs = []
        for to_book in books:
            print "... " + to_book["title"]
            current_diffs.append( { "title": to_book["title"], "diff": analysis.compute_diff(from_book, to_book) })
        diffs.append( { "title": from_book["title"], "diffs": current_diffs } )
        
    print "Correlations: " + dumps(diffs, sort_keys=True, indent=4, separators=(",", ": "))
    
    with open("res/book-to-book", "w") as outfile:
        dump(diffs, outfile, sort_keys=True, indent=4, separators=(",", ": "))
Esempio n. 2
0
def test_compute_diff():
    
    from_book = { "name": "from_book",
                  "n-gram": { "sequences": ["seq1", "seq2"],
                              "frequencies": [0.8, 0.2] 
                              } 
                 }
    
    to_book1 = { "name": "to_book1",
                  "n-gram": { "sequences": ["seq1", "seq2"],
                              "frequencies": [0.7, 0.3] 
                              } 
                 }
    
    to_book2 = { "name": "to_book2",
                  "n-gram": { "sequences": ["seq1", "seq3"],
                              "frequencies": [0.7, 0.3] 
                              } 
                 }
    
    to_book3 = { "name": "to_book3",
                  "n-gram": { "sequences": ["seq1", "seq2"],
                              "frequencies": [0.6, 0.4] 
                              } 
                 }
    
    assert analysis.compute_diff(from_book, from_book) == 0
    assert analysis.compute_diff(from_book, to_book1) < analysis.compute_diff(from_book, to_book2)
    assert analysis.compute_diff(from_book, to_book1) < analysis.compute_diff(from_book, to_book3)
    assert analysis.compute_diff(from_book, to_book3) < analysis.compute_diff(from_book, to_book2)