def book_to_book_corr(): file_list = open("res/filenames.txt").readlines() books = [] diffs = [] for filename in file_list: filename = filename.split(",")[0] print "Loading " + filename text = open("res/" + filename).read().replace("/r/n", " ").replace("/n", " ").replace("/r", " ") n_gram = analysis.compute_n_gram_words(1, text) books.append( { "title": filename.split(".")[0], "n-gram": n_gram } ) for from_book in books: print "Comparing " + from_book["title"] + " to ..." current_diffs = [] for to_book in books: print "... " + to_book["title"] current_diffs.append( { "title": to_book["title"], "diff": analysis.compute_diff(from_book, to_book) }) diffs.append( { "title": from_book["title"], "diffs": current_diffs } ) print "Correlations: " + dumps(diffs, sort_keys=True, indent=4, separators=(",", ": ")) with open("res/book-to-book", "w") as outfile: dump(diffs, outfile, sort_keys=True, indent=4, separators=(",", ": "))
def set_up_genre_data(): """Load all books""" genres = [] file_list = open("res/filenames.txt", "r") line = file_list.readline().rstrip("\n") while(line != ""): splits = str(line).split(",") filename = splits[0] for i in range(2, len(splits)): text_genre = splits[i].strip().lower() found_genre = False for genre in genres: if genre["name"] == text_genre: genre["books"].append(filename) found_genre = True if not found_genre: genres.append( { "name": text_genre, "books": [filename]} ) line = file_list.readline().rstrip("\n") """Filter out genres which only have 1 book""" significant_genres_list = [] for i in range(0, len(genres)): if len(genres[i]["books"]) > 1: significant_genres_list.append(i) temp_list = [] for index in significant_genres_list: temp_list.append(genres[index]) genres = temp_list """Compute 1-gram for each genre""" for i in range(0, len(genres)): print "\nComputing top 50 1-gram for: " + genres[i]["name"] n_gram = None for j in range(0, len(genres[i]["books"])): print "Analyzing text from " + genres[i]["books"][j] text = open("res/" + genres[i]["books"][j]).read().replace("\r\n", " ").replace("\r"," ").replace("\n"," ") n_gram = analysis.compute_n_gram_words(1, text, n_gram) # genres[i]["1-gram"] = n_gram genres[i]["1-gram"] = analysis.get_top_m_n_grams(500, n_gram) print "Computed " + dumps(genres[i]["1-gram"], sort_keys=True, indent=4, separators=(',', ': ')) """Filter sequences, which occur in all genres""" genres = analysis.filter_sequences(genres) """Store n-grams for each genre""" for genre in genres: with open("res/genres/" + genre["name"].replace(" ", "_"), "w") as outfile: dump(genre["1-gram"], outfile) with open("res/genres/" + genre["name"].replace(" ", "_") + "-unique", "w") as outfile: dump(genre["1-gram-unique"], outfile) print "The catalog: " + dumps(genres, sort_keys=True, indent=4, separators=(',', ': '))