def compareLists(query, relevant_positional_index, possible_document_matches, db): global positional_index positional_index = relevant_positional_index # songs that contain the sample from the query passed by the user sampled_songs = dict() # Searching through all of the documents with every word in the query to see if the words come one after another for document in possible_document_matches: # word: 1{20, 40, 67} == gives you [20, 40, 67] max_substring_length = 1 for index, word in enumerate(query): if max_substring_length >= len(query[index:]): break for position in positional_index[word]['document_dict'][document]: # calling a recursive method to see if the song actually contains the query substring_length_from_n = detectSample(position, query[index+1:], document, 1) if substring_length_from_n > max_substring_length: max_substring_length = substring_length_from_n if max_substring_length == len(query): print dBDelegate.getSongTitle(db, document) # if the song does contain the query, add the document name to a list if max_substring_length > len(query)*.25: max_substring_length = max_substring_length * .5 if max_substring_length in sampled_songs: sampled_songs[max_substring_length].append(document) else: sampled_songs[max_substring_length] = [document] return sampled_songs
def sortTfidfValues(tfidf_values): print "****************************************************************" print "Top 20 TFIDF scores" sorted_tfidf = sorted(tfidf_values.items(), key=lambda x: (-x[1], x[0])) top_10_values = itertools.islice(sorted_tfidf, 0, 20) for song, tfidf in top_10_values: print dBDelegate.getSongTitle(db, song), tfidf