# Compute the term's IDF idf = math.log (n_reviews / len(l['occurrences'])) if idf > 0: l['idf'] = idf else: # If IDF == 0, remove term from inverted index del inverted_index[t] return inverted_index # Given an inverted index, compute the IDF threshold value such that a given fraction # of terms is above that level def IDF_threshold (inverted_index, term_fraction = .99): # Sort all IDFs by ascending value idf_list = sorted([l['idf'] for l in inverted_index.values()]) # Find the appropriate position in the list threshold_index = int(len(idf_list)*(1-term_fraction)) # Return the value in the sorted list return idf_list[threshold_index] if __name__ == '__main__': #prepare a list of documents with the test dataset list = parser.extract_reviews("test_reviews.json.gz") direct_index = get_index(list) print 'Gotcha direct index: ' + str(direct_index) print 'Now computing the inverted index...' inverted_index = invert_index(direct_index) print len(inverted_index), 'terms indexed.' print '99% IDF threshold is', IDF_threshold(inverted_index, .99)
idf = math.log(n_reviews / len(l['occurrences'])) if idf > 0: l['idf'] = idf else: # If IDF == 0, remove term from inverted index del inverted_index[t] return inverted_index # Given an inverted index, compute the IDF threshold value such that a given fraction # of terms is above that level def IDF_threshold(inverted_index, term_fraction=.99): # Sort all IDFs by ascending value idf_list = sorted([l['idf'] for l in inverted_index.values()]) # Find the appropriate position in the list threshold_index = int(len(idf_list) * (1 - term_fraction)) # Return the value in the sorted list return idf_list[threshold_index] if __name__ == '__main__': #prepare a list of documents with the test dataset list = parser.extract_reviews("test_reviews.json.gz") direct_index = get_index(list) print 'Gotcha direct index: ' + str(direct_index) print 'Now computing the inverted index...' inverted_index = invert_index(direct_index) print len(inverted_index), 'terms indexed.' print '99% IDF threshold is', IDF_threshold(inverted_index, .99)
############################################################# # INSERT HERE THE PATH OF THE DATASET path = "../../data/1000_reviews.json.gz" # Get the IDF threshold if it is present term_fraction = None if len(sys.argv) < 2 else float(sys.argv[1]) # Get search text if it is present search_text = None if len(sys.argv) < 3 else sys.argv[2] # Parse the list of reviews and count the review's direct index print "Take the list of reviews..." start = time.time() # start the timer reviews = parser.extract_reviews(path) direct_index, items = indexing.get_index(reviews) print len(direct_index), 'review\'s direct index.\n' # Compute the inverted index print 'Now computing the inverted index...' inverted_index = indexing.invert_index(direct_index) print len(inverted_index), 'terms indexed.\n' # For convenience, remember the number of terms and reviews n_reviews = len(direct_index) n_terms = max(t['termid'] for t in inverted_index.values()) + 1 # Compute the (optional) IDF threshold and print it if isn't null idf_threshold = None if not term_fraction else indexing.IDF_threshold(inverted_index, term_fraction) if idf_threshold: