Ejemplo n.º 1
0
		# Compute the term's IDF
		idf = math.log (n_reviews / len(l['occurrences']))
		if idf > 0:
			l['idf'] = idf
		else:
			# If IDF == 0, remove term from inverted index
			del inverted_index[t]

	return inverted_index

# Given an inverted index, compute the IDF threshold value such that a given fraction
# of terms is above that level
def IDF_threshold (inverted_index, term_fraction = .99):
	# Sort all IDFs by ascending value
	idf_list = sorted([l['idf'] for l in inverted_index.values()])
	# Find the appropriate position in the list
	threshold_index = int(len(idf_list)*(1-term_fraction))
	# Return the value in the sorted list
	return idf_list[threshold_index]


if __name__ == '__main__':
    #prepare a list of documents with the test dataset
    list = parser.extract_reviews("test_reviews.json.gz")
    direct_index = get_index(list)
    print 'Gotcha direct index: ' + str(direct_index)
    print 'Now computing the inverted index...'
    inverted_index = invert_index(direct_index)
    print len(inverted_index), 'terms indexed.'
    print '99% IDF threshold is', IDF_threshold(inverted_index, .99)
Ejemplo n.º 2
0
        idf = math.log(n_reviews / len(l['occurrences']))
        if idf > 0:
            l['idf'] = idf
        else:
            # If IDF == 0, remove term from inverted index
            del inverted_index[t]

    return inverted_index


# Given an inverted index, compute the IDF threshold value such that a given fraction
# of terms is above that level
def IDF_threshold(inverted_index, term_fraction=.99):
    # Sort all IDFs by ascending value
    idf_list = sorted([l['idf'] for l in inverted_index.values()])
    # Find the appropriate position in the list
    threshold_index = int(len(idf_list) * (1 - term_fraction))
    # Return the value in the sorted list
    return idf_list[threshold_index]


if __name__ == '__main__':
    #prepare a list of documents with the test dataset
    list = parser.extract_reviews("test_reviews.json.gz")
    direct_index = get_index(list)
    print 'Gotcha direct index: ' + str(direct_index)
    print 'Now computing the inverted index...'
    inverted_index = invert_index(direct_index)
    print len(inverted_index), 'terms indexed.'
    print '99% IDF threshold is', IDF_threshold(inverted_index, .99)
Ejemplo n.º 3
0
#############################################################


# INSERT HERE THE PATH OF THE DATASET
path = "../../data/1000_reviews.json.gz"

# Get the IDF threshold if it is present
term_fraction = None if len(sys.argv) < 2 else float(sys.argv[1])

# Get search text if it is present
search_text = None if len(sys.argv) < 3 else sys.argv[2]

# Parse the list of reviews and count the review's direct index
print "Take the list of reviews..."
start = time.time() # start the timer
reviews = parser.extract_reviews(path)
direct_index, items = indexing.get_index(reviews)
print len(direct_index), 'review\'s direct index.\n'

# Compute the inverted index
print 'Now computing the inverted index...'
inverted_index = indexing.invert_index(direct_index)
print len(inverted_index), 'terms indexed.\n'

# For convenience, remember the number of terms and reviews
n_reviews = len(direct_index)
n_terms = max(t['termid'] for t in inverted_index.values()) + 1

# Compute the (optional) IDF threshold and print it if isn't null
idf_threshold = None if not term_fraction else indexing.IDF_threshold(inverted_index, term_fraction)
if idf_threshold: