def BKing(file_name, query_string, return_count = 10): # set idx file and dict file path idx_file = file_name + ".index" # error detection if not os.path.isfile(idx_file): print "Error: index dictionary file inverted index file (.index) not found. Please Run:\ python corpusParser.py -f %s -s %s" % (file_name, 'stopword') exit(1) # read dict file to dict dicts = decompress_dict(idx_file) (N, docsyID, docsByName) = getCorpusDoc(file_name) # docs's parameter table docs_table = {} # docs score hash, use cosine similarity score with weight use tf-idf docs_score = {} # stem the query string query_string = stem_query(query_string) (query_table, docs_set, term_index) = query_weight(query_string, dicts, idx_file, N) docs_table = docs_weight(docs_set, term_index, query_string, query_table) docs_score = cos_vector_space_model(query_table, docs_table) print "Query terms:", query_string print "Top", return_count, "results:" print "doc#\tscore" count = return_count for i in sorted(docs_score, key=docs_score.get, reverse=True): count -= 1 if count < 0: break print "%d\t%.6f" % (int(i), docs_score[i]) if count >= 0: print "Only have found %d relevant documents." % (return_count - count) print '\n*****************Some information about corpus*************************' print "The query string in corups information:" for i in term_index: if term_index[i].df > 0: term_index[i].output() print '\n*****************Some information about corpus*************************' for doc in docsByName.values(): print doc[1], " docID:", doc[0], " docLength:", doc[2]
def query(file_name, query_string, return_count=10): # set idx file and dict file path idx_file = file_name + ".index" # error detection if not os.path.isfile(idx_file): print "Error: index dictionary file(.index.dict or inverted index file (.index.idx) not found." exit(1) # read dict file to dict dicts = decompress_dict(idx_file) # docs's parameter table docs_table = {} # docs score hash, use cosine similarity score with weight use tf-idf docs_score = {} # stem the query string query_string = stem_query(query_string) (query_table, docs_set, term_index) = query_weight(query_string, dicts, idx_file) docs_table = docs_weight(docs_set, term_index, query_string, query_table) docs_score = cos_vector_space_model(query_table, docs_table) print "Query terms:", query_string print "Top", return_count, "results:" print "doc#\tscore" for i in sorted(docs_score, key=docs_score.get, reverse=True): return_count -= 1 if return_count < 0: break print "%d\t%.3f" % (int(i), docs_score[i])
def query(file_name, query_string, return_count = 10): # set idx file and dict file path idx_file = file_name + ".index" # error detection if not os.path.isfile(idx_file): print "Error: index dictionary file(.index.dict or inverted index file (.index.idx) not found." exit(1) # read dict file to dict dicts = decompress_dict(idx_file) # docs's parameter table docs_table = {} # docs score hash, use cosine similarity score with weight use tf-idf docs_score = {} # stem the query string query_string = stem_query(query_string) (query_table, docs_set, term_index) = query_weight(query_string, dicts, idx_file) docs_table = docs_weight(docs_set, term_index, query_string, query_table) docs_score = cos_vector_space_model(query_table, docs_table) print "Query terms:", query_string print "Top", return_count, "results:" print "doc#\tscore" for i in sorted(docs_score, key=docs_score.get, reverse=True): return_count -= 1 if return_count < 0: break print "%d\t%.3f" % (int(i), docs_score[i])