def run(args): """The main function that does all the processing. Takes as argument the Namespace object obtained from _get_args(). """ query_id2source_text_id = read_map(args.query_id2source_text_id, num_values_per_key=1) source_text_id2doc_ids = read_map(args.source_text_id2doc_ids, min_num_values_per_key=1) source_text_id2tfidf = read_map(args.source_text_id2tfidf, num_values_per_key=1) num_queries = 0 prev_source_text_id = "" for query_id, query_tfidf in tf_idf.read_tfidf_ark(args.query_tfidf): num_queries += 1 # The source text from which a document is to be retrieved for the # input query source_text_id = query_id2source_text_id[query_id] if prev_source_text_id != source_text_id: source_tfidf = tf_idf.TFIDF() source_tfidf.read(open(source_text_id2tfidf[source_text_id])) prev_source_text_id = source_text_id # The source documents corresponding to the source text. # This is set of documents which will be searched over for the query. source_doc_ids = source_text_id2doc_ids[source_text_id] scores = query_tfidf.compute_similarity_scores( source_tfidf, source_docs=source_doc_ids, query_id=query_id) assert len(scores) > 0, ( "Did not get scores for query {0}".format(query_id)) if args.verbose > 2: for tup, score in scores.iteritems(): logger.debug("Score, {num}: {0} {1} {2}".format( tup[0], tup[1], score, num=num_queries)) best_index, best_doc_id = max(enumerate(source_doc_ids), key=lambda x: scores[(query_id, x[1])]) best_score = scores[(query_id, best_doc_id)] assert source_doc_ids[best_index] == best_doc_id assert best_score == max( [scores[(query_id, x)] for x in source_doc_ids]) best_indexes = {} if args.num_neighbors_to_search == 0: best_indexes[best_index] = (1, 1) if best_index > 0: best_indexes[best_index - 1] = (0, args.partial_doc_fraction) if best_index < len(source_doc_ids) - 1: best_indexes[best_index + 1] = (args.partial_doc_fraction, 0) else: excluded_indexes = set() for index in range( max(best_index - args.num_neighbors_to_search, 0), min(best_index + args.num_neighbors_to_search + 1, len(source_doc_ids))): if (scores[(query_id, source_doc_ids[index])] >= args.neighbor_tfidf_threshold * best_score): best_indexes[index] = (1, 1) # Type 2 if index > 0 and index - 1 in excluded_indexes: try: # Type 1 and 3 start_frac, end_frac = best_indexes[index - 1] assert end_frac == 0 best_indexes[index - 1] = (start_frac, args.partial_doc_fraction) except KeyError: # Type 1 best_indexes[index - 1] = (0, args.partial_doc_fraction) else: excluded_indexes.add(index) if index > 0 and index - 1 not in excluded_indexes: # Type 3 best_indexes[index] = (args.partial_doc_fraction, 0) best_docs = get_document_ids(source_doc_ids, best_indexes) assert len(best_docs) > 0, ( "Did not get best docs for query {0}\n" "Scores: {1}\n" "Source docs: {2}\n" "Best index: {best_index}, score: {best_score}\n".format( query_id, scores, source_doc_ids, best_index=best_index, best_score=best_score)) assert (best_doc_id, 1.0, 1.0) in best_docs print("{0} {1}".format( query_id, " ".join(["%s,%.2f,%.2f" % x for x in best_docs])), file=args.relevant_docs) if num_queries == 0: raise RuntimeError("Failed to retrieve any document.") logger.info("Retrieved similar documents for " "%d queries", num_queries)
def run(args): """The main function that does all the processing. Takes as argument the Namespace object obtained from _get_args(). """ query_id2source_text_id = read_map(args.query_id2source_text_id, num_values_per_key=1) source_text_id2doc_ids = read_map(args.source_text_id2doc_ids, min_num_values_per_key=1) source_text_id2tfidf = read_map(args.source_text_id2tfidf, num_values_per_key=1) num_queries = 0 prev_source_text_id = "" for query_id, query_tfidf in tf_idf.read_tfidf_ark(args.query_tfidf): num_queries += 1 # The source text from which a document is to be retrieved for the # input query source_text_id = query_id2source_text_id[query_id] if prev_source_text_id != source_text_id: source_tfidf = tf_idf.TFIDF() source_tfidf.read( open(source_text_id2tfidf[source_text_id])) prev_source_text_id = source_text_id # The source documents corresponding to the source text. # This is set of documents which will be searched over for the query. source_doc_ids = source_text_id2doc_ids[source_text_id] scores = query_tfidf.compute_similarity_scores( source_tfidf, source_docs=source_doc_ids, query_id=query_id) assert len(scores) > 0, ( "Did not get scores for query {0}".format(query_id)) if args.verbose > 2: for tup, score in scores.items(): logger.debug("Score, {num}: {0} {1} {2}".format( tup[0], tup[1], score, num=num_queries)) best_index, best_doc_id = max( enumerate(source_doc_ids), key=lambda x: scores[(query_id, x[1])]) best_score = scores[(query_id, best_doc_id)] assert source_doc_ids[best_index] == best_doc_id assert best_score == max([scores[(query_id, x)] for x in source_doc_ids]) best_indexes = {} if args.num_neighbors_to_search == 0: best_indexes[best_index] = (1, 1) if best_index > 0: best_indexes[best_index - 1] = (0, args.partial_doc_fraction) if best_index < len(source_doc_ids) - 1: best_indexes[best_index + 1] = (args.partial_doc_fraction, 0) else: excluded_indexes = set() for index in range( max(best_index - args.num_neighbors_to_search, 0), min(best_index + args.num_neighbors_to_search + 1, len(source_doc_ids))): if (scores[(query_id, source_doc_ids[index])] >= args.neighbor_tfidf_threshold * best_score): best_indexes[index] = (1, 1) # Type 2 if index > 0 and index - 1 in excluded_indexes: try: # Type 1 and 3 start_frac, end_frac = best_indexes[index - 1] assert end_frac == 0 best_indexes[index - 1] = ( start_frac, args.partial_doc_fraction) except KeyError: # Type 1 best_indexes[index - 1] = ( 0, args.partial_doc_fraction) else: excluded_indexes.add(index) if index > 0 and index - 1 not in excluded_indexes: # Type 3 best_indexes[index] = (args.partial_doc_fraction, 0) best_docs = get_document_ids(source_doc_ids, best_indexes) assert len(best_docs) > 0, ( "Did not get best docs for query {0}\n" "Scores: {1}\n" "Source docs: {2}\n" "Best index: {best_index}, score: {best_score}\n".format( query_id, scores, source_doc_ids, best_index=best_index, best_score=best_score)) assert (best_doc_id, 1.0, 1.0) in best_docs print ("{0} {1}".format(query_id, " ".join( ["%s,%.2f,%.2f" % x for x in best_docs])), file=args.relevant_docs) if num_queries == 0: raise RuntimeError("Failed to retrieve any document.") logger.info("Retrieved similar documents for " "%d queries", num_queries)