def search(dictionary_file, postings_file, query_file, output_file): try: # Remove previous output file os.remove(output_file) except OSError: pass inverted_index = InvertedIndex(dictionary_file, postings_file) meta_data = get_meta_data() tree = ET.parse(query_file) root = tree.getroot() title_tokens = [] description_tokens = [] raw_tokens = [] for child in root: if child.tag == 'title': title_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) elif child.tag == 'description': description_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) raw_tokens = helper.remove_stop_words_without_normalize(helper.filter_invalid_characters(raw_tokens)) additional_tokens = [] for token in list(set(raw_tokens)): additional_tokens.extend(helper.get_similar_words(token)) title_tokens = helper.remove_stop_words(helper.filter_invalid_characters(title_tokens)) description_tokens = helper.remove_stop_words(helper.filter_invalid_characters(description_tokens)) # tight results are results which favour high precision. We use this as a proxy for true positive tight_results = execute_query(title_tokens, description_tokens, [], inverted_index, meta_data) global top_UPC_classes global top_IPC_classes global top_family_members global top_cited_by # Get top UPC, IPC, family members and cited by from our true positive proxy results # This helps us determine which documents are more similar to the original top results # when we add in the additional similar words top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6) top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4) top_family_members = get_top_members(tight_results, meta_data['family_members'], 20) top_cited_by = get_top_members(tight_results, meta_data['cited_by'], 20) # query expansion # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data) # synonyms, hypernyms additional_tokens = helper.normalize_tokens(list(set(additional_tokens))) results = execute_query(title_tokens, description_tokens, additional_tokens, inverted_index, meta_data) k = int(TOP_X_PERCENT_RESULTS * len(results)) # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results)) # results = list(set(results[:k] + supplementary_results[:j])) write_to_output(output_file, results[:k])
def search(dictionary_file, postings_file, query_file, output_file): try: # Remove previous output file os.remove(output_file) except OSError: pass inverted_index = InvertedIndex(dictionary_file, postings_file) meta_data = get_meta_data() tree = ET.parse(query_file) root = tree.getroot() title_tokens = [] description_tokens = [] raw_tokens = [] for child in root: if child.tag == 'title': title_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) elif child.tag == 'description': description_tokens = build_tokens(child.text) raw_tokens.extend(word_tokenize(child.text)) raw_tokens = helper.remove_stop_words_without_normalize( helper.filter_invalid_characters(raw_tokens)) additional_tokens = [] for token in list(set(raw_tokens)): additional_tokens.extend(helper.get_similar_words(token)) pass title_tokens = helper.remove_stop_words( helper.filter_invalid_characters(title_tokens)) description_tokens = helper.remove_stop_words( helper.filter_invalid_characters(description_tokens)) # tight results are results which favour high precision. We use this as a proxy for true positive tight_results = execute_query(title_tokens, description_tokens, [], inverted_index, meta_data) global top_UPC_classes global top_IPC_classes global top_family_members top_UPC_classes = get_top_classes(tight_results, meta_data['UPC_class'], 6) top_IPC_classes = get_top_classes(tight_results, meta_data['IPC_class'], 4) top_family_members = get_top_members(tight_results, meta_data['family_members'], 30) # supplementary_results = expand_query(tight_results, meta_data['doc_top_terms'], inverted_index, meta_data) additional_tokens = helper.normalize_tokens(list(set(additional_tokens))) results = execute_query(title_tokens, description_tokens, additional_tokens, inverted_index, meta_data) k = int(TOP_X_PERCENT_RESULTS * len(results)) # j = int(TOP_X_PERCENT_RESULTS * len(supplementary_results)) # results = list(set(results[:k] + supplementary_results[:j])) write_to_output(output_file, results[:k])
def build_tokens(text): tokens = word_tokenize(text) return helper.normalize_tokens(tokens)