class QueryHandler: # TODO: Query Interface for deep doc search. def __init__(self): self.es = Elasticsearch() self.ch = ContentHandler() def lite_search(self, query): if not isinstance(query, str): return [] processed_query = self.ch.transform(query) if processed_query == '': return [] # TODO: Proper tf-idf freq. res = self.es.search( { "query": { "more_like_this": { "fields": ["title_index", "abstract_index"], "like": processed_query, "min_term_freq": 1, "max_query_terms": 15, "min_doc_freq": 1 } } }, index=constants.DEFAULT_LITE_INDEX) return res["hits"]["hits"]
ch = ContentHandler() es.indices.create(constants.DEFAULT_LITE_INDEX, ignore=[constants.ErrorConstants.INDEX_EXISTS]) print('[INFO] Index created') print('[INFO] Reading abstracts') docs_to_be_indexed = [] df = pd.read_csv(constants.ABSTRACTS_CSV_PATH) for i in range(0, df.shape[0]): print('[INFO] Preparing data :' + str(i) + "/" + str(df.shape[0])) row = list(df.iloc[i]) document_id = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["documentIdIndex"]] title = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["titleIndex"]] abstract = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["abstractIndex"]] paper_link = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["researchPaperUrlIndex"]] title_index = ch.transform(title) abstract_index = ch.transform(abstract) max_ranking_text = ch.extract_high_ranked_phrase(abstract) # Index operation meta information docs_to_be_indexed.append({'index': {'_id': document_id, '_index': constants.DEFAULT_LITE_INDEX}}) # Body docs_to_be_indexed.append({ 'document_id': document_id, 'title': title, 'abstract': abstract, 'research_paper_url': paper_link, 'title_index': title_index, 'abstract_index': abstract_index, 'max_ranking_phrase': max_ranking_text })
if encoding_check(w2): return wd return None df = pd.read_csv(file_path) assert df is not None word_set = set() for i in range(0, df.shape[0]): print('[INFO] Processing ' + str(i) + "/" + str(df.shape[0])) row = list(df.iloc[i]) title = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS] ["titleIndex"]] abstract = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS] ["abstractIndex"]] title_transformed = ch.transform(title) abstract_transformed = ch.transform(abstract) for title_tokens in ch._get_tokens(title_transformed): w = lev2_clean(title_tokens) if w is not None: word_set.add(lev2_clean(title_tokens)) for abstract_token in ch._get_tokens(abstract_transformed): w = lev2_clean(abstract_token) if w is not None: word_set.add(w) print('[INFO] word_set generated: ' + str(len(word_set))) file = open('unique_words.txt', 'w+') for word in word_set: file.write(word + '\n') file.close()