Esempio n. 1
0
class QueryHandler:
    # TODO: Query Interface for deep doc search.

    def __init__(self):
        self.es = Elasticsearch()
        self.ch = ContentHandler()

    def lite_search(self, query):
        if not isinstance(query, str):
            return []
        processed_query = self.ch.transform(query)
        if processed_query == '':
            return []
        # TODO: Proper tf-idf freq.
        res = self.es.search(
            {
                "query": {
                    "more_like_this": {
                        "fields": ["title_index", "abstract_index"],
                        "like": processed_query,
                        "min_term_freq": 1,
                        "max_query_terms": 15,
                        "min_doc_freq": 1
                    }
                }
            },
            index=constants.DEFAULT_LITE_INDEX)
        return res["hits"]["hits"]
ch = ContentHandler()

es.indices.create(constants.DEFAULT_LITE_INDEX, ignore=[constants.ErrorConstants.INDEX_EXISTS])
print('[INFO] Index created')
print('[INFO] Reading abstracts')
docs_to_be_indexed = []
df = pd.read_csv(constants.ABSTRACTS_CSV_PATH)

for i in range(0, df.shape[0]):
    print('[INFO] Preparing data  :' + str(i) + "/" + str(df.shape[0]))
    row = list(df.iloc[i])
    document_id = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["documentIdIndex"]]
    title = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["titleIndex"]]
    abstract = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["abstractIndex"]]
    paper_link = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]["researchPaperUrlIndex"]]
    title_index = ch.transform(title)
    abstract_index = ch.transform(abstract)
    max_ranking_text = ch.extract_high_ranked_phrase(abstract)
    # Index operation meta information
    docs_to_be_indexed.append({'index': {'_id': document_id, '_index': constants.DEFAULT_LITE_INDEX}})
    # Body
    docs_to_be_indexed.append({
        'document_id': document_id,
        'title': title,
        'abstract': abstract,
        'research_paper_url': paper_link,
        'title_index': title_index,
        'abstract_index': abstract_index,
        'max_ranking_phrase': max_ranking_text
    })
Esempio n. 3
0
    if encoding_check(w2):
        return wd
    return None


df = pd.read_csv(file_path)
assert df is not None
word_set = set()
for i in range(0, df.shape[0]):
    print('[INFO] Processing ' + str(i) + "/" + str(df.shape[0]))
    row = list(df.iloc[i])
    title = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]
                ["titleIndex"]]
    abstract = row[dataset_config.GLOBAL_INDEX[dataset_config.ABSTRACTS]
                   ["abstractIndex"]]
    title_transformed = ch.transform(title)
    abstract_transformed = ch.transform(abstract)
    for title_tokens in ch._get_tokens(title_transformed):
        w = lev2_clean(title_tokens)
        if w is not None:
            word_set.add(lev2_clean(title_tokens))
    for abstract_token in ch._get_tokens(abstract_transformed):
        w = lev2_clean(abstract_token)
        if w is not None:
            word_set.add(w)

print('[INFO] word_set generated: ' + str(len(word_set)))
file = open('unique_words.txt', 'w+')
for word in word_set:
    file.write(word + '\n')
file.close()