def index_ngram_word(): print('Indexing ngram word...') schema = Schema(id=ID(stored=True), question=NGRAMWORDS(minsize=2, maxsize=7, tokenizer=SpaceSeparatedTokenizer()), answer=NGRAMWORDS(minsize=2, maxsize=7, tokenizer=SpaceSeparatedTokenizer())) if not os.path.exists('index_ngram_word'): os.mkdir('index_ngram_word') ix = create_in('index_ngram_word', schema) writer = ix.writer() with open(PATH_QUESTION_ANSWER, 'r') as f: for qa in json_lines.reader(f): # print(qa['question']) # print(qa['answer']) # print('\n') if not convenion.is_valid_qa(qa): continue question = convenion.customize_and_remove_stopword(qa['question']) answer = convenion.customize_and_remove_stopword(qa['answer']) writer.add_document(id=qa['id_cmt'], question=question, answer=answer) print('Commit ngram word...') writer.commit()
def raw_index_file(): with jsonlines.open(PATH_QUESTION_ANSWER_INDEXER, mode='w') as writer: with jsonlines.open(PATH_QUESTION_ANSWER) as reader: for qa in reader: if not convenion.is_valid_qa(qa): continue id_doc = qa['id_cmt'] question = qa['question'] answer = qa['answer'] question_custom = convenion.customize_string(question) answer_custom = convenion.customize_string(answer) question_removed_stopword = convenion.customize_and_remove_stopword( question) answer_removed_stopword = convenion.customize_and_remove_stopword( answer) # print(question_custom) # print(answer_custom) # print(question_removed_stopword) # print(answer_removed_stopword) doc_id = {"index": {"_id": id_doc}} doc = { "question": question, "answer": answer, "question_custom": question_custom, "answer_custom": answer_custom, "question_removed_stopword": question_removed_stopword, "answer_removed_stopword": answer_removed_stopword, } writer.write(doc_id) writer.write(doc)
def index_basic(): # Use scoring method BM25F print('Indexing basic...') schema = Schema(id=ID(stored=True), question=STORED, answer=STORED, question_custom=TEXT(stored=True), answer_custom=TEXT(stored=True)) if not os.path.exists('index_basic'): os.mkdir('index_basic') ix = create_in('index_basic', schema) writer = ix.writer() with open(PATH_QUESTION_ANSWER, 'r') as f: for qa in json_lines.reader(f): if not convenion.is_valid_qa(qa): continue question = qa['question'] answer = qa['answer'] question_custom = convenion.customize_and_remove_stopword( qa['question']) answer_custom = convenion.customize_and_remove_stopword( qa['answer']) print(question_custom) print(answer_custom) writer.add_document(id=qa['id_cmt'], question=question, answer=answer, question_custom=question_custom, answer_custom=answer_custom) print('Commit basic...') writer.commit()
def raw_query_pool(): with open('elastic/query_pool.json') as f: queries = json.load(f) print("Current queries len: ", len(queries)) print("\n") arr_id = [query['id'] for query in queries] arr_id_checked = list(arr_id) arr_question_source = [] with jsonlines.open(PATH_QUESTION_ANSWER) as reader: for qa in reader: if not convenion.is_valid_qa(qa): continue arr_question_source.append(qa) print(random.choice(arr_question_source)) user_judge = '' while (len(arr_id) != 250) and (user_judge != '0'): qa_checking = random.choice(arr_question_source) if qa_checking['id_cmt'] in arr_id_checked: continue arr_id_checked.append(qa_checking['id_cmt']) # print("Question: %(question)s\n" %qa_checking) # print('Input your jugde for quenstion: ') user_judge = input(qa_checking['question'] + '\n') if user_judge != '1': print("Collecting next question...\n") continue print("Add to query...\n") arr_id.append(qa_checking['id_cmt']) queries.append({ 'id': qa_checking['id_cmt'], 'question': qa_checking['question'], 'searched': 0 }) print("Current queries len: ", len(queries)) print("\n") with open('elastic/query_pool.json', 'w') as outfile: json.dump(queries, outfile)