def get_question_answer_pair_wikiqa(wikiqa): from tqdm import tqdm from IR import infoRX # glove = load_glove(200) from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() wikiqa_data = [] for sample in tqdm(wikiqa, total=len(wikiqa), ncols=75, unit='Sample'): q, context, label_list = sample tfidf, imp_tokens = infoRX.tf_idf(context, q) for i, c in enumerate(context): # q_vector, a_vector = qa_vectorize(q, c, glove) # WARNING: THIS WILL INCERASE THE FILE SIZE IN ORDER OF GB word_cnt = 0 for imp in imp_tokens: if lmtzr.lemmatize(imp) in [ lmtzr.lemmatize(w) for w in c.split() ]: word_cnt += 1 wikiqa_data.append((q, c, label_list[i], tfidf[i], word_cnt)) return wikiqa_data
def get_question_answer_pair_squad(squad): from IR import infoRX from nltk.stem.wordnet import WordNetLemmatizer from nltk import sent_tokenize lmtzr = WordNetLemmatizer() squad_qa = [] for s in squad: context, qas_list = s context = sent_tokenize(context) # context = context[: -1] # To remove empty item after splitting the last '.' # context = [c + '. ' for c in context] for qas in qas_list: a_start, a_text, a_sentence, q, _ = qas tfidf, imp_tokens = infoRX.tf_idf(context, q) label = 0 for i, c in enumerate(context): if c == a_sentence: label = 1 word_cnt = 0 for imp in imp_tokens: if lmtzr.lemmatize(imp) in [ lmtzr.lemmatize(w) for w in c.split() ]: word_cnt += 1 squad_qa.append((q, c, label, tfidf[i], word_cnt)) return squad_qa
def get_question_answer_pair_babi(babi): from tqdm import tqdm from IR import infoRX # glove = load_glove(200) babi_data = [] for sample in tqdm(babi, total=len(babi), ncols=75, unit='Sample'): line_numbers, context, question, _, support = sample tfidf, imp_tokens = infoRX.tf_idf(context, question) for i, c in enumerate(context): # q_vector, a_vector = qa_vectorize(question, c, glove) # WARNING: THIS WILL INCERASE THE FILE SIZE IN ORDER OF GB label = 0 line_number = line_numbers[i] if int(support) == int(line_number): label = 1 word_cnt = 0 for imp in imp_tokens: if imp in c: word_cnt += 1 babi_data.append((question, c, label, tfidf[i], word_cnt)) return babi_data
def extract_features(self, q, a_list): a_list = [str(a) for a in a_list] tfidf, imp_tokens = infoRX.tf_idf(a_list, q) word_cnt = [] for a in a_list: w_cnt = 0 for imp in imp_tokens: if imp in a: w_cnt += 1 word_cnt.append(w_cnt) return tfidf, word_cnt