Beispiel #1
0
def get_question_answer_pair_wikiqa(wikiqa):
    from tqdm import tqdm
    from IR import infoRX
    # glove = load_glove(200)

    from nltk.stem.wordnet import WordNetLemmatizer
    lmtzr = WordNetLemmatizer()

    wikiqa_data = []

    for sample in tqdm(wikiqa, total=len(wikiqa), ncols=75, unit='Sample'):
        q, context, label_list = sample

        tfidf, imp_tokens = infoRX.tf_idf(context, q)

        for i, c in enumerate(context):
            # q_vector, a_vector = qa_vectorize(q, c, glove) # WARNING: THIS WILL INCERASE THE FILE SIZE IN ORDER OF GB

            word_cnt = 0
            for imp in imp_tokens:

                if lmtzr.lemmatize(imp) in [
                        lmtzr.lemmatize(w) for w in c.split()
                ]:
                    word_cnt += 1

            wikiqa_data.append((q, c, label_list[i], tfidf[i], word_cnt))

    return wikiqa_data
Beispiel #2
0
def get_question_answer_pair_squad(squad):

    from IR import infoRX
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk import sent_tokenize
    lmtzr = WordNetLemmatizer()

    squad_qa = []
    for s in squad:
        context, qas_list = s
        context = sent_tokenize(context)
        # context = context[: -1] # To remove empty item after splitting the last '.'
        # context = [c + '. ' for c in context]
        for qas in qas_list:
            a_start, a_text, a_sentence, q, _ = qas
            tfidf, imp_tokens = infoRX.tf_idf(context, q)
            label = 0
            for i, c in enumerate(context):
                if c == a_sentence:
                    label = 1
                word_cnt = 0
                for imp in imp_tokens:
                    if lmtzr.lemmatize(imp) in [
                            lmtzr.lemmatize(w) for w in c.split()
                    ]:
                        word_cnt += 1
                squad_qa.append((q, c, label, tfidf[i], word_cnt))
    return squad_qa
Beispiel #3
0
def get_question_answer_pair_babi(babi):
    from tqdm import tqdm
    from IR import infoRX
    # glove = load_glove(200)

    babi_data = []

    for sample in tqdm(babi, total=len(babi), ncols=75, unit='Sample'):
        line_numbers, context, question, _, support = sample

        tfidf, imp_tokens = infoRX.tf_idf(context, question)

        for i, c in enumerate(context):
            # q_vector, a_vector = qa_vectorize(question, c, glove) # WARNING: THIS WILL INCERASE THE FILE SIZE IN ORDER OF GB

            label = 0
            line_number = line_numbers[i]
            if int(support) == int(line_number):
                label = 1

            word_cnt = 0
            for imp in imp_tokens:
                if imp in c:
                    word_cnt += 1

            babi_data.append((question, c, label, tfidf[i], word_cnt))

    return babi_data
Beispiel #4
0
	def extract_features(self, q, a_list):

		a_list = [str(a) for a in a_list]

		tfidf, imp_tokens = infoRX.tf_idf(a_list, q)
		word_cnt = []

		for a in a_list:
			w_cnt = 0
			for imp in imp_tokens:
				if imp in a:
					w_cnt += 1
			word_cnt.append(w_cnt)

		return tfidf, word_cnt