Esempio n. 1
0
def read_all_GQA_questions(inpath, isLower=True, switch=False):
    with open(inpath) as dataset_file:
        dataset_json = json.load(dataset_file, encoding='utf-8')
        dataset = dataset_json['data']
    all_questions = []
    max_answer_len = 0
    end_sym_q = '</s>' if switch is True else None
    end_sym_a = '</s>' if switch is False else None
    for article in dataset:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            context_annotations = paragraph['annotations']
            passageSent = QASentence(context, context_annotations, ID_num=None, isLower=isLower)
            for question in paragraph['qas']:
                question_text = question['question']
                question_id = question['id']
                if not question.has_key('annotations'):
                    continue
                question_annotation = question['annotations']
                questionSent = QASentence(question_text, question_annotation, ID_num=question_id, isLower=isLower, end_sym=end_sym_q)
                answer_text = question['answers'][0]['text']
                answer_annotation = question['answers'][0]['annotations']
                answerSent = QASentence(answer_text, answer_annotation, isLower=isLower, end_sym=end_sym_a)
                if switch:
                    max_answer_len = max(max_answer_len, len(questionSent.tokText.split()))
                    all_questions.append((passageSent, questionSent, answerSent))
                else:
                    max_answer_len = max(max_answer_len, len(answerSent.tokText.split()))
                    all_questions.append((passageSent, answerSent, questionSent))
    return all_questions, max_answer_len
Esempio n. 2
0
def read_all_GenerationDatasets(inpath, isLower=True):
    #print("-------", inpath)
    with open(inpath) as dataset_file:
	content = dataset_file.readline()
	#json.dump(content, dataset_file).encode('utf-8')
        dataset = json.loads(content, encoding='utf-8')
    all_instances = []
    max_answer_len = 0
    for instance in dataset:
        #print("for loop instance = ", instance)
        ID_num = None
        if instance.has_key('id'): ID_num = instance['id']

        text1 = instance['annotation1']['toks'] if 'annotation1' in instance else instance['text1']
        #print("text1 = ", text1)
        if text1 == "": continue
        annotation1 = instance['annotation1'] if 'annotation1' in instance else None
        sent1 = QASentence(text1, annotation1, ID_num=ID_num, isLower=isLower)
        #print("text1 end")
        text2 = instance['annotation2']['toks'] if 'annotation2' in instance else instance['text2']
        #print("text2 = ", text2)
        if text2 == "": continue
        annotation2 = instance['annotation2'] if 'annotation2' in instance else None
        sent2 = QASentence(text2, annotation2, ID_num=ID_num, isLower=isLower, end_sym='</s>')
        max_answer_len = max(max_answer_len, sent2.get_length()) # text2 is the sequence to be generated
        #print("text2 end")
        sent3 = None
        if instance.has_key('text3'):
            #print("inside if instacne text3 ")
            text3 = instance['annotation3']['toks'] if 'annotation3' in instance else instance['text3']
            annotation3 = instance['annotation3'] if 'annotation3' in instance else None
            sent3 = QASentence(text3, annotation3, ID_num=ID_num, isLower=isLower)
        all_instances.append((sent1, sent2, sent3))
        #print("enddddd if statement")
    return all_instances, max_answer_len
Esempio n. 3
0
def read_all_GenerationDatasets(inpath, isLower=True):
    with open(inpath) as dataset_file:
        dataset = json.load(dataset_file, encoding='utf-8')
    all_instances = []
    max_answer_len = 0
    for ID_num, instance in enumerate(dataset):
        if instance.has_key('id'): ID_num = instance['id']

        text1 = instance['text1']
        if text1 == "": continue
        annotation1 = None #instance['annotation1']
        sent1 = QASentence(text1, annotation1, ID_num=ID_num, isLower=isLower)

        text2 = instance['text2']
        if text2 == "": continue
        annotation2 = None #instance['annotation2']
        sent2 = QASentence(text2, annotation2, ID_num=ID_num, isLower=isLower, end_sym='</s>')
        max_answer_len = max(max_answer_len, sent2.get_length()) # text2 is the sequence to be generated
        #if sent2.get_length()>100: print(sent2.tokText)

        sent3 = None
        if instance.has_key('text3'):
            text3 = instance['text3']
            annotation3 = instance['annotation3']
            sent3 = QASentence(text3, annotation3, ID_num=ID_num, isLower=isLower)
        all_instances.append((sent1, sent2, sent3))
    return all_instances, max_answer_len