def read_all_GQA_questions(inpath, isLower=True, switch=False): with open(inpath) as dataset_file: dataset_json = json.load(dataset_file, encoding='utf-8') dataset = dataset_json['data'] all_questions = [] max_answer_len = 0 end_sym_q = '</s>' if switch is True else None end_sym_a = '</s>' if switch is False else None for article in dataset: for paragraph in article['paragraphs']: context = paragraph['context'] context_annotations = paragraph['annotations'] passageSent = QASentence(context, context_annotations, ID_num=None, isLower=isLower) for question in paragraph['qas']: question_text = question['question'] question_id = question['id'] if not question.has_key('annotations'): continue question_annotation = question['annotations'] questionSent = QASentence(question_text, question_annotation, ID_num=question_id, isLower=isLower, end_sym=end_sym_q) answer_text = question['answers'][0]['text'] answer_annotation = question['answers'][0]['annotations'] answerSent = QASentence(answer_text, answer_annotation, isLower=isLower, end_sym=end_sym_a) if switch: max_answer_len = max(max_answer_len, len(questionSent.tokText.split())) all_questions.append((passageSent, questionSent, answerSent)) else: max_answer_len = max(max_answer_len, len(answerSent.tokText.split())) all_questions.append((passageSent, answerSent, questionSent)) return all_questions, max_answer_len
def read_all_GenerationDatasets(inpath, isLower=True): #print("-------", inpath) with open(inpath) as dataset_file: content = dataset_file.readline() #json.dump(content, dataset_file).encode('utf-8') dataset = json.loads(content, encoding='utf-8') all_instances = [] max_answer_len = 0 for instance in dataset: #print("for loop instance = ", instance) ID_num = None if instance.has_key('id'): ID_num = instance['id'] text1 = instance['annotation1']['toks'] if 'annotation1' in instance else instance['text1'] #print("text1 = ", text1) if text1 == "": continue annotation1 = instance['annotation1'] if 'annotation1' in instance else None sent1 = QASentence(text1, annotation1, ID_num=ID_num, isLower=isLower) #print("text1 end") text2 = instance['annotation2']['toks'] if 'annotation2' in instance else instance['text2'] #print("text2 = ", text2) if text2 == "": continue annotation2 = instance['annotation2'] if 'annotation2' in instance else None sent2 = QASentence(text2, annotation2, ID_num=ID_num, isLower=isLower, end_sym='</s>') max_answer_len = max(max_answer_len, sent2.get_length()) # text2 is the sequence to be generated #print("text2 end") sent3 = None if instance.has_key('text3'): #print("inside if instacne text3 ") text3 = instance['annotation3']['toks'] if 'annotation3' in instance else instance['text3'] annotation3 = instance['annotation3'] if 'annotation3' in instance else None sent3 = QASentence(text3, annotation3, ID_num=ID_num, isLower=isLower) all_instances.append((sent1, sent2, sent3)) #print("enddddd if statement") return all_instances, max_answer_len
def read_all_GenerationDatasets(inpath, isLower=True): with open(inpath) as dataset_file: dataset = json.load(dataset_file, encoding='utf-8') all_instances = [] max_answer_len = 0 for ID_num, instance in enumerate(dataset): if instance.has_key('id'): ID_num = instance['id'] text1 = instance['text1'] if text1 == "": continue annotation1 = None #instance['annotation1'] sent1 = QASentence(text1, annotation1, ID_num=ID_num, isLower=isLower) text2 = instance['text2'] if text2 == "": continue annotation2 = None #instance['annotation2'] sent2 = QASentence(text2, annotation2, ID_num=ID_num, isLower=isLower, end_sym='</s>') max_answer_len = max(max_answer_len, sent2.get_length()) # text2 is the sequence to be generated #if sent2.get_length()>100: print(sent2.tokText) sent3 = None if instance.has_key('text3'): text3 = instance['text3'] annotation3 = instance['annotation3'] sent3 = QASentence(text3, annotation3, ID_num=ID_num, isLower=isLower) all_instances.append((sent1, sent2, sent3)) return all_instances, max_answer_len