def file_to_dicts(self, file: str) -> [dict]: dict = read_squad_file(filename=file) return dict
def get_all_questions(self): """Return all question strings. Note that if the same question appears for different paragraphs, it will be returned multiple times by this fn""" df_questions = self.df[["title", "context", "question"]] df_questions = df_questions.drop_duplicates() questions = df_questions["question"].tolist() return questions def get_all_document_titles(self): """Return all document title strings""" return self.df["title"].unique().tolist() if __name__ == "__main__": # Download the SQuAD dataset if it isn't at target directory read_squad_file("../data/squad20/train-v2.0.json") filename1 = "../data/squad20/train-v2.0.json" filename2 = "../data/squad20/dev-v2.0.json" # Load file1 and take a sample of 10000 questions sd = SquadData.from_file(filename1) sample1 = sd.sample_questions(n=10000) # Set sd to now contain the sample of 10000 questions sd.set_data(sample1) # Merge sd with file2 and take a sample of 100 questions sd.merge_from_file(filename2) sample2 = sd.sample_questions(n=100) sd.set_data(sample2)
def file_to_dicts(self, file: str) -> [dict]: nested_dicts = read_squad_file(filename=file) dicts = [y for x in nested_dicts for y in x["paragraphs"]] return dicts
def file_to_dicts(self, file: str) -> [dict]: dict = read_squad_file(filename=file, proxies=self.proxies) return dict