Esempio n. 1
0
 def file_to_dicts(self, file: str) -> [dict]:
     dict = read_squad_file(filename=file)
     return dict
Esempio n. 2
0
    def get_all_questions(self):
        """Return all question strings. Note that if the same question appears for different paragraphs, it will be
        returned multiple times by this fn"""
        df_questions = self.df[["title", "context", "question"]]
        df_questions = df_questions.drop_duplicates()
        questions = df_questions["question"].tolist()
        return questions

    def get_all_document_titles(self):
        """Return all document title strings"""
        return self.df["title"].unique().tolist()


if __name__ == "__main__":
    # Download the SQuAD dataset if it isn't at target directory
    read_squad_file("../data/squad20/train-v2.0.json")

    filename1 = "../data/squad20/train-v2.0.json"
    filename2 = "../data/squad20/dev-v2.0.json"

    # Load file1 and take a sample of 10000 questions
    sd = SquadData.from_file(filename1)
    sample1 = sd.sample_questions(n=10000)

    # Set sd to now contain the sample of 10000 questions
    sd.set_data(sample1)

    # Merge sd with file2 and take a sample of 100 questions
    sd.merge_from_file(filename2)
    sample2 = sd.sample_questions(n=100)
    sd.set_data(sample2)
Esempio n. 3
0
 def file_to_dicts(self, file: str) -> [dict]:
     nested_dicts = read_squad_file(filename=file)
     dicts = [y for x in nested_dicts for y in x["paragraphs"]]
     return dicts
Esempio n. 4
0
 def file_to_dicts(self, file: str) -> [dict]:
     dict = read_squad_file(filename=file, proxies=self.proxies)
     return dict