Example #1
0
    def load_with_spacy(self):
        """
        This function will convert the CoNLL02/03 format to json format for spaCy.
        As the function will return a spacy.gold.GoldCorpus which needs a dev set
        this function also splits the dataset into a 70/30 split as is done by
        Pan et al. (2017).

        - Pan et al. (2017): https://aclweb.org/anthology/P17-1178
        :return:
        """
        import srsly
        from spacy.cli.converters import conll_ner2json
        from spacy.gold import GoldCorpus
        from spacy.gold import Path

        conll_path = os.path.join(self.dataset_dir,
                                  self.dataset_name + self.file_extension)
        dev_json_path = os.path.join(self.dataset_dir,
                                     self.dataset_name + "dev.json")
        train_json_path = os.path.join(self.dataset_dir,
                                       self.dataset_name + "train.json")

        if not os.path.isfile(dev_json_path) or not os.path.isfile(
                train_json_path):
            # Convert the conll ner files to json
            with open(conll_path, 'r') as file:
                file_as_string = file.read()
                file_as_json = conll_ner2json(file_as_string)

                all_sents = file_as_json[0]['paragraphs'][0]['sentences']
                train_sents, dev_sents = train_test_split(all_sents,
                                                          test_size=0.3,
                                                          random_state=42)

                train_json = [{
                    'id': 0,
                    'paragraphs': [{
                        'sentences': train_sents
                    }]
                }]
                dev_json = [{
                    'id': 0,
                    'paragraphs': [{
                        'sentences': dev_sents
                    }]
                }]

                srsly.write_json(train_json_path, train_json)
                srsly.write_json(dev_json_path, dev_json)

        assert os.path.isfile(train_json_path) and os.path.isfile(
            train_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
Example #2
0
    def load_with_spacy(self):
        """
        Loads the dataset with spaCy. 

        This function will convert the CoNLL02/03 format to json format for spaCy.
        As the function will return a spacy.gold.GoldCorpus which needs a dev set
        this function also splits the dataset into a 70/30 split as is done by
        Pan et al. (2017).

        - Pan et al. (2017): https://aclweb.org/anthology/P17-1178
        
        :return: GoldCorpus
        """
        import srsly
        from spacy.cli.converters import conll_ner2json
        from spacy.gold import GoldCorpus
        from spacy.gold import Path

        conll_path = os.path.join(self.dataset_dir,
                                  self.dataset_name + self.file_extension)
        dev_json_path = os.path.join(self.dataset_dir,
                                     self.dataset_name + "dev.json")
        train_json_path = os.path.join(self.dataset_dir,
                                       self.dataset_name + "train.json")

        if not os.path.isfile(dev_json_path) or not os.path.isfile(
                train_json_path):
            # Convert the conll ner files to json
            with open(conll_path, 'r') as file:
                file_as_string = file.read()
                # n_sents=0 means we do not group the sentences into documents
                file_as_json = conll_ner2json(file_as_string,
                                              n_sents=0,
                                              no_print=True)

                all_sents = file_as_json[0]['paragraphs'][0]['sentences']

                random.seed(42)
                random.shuffle(all_sents)

                train_size = round(len(all_sents) * 0.7)
                train_sents = all_sents[:train_size]
                dev_sents = all_sents[train_size:]

                train_json = [{
                    'id': 0,
                    'paragraphs': [{
                        'sentences': train_sents
                    }]
                }]
                dev_json = [{
                    'id': 0,
                    'paragraphs': [{
                        'sentences': dev_sents
                    }]
                }]

                srsly.write_json(train_json_path, train_json)
                srsly.write_json(dev_json_path, dev_json)

        assert os.path.isfile(train_json_path) and os.path.isfile(
            train_json_path)

        return GoldCorpus(Path(train_json_path), Path(dev_json_path))
Example #3
0
def test_cli_converters_conll_ner2json():
    lines = [
        "-DOCSTART- -X- O O",
        "",
        "I\tO",
        "like\tO",
        "London\tB-GPE",
        "and\tO",
        "New\tB-GPE",
        "York\tI-GPE",
        "City\tI-GPE",
        ".\tO",
        "",
        "I O",
        "like O",
        "London B-GPE",
        "and O",
        "New B-GPE",
        "York I-GPE",
        "City I-GPE",
        ". O",
        "",
        "I PRP O",
        "like VBP O",
        "London NNP B-GPE",
        "and CC O",
        "New NNP B-GPE",
        "York NNP I-GPE",
        "City NNP I-GPE",
        ". . O",
        "",
        "I PRP _ O",
        "like VBP _ O",
        "London NNP _ B-GPE",
        "and CC _ O",
        "New NNP _ B-GPE",
        "York NNP _ I-GPE",
        "City NNP _ I-GPE",
        ". . _ O",
        "",
        "I\tPRP\t_\tO",
        "like\tVBP\t_\tO",
        "London\tNNP\t_\tB-GPE",
        "and\tCC\t_\tO",
        "New\tNNP\t_\tB-GPE",
        "York\tNNP\t_\tI-GPE",
        "City\tNNP\t_\tI-GPE",
        ".\t.\t_\tO",
    ]
    input_data = "\n".join(lines)
    converted = conll_ner2json(input_data, n_sents=10)
    print(converted)
    assert len(converted) == 1
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
    for i in range(0, 5):
        sent = converted[0]["paragraphs"][0]["sentences"][i]
        assert len(sent["tokens"]) == 8
        tokens = sent["tokens"]
        # fmt: off
        assert [t["orth"] for t in tokens] == [
            "I", "like", "London", "and", "New", "York", "City", "."
        ]
        assert [t["ner"] for t in tokens
                ] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]