def load_with_spacy(self): """ This function will convert the CoNLL02/03 format to json format for spaCy. As the function will return a spacy.gold.GoldCorpus which needs a dev set this function also splits the dataset into a 70/30 split as is done by Pan et al. (2017). - Pan et al. (2017): https://aclweb.org/anthology/P17-1178 :return: """ import srsly from spacy.cli.converters import conll_ner2json from spacy.gold import GoldCorpus from spacy.gold import Path conll_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension) dev_json_path = os.path.join(self.dataset_dir, self.dataset_name + "dev.json") train_json_path = os.path.join(self.dataset_dir, self.dataset_name + "train.json") if not os.path.isfile(dev_json_path) or not os.path.isfile( train_json_path): # Convert the conll ner files to json with open(conll_path, 'r') as file: file_as_string = file.read() file_as_json = conll_ner2json(file_as_string) all_sents = file_as_json[0]['paragraphs'][0]['sentences'] train_sents, dev_sents = train_test_split(all_sents, test_size=0.3, random_state=42) train_json = [{ 'id': 0, 'paragraphs': [{ 'sentences': train_sents }] }] dev_json = [{ 'id': 0, 'paragraphs': [{ 'sentences': dev_sents }] }] srsly.write_json(train_json_path, train_json) srsly.write_json(dev_json_path, dev_json) assert os.path.isfile(train_json_path) and os.path.isfile( train_json_path) return GoldCorpus(Path(train_json_path), Path(dev_json_path))
def load_with_spacy(self): """ Loads the dataset with spaCy. This function will convert the CoNLL02/03 format to json format for spaCy. As the function will return a spacy.gold.GoldCorpus which needs a dev set this function also splits the dataset into a 70/30 split as is done by Pan et al. (2017). - Pan et al. (2017): https://aclweb.org/anthology/P17-1178 :return: GoldCorpus """ import srsly from spacy.cli.converters import conll_ner2json from spacy.gold import GoldCorpus from spacy.gold import Path conll_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension) dev_json_path = os.path.join(self.dataset_dir, self.dataset_name + "dev.json") train_json_path = os.path.join(self.dataset_dir, self.dataset_name + "train.json") if not os.path.isfile(dev_json_path) or not os.path.isfile( train_json_path): # Convert the conll ner files to json with open(conll_path, 'r') as file: file_as_string = file.read() # n_sents=0 means we do not group the sentences into documents file_as_json = conll_ner2json(file_as_string, n_sents=0, no_print=True) all_sents = file_as_json[0]['paragraphs'][0]['sentences'] random.seed(42) random.shuffle(all_sents) train_size = round(len(all_sents) * 0.7) train_sents = all_sents[:train_size] dev_sents = all_sents[train_size:] train_json = [{ 'id': 0, 'paragraphs': [{ 'sentences': train_sents }] }] dev_json = [{ 'id': 0, 'paragraphs': [{ 'sentences': dev_sents }] }] srsly.write_json(train_json_path, train_json) srsly.write_json(dev_json_path, dev_json) assert os.path.isfile(train_json_path) and os.path.isfile( train_json_path) return GoldCorpus(Path(train_json_path), Path(dev_json_path))
def test_cli_converters_conll_ner2json(): lines = [ "-DOCSTART- -X- O O", "", "I\tO", "like\tO", "London\tB-GPE", "and\tO", "New\tB-GPE", "York\tI-GPE", "City\tI-GPE", ".\tO", "", "I O", "like O", "London B-GPE", "and O", "New B-GPE", "York I-GPE", "City I-GPE", ". O", "", "I PRP O", "like VBP O", "London NNP B-GPE", "and CC O", "New NNP B-GPE", "York NNP I-GPE", "City NNP I-GPE", ". . O", "", "I PRP _ O", "like VBP _ O", "London NNP _ B-GPE", "and CC _ O", "New NNP _ B-GPE", "York NNP _ I-GPE", "City NNP _ I-GPE", ". . _ O", "", "I\tPRP\t_\tO", "like\tVBP\t_\tO", "London\tNNP\t_\tB-GPE", "and\tCC\t_\tO", "New\tNNP\t_\tB-GPE", "York\tNNP\t_\tI-GPE", "City\tNNP\t_\tI-GPE", ".\t.\t_\tO", ] input_data = "\n".join(lines) converted = conll_ner2json(input_data, n_sents=10) print(converted) assert len(converted) == 1 assert converted[0]["id"] == 0 assert len(converted[0]["paragraphs"]) == 1 assert len(converted[0]["paragraphs"][0]["sentences"]) == 5 for i in range(0, 5): sent = converted[0]["paragraphs"][0]["sentences"][i] assert len(sent["tokens"]) == 8 tokens = sent["tokens"] # fmt: off assert [t["orth"] for t in tokens] == [ "I", "like", "London", "and", "New", "York", "City", "." ] assert [t["ner"] for t in tokens ] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]