def fetch_dane_as_conllu(): from danlp.datasets import DDT ddt = DDT() train, dev, test = ddt.load_as_conllu(predefined_splits=True) with open("assets/dane/dane_train.conllu", "w") as f: train.write(f) with open("assets/dane/dane_dev.conllu", "w") as f: dev.write(f) with open("assets/dane/dane_test.conllu", "w") as f: test.write(f)
def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() num_sents_train = 0 for paragraph in [paragraph[1] for paragraph in list(corpus.train_tuples)]: num_sents_train += len(paragraph) self.assertIsInstance(corpus, GoldCorpus) self.assertEqual(self.train_len, num_sents_train)
def load(self, **_): # Toss out kwargs # Get all three splits from DaNE and divide them in source texts and annotations if not danlp_available: raise ModuleNotFoundError( "DaNE dataset requires installation of the optional requirement `danlp`" ) datasets = DDT().load_as_simple_ner(predefined_splits=True) for (texts, annotations), split in zip(datasets, Split): self.data[split] = Sequences( texts=texts, annotations=annotations, # Sadly, we do not have access to where the DaNE sentences are divided into articles, so we let each sentence be an entire text. sentence_boundaries=[[len(s)] for s in texts]) self.loaded = True
download_dane_data() def is_misc(ent: str): if len(ent) < 4: return False return ent[-4:] == 'MISC' def remove_miscs(se: list): return [[entity if not is_misc(entity) else 'O' for entity in entities] for entities in se] # Load the DaNE data _, _, test = DDT().load_as_simple_ner(predefined_splits=True) sentences_tokens, sentences_entities = test # Replace MISC with O for fair comparisons sentences_entities = remove_miscs(sentences_entities) num_sentences = len(sentences_tokens) num_tokens = sum([len(s) for s in sentences_tokens]) # def benchmark_polyglot_mdl(): # """ # Running ployglot requires these packages: # # Morfessor==2.0.6 # # PyICU==2.4.2 # # pycld2==0.41 # # polyglot
def setUp(self): self.train_len = 4383 self.dev_len = 564 self.test_len = 565 self.ddt = DDT() # Load dataset
from danlp.datasets import DDT # utils #from flair.data import Sentence, Token # load models bert = load_bert_ner_model() '''flair = load_flair_ner_model()''' # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True # get data (splitted into a training set, a validation set, and a test set) ddt = DDT() train, valid, test = ddt.load_as_simple_ner(True) # divide the observations and the targets of the testset into new variables sentences, categories = test batch_size = 64 batch_sentences = [] num_sentences = len(sentences) iterator = 0 while len(batch_sentences) < num_sentences / batch_size: stop = batch_size * (iterator + 1) if stop > num_sentences: stop = num_sentences
def setup(self, split="test"): # Third element is test data self.data = DDT().load_as_simple_ner(predefined_splits=True)[("train", "dev", "test").index(split)]
def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() self.assertIsInstance(corpus, GoldCorpus)