def fetch_dane_as_conllu(): from danlp.datasets import DDT ddt = DDT() train, dev, test = ddt.load_as_conllu(predefined_splits=True) with open("assets/dane/dane_train.conllu", "w") as f: train.write(f) with open("assets/dane/dane_dev.conllu", "w") as f: dev.write(f) with open("assets/dane/dane_test.conllu", "w") as f: test.write(f)
def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() num_sents_train = 0 for paragraph in [paragraph[1] for paragraph in list(corpus.train_tuples)]: num_sents_train += len(paragraph) self.assertIsInstance(corpus, GoldCorpus) self.assertEqual(self.train_len, num_sents_train)
def load(self, **_): # Toss out kwargs # Get all three splits from DaNE and divide them in source texts and annotations if not danlp_available: raise ModuleNotFoundError( "DaNE dataset requires installation of the optional requirement `danlp`" ) datasets = DDT().load_as_simple_ner(predefined_splits=True) for (texts, annotations), split in zip(datasets, Split): self.data[split] = Sequences( texts=texts, annotations=annotations, # Sadly, we do not have access to where the DaNE sentences are divided into articles, so we let each sentence be an entire text. sentence_boundaries=[[len(s)] for s in texts]) self.loaded = True
download_dane_data() def is_misc(ent: str): if len(ent) < 4: return False return ent[-4:] == 'MISC' def remove_miscs(se: list): return [[entity if not is_misc(entity) else 'O' for entity in entities] for entities in se] # Load the DaNE data _, _, test = DDT().load_as_simple_ner(predefined_splits=True) sentences_tokens, sentences_entities = test # Replace MISC with O for fair comparisons sentences_entities = remove_miscs(sentences_entities) num_sentences = len(sentences_tokens) num_tokens = sum([len(s) for s in sentences_tokens]) # def benchmark_polyglot_mdl(): # """ # Running ployglot requires these packages: # # Morfessor==2.0.6 # # PyICU==2.4.2 # # pycld2==0.41 # # polyglot
def setUp(self): self.train_len = 4383 self.dev_len = 564 self.test_len = 565 self.ddt = DDT() # Load dataset
class TestNerDatasets(unittest.TestCase): def setUp(self): self.train_len = 4383 self.dev_len = 564 self.test_len = 565 self.ddt = DDT() # Load dataset def test_ddt_dataset(self): train, dev, test = self.ddt.load_as_conllu(predefined_splits=True) self.assertIsInstance(train, Conll) self.assertIsInstance(dev, Conll) self.assertIsInstance(test, Conll) self.assertEqual( [len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len]) full_dataset = self.ddt.load_as_conllu(predefined_splits=False) self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len) def test_ddt_simple_ner(self): train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True) self.assertEqual([len( train[0]), len(dev[0]), len(test[0])], [self.train_len, self.dev_len, self.test_len]) all_sentences, all_entities = self.ddt.load_as_simple_ner( predefined_splits=False) self.assertEqual(len(all_sentences), self.train_len + self.dev_len + self.test_len) data = defaultdict(int) for entities in train[1]: for entity in entities: if "B" in entity: data[entity[2:]] += 1 self.assertDictEqual(data, { 'ORG': 802, 'LOC': 945, 'PER': 1249, 'MISC': 1007 }) def test_ddt_dataset_with_flair(self): flair_corpus = self.ddt.load_with_flair() self.assertIsInstance(flair_corpus, ColumnCorpus) flair_lens = [ len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test) ] self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len]) ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER', b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() num_sents_train = 0 for paragraph in [ paragraph[1] for paragraph in list(corpus.train_tuples) ]: num_sents_train += len(paragraph) self.assertIsInstance(corpus, GoldCorpus) self.assertEqual(self.train_len, num_sents_train) def test_wikiann_dataset(self): # Change to a sample of the full wikiann to ease test computation DATASETS['wikiann'][ 'url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz" DATASETS['wikiann']['size'] = 2502 DATASETS['wikiann'][ 'md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace' wikiann = WikiAnn() corpus = wikiann.load_with_flair() self.assertEqual( [len(corpus.train), len(corpus.dev), len(corpus.test)], [21, 2, 3]) ner_tags = corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) spacy_gold = wikiann.load_with_spacy() self.assertIsInstance(spacy_gold, GoldCorpus) num_train_sents = len(list(spacy_gold.train_tuples)[0][1]) num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1]) self.assertEqual(num_dev_sents + num_train_sents, 26) shutil.rmtree(wikiann.dataset_dir) def test_wordsim353(self): ws353 = WordSim353Da() df = ws353.load_with_pandas() self.assertEqual(len(df), 353) self.assertListEqual(list(df.columns), ['da1', 'da2', 'Human (mean)']) self.assertEqual(len(ws353.words()), 424) def test_dsd(self): dsd = DSD() df = dsd.load_with_pandas() self.assertEqual(len(df), 99) self.assertListEqual(list(df.columns), ['word1', 'word2', 'similarity']) self.assertEqual(len(dsd.words()), 197) def test_europarlsentiment(self): eusent = EuroparlSentiment() df = eusent.load_with_pandas() self.assertEqual(len(df), 184) def test_lccsentiment(self): sent = LccSentiment() df = sent.load_with_pandas() self.assertEqual(len(df), 499)
class TestNerDatasets(unittest.TestCase): def setUp(self): self.train_len = 4383 self.dev_len = 564 self.test_len = 565 self.ddt = DDT() # Load dataset def test_write_and_read_simple_ner_dataset(self): sentences = [["Jeg", "gik", "en", "tur", "i", "København"], ["Alexandra", "Instituttet", "arbejder", "med", "NLP"]] entities = [["O", "O", "O", "O", "O", "B-LOC"], ["B-ORG", "I-ORG", "O", "O", "O"]] tmp_file = NamedTemporaryFile().name write_simple_ner_dataset(sentences, entities, tmp_file) loaded_sents, loaded_ents = read_simple_ner_dataset(tmp_file) self.assertEqual(sentences, loaded_sents) self.assertEqual(entities, loaded_ents) def test_ddt_dataset(self): train, dev, test = self.ddt.load_as_conllu(predefined_splits=True) self.assertIsInstance(train, Conll) self.assertIsInstance(dev, Conll) self.assertIsInstance(test, Conll) self.assertEqual( [len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len]) full_dataset = self.ddt.load_as_conllu(predefined_splits=False) self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len) def test_ddt_simple_ner(self): train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True) self.assertEqual([len( train[0]), len(dev[0]), len(test[0])], [self.train_len, self.dev_len, self.test_len]) all_sentences, all_entities = self.ddt.load_as_simple_ner( predefined_splits=False) self.assertEqual(len(all_sentences), self.train_len + self.dev_len + self.test_len) data = defaultdict(int) for entities in train[1]: for entity in entities: if "B" in entity: data[entity[2:]] += 1 self.assertDictEqual(data, { 'ORG': 802, 'LOC': 945, 'PER': 1249, 'MISC': 1007 }) def test_ddt_dataset_with_flair(self): flair_corpus = self.ddt.load_with_flair() self.assertIsInstance(flair_corpus, ColumnCorpus) flair_lens = [ len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test) ] self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len]) ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER', b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() num_sents_train = 0 for paragraph in [ paragraph[1] for paragraph in list(corpus.train_tuples) ]: num_sents_train += len(paragraph) self.assertIsInstance(corpus, GoldCorpus) self.assertEqual(self.train_len, num_sents_train) def test_wikiann_dataset(self): # Change to a sample of the full wikiann to ease test computation DATASETS['wikiann']['url'] = DANLP_STORAGE_URL + "/tests/da.tar.gz" DATASETS['wikiann']['size'] = 2502 DATASETS['wikiann'][ 'md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace' wikiann = WikiAnn() corpus = wikiann.load_with_flair() self.assertEqual( [len(corpus.train), len(corpus.dev), len(corpus.test)], [21, 2, 3]) ner_tags = corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) spacy_gold = wikiann.load_with_spacy() self.assertIsInstance(spacy_gold, GoldCorpus) num_train_sents = len(list(spacy_gold.train_tuples)[0][1]) num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1]) self.assertEqual(num_dev_sents + num_train_sents, 26) shutil.rmtree(wikiann.dataset_dir)
from danlp.datasets import DDT # utils #from flair.data import Sentence, Token # load models bert = load_bert_ner_model() '''flair = load_flair_ner_model()''' # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") torch.backends.cudnn.benchmark = True # get data (splitted into a training set, a validation set, and a test set) ddt = DDT() train, valid, test = ddt.load_as_simple_ner(True) # divide the observations and the targets of the testset into new variables sentences, categories = test batch_size = 64 batch_sentences = [] num_sentences = len(sentences) iterator = 0 while len(batch_sentences) < num_sentences / batch_size: stop = batch_size * (iterator + 1) if stop > num_sentences: stop = num_sentences
from utils import print_speed_performance, accuracy_report from flair.data import Sentence, Token from danlp.datasets import DDT from danlp.models import load_spacy_model, load_flair_pos_model # benchmarking polyglotmodel requires from polyglot.tag import POSTagger from polyglot.text import WordList import os import spacy # load the data ddt = DDT() corpus_flair = ddt.load_with_flair() tags_true = [[tok.tags['pos'].value for tok in fs] for fs in corpus_flair.test] num_sentences = len(tags_true) num_tokens = sum([len(s) for s in tags_true]) ccorpus_conll = ddt.load_as_conllu(predefined_splits=True) # the test set sentences_tokens = [] for sent in ccorpus_conll[2]: sentences_tokens.append([token.form for token in sent._tokens]) def benchmark_flair_mdl(): tagger = load_flair_pos_model()
import time import os import spacy from danlp.datasets import DDT from danlp.models import load_spacy_model from utils import print_speed_performance, dependency_report import stanza stanza.download( 'Danish') # Download model (you can comment this line after download) # load the data ddt = DDT() ccorpus_conll = ddt.load_as_conllu(predefined_splits=True) deps_true = [] # the test set sentences_tokens = [] for sent in ccorpus_conll[2]: sentences_tokens.append([token.form for token in sent._tokens]) deps_true.append([(token.deprel.lower(), int(token.head)) for token in sent._tokens]) num_sentences = len(sentences_tokens) num_tokens = sum([len(s) for s in sentences_tokens]) def benchmark_spacy_mdl(): def normalize_spacy_head(i, hd):
def setup(self, split="test"): # Third element is test data self.data = DDT().load_as_simple_ner(predefined_splits=True)[("train", "dev", "test").index(split)]
def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() self.assertIsInstance(corpus, GoldCorpus)
class TestNerDatasets(unittest.TestCase): def setUp(self): self.train_len = 4383 self.dev_len = 564 self.test_len = 565 self.ddt = DDT() # Load dataset def test_ddt_dataset(self): train, dev, test = self.ddt.load_as_conllu(predefined_splits=True) self.assertIsInstance(train, Conll) self.assertIsInstance(dev, Conll) self.assertIsInstance(test, Conll) self.assertEqual([len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len]) full_dataset = self.ddt.load_as_conllu(predefined_splits=False) self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len) def test_ddt_dataset_with_flair(self): flair_corpus = self.ddt.load_with_flair() self.assertIsInstance(flair_corpus, ColumnCorpus) flair_lens = [len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test)] self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len]) ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() self.assertIsInstance(corpus, GoldCorpus) def test_wikiann_dataset(self): # Change to a sample of the full wikiann to ease test computation DATASETS['wikiann']['url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz" DATASETS['wikiann']['size'] = 2502 DATASETS['wikiann']['md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace' wikiann = WikiAnn() corpus = wikiann.load_with_flair() self.assertEqual([len(corpus.train), len(corpus.dev), len(corpus.test)], [21, 2, 3]) ner_tags = corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) spacy_gold = wikiann.load_with_spacy() self.assertIsInstance(spacy_gold, GoldCorpus) num_train_sents = len(list(spacy_gold.train_tuples)[0][1]) num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1]) self.assertEqual(num_dev_sents + num_train_sents, 26) shutil.rmtree(wikiann.dataset_dir) def test_wordsim353(self): ws353 = WordSim353Da() df = ws353.load_with_pandas() self.assertEqual(len(df), 353) self.assertListEqual(list(df.columns), ['da1', 'da2', 'Human (mean)']) self.assertEqual(len(ws353.words()), 424) def test_dsd(self): dsd = DSD() df = dsd.load_with_pandas() self.assertEqual(len(df), 99) self.assertListEqual(list(df.columns), ['word1', 'word2', 'similarity']) self.assertEqual(len(dsd.words()), 197)
class TestNerDatasets(unittest.TestCase): def setUp(self): self.train_len = 4383 self.dev_len = 564 self.test_len = 565 self.ddt = DDT() # Load dataset def test_write_and_read_simple_ner_dataset(self): sentences = [["Jeg", "gik", "en", "tur", "i", "København"], ["Alexandra", "Instituttet", "arbejder", "med", "NLP"]] entities = [["O", "O", "O", "O", "O", "B-LOC"], ["B-ORG", "I-ORG", "O", "O", "O"]] tmp_file = NamedTemporaryFile().name write_simple_ner_dataset(sentences, entities, tmp_file) loaded_sents, loaded_ents = read_simple_ner_dataset(tmp_file) self.assertEqual(sentences, loaded_sents) self.assertEqual(entities, loaded_ents) def test_ddt_dataset(self): train, dev, test = self.ddt.load_as_conllu(predefined_splits=True) self.assertIsInstance(train, Conll) self.assertIsInstance(dev, Conll) self.assertIsInstance(test, Conll) self.assertEqual( [len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len]) full_dataset = self.ddt.load_as_conllu(predefined_splits=False) self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len) def test_ddt_simple_ner(self): train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True) self.assertEqual([len( train[0]), len(dev[0]), len(test[0])], [self.train_len, self.dev_len, self.test_len]) all_sentences, all_entities = self.ddt.load_as_simple_ner( predefined_splits=False) self.assertEqual(len(all_sentences), self.train_len + self.dev_len + self.test_len) data = defaultdict(int) for entities in train[1]: for entity in entities: if "B" in entity: data[entity[2:]] += 1 self.assertDictEqual(data, { 'ORG': 802, 'LOC': 945, 'PER': 1249, 'MISC': 1007 }) def test_ddt_dataset_with_flair(self): flair_corpus = self.ddt.load_with_flair() self.assertIsInstance(flair_corpus, ColumnCorpus) flair_lens = [ len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test) ] self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len]) ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item asserted_ner_tags = [ b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER', b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>' ] self.assertCountEqual(ner_tags, asserted_ner_tags) def test_ddt_dataset_with_spacy(self): ddt = DDT() # Load dataset corpus = ddt.load_with_spacy() num_sents_train = 0 for paragraph in [ paragraph[1] for paragraph in list(corpus.train_tuples) ]: num_sents_train += len(paragraph) self.assertIsInstance(corpus, GoldCorpus) self.assertEqual(self.train_len, num_sents_train)
from tqdm import tqdm, trange import pandas as pd import io import numpy as np import matplotlib.pyplot as plt from danlp.datasets import DDT import pyconll import re import unicodedata ###################################################### # Loading Data ###################################################### ddt = DDT() conllu_format = ddt.load_as_conllu() L = [(i, token.form, token.misc.get("name").pop()) for i, sent in enumerate(conllu_format) for token in sent] df = pd.DataFrame(L, columns=['sentence_id', 'words', 'labels']) ###################################################### # to bert tokens ###################################################### sent_str = [sent.text for sent in conllu_format] tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sent_str] # Convert tokens to indexes with open("/home/au554730/Desktop/BERT_test/danish_bert_uncased/vocab.txt") as f: vocab = f.read()
from spacy.cli.converters import conllu2json conll_path = os.path.join(ddt.dataset_dir, '{}.{}{}'.format(ddt.dataset_name, 'test', ddt.file_extension)) file_as_json = {} with open(conll_path, 'r') as file: file_as_string = file.read() file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter=No", "") file_as_json = conllu2json(file_as_string) return read_json_object(file_as_json) # load the data : # * convert to spaCy Docs format # * convert dependencies to (BIO) noun chunks ddt = DDT() corpus = load_test_with_spacy(ddt) nlp = chunker.model sentences_tokens = [] chks_true = [] for jobj in corpus: for sentence in jobj[1]: sentence = sentence[0] tokens = sentence[1] sentences_tokens.append(tokens) doc = Doc(nlp.vocab, words=tokens) for i, t in enumerate(doc): t.head = doc[sentence[3][i]] t.pos = nlp.vocab.strings.add(sentence[2][i]) t.dep = nlp.vocab.strings.add(sentence[4][i]) bio_chks = get_noun_chunks(doc, bio=True)