Ejemplo n.º 1
0
def fetch_dane_as_conllu():
    from danlp.datasets import DDT
    ddt = DDT()
    train, dev, test = ddt.load_as_conllu(predefined_splits=True)
    with open("assets/dane/dane_train.conllu", "w") as f:
        train.write(f)
    with open("assets/dane/dane_dev.conllu", "w") as f:
        dev.write(f)
    with open("assets/dane/dane_test.conllu", "w") as f:
        test.write(f)
Ejemplo n.º 2
0
    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [paragraph[1] for paragraph in list(corpus.train_tuples)]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)
Ejemplo n.º 3
0
 def load(self, **_):  # Toss out kwargs
     # Get all three splits from DaNE and divide them in source texts and annotations
     if not danlp_available:
         raise ModuleNotFoundError(
             "DaNE dataset requires installation of the optional requirement `danlp`"
         )
     datasets = DDT().load_as_simple_ner(predefined_splits=True)
     for (texts, annotations), split in zip(datasets, Split):
         self.data[split] = Sequences(
             texts=texts,
             annotations=annotations,
             # Sadly, we do not have access to where the DaNE sentences are divided into articles, so we let each sentence be an entire text.
             sentence_boundaries=[[len(s)] for s in texts])
     self.loaded = True
Ejemplo n.º 4
0
download_dane_data()


def is_misc(ent: str):
    if len(ent) < 4:
        return False
    return ent[-4:] == 'MISC'


def remove_miscs(se: list):
    return [[entity if not is_misc(entity) else 'O' for entity in entities]
            for entities in se]


# Load the DaNE data
_, _, test = DDT().load_as_simple_ner(predefined_splits=True)
sentences_tokens, sentences_entities = test

# Replace MISC with O for fair comparisons
sentences_entities = remove_miscs(sentences_entities)

num_sentences = len(sentences_tokens)
num_tokens = sum([len(s) for s in sentences_tokens])

# def benchmark_polyglot_mdl():
#     """
#     Running ployglot requires these packages:
#     # Morfessor==2.0.6
#     # PyICU==2.4.2
#     # pycld2==0.41
#     # polyglot
Ejemplo n.º 5
0
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset
Ejemplo n.º 6
0
class TestNerDatasets(unittest.TestCase):
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual(
            [len(train), len(dev), len(test)],
            [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset),
                         self.train_len + self.dev_len + self.test_len)

    def test_ddt_simple_ner(self):
        train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True)

        self.assertEqual([len(
            train[0]), len(dev[0]), len(test[0])],
                         [self.train_len, self.dev_len, self.test_len])

        all_sentences, all_entities = self.ddt.load_as_simple_ner(
            predefined_splits=False)
        self.assertEqual(len(all_sentences),
                         self.train_len + self.dev_len + self.test_len)

        data = defaultdict(int)
        for entities in train[1]:
            for entity in entities:
                if "B" in entity:
                    data[entity[2:]] += 1
        self.assertDictEqual(data, {
            'ORG': 802,
            'LOC': 945,
            'PER': 1249,
            'MISC': 1007
        })

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [
            len(flair_corpus.train),
            len(flair_corpus.dev),
            len(flair_corpus.test)
        ]
        self.assertEqual(flair_lens,
                         [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER',
            b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [
                paragraph[1] for paragraph in list(corpus.train_tuples)
        ]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)

    def test_wikiann_dataset(self):
        # Change to a sample of the full wikiann to ease test computation
        DATASETS['wikiann'][
            'url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz"
        DATASETS['wikiann']['size'] = 2502
        DATASETS['wikiann'][
            'md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'

        wikiann = WikiAnn()

        corpus = wikiann.load_with_flair()

        self.assertEqual(
            [len(corpus.train),
             len(corpus.dev),
             len(corpus.test)], [21, 2, 3])

        ner_tags = corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O',
            b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

        spacy_gold = wikiann.load_with_spacy()
        self.assertIsInstance(spacy_gold, GoldCorpus)

        num_train_sents = len(list(spacy_gold.train_tuples)[0][1])
        num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1])
        self.assertEqual(num_dev_sents + num_train_sents, 26)

        shutil.rmtree(wikiann.dataset_dir)

    def test_wordsim353(self):
        ws353 = WordSim353Da()
        df = ws353.load_with_pandas()

        self.assertEqual(len(df), 353)
        self.assertListEqual(list(df.columns), ['da1', 'da2', 'Human (mean)'])
        self.assertEqual(len(ws353.words()), 424)

    def test_dsd(self):
        dsd = DSD()
        df = dsd.load_with_pandas()

        self.assertEqual(len(df), 99)
        self.assertListEqual(list(df.columns),
                             ['word1', 'word2', 'similarity'])
        self.assertEqual(len(dsd.words()), 197)

    def test_europarlsentiment(self):
        eusent = EuroparlSentiment()
        df = eusent.load_with_pandas()
        self.assertEqual(len(df), 184)

    def test_lccsentiment(self):
        sent = LccSentiment()
        df = sent.load_with_pandas()
        self.assertEqual(len(df), 499)
Ejemplo n.º 7
0
class TestNerDatasets(unittest.TestCase):
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_write_and_read_simple_ner_dataset(self):
        sentences = [["Jeg", "gik", "en", "tur", "i", "København"],
                     ["Alexandra", "Instituttet", "arbejder", "med", "NLP"]]

        entities = [["O", "O", "O", "O", "O", "B-LOC"],
                    ["B-ORG", "I-ORG", "O", "O", "O"]]
        tmp_file = NamedTemporaryFile().name
        write_simple_ner_dataset(sentences, entities, tmp_file)

        loaded_sents, loaded_ents = read_simple_ner_dataset(tmp_file)

        self.assertEqual(sentences, loaded_sents)
        self.assertEqual(entities, loaded_ents)

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual(
            [len(train), len(dev), len(test)],
            [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset),
                         self.train_len + self.dev_len + self.test_len)

    def test_ddt_simple_ner(self):
        train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True)

        self.assertEqual([len(
            train[0]), len(dev[0]), len(test[0])],
                         [self.train_len, self.dev_len, self.test_len])

        all_sentences, all_entities = self.ddt.load_as_simple_ner(
            predefined_splits=False)
        self.assertEqual(len(all_sentences),
                         self.train_len + self.dev_len + self.test_len)

        data = defaultdict(int)
        for entities in train[1]:
            for entity in entities:
                if "B" in entity:
                    data[entity[2:]] += 1
        self.assertDictEqual(data, {
            'ORG': 802,
            'LOC': 945,
            'PER': 1249,
            'MISC': 1007
        })

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [
            len(flair_corpus.train),
            len(flair_corpus.dev),
            len(flair_corpus.test)
        ]
        self.assertEqual(flair_lens,
                         [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER',
            b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [
                paragraph[1] for paragraph in list(corpus.train_tuples)
        ]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)

    def test_wikiann_dataset(self):
        # Change to a sample of the full wikiann to ease test computation
        DATASETS['wikiann']['url'] = DANLP_STORAGE_URL + "/tests/da.tar.gz"
        DATASETS['wikiann']['size'] = 2502
        DATASETS['wikiann'][
            'md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'

        wikiann = WikiAnn()

        corpus = wikiann.load_with_flair()

        self.assertEqual(
            [len(corpus.train),
             len(corpus.dev),
             len(corpus.test)], [21, 2, 3])

        ner_tags = corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'I-ORG', b'I-PER', b'I-LOC', b'O',
            b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

        spacy_gold = wikiann.load_with_spacy()
        self.assertIsInstance(spacy_gold, GoldCorpus)

        num_train_sents = len(list(spacy_gold.train_tuples)[0][1])
        num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1])
        self.assertEqual(num_dev_sents + num_train_sents, 26)

        shutil.rmtree(wikiann.dataset_dir)
Ejemplo n.º 8
0
from danlp.datasets import DDT

# utils
#from flair.data import Sentence, Token

# load models
bert = load_bert_ner_model()
'''flair = load_flair_ner_model()'''

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

# get data (splitted into a training set, a validation set, and a test set)
ddt = DDT()
train, valid, test = ddt.load_as_simple_ner(True)

# divide the observations and the targets of the testset into new variables
sentences, categories = test

batch_size = 64

batch_sentences = []

num_sentences = len(sentences)
iterator = 0
while len(batch_sentences) < num_sentences / batch_size:
    stop = batch_size * (iterator + 1)
    if stop > num_sentences:
        stop = num_sentences
Ejemplo n.º 9
0
from utils import print_speed_performance, accuracy_report

from flair.data import Sentence, Token

from danlp.datasets import DDT
from danlp.models import load_spacy_model, load_flair_pos_model

# benchmarking polyglotmodel requires
from polyglot.tag import POSTagger
from polyglot.text import WordList

import os
import spacy

# load the data
ddt = DDT()

corpus_flair = ddt.load_with_flair()
tags_true = [[tok.tags['pos'].value for tok in fs] for fs in corpus_flair.test]
num_sentences = len(tags_true)
num_tokens = sum([len(s) for s in tags_true])

ccorpus_conll = ddt.load_as_conllu(predefined_splits=True)
# the test set
sentences_tokens = []
for sent in ccorpus_conll[2]:
    sentences_tokens.append([token.form for token in sent._tokens])


def benchmark_flair_mdl():
    tagger = load_flair_pos_model()
Ejemplo n.º 10
0
import time
import os

import spacy

from danlp.datasets import DDT
from danlp.models import load_spacy_model

from utils import print_speed_performance, dependency_report

import stanza
stanza.download(
    'Danish')  # Download model (you can comment this line after download)

# load the data
ddt = DDT()

ccorpus_conll = ddt.load_as_conllu(predefined_splits=True)
deps_true = []
# the test set
sentences_tokens = []
for sent in ccorpus_conll[2]:
    sentences_tokens.append([token.form for token in sent._tokens])
    deps_true.append([(token.deprel.lower(), int(token.head))
                      for token in sent._tokens])
num_sentences = len(sentences_tokens)
num_tokens = sum([len(s) for s in sentences_tokens])


def benchmark_spacy_mdl():
    def normalize_spacy_head(i, hd):
Ejemplo n.º 11
0
 def setup(self, split="test"):
     # Third element is test data
     self.data = DDT().load_as_simple_ner(predefined_splits=True)[("train", "dev", "test").index(split)]
Ejemplo n.º 12
0
 def test_ddt_dataset_with_spacy(self):
     ddt = DDT()  # Load dataset
     corpus = ddt.load_with_spacy()
     self.assertIsInstance(corpus, GoldCorpus)
Ejemplo n.º 13
0
class TestNerDatasets(unittest.TestCase):

    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual([len(train), len(dev), len(test)], [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset), self.train_len + self.dev_len + self.test_len)

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [len(flair_corpus.train), len(flair_corpus.dev), len(flair_corpus.test)]
        self.assertEqual(flair_lens, [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC',
            b'I-ORG', b'I-PER', b'I-LOC',
            b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()
        self.assertIsInstance(corpus, GoldCorpus)

    def test_wikiann_dataset(self):
        # Change to a sample of the full wikiann to ease test computation
        DATASETS['wikiann']['url'] = "https://danlp.s3.eu-central-1.amazonaws.com/test-datasets/da.tar.gz"
        DATASETS['wikiann']['size'] = 2502
        DATASETS['wikiann']['md5_checksum'] = 'd0271de38ae23f215b5117450efb9ace'

        wikiann = WikiAnn()

        corpus = wikiann.load_with_flair()

        self.assertEqual([len(corpus.train), len(corpus.dev), len(corpus.test)], [21, 2, 3])

        ner_tags = corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC',
            b'I-ORG', b'I-PER', b'I-LOC',
            b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

        spacy_gold = wikiann.load_with_spacy()
        self.assertIsInstance(spacy_gold, GoldCorpus)

        num_train_sents = len(list(spacy_gold.train_tuples)[0][1])
        num_dev_sents = len(list(spacy_gold.dev_tuples)[0][1])
        self.assertEqual(num_dev_sents + num_train_sents, 26)

        shutil.rmtree(wikiann.dataset_dir)

    def test_wordsim353(self):
        ws353 = WordSim353Da()
        df = ws353.load_with_pandas()

        self.assertEqual(len(df), 353)
        self.assertListEqual(list(df.columns), ['da1', 'da2', 'Human (mean)'])
        self.assertEqual(len(ws353.words()), 424)

    def test_dsd(self):
        dsd = DSD()
        df = dsd.load_with_pandas()

        self.assertEqual(len(df), 99)
        self.assertListEqual(list(df.columns), ['word1', 'word2', 'similarity'])
        self.assertEqual(len(dsd.words()), 197)
Ejemplo n.º 14
0
class TestNerDatasets(unittest.TestCase):
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset

    def test_write_and_read_simple_ner_dataset(self):
        sentences = [["Jeg", "gik", "en", "tur", "i", "København"],
                     ["Alexandra", "Instituttet", "arbejder", "med", "NLP"]]

        entities = [["O", "O", "O", "O", "O", "B-LOC"],
                    ["B-ORG", "I-ORG", "O", "O", "O"]]
        tmp_file = NamedTemporaryFile().name
        write_simple_ner_dataset(sentences, entities, tmp_file)

        loaded_sents, loaded_ents = read_simple_ner_dataset(tmp_file)

        self.assertEqual(sentences, loaded_sents)
        self.assertEqual(entities, loaded_ents)

    def test_ddt_dataset(self):
        train, dev, test = self.ddt.load_as_conllu(predefined_splits=True)

        self.assertIsInstance(train, Conll)
        self.assertIsInstance(dev, Conll)
        self.assertIsInstance(test, Conll)

        self.assertEqual(
            [len(train), len(dev), len(test)],
            [self.train_len, self.dev_len, self.test_len])

        full_dataset = self.ddt.load_as_conllu(predefined_splits=False)
        self.assertEqual(len(full_dataset),
                         self.train_len + self.dev_len + self.test_len)

    def test_ddt_simple_ner(self):
        train, dev, test = self.ddt.load_as_simple_ner(predefined_splits=True)

        self.assertEqual([len(
            train[0]), len(dev[0]), len(test[0])],
                         [self.train_len, self.dev_len, self.test_len])

        all_sentences, all_entities = self.ddt.load_as_simple_ner(
            predefined_splits=False)
        self.assertEqual(len(all_sentences),
                         self.train_len + self.dev_len + self.test_len)

        data = defaultdict(int)
        for entities in train[1]:
            for entity in entities:
                if "B" in entity:
                    data[entity[2:]] += 1
        self.assertDictEqual(data, {
            'ORG': 802,
            'LOC': 945,
            'PER': 1249,
            'MISC': 1007
        })

    def test_ddt_dataset_with_flair(self):
        flair_corpus = self.ddt.load_with_flair()

        self.assertIsInstance(flair_corpus, ColumnCorpus)

        flair_lens = [
            len(flair_corpus.train),
            len(flair_corpus.dev),
            len(flair_corpus.test)
        ]
        self.assertEqual(flair_lens,
                         [self.train_len, self.dev_len, self.test_len])

        ner_tags = flair_corpus.make_tag_dictionary('ner').idx2item
        asserted_ner_tags = [
            b'B-ORG', b'B-PER', b'B-LOC', b'B-MISC', b'I-ORG', b'I-PER',
            b'I-LOC', b'I-MISC', b'O', b'<START>', b'<STOP>', b'<unk>'
        ]
        self.assertCountEqual(ner_tags, asserted_ner_tags)

    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [
                paragraph[1] for paragraph in list(corpus.train_tuples)
        ]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)
Ejemplo n.º 15
0
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from danlp.datasets import DDT
import pyconll
import re
import unicodedata

######################################################
# Loading Data
######################################################


ddt = DDT()

conllu_format = ddt.load_as_conllu()
L = [(i, token.form, token.misc.get("name").pop()) for i, sent in enumerate(conllu_format) for token in sent]
df = pd.DataFrame(L, columns=['sentence_id', 'words', 'labels'])

######################################################
# to bert tokens 
######################################################
sent_str = [sent.text for sent in conllu_format]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sent_str]

# Convert tokens to indexes
with open("/home/au554730/Desktop/BERT_test/danish_bert_uncased/vocab.txt") as f:
    vocab = f.read()
Ejemplo n.º 16
0
    from spacy.cli.converters import conllu2json

    conll_path = os.path.join(ddt.dataset_dir, '{}.{}{}'.format(ddt.dataset_name, 'test', ddt.file_extension))

    file_as_json = {}
    with open(conll_path, 'r') as file:
        file_as_string = file.read()
        file_as_string = file_as_string.replace("name=", "").replace("|SpaceAfter=No", "")
        file_as_json = conllu2json(file_as_string)
    return read_json_object(file_as_json)


# load the data :
#      * convert to spaCy Docs format
#      * convert dependencies to (BIO) noun chunks
ddt = DDT()
corpus = load_test_with_spacy(ddt)
nlp = chunker.model
sentences_tokens = []
chks_true = []
for jobj in corpus:
    for sentence in jobj[1]:
        sentence = sentence[0]
        tokens = sentence[1]
        sentences_tokens.append(tokens)
        doc = Doc(nlp.vocab, words=tokens)
        for i, t in enumerate(doc):
            t.head =  doc[sentence[3][i]]
            t.pos = nlp.vocab.strings.add(sentence[2][i])
            t.dep = nlp.vocab.strings.add(sentence[4][i])
        bio_chks = get_noun_chunks(doc, bio=True)