Esempio n. 1
0
def fetch_dane_as_conllu():
    from danlp.datasets import DDT
    ddt = DDT()
    train, dev, test = ddt.load_as_conllu(predefined_splits=True)
    with open("assets/dane/dane_train.conllu", "w") as f:
        train.write(f)
    with open("assets/dane/dane_dev.conllu", "w") as f:
        dev.write(f)
    with open("assets/dane/dane_test.conllu", "w") as f:
        test.write(f)
Esempio n. 2
0
    def test_ddt_dataset_with_spacy(self):
        ddt = DDT()  # Load dataset
        corpus = ddt.load_with_spacy()

        num_sents_train = 0
        for paragraph in [paragraph[1] for paragraph in list(corpus.train_tuples)]:
            num_sents_train += len(paragraph)

        self.assertIsInstance(corpus, GoldCorpus)
        self.assertEqual(self.train_len, num_sents_train)
Esempio n. 3
0
 def load(self, **_):  # Toss out kwargs
     # Get all three splits from DaNE and divide them in source texts and annotations
     if not danlp_available:
         raise ModuleNotFoundError(
             "DaNE dataset requires installation of the optional requirement `danlp`"
         )
     datasets = DDT().load_as_simple_ner(predefined_splits=True)
     for (texts, annotations), split in zip(datasets, Split):
         self.data[split] = Sequences(
             texts=texts,
             annotations=annotations,
             # Sadly, we do not have access to where the DaNE sentences are divided into articles, so we let each sentence be an entire text.
             sentence_boundaries=[[len(s)] for s in texts])
     self.loaded = True
Esempio n. 4
0
download_dane_data()


def is_misc(ent: str):
    if len(ent) < 4:
        return False
    return ent[-4:] == 'MISC'


def remove_miscs(se: list):
    return [[entity if not is_misc(entity) else 'O' for entity in entities]
            for entities in se]


# Load the DaNE data
_, _, test = DDT().load_as_simple_ner(predefined_splits=True)
sentences_tokens, sentences_entities = test

# Replace MISC with O for fair comparisons
sentences_entities = remove_miscs(sentences_entities)

num_sentences = len(sentences_tokens)
num_tokens = sum([len(s) for s in sentences_tokens])

# def benchmark_polyglot_mdl():
#     """
#     Running ployglot requires these packages:
#     # Morfessor==2.0.6
#     # PyICU==2.4.2
#     # pycld2==0.41
#     # polyglot
Esempio n. 5
0
    def setUp(self):
        self.train_len = 4383
        self.dev_len = 564
        self.test_len = 565

        self.ddt = DDT()  # Load dataset
Esempio n. 6
0
from danlp.datasets import DDT

# utils
#from flair.data import Sentence, Token

# load models
bert = load_bert_ner_model()
'''flair = load_flair_ner_model()'''

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

# get data (splitted into a training set, a validation set, and a test set)
ddt = DDT()
train, valid, test = ddt.load_as_simple_ner(True)

# divide the observations and the targets of the testset into new variables
sentences, categories = test

batch_size = 64

batch_sentences = []

num_sentences = len(sentences)
iterator = 0
while len(batch_sentences) < num_sentences / batch_size:
    stop = batch_size * (iterator + 1)
    if stop > num_sentences:
        stop = num_sentences
Esempio n. 7
0
 def setup(self, split="test"):
     # Third element is test data
     self.data = DDT().load_as_simple_ner(predefined_splits=True)[("train", "dev", "test").index(split)]
Esempio n. 8
0
 def test_ddt_dataset_with_spacy(self):
     ddt = DDT()  # Load dataset
     corpus = ddt.load_with_spacy()
     self.assertIsInstance(corpus, GoldCorpus)