Exemple #1
0
def loadConll(conllFile):
    pathToConllFile = conllFile

    if os.sep not in conllFile:  # Checks if is a path to the file
        pathToConllFile = os.getcwd() + os.sep + 'data' + os.sep \
            + 'conll2003' + os.sep + conllFile

    corpus = conll.read_corpus_conll(pathToConllFile)
    corpus = corpus[1:]  # Removes DOCSTART line
    print('Elements in corpus: {}'.format(len(corpus)))
    sentence = {'text': [], 'POS_tag': [], 'SynChunkTag': [], 'NE_tag': []}
    for _list in corpus:
        tempSentence = ''
        tempPOSTags = ''
        tempSyncChunksTags = ''
        tempNETags = ''
        for vec in _list:
            vec = vec[0].split(' ')
            if '-DOCSTART-' not in vec:
                tempSentence = tempSentence + '{} '.format(vec[0])
                tempPOSTags = tempPOSTags + '{} '.format(vec[1])
                tempSyncChunksTags = tempSyncChunksTags + '{} '.format(vec[2])
                tempNETags = tempNETags + '{} '.format(vec[3])

        sentence['text'].append(tempSentence.strip())
        sentence['POS_tag'].append(tempPOSTags.strip())
        sentence['SynChunkTag'].append(tempSyncChunksTags.strip())
        sentence['NE_tag'].append(tempNETags.strip())

    return sentence
Exemple #2
0
def reconstructSentences(corpus_file):
    test_sents = read_corpus_conll(corpus_file)
    sentences_list = []
    for line in test_sents:
        string = ""
        for word in line:
            w = word[0].partition(' ')[0]
            string += " "
            string = string + w
        if (string != ' -DOCSTART-'):
            sentences_list.append(string)
    return sentences_list
Exemple #3
0
def import_dataset(path):
    data = conll.read_corpus_conll(path)
    text_dataset = []
    dataset = []
    for t in data:
        sentence = []
        txt = ""
        for t2 in t:
            if (t2[0].split()[0] != "-DOCSTART-"):
                sentence.append((t2[0].split()[0], t2[0].split()[3]))
                txt += str(t2[0].split()[0]) + " "
        dataset.append(sentence)
        text_dataset.append([txt])
    return text_dataset, dataset
Exemple #4
0
def extract_data(file_path: str) -> (list, list, list):
    dataset = list()
    sentences = list()
    corpus = conll.read_corpus_conll(file_path)

    for sent in corpus:
        dataset.append(build_tuple_data_list(sent))

    for sent_tuples in dataset:
        if sent_tuples[0][0] == '-DOCSTART-':
            dataset.remove(sent_tuples)

    for sent_tuples in dataset:
        sentences.append(build_sentence_string(sent_tuples))

    nlp = spacy.load("en_core_web_sm")
    # I need this WhitespaceTokenizer otherwise spacy and conll are not in sync
    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    docs = list()
    for doc in nlp.pipe(sentences):
        docs.append(doc)

    ent_tag_converter = dict()
    ent_tag_converter["PERSON"] = "PER"
    ent_tag_converter["ORG"] = "ORG"
    ent_tag_converter["NORP"] = "ORG"
    ent_tag_converter["FAC"] = "LOC"
    ent_tag_converter["GPE"] = "LOC"
    ent_tag_converter["EVENT"] = "MISC"
    ent_tag_converter["WORK_OF_ART"] = "MISC"
    ent_tag_converter["LANGUAGE"] = "MISC"

    spacy_data = list()
    for i in range(len(dataset)):
        sentence = list()
        for j in range(len(dataset[i])):
            text = docs[i][j].text
            pos = docs[i][j].tag_
            chunk = ""  # TO DO

            if docs[i][j].ent_type_ in ent_tag_converter.keys():
                iob_tag = docs[i][j].ent_iob_ + "-" + ent_tag_converter[
                    docs[i][j].ent_type_]
            else:
                iob_tag = "O"
            sentence.append((text, pos, chunk, iob_tag))
        spacy_data.append(sentence)

    return docs, spacy_data, dataset
Exemple #5
0
def groundTruthList(corpus_file):
    file = read_corpus_conll(corpus_file)
    gt_list1 = []
    gt_list2 = []
    list = []
    for tuple in file:
        list1 = []
        for sentence in tuple:
            frase = sentence[0]
            tupl1 = (frase.split()[0], frase.split()[3])
            list1.append(tupl1)
        if (('-DOCSTART-', 'O') not in list1):
            gt_list2.append(list1)

    for tuple_list in file:
        list1 = []
        for sentence in tuple_list:
            frase = sentence[0]
            if (frase.split()[0] != '-DOCSTART-'):
                gt_list1.append(frase.split()[3])
    return gt_list1, gt_list2
import spacy
import conll
import numpy as np
from spacy.training import Alignment
from spacy.tokens import Doc
from sklearn.metrics import classification_report
from pprint import pprint

# Loading the language model
nlp = spacy.load('en_core_web_sm')

## Reading training data
train = conll.read_corpus_conll('./conll2003/train.txt', ' ')
## Reading test data
train = conll.read_corpus_conll('./conll2003/test.txt', ' ')

## Dictionary for labels conversion
## Keys: spacy labels
## Values: coNLL labels
labels = {
    'PERSON': 'PER',
    'NORP': 'MISC',
    'FAC': 'LOC',
    'ORG': 'ORG',
    'GPE': 'LOC',
    'LOC': 'LOC',
    'PRODUCT': 'MISC',
    'EVENT': 'MISC',
    'WORK_OF_ART': 'MISC',
    'LAW': 'MISC',
    'LANGUAGE': 'MISC',
Exemple #7
0
Here I was curious about using already tokenized text from the dataset (overriding spaCy tokenizer).
SpaCy's documentation reports that the performance should decrease (due to the fact that the tokenization methods may be different), in this case the perfomance slightly decreases, so spaCy's documentation is right.
Here the results:
"""

from spacy.tokens import Doc


# function to replace spaCy tokenizer
def get_tokens(sentence):
    return Doc(nlp.vocab, sentence)


nlp.tokenizer = get_tokens

data = conll.read_corpus_conll(test_path)
pred = []

for s in data:
    sentence = []
    for token in s:
        if (token[0].split()[0] != "-DOCSTART-"):
            sentence.append(token[0].split()[0])
    doc = nlp(sentence)
    pred.append(reconstruct_output(doc))

predicted = []
for sentence in pred:
    for token in sentence:
        predicted.append(token[1])