def loadConll(conllFile): pathToConllFile = conllFile if os.sep not in conllFile: # Checks if is a path to the file pathToConllFile = os.getcwd() + os.sep + 'data' + os.sep \ + 'conll2003' + os.sep + conllFile corpus = conll.read_corpus_conll(pathToConllFile) corpus = corpus[1:] # Removes DOCSTART line print('Elements in corpus: {}'.format(len(corpus))) sentence = {'text': [], 'POS_tag': [], 'SynChunkTag': [], 'NE_tag': []} for _list in corpus: tempSentence = '' tempPOSTags = '' tempSyncChunksTags = '' tempNETags = '' for vec in _list: vec = vec[0].split(' ') if '-DOCSTART-' not in vec: tempSentence = tempSentence + '{} '.format(vec[0]) tempPOSTags = tempPOSTags + '{} '.format(vec[1]) tempSyncChunksTags = tempSyncChunksTags + '{} '.format(vec[2]) tempNETags = tempNETags + '{} '.format(vec[3]) sentence['text'].append(tempSentence.strip()) sentence['POS_tag'].append(tempPOSTags.strip()) sentence['SynChunkTag'].append(tempSyncChunksTags.strip()) sentence['NE_tag'].append(tempNETags.strip()) return sentence
def reconstructSentences(corpus_file): test_sents = read_corpus_conll(corpus_file) sentences_list = [] for line in test_sents: string = "" for word in line: w = word[0].partition(' ')[0] string += " " string = string + w if (string != ' -DOCSTART-'): sentences_list.append(string) return sentences_list
def import_dataset(path): data = conll.read_corpus_conll(path) text_dataset = [] dataset = [] for t in data: sentence = [] txt = "" for t2 in t: if (t2[0].split()[0] != "-DOCSTART-"): sentence.append((t2[0].split()[0], t2[0].split()[3])) txt += str(t2[0].split()[0]) + " " dataset.append(sentence) text_dataset.append([txt]) return text_dataset, dataset
def extract_data(file_path: str) -> (list, list, list): dataset = list() sentences = list() corpus = conll.read_corpus_conll(file_path) for sent in corpus: dataset.append(build_tuple_data_list(sent)) for sent_tuples in dataset: if sent_tuples[0][0] == '-DOCSTART-': dataset.remove(sent_tuples) for sent_tuples in dataset: sentences.append(build_sentence_string(sent_tuples)) nlp = spacy.load("en_core_web_sm") # I need this WhitespaceTokenizer otherwise spacy and conll are not in sync nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) docs = list() for doc in nlp.pipe(sentences): docs.append(doc) ent_tag_converter = dict() ent_tag_converter["PERSON"] = "PER" ent_tag_converter["ORG"] = "ORG" ent_tag_converter["NORP"] = "ORG" ent_tag_converter["FAC"] = "LOC" ent_tag_converter["GPE"] = "LOC" ent_tag_converter["EVENT"] = "MISC" ent_tag_converter["WORK_OF_ART"] = "MISC" ent_tag_converter["LANGUAGE"] = "MISC" spacy_data = list() for i in range(len(dataset)): sentence = list() for j in range(len(dataset[i])): text = docs[i][j].text pos = docs[i][j].tag_ chunk = "" # TO DO if docs[i][j].ent_type_ in ent_tag_converter.keys(): iob_tag = docs[i][j].ent_iob_ + "-" + ent_tag_converter[ docs[i][j].ent_type_] else: iob_tag = "O" sentence.append((text, pos, chunk, iob_tag)) spacy_data.append(sentence) return docs, spacy_data, dataset
def groundTruthList(corpus_file): file = read_corpus_conll(corpus_file) gt_list1 = [] gt_list2 = [] list = [] for tuple in file: list1 = [] for sentence in tuple: frase = sentence[0] tupl1 = (frase.split()[0], frase.split()[3]) list1.append(tupl1) if (('-DOCSTART-', 'O') not in list1): gt_list2.append(list1) for tuple_list in file: list1 = [] for sentence in tuple_list: frase = sentence[0] if (frase.split()[0] != '-DOCSTART-'): gt_list1.append(frase.split()[3]) return gt_list1, gt_list2
import spacy import conll import numpy as np from spacy.training import Alignment from spacy.tokens import Doc from sklearn.metrics import classification_report from pprint import pprint # Loading the language model nlp = spacy.load('en_core_web_sm') ## Reading training data train = conll.read_corpus_conll('./conll2003/train.txt', ' ') ## Reading test data train = conll.read_corpus_conll('./conll2003/test.txt', ' ') ## Dictionary for labels conversion ## Keys: spacy labels ## Values: coNLL labels labels = { 'PERSON': 'PER', 'NORP': 'MISC', 'FAC': 'LOC', 'ORG': 'ORG', 'GPE': 'LOC', 'LOC': 'LOC', 'PRODUCT': 'MISC', 'EVENT': 'MISC', 'WORK_OF_ART': 'MISC', 'LAW': 'MISC', 'LANGUAGE': 'MISC',
Here I was curious about using already tokenized text from the dataset (overriding spaCy tokenizer). SpaCy's documentation reports that the performance should decrease (due to the fact that the tokenization methods may be different), in this case the perfomance slightly decreases, so spaCy's documentation is right. Here the results: """ from spacy.tokens import Doc # function to replace spaCy tokenizer def get_tokens(sentence): return Doc(nlp.vocab, sentence) nlp.tokenizer = get_tokens data = conll.read_corpus_conll(test_path) pred = [] for s in data: sentence = [] for token in s: if (token[0].split()[0] != "-DOCSTART-"): sentence.append(token[0].split()[0]) doc = nlp(sentence) pred.append(reconstruct_output(doc)) predicted = [] for sentence in pred: for token in sentence: predicted.append(token[1])