Python read_corpus_conllの例

プログラミング言語: Python

名前空間/パッケージ名: conll

メソッド/関数: read_corpus_conll

hotexamples.comのコード掲載数: 7

Python read_corpus_conll - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのconll.read_corpus_conllの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def loadConll(conllFile):
    pathToConllFile = conllFile

    if os.sep not in conllFile:  # Checks if is a path to the file
        pathToConllFile = os.getcwd() + os.sep + 'data' + os.sep \
            + 'conll2003' + os.sep + conllFile

    corpus = conll.read_corpus_conll(pathToConllFile)
    corpus = corpus[1:]  # Removes DOCSTART line
    print('Elements in corpus: {}'.format(len(corpus)))
    sentence = {'text': [], 'POS_tag': [], 'SynChunkTag': [], 'NE_tag': []}
    for _list in corpus:
        tempSentence = ''
        tempPOSTags = ''
        tempSyncChunksTags = ''
        tempNETags = ''
        for vec in _list:
            vec = vec[0].split(' ')
            if '-DOCSTART-' not in vec:
                tempSentence = tempSentence + '{} '.format(vec[0])
                tempPOSTags = tempPOSTags + '{} '.format(vec[1])
                tempSyncChunksTags = tempSyncChunksTags + '{} '.format(vec[2])
                tempNETags = tempNETags + '{} '.format(vec[3])

        sentence['text'].append(tempSentence.strip())
        sentence['POS_tag'].append(tempPOSTags.strip())
        sentence['SynChunkTag'].append(tempSyncChunksTags.strip())
        sentence['NE_tag'].append(tempNETags.strip())

    return sentence

コード例 #2

ファイルを表示

def reconstructSentences(corpus_file):
    test_sents = read_corpus_conll(corpus_file)
    sentences_list = []
    for line in test_sents:
        string = ""
        for word in line:
            w = word[0].partition(' ')[0]
            string += " "
            string = string + w
        if (string != ' -DOCSTART-'):
            sentences_list.append(string)
    return sentences_list

コード例 #3

ファイルを表示

def import_dataset(path):
    data = conll.read_corpus_conll(path)
    text_dataset = []
    dataset = []
    for t in data:
        sentence = []
        txt = ""
        for t2 in t:
            if (t2[0].split()[0] != "-DOCSTART-"):
                sentence.append((t2[0].split()[0], t2[0].split()[3]))
                txt += str(t2[0].split()[0]) + " "
        dataset.append(sentence)
        text_dataset.append([txt])
    return text_dataset, dataset

コード例 #4

ファイルを表示

def extract_data(file_path: str) -> (list, list, list):
    dataset = list()
    sentences = list()
    corpus = conll.read_corpus_conll(file_path)

    for sent in corpus:
        dataset.append(build_tuple_data_list(sent))

    for sent_tuples in dataset:
        if sent_tuples[0][0] == '-DOCSTART-':
            dataset.remove(sent_tuples)

    for sent_tuples in dataset:
        sentences.append(build_sentence_string(sent_tuples))

    nlp = spacy.load("en_core_web_sm")
    # I need this WhitespaceTokenizer otherwise spacy and conll are not in sync
    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    docs = list()
    for doc in nlp.pipe(sentences):
        docs.append(doc)

    ent_tag_converter = dict()
    ent_tag_converter["PERSON"] = "PER"
    ent_tag_converter["ORG"] = "ORG"
    ent_tag_converter["NORP"] = "ORG"
    ent_tag_converter["FAC"] = "LOC"
    ent_tag_converter["GPE"] = "LOC"
    ent_tag_converter["EVENT"] = "MISC"
    ent_tag_converter["WORK_OF_ART"] = "MISC"
    ent_tag_converter["LANGUAGE"] = "MISC"

    spacy_data = list()
    for i in range(len(dataset)):
        sentence = list()
        for j in range(len(dataset[i])):
            text = docs[i][j].text
            pos = docs[i][j].tag_
            chunk = ""  # TO DO

            if docs[i][j].ent_type_ in ent_tag_converter.keys():
                iob_tag = docs[i][j].ent_iob_ + "-" + ent_tag_converter[
                    docs[i][j].ent_type_]
            else:
                iob_tag = "O"
            sentence.append((text, pos, chunk, iob_tag))
        spacy_data.append(sentence)

    return docs, spacy_data, dataset

コード例 #5

ファイルを表示

def groundTruthList(corpus_file):
    file = read_corpus_conll(corpus_file)
    gt_list1 = []
    gt_list2 = []
    list = []
    for tuple in file:
        list1 = []
        for sentence in tuple:
            frase = sentence[0]
            tupl1 = (frase.split()[0], frase.split()[3])
            list1.append(tupl1)
        if (('-DOCSTART-', 'O') not in list1):
            gt_list2.append(list1)

    for tuple_list in file:
        list1 = []
        for sentence in tuple_list:
            frase = sentence[0]
            if (frase.split()[0] != '-DOCSTART-'):
                gt_list1.append(frase.split()[3])
    return gt_list1, gt_list2

コード例 #6

ファイルを表示

ファイル: assignment2.py プロジェクト: LunaBaozi/NLU_assignment2

import spacy
import conll
import numpy as np
from spacy.training import Alignment
from spacy.tokens import Doc
from sklearn.metrics import classification_report
from pprint import pprint

# Loading the language model
nlp = spacy.load('en_core_web_sm')

## Reading training data
train = conll.read_corpus_conll('./conll2003/train.txt', ' ')
## Reading test data
train = conll.read_corpus_conll('./conll2003/test.txt', ' ')

## Dictionary for labels conversion
## Keys: spacy labels
## Values: coNLL labels
labels = {
    'PERSON': 'PER',
    'NORP': 'MISC',
    'FAC': 'LOC',
    'ORG': 'ORG',
    'GPE': 'LOC',
    'LOC': 'LOC',
    'PRODUCT': 'MISC',
    'EVENT': 'MISC',
    'WORK_OF_ART': 'MISC',
    'LAW': 'MISC',
    'LANGUAGE': 'MISC',

コード例 #7

ファイルを表示

Here I was curious about using already tokenized text from the dataset (overriding spaCy tokenizer).
SpaCy's documentation reports that the performance should decrease (due to the fact that the tokenization methods may be different), in this case the perfomance slightly decreases, so spaCy's documentation is right.
Here the results:
"""

from spacy.tokens import Doc


# function to replace spaCy tokenizer
def get_tokens(sentence):
    return Doc(nlp.vocab, sentence)


nlp.tokenizer = get_tokens

data = conll.read_corpus_conll(test_path)
pred = []

for s in data:
    sentence = []
    for token in s:
        if (token[0].split()[0] != "-DOCSTART-"):
            sentence.append(token[0].split()[0])
    doc = nlp(sentence)
    pred.append(reconstruct_output(doc))

predicted = []
for sentence in pred:
    for token in sentence:
        predicted.append(token[1])