Beispiel #1
0
def train_identifier():
    #1. prepare data
    print "-- Prepare Data"
    train_sentences = conll2002.iob_sents('esp.train')
    test_sentences = conll2002.iob_sents('esp.testa')
    #2. extract features
    train_df, test_df = extract_ident_features(train_sentences, test_sentences)
    #3. train
    clf = identifier.train(train_df, test_df)
def print_clf_score_table():
    clf = pickle.load( open( "..//models//clf1.p", "rb" ) )
    v = pickle.load( open( "..//models//v1.p", "rb" ) )

    dev_sents = list(conll2002.iob_sents('esp.testb'))
    y_pred = predict(clf, v, dev_sents)
    y_true = corpus2labels(dev_sents)
    assert len(y_pred) == len(y_true)
   
    print(metrics.classification_report(y_pred, y_true))
def load_and_test_model():
    clf = pickle.load(open("..//models//clf1.p", "rb"))
    v = pickle.load(open("..//models//v1.p", "rb"))

    dev_sents = list(conll2002.iob_sents('ned.testa'))[0:20]

    y_pred = predict(clf, v, dev_sents)

    test_tokens = corpus2tokens(dev_sents)
    assert len(test_tokens) == len(y_pred)
    result = list(zip(test_tokens, y_pred))
    result = clean_result(result)
    print(result)
Beispiel #4
0
    """ The function generates all features
    for the word at position i in the
    sentence."""
    features = []
    # the window around the token
    for o in [-1,0,1]:
        if i+o >= 0 and i+o < len(sent):
            word = sent[i+o][0]
            featlist = getfeats(word, o)
            features.extend(featlist)
    
    return dict(features)

if __name__ == "__main__":
    # Load the training data
    train_sents = list(conll2002.iob_sents('esp.train'))
    dev_sents = list(conll2002.iob_sents('esp.testa'))
    test_sents = list(conll2002.iob_sents('esp.testb'))
    
    train_feats = []
    train_labels = []

    for sent in train_sents:
        for i in range(len(sent)):
            feats = word2features(sent,i)
            train_feats.append(feats)
            train_labels.append(sent[i][-1])

    vectorizer = DictVectorizer()
    X_train = vectorizer.fit_transform(train_feats)
Beispiel #5
0
    for the word at position i in the
    sentence."""
    features = []
    # the window around the token
    for o in [-1, 0, 1]:
        if i + o >= 0 and i + o < len(sent):
            word = sent[i + o][0]
            featlist = getfeats(word, o)
            features.extend(featlist)

    return dict(features)


if __name__ == "__main__":
    # Load the training data
    train_sents = list(conll2002.iob_sents("esp.train"))
    dev_sents = list(conll2002.iob_sents("esp.testa"))
    test_sents = list(conll2002.iob_sents("esp.testb"))

    train_feats = []
    train_labels = []

    for sent in train_sents:
        for i in range(len(sent)):
            feats = word2features(sent, i)
            train_feats.append(feats)
            train_labels.append(sent[i][-1])

    vectorizer = DictVectorizer()
    X_train = vectorizer.fit_transform(train_feats)
def train_model():
    train_sents = list(conll2002.iob_sents('ned.train'))
    clf, v = train(train_sents)
    pickle.dump(clf, open("..//models//clf1.p", "wb"))
    pickle.dump(v, open("..//models//v1.p", "wb"))
   
    print(metrics.classification_report(y_pred, y_true))



#train_model()
#load_and_test_model()
#dev_sents = list(conll2002.iob_sents('esp.testa'))
#y_pred = load_and_predict(dev_sents)
#y_true = corpus2labels(dev_sents)
#print('len pred == len true: {}'.format(len(y_pred) == len(y_true)))
#print('B-PER:', get_3_stats2('B-PER', y_pred, y_true))
#print('I-PER:',  get_3_stats2('I-PER', y_pred, y_true))
#print('B-ORG:',  get_3_stats2('B-ORG', y_pred, y_true))
#print('I-ORG:',  get_3_stats2('I-ORG', y_pred, y_true))
#print('B-LOC:',  get_3_stats2('B-LOC', y_pred, y_true))
#print('I-LOC:',  get_3_stats2('I-LOC', y_pred, y_true))
#print('B-MISC:',  get_3_stats2('B-MISC', y_pred, y_true))
#print('I-MISC:',  get_3_stats2('I-MISC', y_pred, y_true))
#print('O:',  get_3_stats2('O', y_pred, y_true))
#print_clf_score_table()

clf = pickle.load( open( "..//models//clf1.p", "rb" ) )
v = pickle.load( open( "..//models//v1.p", "rb" ) )
test_sents = list(conll2002.iob_sents('esp.testb'))
y_pred = predict(clf, v, test_sents)
bad_tags_count, bad_seqs_count = q3_1_3.bad_seqs(y_pred)
print(bad_tags_count)
print(bad_seqs_count)

Beispiel #8
0
    return [word2features(sent, i, model) for i in range(len(sent))]


def sent2features2(sent):
    return [sent[i][0] for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


etr = conll2002.iob_sents('esp.train')  # In Spanish
eta = conll2002.iob_sents('esp.testa')  # In Spanish
etb = conll2002.iob_sents('esp.testb')  # In Spanish

dtr = conll2002.iob_sents('ned.train')  # In Dutch
dta = conll2002.iob_sents('ned.testa')  # In Dutch
dtb = conll2002.iob_sents('ned.testb')  # In Dutch

train_sents = etr
test_sents = etb

data = [sent2features2(s) for s in train_sents]
data = [item for sublist in data for item in sublist]

with open('file_of_text.txt', 'w') as f:
    for item in data:
Beispiel #9
0
from nltk.corpus import conll2002
sent_a = conll2002.iob_sents('esp.testa')
sent_b = conll2002.iob_sents('esp.testb')
for sent in sent_a:
    for word_tuple in sent:
        if (len(word_tuple) != 3):
            print("Weird Tuple: " + str(word_tuple))

for sent in sent_b:
    for word_tuple in sent:
        if (len(word_tuple) != 3):
            print("Weird Tuple: " + str(word_tuple))

print("done tuple check")
Beispiel #10
0
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


if __name__ == "__main__":
    # Load the training data
    train_sents = list(conll2002.iob_sents('esp.train'))  # 8323
    dev_sents = list(conll2002.iob_sents('esp.testa'))  # 1915
    test_sents = list(conll2002.iob_sents('esp.testb'))

    ### Use the following code when experimenting with sklearn models ###
    '''
    train_feats = []
    train_labels = []
    for sent in train_sents:
        for i in range(len(sent)):
            feats = word2features(sent,i)
            train_feats.append(feats)
            train_labels.append(sent[i][-1])

    vectorizer = DictVectorizer()
    X_train = vectorizer.fit_transform(train_feats)
Beispiel #11
0
from nltk.corpus import conll2002
import string, nltk
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.feature_selection import RFE

train_sents = list(conll2002.iob_sents('esp.train'))
test_sents = list(conll2002.iob_sents('esp.testa'))
d_train_sents = list(conll2002.iob_sents('ned.train'))
d_test_sents = list(conll2002.iob_sents('ned.testa'))
v = DictVectorizer(sparse=True)
    
def hasNumbers(str):
    return any(c.isdigit() for c in str)

def get_word_features (word):
    w = word[0]
    features = {
     "form": w,
     "pos": word[1],
     "is_number": w.isdigit(),
     "contains_number": hasNumbers(w),
     "beginCapital": w[0].isupper(),
     "allCaps": w.isupper(),
     "isPunc": w in string.punctuation,
     "firstLetter": w[0],
     "first2Letters": w[0:2],
     "first3Letters": w[0:3],
     "lastLetter": w[-1],
     "last2Letters": w[-2:],
Beispiel #12
0
def corpus_reader(corpus_path,tag='bio'):
    """ corpus relative path (str) -> list of iob sents """
    if tag=='bio':  # training dataset
        return reader.iob_sents(os.path.abspath(corpus_path))
    if tag=='pos':  # test dataset
        return reader.tagged_sents(os.path.abspath(corpus_path))
Beispiel #13
0
from nltk.corpus import conll2002
from ner import generate_features_and_labels
dev_sents = list(conll2002.iob_sents('esp.testa'))

feats, labels = generate_features_and_labels(dev_sents)

with open("results-mlp-lbfgs-200-0pos-prop.txt", "r") as f:
    line_count = 0
    wrong_count = 0
    for line in f:
        word, gold, pred = line.split()
        if gold != pred:
            print(line + str(feats[line_count]))
            print()
            wrong_count += 1
        line_count += 1

print("\nTotal number wrong: %d" % wrong_count)

Beispiel #14
0
def corpus_reader(corpus_path, tag='bio'):
    """ corpus relative path (str) -> list of iob sents """
    if tag == 'bio':  # training dataset
        return reader.iob_sents(os.path.abspath(corpus_path))
    if tag == 'pos':  # test dataset
        return reader.tagged_sents(os.path.abspath(corpus_path))
Beispiel #15
0
from collections import Counter
from nltk.corpus import conll2002

# used by the top spanish model

train_sents = list(conll2002.iob_sents('esp.train'))
occurrences = Counter()
for sent in train_sents:
    for i in range(len(sent)):
        word = sent[i][0]
        if len(word) == 4:
            occurrences.update([word])
        if len(word) > 4:
            occurrences.update([word[-4:]])
            occurrences.update([word[:4]])


final_counts = {x: occurrences[x] for x in occurrences if occurrences[x] >= 100}
with open('affixes.txt', 'w') as affixes:
    for k in final_counts.keys():
        affixes.write(k + "\n")
Beispiel #16
0
from nltk.corpus import conll2002
from collections import Counter


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent, ):
    return [token for token, postag, label in sent]


train_sents = list(conll2002.iob_sents('ned.train'))

PER_count = Counter()
for sent in train_sents:
    labels = sent2labels(sent)
    tokens = sent2tokens(sent)
    if 'B-PER' in labels:
        PER_count.update(tokens)

PER_words = [word for word, _ in PER_count.most_common(1000)]
#print('PER implying words')
#print(PER_count.most_common(100))

LOC_count = Counter()
for sent in train_sents:
    labels = sent2labels(sent)
    tokens = sent2tokens(sent)
    if 'B-LOC' in labels:
        LOC_count.update(tokens)