def train_identifier(): #1. prepare data print "-- Prepare Data" train_sentences = conll2002.iob_sents('esp.train') test_sentences = conll2002.iob_sents('esp.testa') #2. extract features train_df, test_df = extract_ident_features(train_sentences, test_sentences) #3. train clf = identifier.train(train_df, test_df)
def print_clf_score_table(): clf = pickle.load( open( "..//models//clf1.p", "rb" ) ) v = pickle.load( open( "..//models//v1.p", "rb" ) ) dev_sents = list(conll2002.iob_sents('esp.testb')) y_pred = predict(clf, v, dev_sents) y_true = corpus2labels(dev_sents) assert len(y_pred) == len(y_true) print(metrics.classification_report(y_pred, y_true))
def load_and_test_model(): clf = pickle.load(open("..//models//clf1.p", "rb")) v = pickle.load(open("..//models//v1.p", "rb")) dev_sents = list(conll2002.iob_sents('ned.testa'))[0:20] y_pred = predict(clf, v, dev_sents) test_tokens = corpus2tokens(dev_sents) assert len(test_tokens) == len(y_pred) result = list(zip(test_tokens, y_pred)) result = clean_result(result) print(result)
""" The function generates all features for the word at position i in the sentence.""" features = [] # the window around the token for o in [-1,0,1]: if i+o >= 0 and i+o < len(sent): word = sent[i+o][0] featlist = getfeats(word, o) features.extend(featlist) return dict(features) if __name__ == "__main__": # Load the training data train_sents = list(conll2002.iob_sents('esp.train')) dev_sents = list(conll2002.iob_sents('esp.testa')) test_sents = list(conll2002.iob_sents('esp.testb')) train_feats = [] train_labels = [] for sent in train_sents: for i in range(len(sent)): feats = word2features(sent,i) train_feats.append(feats) train_labels.append(sent[i][-1]) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_feats)
for the word at position i in the sentence.""" features = [] # the window around the token for o in [-1, 0, 1]: if i + o >= 0 and i + o < len(sent): word = sent[i + o][0] featlist = getfeats(word, o) features.extend(featlist) return dict(features) if __name__ == "__main__": # Load the training data train_sents = list(conll2002.iob_sents("esp.train")) dev_sents = list(conll2002.iob_sents("esp.testa")) test_sents = list(conll2002.iob_sents("esp.testb")) train_feats = [] train_labels = [] for sent in train_sents: for i in range(len(sent)): feats = word2features(sent, i) train_feats.append(feats) train_labels.append(sent[i][-1]) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_feats)
def train_model(): train_sents = list(conll2002.iob_sents('ned.train')) clf, v = train(train_sents) pickle.dump(clf, open("..//models//clf1.p", "wb")) pickle.dump(v, open("..//models//v1.p", "wb"))
print(metrics.classification_report(y_pred, y_true)) #train_model() #load_and_test_model() #dev_sents = list(conll2002.iob_sents('esp.testa')) #y_pred = load_and_predict(dev_sents) #y_true = corpus2labels(dev_sents) #print('len pred == len true: {}'.format(len(y_pred) == len(y_true))) #print('B-PER:', get_3_stats2('B-PER', y_pred, y_true)) #print('I-PER:', get_3_stats2('I-PER', y_pred, y_true)) #print('B-ORG:', get_3_stats2('B-ORG', y_pred, y_true)) #print('I-ORG:', get_3_stats2('I-ORG', y_pred, y_true)) #print('B-LOC:', get_3_stats2('B-LOC', y_pred, y_true)) #print('I-LOC:', get_3_stats2('I-LOC', y_pred, y_true)) #print('B-MISC:', get_3_stats2('B-MISC', y_pred, y_true)) #print('I-MISC:', get_3_stats2('I-MISC', y_pred, y_true)) #print('O:', get_3_stats2('O', y_pred, y_true)) #print_clf_score_table() clf = pickle.load( open( "..//models//clf1.p", "rb" ) ) v = pickle.load( open( "..//models//v1.p", "rb" ) ) test_sents = list(conll2002.iob_sents('esp.testb')) y_pred = predict(clf, v, test_sents) bad_tags_count, bad_seqs_count = q3_1_3.bad_seqs(y_pred) print(bad_tags_count) print(bad_seqs_count)
return [word2features(sent, i, model) for i in range(len(sent))] def sent2features2(sent): return [sent[i][0] for i in range(len(sent))] def sent2labels(sent): return [label for token, postag, label in sent] def sent2tokens(sent): return [token for token, postag, label in sent] etr = conll2002.iob_sents('esp.train') # In Spanish eta = conll2002.iob_sents('esp.testa') # In Spanish etb = conll2002.iob_sents('esp.testb') # In Spanish dtr = conll2002.iob_sents('ned.train') # In Dutch dta = conll2002.iob_sents('ned.testa') # In Dutch dtb = conll2002.iob_sents('ned.testb') # In Dutch train_sents = etr test_sents = etb data = [sent2features2(s) for s in train_sents] data = [item for sublist in data for item in sublist] with open('file_of_text.txt', 'w') as f: for item in data:
from nltk.corpus import conll2002 sent_a = conll2002.iob_sents('esp.testa') sent_b = conll2002.iob_sents('esp.testb') for sent in sent_a: for word_tuple in sent: if (len(word_tuple) != 3): print("Weird Tuple: " + str(word_tuple)) for sent in sent_b: for word_tuple in sent: if (len(word_tuple) != 3): print("Weird Tuple: " + str(word_tuple)) print("done tuple check")
def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))] def sent2labels(sent): return [label for token, postag, label in sent] def sent2tokens(sent): return [token for token, postag, label in sent] if __name__ == "__main__": # Load the training data train_sents = list(conll2002.iob_sents('esp.train')) # 8323 dev_sents = list(conll2002.iob_sents('esp.testa')) # 1915 test_sents = list(conll2002.iob_sents('esp.testb')) ### Use the following code when experimenting with sklearn models ### ''' train_feats = [] train_labels = [] for sent in train_sents: for i in range(len(sent)): feats = word2features(sent,i) train_feats.append(feats) train_labels.append(sent[i][-1]) vectorizer = DictVectorizer() X_train = vectorizer.fit_transform(train_feats)
from nltk.corpus import conll2002 import string, nltk from sklearn.feature_extraction import DictVectorizer from sklearn.linear_model import LogisticRegression import sklearn.metrics as metrics from sklearn.feature_selection import RFE train_sents = list(conll2002.iob_sents('esp.train')) test_sents = list(conll2002.iob_sents('esp.testa')) d_train_sents = list(conll2002.iob_sents('ned.train')) d_test_sents = list(conll2002.iob_sents('ned.testa')) v = DictVectorizer(sparse=True) def hasNumbers(str): return any(c.isdigit() for c in str) def get_word_features (word): w = word[0] features = { "form": w, "pos": word[1], "is_number": w.isdigit(), "contains_number": hasNumbers(w), "beginCapital": w[0].isupper(), "allCaps": w.isupper(), "isPunc": w in string.punctuation, "firstLetter": w[0], "first2Letters": w[0:2], "first3Letters": w[0:3], "lastLetter": w[-1], "last2Letters": w[-2:],
def corpus_reader(corpus_path,tag='bio'): """ corpus relative path (str) -> list of iob sents """ if tag=='bio': # training dataset return reader.iob_sents(os.path.abspath(corpus_path)) if tag=='pos': # test dataset return reader.tagged_sents(os.path.abspath(corpus_path))
from nltk.corpus import conll2002 from ner import generate_features_and_labels dev_sents = list(conll2002.iob_sents('esp.testa')) feats, labels = generate_features_and_labels(dev_sents) with open("results-mlp-lbfgs-200-0pos-prop.txt", "r") as f: line_count = 0 wrong_count = 0 for line in f: word, gold, pred = line.split() if gold != pred: print(line + str(feats[line_count])) print() wrong_count += 1 line_count += 1 print("\nTotal number wrong: %d" % wrong_count)
def corpus_reader(corpus_path, tag='bio'): """ corpus relative path (str) -> list of iob sents """ if tag == 'bio': # training dataset return reader.iob_sents(os.path.abspath(corpus_path)) if tag == 'pos': # test dataset return reader.tagged_sents(os.path.abspath(corpus_path))
from collections import Counter from nltk.corpus import conll2002 # used by the top spanish model train_sents = list(conll2002.iob_sents('esp.train')) occurrences = Counter() for sent in train_sents: for i in range(len(sent)): word = sent[i][0] if len(word) == 4: occurrences.update([word]) if len(word) > 4: occurrences.update([word[-4:]]) occurrences.update([word[:4]]) final_counts = {x: occurrences[x] for x in occurrences if occurrences[x] >= 100} with open('affixes.txt', 'w') as affixes: for k in final_counts.keys(): affixes.write(k + "\n")
from nltk.corpus import conll2002 from collections import Counter def sent2labels(sent): return [label for token, postag, label in sent] def sent2tokens(sent, ): return [token for token, postag, label in sent] train_sents = list(conll2002.iob_sents('ned.train')) PER_count = Counter() for sent in train_sents: labels = sent2labels(sent) tokens = sent2tokens(sent) if 'B-PER' in labels: PER_count.update(tokens) PER_words = [word for word, _ in PER_count.most_common(1000)] #print('PER implying words') #print(PER_count.most_common(100)) LOC_count = Counter() for sent in train_sents: labels = sent2labels(sent) tokens = sent2tokens(sent) if 'B-LOC' in labels: LOC_count.update(tokens)