def test(self, file_name, labels_to_remove=[]): ''' test the sentences while tagging them ''' data = self.data_class(file_name) tagged_sents = self.tag_sents(data.sentences) y_pred = [[fields[-1] for fields in tagged_sent] for tagged_sent in tagged_sents] self.logger.debug(str(len(data.y)) + ' ' + str(len(y_pred))) self.logger.debug(str(data.y[:5])) self.logger.debug(str(y_pred[:5])) # flatten list of lists # from itertools import chain # list(chain.from_iterable(y)) y_true_flat = flatten(data.y) y_pred_flat = flatten(y_pred) # mainly for removing O tag in NER, can also be used for other tags labels = list(self.tagger.classes_) if labels_to_remove: for l in labels_to_remove: labels.remove(l) precision = precision_score(y_true_flat, y_pred_flat, average='micro', labels=labels) recall = recall_score(y_true_flat, y_pred_flat, average='micro', labels=labels) f1 = f1_score(y_true_flat, y_pred_flat, average='micro', labels=labels) accuracy = accuracy_score(y_true_flat, y_pred_flat) confusion = confusion_matrix(y_true_flat, y_pred_flat) return [precision, recall, f1, accuracy, confusion, tagged_sents]
def print_classification_report(annotations, n_splits=10, model=None): """ Evaluate model, print classification report """ if model is None: # FIXME: we're overfitting on hyperparameters - they should be chosen # using inner cross-validation, not set to fixed values beforehand. model = get_model(use_precise_form_types=True) annotations = [a for a in annotations if a.fields_annotated] form_types = formtype_model.get_realistic_form_labels( annotations=annotations, n_splits=n_splits, full_type_names=False ) X, y = get_Xy( annotations=annotations, form_types=form_types, full_type_names=True, ) group_kfold = GroupKFold(n_splits=n_splits) groups = [get_domain(ann.url) for ann in annotations] y_pred = cross_val_predict(model, X, y, cv=group_kfold, groups=groups, n_jobs=-1) all_labels = list(annotations[0].field_schema.types.keys()) labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k)) print((flat_classification_report(y, y_pred, digits=2, labels=labels, target_names=labels))) print(( "{:0.1f}% fields are classified correctly.".format( flat_accuracy_score(y, y_pred) * 100 ) )) print(( "All fields are classified correctly in {:0.1f}% forms.".format( sequence_accuracy_score(y, y_pred) * 100 ) ))
def print_classification_report(annotations, n_folds=10, model=None): """ Evaluate model, print classification report """ if model is None: # FIXME: we're overfitting on hyperparameters - they should be chosen # using inner cross-validation, not set to fixed values beforehand. model = get_model(use_precise_form_types=True) annotations = [a for a in annotations if a.fields_annotated] form_types = formtype_model.get_realistic_form_labels( annotations=annotations, n_folds=n_folds, full_type_names=False ) X, y = get_Xy(annotations=annotations, form_types=form_types, full_type_names=True) cv = get_annotation_folds(annotations, n_folds=n_folds) y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1) all_labels = list(annotations[0].field_schema.types.keys()) labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k)) print(flat_classification_report(y, y_pred, digits=2, labels=labels, target_names=labels)) print("{:0.1f}% fields are classified correctly.".format(flat_accuracy_score(y, y_pred) * 100)) print("All fields are classified correctly in {:0.1f}% forms.".format(sequence_accuracy_score(y, y_pred) * 100))
def wrapper(y_true, y_pred, *args, **kwargs): y_true_flat = flatten(y_true) y_pred_flat = flatten(y_pred) return func(y_true_flat, y_pred_flat, *args, **kwargs)
data_manager = DataManager(5) print("Processing dataset") x, y = data_manager.process_dataset('/data/morpho/FLP_segmented.csv') temp = list(zip(x, y)) random.shuffle(temp) x[:], y[:] = zip(*temp) x_folds = [x[i:i+len(x)//5] for i in range(0, len(x), len(x)//5)] y_folds = [y[i:i+len(y)//5] for i in range(0, len(y), len(y)//5)] crf = sklearn_crfsuite.CRF() # scores = cross_validation.cross_val_score(crf, x, y, cv=10, scoring='f1') scores = [] # crf.fit(x[:100], y[:100]) # print(crf.predict_marginals(x)) for i in range(5): x_test = flatten(x_folds[:i] + x_folds[i+1:]) x_train = x_folds[i] y_test = flatten(y_folds[:i] + y_folds[i+1:]) y_train = y_folds[i] print(len(x_train[0]), len(y_train[0])) crf.fit(x_train, y_train) y_pred = crf.predict(x_test) print(len(y_pred)) y_pred = flatten(y_pred) # y_marg = crf.predict_marginals(x_test) y_test = flatten(y_test) score = metrics.classification_report(y_test, y_pred, labels=['B_PREF', 'M_PREF', 'B_ROOT', 'M_ROOT',
data_manager = DataManager(5) print("Processing dataset") x, y = data_manager.process_dataset('/data/morpho/FLP_segmented.csv') temp = list(zip(x, y)) random.shuffle(temp) x[:], y[:] = zip(*temp) x_folds = [x[i:i + len(x) // 5] for i in range(0, len(x), len(x) // 5)] y_folds = [y[i:i + len(y) // 5] for i in range(0, len(y), len(y) // 5)] crf = sklearn_crfsuite.CRF() # scores = cross_validation.cross_val_score(crf, x, y, cv=10, scoring='f1') scores = [] # crf.fit(x[:100], y[:100]) # print(crf.predict_marginals(x)) for i in range(5): x_test = flatten(x_folds[:i] + x_folds[i + 1:]) x_train = x_folds[i] y_test = flatten(y_folds[:i] + y_folds[i + 1:]) y_train = y_folds[i] print(len(x_train[0]), len(y_train[0])) crf.fit(x_train, y_train) y_pred = crf.predict(x_test) print(len(y_pred)) y_pred = flatten(y_pred) # y_marg = crf.predict_marginals(x_test) y_test = flatten(y_test) score = metrics.classification_report(y_test, y_pred, labels=[ 'B_PREF', 'M_PREF', 'B_ROOT',
def tokenLevel_measures(predictedY, trueY, tokenList, label_dic): dic_tokenmeasure = {} predictedYflat1 = [] trueYflat1 = [] predictedYflat = flatten(predictedY) trueYflat = flatten(trueY) tokenListFlat = flatten(tokenList) #out=open("confusion matrix.txt",'w') for i in range(len(predictedYflat)): if tokenListFlat[i] not in string.punctuation: labelSplitPre = predictedYflat[i].split('-') predictedYflat1.append(labelSplitPre[len(labelSplitPre) - 1]) labelSplitTrue = trueYflat[i].split('-') trueYflat1.append(labelSplitTrue[len(labelSplitTrue) - 1]) labels1 = list(np.unique(trueYflat1)) #labels1.remove('O') labels2 = list(np.unique(trueYflat1)) #labels2.remove('O') measuresprs = precision_recall_fscore_support(trueYflat1, predictedYflat1, labels=labels1) #print measuresprs[0] if label_dic == {}: count = 0 else: count = len(list(label_dic.iteritems())) for i in range(len(labels2)): #if measuresprs[3][i]<=10: # labels1.remove(labels2[i]) if not label_dic.has_key(labels2[i]): label_dic[labels2[i]] = [[], [0], [count]] count = count + 1 label_dic[labels2[i]][0].append(measuresprs[2][i]) label_dic[labels2[i]][1] = label_dic[labels2[i]][1] + measuresprs[3][i] #measuresprs1=zip(labels1,measuresprs[0],measuresprs[1],measuresprs[2],measuresprs[3]) #print trueYflat #print predictedYflat #print labels1 #labels1 = ['B-address', 'B-authors', 'B-booktitle', 'B-journal', 'B-pages', 'B-publisher', 'B-ref', 'B-title', 'B-volume', 'B-year', 'I-address', 'I-authors', 'I-booktitle', 'I-journal', 'I-pages', 'I-publisher', 'I-ref', 'I-title', 'I-volume', 'I-year'] #sorted_labels = sorted(labels1,key=lambda name: (name[1:], name[0])) #F1score_micro=metrics.flat_f1_score(trueY, predictedY, labels=labels1, average='micro') F1score_micro = f1_score(trueYflat1, predictedYflat1, labels=labels1, average='micro') print "F1 measure:", F1score_micro precision_micro = precision_score(trueYflat1, predictedYflat1, labels=labels1, average='micro') print "precision measure:", precision_micro recallScore_micro = recall_score(trueYflat1, predictedYflat1, labels=labels1, average='micro') print "recall measure:", recallScore_micro #precisionScore_micro=metrics.flat_precision_score(trueY, predictedY, average='micro') #print precisionScore_micro #recallScore_micro=metrics.flat_recall_score(trueY, predictedY, average='micro') #print recallScore_micro classificationReport = classification_report(trueYflat1, predictedYflat1, labels=labels1, digits=3) print classificationReport conf_mat = confusion_matrix(trueYflat1, predictedYflat1, labels=labels1) '''for i in range(len(labels2)): for j in range(len(labels2)): conf_mat_agg[label_dic[labels2[i]][2][0],label_dic[labels2[j]][2][0]]=conf_mat_agg[label_dic[labels2[i]][2][0],label_dic[labels2[j]][2][0]]+conf_mat[i][j]''' for i in range(len(labels1)): dic_tokenmeasure[labels1[i]] = [ measuresprs[0][i], measuresprs[1][i], measuresprs[2][i], measuresprs[3][i] ] #out.write(str(labels1)+"\n") #for i in range(len(conf_mat)): # out.write(str(conf_mat[i])+"\n") #print np.sum(conf_mat[1,:]) accuracy = accuracy_score(trueYflat1, predictedYflat1) #print measuresprs1 print "Accuracy:", accuracy return F1score_micro, dic_tokenmeasure
def train(self): model = os.path.abspath( '1server/nlp/data/model.joblib') if os.path.exists(model): model = load(model) self.crf = model pred=self.predict_single( 'Dijual Handphone Samsung Galaxy Note 7') # print(pred) # h = WorParser( # '<ENAMEX TYPE="TYPE">Handphone</ENAMEX> <ENAMEX TYPE="BRAND">Samsung</ENAMEX> <ENAMEX TYPE="NAME">Galaxy 7</ENAMEX> murah') h = WorParser( 'Promo <ENAMEX TYPE="TYPE">Laptop</ENAMEX> <ENAMEX TYPE="BRAND">Asus</ENAMEX> <ENAMEX TYPE="NAME">A411UF</ENAMEX>') tt = sent2features(h) return model else: print('CREATE MODEL') crf = self.crf # f = open(self.file_path) # lines = [line for line in f.read().split("\n")] # f.close() # train_test = [] train_data = [] # for row in lines: # h = WorParser(row) # # h = parseEntity(row) # if h: # train_data.append(h) dd = pd.read_csv(os.path.abspath( 'server/nlp/data/data_train_transform.csv')) for i in range(len(dd)): dd_index = i _da = (dd['Word'][i],dd['POS'][i],dd['Label'][i]) if i > 0 and dd['Word_row'][i] == dd['Word_row'][dd_index] and len(train_data)-1 >= dd['Word_row'][i]: dd_index = i-1 train_data[dd['Word_row'][i]].append(_da) else: train_data.append([_da]) X = [sent2features(s) for s in train_data[:1]] Y = [sent2labels(s) for s in train_data[:1]] self.revisian_feature(X,Y) for i,l in zip(X,Y): for val,label in zip(i,l): data_t = val data_t['label'] = label value_data = [v for k,v in data_t.items()] writer.writerow(value_data) print(data_t,value_data) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=10) # X_test = [sent2features(s) for s in train_test] # y_test = [sent2labels(s) for s in train_test] # X_train = X # y_train = Y # print(train_data[0]) crf.fit(X_train, y_train) labels = list(crf.classes_) new_classes = labels.copy() new_classes.remove('O') # print(rows) # train_test = [parseEntity(row) for row in rows] y_pred = crf.predict(X_test) score = metrics.flat_classification_report( y_test, y_pred, labels=new_classes, digits=3 ) print(score) # print("Top likely transitions:") # self.print_transitions(Counter(crf.transition_features_).most_common(20)) # print("\nTop unlikely transitions:") # self.print_transitions(Counter(crf.transition_features_).most_common()[-20:]) y_true = flatten(y_test) y_pred = flatten(y_pred) self.conffusion_matrix_to_csv(y_true, y_pred, new_classes) # re_format_labels = self.re_format_class(new_classes) # re_format_iob_y_true = self.re_format(y_true) # re_format_iob_y_pred = self.re_format(y_pred) # pprint(tp / (tp + fp)) # pprint(tp / (tp + fn)) # print("Top likely transitions:") # self.print_transitions(Counter(crf.transition_features_).most_common(20)) # state_features = crf.state_features_ # out = zip(state_features.keys(), state_features.values()) # with open(os.path.abspath( # 'server/nlp/data/data_features.csv'), 'w', encoding="utf8",newline="") as csv_feature: # writer = csv.writer(csv_feature) # writer.writerow(['key','value']) # for i in out: # writer.writerow(i) # csv_feature.close() dump(crf, os.path.abspath( 'server/nlp/data/model.joblib')) self.crf = crf # weight = eli5.show_weights(crf, top=30) # print(dir(eli5)) # for i in data_frame: # print(data_frame[i]) # pd = df.DataFrame(data_frame, index=True) # print(data_frame['targets'].to_html()) # data_frame['targets'].to_csv(os.path.abspath( # 'server/nlp/data/data_feature_targets.csv')) # data_frame['transition_features'].to_csv(os.path.abspath( # 'server/nlp/data/data_transition_features.csv')) # pprint(dir(self.crf)) # pprint(self.crf.state_features_) # pprint(self.crf.training_log_.iterations) return self.crf
def flat_recall(y_true, y_pred): """Define flat recall metric.""" ytr_flat = flatten(y_true) ypr_flat = flatten(y_pred) return recall_score(ytr_flat, ypr_flat, pos_label="FOOD", average='binary')
def gather_validation_metrics(X_text, y, tokenizer, model, preprocessor, batch_size=128, check_lengths=False, verbose=False, fine_label_report=False, dataset='UNKNOWN DATASET', label_ignore_set=set(['O'])): gold_labels = [] pred_labels = [] gold_coarse_labels = [] pred_coarse_labels = [] # now put all the predictions together fine_labels_set = set() coarse_labels_set = set() MAX_PREDICTIONS = len(X_text) # do all predictions at once (rather than sentence-by-sentence) and then collect the results sentence_preds = model.predict(preprocessor.transform( X_text[:MAX_PREDICTIONS]), batch_size=batch_size) # then go back and get predictions one at a time for i in range(len(X_text[:MAX_PREDICTIONS])): # only one sentence, so grab the 0 row... sentence_length = len(y[i]) sentence_pred = sentence_preds[i, :sentence_length] #print(sentence_pred) #break # prep the labels sentence_coarse_labels = get_coarse_labels(y[i]) # get the predictions pred_sentence_labels = preprocessor.inverse_transform( np.argmax(sentence_pred, -1)) pred_sentence_coarse_labels = get_coarse_labels(pred_sentence_labels) if check_lengths: if len(y[i]) == len(pred_sentence_labels): print('MATCH of GOLD and PRED') else: print('NO MATCH ON GOLD and PRED') print('GOLD : {0}, PRED : {1}'.format( len(y[i]), len(pred_sentence_labels))) print('SENTENCE PRED LENGTH : {}'.format(len(sentence_pred))) print(X_text[i]) print(y[i]) print(pred_sentence_labels) # the sklearn-crfsuite metrics call its flatten() to convert a list of lists to one flat list gold_labels.append(y[i]) gold_coarse_labels.append(sentence_coarse_labels) pred_labels.append(pred_sentence_labels) pred_coarse_labels.append(pred_sentence_coarse_labels) fine_labels_set |= set(y[i]) coarse_labels_set |= set(sentence_coarse_labels) #print(pred_sentence_labels) #print(y[i]) print('Reporting metrics for dataset : [{0}]'.format(dataset)) if verbose: print('Total gold FINE : {}'.format(len(gold_labels))) print('Total pred FINE : {}'.format(len(pred_labels))) print('Total gold COARSE : {}'.format(len(gold_coarse_labels))) print('Total pred COARSE : {}'.format(len(pred_coarse_labels))) print('Total FLATTENED gold FINE : {}'.format(len( flatten(gold_labels)))) print('Total FLATTENED pred FINE : {}'.format(len( flatten(pred_labels)))) print('Total FLATTENED gold COARSE : {}'.format( len(flatten(gold_coarse_labels)))) print('Total FLATTENED pred COARSE : {}'.format( len(flatten(pred_coarse_labels)))) # now let's do some evaluation fine_labels_sorted_list = sorted(list(fine_labels_set)) coarse_labels_sorted_list = sorted(list(coarse_labels_set)) # pull out any labels that we want to ignore fine_labels_sorted_list = [ x for x in fine_labels_sorted_list if x not in label_ignore_set ] coarse_labels_sorted_list = [ x for x in coarse_labels_sorted_list if x not in label_ignore_set ] if fine_label_report: print('FINE label report : ') print( flat_classification_report(gold_labels, pred_labels, labels=fine_labels_sorted_list, digits=3)) print('COARSE label report : ') print( flat_classification_report(gold_coarse_labels, pred_coarse_labels, labels=coarse_labels_sorted_list, digits=3)) flat_gold_coarse_labels = flatten(gold_coarse_labels) flat_pred_coarse_labels = flatten(pred_coarse_labels) coarse_labels_sorted_list_nofilter = sorted(list(coarse_labels_set)) confusion_coarse = confusion_matrix( flat_gold_coarse_labels, flat_pred_coarse_labels, labels=coarse_labels_sorted_list_nofilter) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix( confusion_coarse, classes=coarse_labels_sorted_list_nofilter, title='Coarse Label Confusion for : [{0}]'.format(dataset)) plt.show()
def count_labels(labels): labels_flat = flatten(labels) counter = collections.Counter(labels_flat) print(counter)
def get_labels(y): return sorted_for_ner(set(flatten(y)) - {'O'})