Ejemplo n.º 1
0
    def test(self, file_name, labels_to_remove=[]):
        '''
        test the sentences while tagging them
        '''
        data = self.data_class(file_name)

        tagged_sents = self.tag_sents(data.sentences)
        y_pred = [[fields[-1] for fields in tagged_sent]
                  for tagged_sent in tagged_sents]

        self.logger.debug(str(len(data.y)) + ' ' + str(len(y_pred)))
        self.logger.debug(str(data.y[:5]))
        self.logger.debug(str(y_pred[:5]))

        # flatten list of lists
        # from itertools import chain
        # list(chain.from_iterable(y))
        y_true_flat = flatten(data.y)
        y_pred_flat = flatten(y_pred)

        # mainly for removing O tag in NER, can also be used for other tags
        labels = list(self.tagger.classes_)
        if labels_to_remove:
            for l in labels_to_remove:
                labels.remove(l)

        precision = precision_score(y_true_flat,
                                    y_pred_flat,
                                    average='micro',
                                    labels=labels)
        recall = recall_score(y_true_flat,
                              y_pred_flat,
                              average='micro',
                              labels=labels)
        f1 = f1_score(y_true_flat, y_pred_flat, average='micro', labels=labels)
        accuracy = accuracy_score(y_true_flat, y_pred_flat)
        confusion = confusion_matrix(y_true_flat, y_pred_flat)

        return [precision, recall, f1, accuracy, confusion, tagged_sents]
Ejemplo n.º 2
0
def print_classification_report(annotations, n_splits=10, model=None):
    """ Evaluate model, print classification report """
    if model is None:
        # FIXME: we're overfitting on hyperparameters - they should be chosen
        # using inner cross-validation, not set to fixed values beforehand.
        model = get_model(use_precise_form_types=True)

    annotations = [a for a in annotations if a.fields_annotated]
    form_types = formtype_model.get_realistic_form_labels(
        annotations=annotations,
        n_splits=n_splits,
        full_type_names=False
    )

    X, y = get_Xy(
        annotations=annotations,
        form_types=form_types,
        full_type_names=True,
    )
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = [get_domain(ann.url) for ann in annotations]
    y_pred = cross_val_predict(model, X, y, cv=group_kfold, groups=groups,
                               n_jobs=-1)

    all_labels = list(annotations[0].field_schema.types.keys())
    labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
    print((flat_classification_report(y, y_pred, digits=2,
                                     labels=labels, target_names=labels)))

    print((
        "{:0.1f}% fields are classified correctly.".format(
            flat_accuracy_score(y, y_pred) * 100
        )
    ))
    print((
        "All fields are classified correctly in {:0.1f}% forms.".format(
            sequence_accuracy_score(y, y_pred) * 100
        )
    ))
Ejemplo n.º 3
0
def print_classification_report(annotations, n_folds=10, model=None):
    """ Evaluate model, print classification report """
    if model is None:
        # FIXME: we're overfitting on hyperparameters - they should be chosen
        # using inner cross-validation, not set to fixed values beforehand.
        model = get_model(use_precise_form_types=True)

    annotations = [a for a in annotations if a.fields_annotated]
    form_types = formtype_model.get_realistic_form_labels(
        annotations=annotations, n_folds=n_folds, full_type_names=False
    )

    X, y = get_Xy(annotations=annotations, form_types=form_types, full_type_names=True)
    cv = get_annotation_folds(annotations, n_folds=n_folds)
    y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)

    all_labels = list(annotations[0].field_schema.types.keys())
    labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
    print(flat_classification_report(y, y_pred, digits=2, labels=labels, target_names=labels))

    print("{:0.1f}% fields are classified correctly.".format(flat_accuracy_score(y, y_pred) * 100))
    print("All fields are classified correctly in {:0.1f}% forms.".format(sequence_accuracy_score(y, y_pred) * 100))
Ejemplo n.º 4
0
 def wrapper(y_true, y_pred, *args, **kwargs):
     y_true_flat = flatten(y_true)
     y_pred_flat = flatten(y_pred)
     return func(y_true_flat, y_pred_flat, *args, **kwargs)
Ejemplo n.º 5
0
    data_manager = DataManager(5)
    print("Processing dataset")
    x, y = data_manager.process_dataset('/data/morpho/FLP_segmented.csv')
    temp = list(zip(x, y))
    random.shuffle(temp)
    x[:], y[:] = zip(*temp)
    x_folds = [x[i:i+len(x)//5] for i in range(0, len(x), len(x)//5)]
    y_folds = [y[i:i+len(y)//5] for i in range(0, len(y), len(y)//5)]

    crf = sklearn_crfsuite.CRF()
    # scores = cross_validation.cross_val_score(crf, x, y, cv=10, scoring='f1')
    scores = []
    # crf.fit(x[:100], y[:100])
    # print(crf.predict_marginals(x))
    for i in range(5):
        x_test = flatten(x_folds[:i] + x_folds[i+1:])
        x_train = x_folds[i]
        y_test = flatten(y_folds[:i] + y_folds[i+1:])
        y_train = y_folds[i]
        print(len(x_train[0]), len(y_train[0]))

        
        crf.fit(x_train, y_train)
        y_pred = crf.predict(x_test)
        print(len(y_pred))
        y_pred = flatten(y_pred)
        # y_marg = crf.predict_marginals(x_test)
        y_test = flatten(y_test)
        score = metrics.classification_report(y_test, y_pred,
                                              labels=['B_PREF', 'M_PREF', 
                                                      'B_ROOT', 'M_ROOT', 
Ejemplo n.º 6
0
    data_manager = DataManager(5)
    print("Processing dataset")
    x, y = data_manager.process_dataset('/data/morpho/FLP_segmented.csv')
    temp = list(zip(x, y))
    random.shuffle(temp)
    x[:], y[:] = zip(*temp)
    x_folds = [x[i:i + len(x) // 5] for i in range(0, len(x), len(x) // 5)]
    y_folds = [y[i:i + len(y) // 5] for i in range(0, len(y), len(y) // 5)]

    crf = sklearn_crfsuite.CRF()
    # scores = cross_validation.cross_val_score(crf, x, y, cv=10, scoring='f1')
    scores = []
    # crf.fit(x[:100], y[:100])
    # print(crf.predict_marginals(x))
    for i in range(5):
        x_test = flatten(x_folds[:i] + x_folds[i + 1:])
        x_train = x_folds[i]
        y_test = flatten(y_folds[:i] + y_folds[i + 1:])
        y_train = y_folds[i]
        print(len(x_train[0]), len(y_train[0]))

        crf.fit(x_train, y_train)
        y_pred = crf.predict(x_test)
        print(len(y_pred))
        y_pred = flatten(y_pred)
        # y_marg = crf.predict_marginals(x_test)
        y_test = flatten(y_test)
        score = metrics.classification_report(y_test,
                                              y_pred,
                                              labels=[
                                                  'B_PREF', 'M_PREF', 'B_ROOT',
def tokenLevel_measures(predictedY, trueY, tokenList, label_dic):
    dic_tokenmeasure = {}
    predictedYflat1 = []
    trueYflat1 = []
    predictedYflat = flatten(predictedY)
    trueYflat = flatten(trueY)
    tokenListFlat = flatten(tokenList)
    #out=open("confusion matrix.txt",'w')
    for i in range(len(predictedYflat)):
        if tokenListFlat[i] not in string.punctuation:
            labelSplitPre = predictedYflat[i].split('-')
            predictedYflat1.append(labelSplitPre[len(labelSplitPre) - 1])
            labelSplitTrue = trueYflat[i].split('-')
            trueYflat1.append(labelSplitTrue[len(labelSplitTrue) - 1])

    labels1 = list(np.unique(trueYflat1))
    #labels1.remove('O')
    labels2 = list(np.unique(trueYflat1))
    #labels2.remove('O')
    measuresprs = precision_recall_fscore_support(trueYflat1,
                                                  predictedYflat1,
                                                  labels=labels1)
    #print measuresprs[0]
    if label_dic == {}:
        count = 0
    else:
        count = len(list(label_dic.iteritems()))
    for i in range(len(labels2)):
        #if measuresprs[3][i]<=10:
        #    labels1.remove(labels2[i])
        if not label_dic.has_key(labels2[i]):
            label_dic[labels2[i]] = [[], [0], [count]]
            count = count + 1
        label_dic[labels2[i]][0].append(measuresprs[2][i])
        label_dic[labels2[i]][1] = label_dic[labels2[i]][1] + measuresprs[3][i]
        #measuresprs1=zip(labels1,measuresprs[0],measuresprs[1],measuresprs[2],measuresprs[3])

    #print trueYflat
    #print predictedYflat
    #print labels1
    #labels1 = ['B-address', 'B-authors', 'B-booktitle', 'B-journal', 'B-pages', 'B-publisher', 'B-ref', 'B-title', 'B-volume', 'B-year', 'I-address', 'I-authors', 'I-booktitle', 'I-journal', 'I-pages', 'I-publisher', 'I-ref', 'I-title', 'I-volume', 'I-year']
    #sorted_labels = sorted(labels1,key=lambda name: (name[1:], name[0]))
    #F1score_micro=metrics.flat_f1_score(trueY, predictedY, labels=labels1, average='micro')
    F1score_micro = f1_score(trueYflat1,
                             predictedYflat1,
                             labels=labels1,
                             average='micro')
    print "F1 measure:", F1score_micro
    precision_micro = precision_score(trueYflat1,
                                      predictedYflat1,
                                      labels=labels1,
                                      average='micro')
    print "precision measure:", precision_micro
    recallScore_micro = recall_score(trueYflat1,
                                     predictedYflat1,
                                     labels=labels1,
                                     average='micro')
    print "recall measure:", recallScore_micro
    #precisionScore_micro=metrics.flat_precision_score(trueY, predictedY, average='micro')
    #print precisionScore_micro
    #recallScore_micro=metrics.flat_recall_score(trueY, predictedY, average='micro')
    #print recallScore_micro
    classificationReport = classification_report(trueYflat1,
                                                 predictedYflat1,
                                                 labels=labels1,
                                                 digits=3)
    print classificationReport
    conf_mat = confusion_matrix(trueYflat1, predictedYflat1, labels=labels1)
    '''for i in range(len(labels2)):
        for j in range(len(labels2)):
            conf_mat_agg[label_dic[labels2[i]][2][0],label_dic[labels2[j]][2][0]]=conf_mat_agg[label_dic[labels2[i]][2][0],label_dic[labels2[j]][2][0]]+conf_mat[i][j]'''
    for i in range(len(labels1)):
        dic_tokenmeasure[labels1[i]] = [
            measuresprs[0][i], measuresprs[1][i], measuresprs[2][i],
            measuresprs[3][i]
        ]
    #out.write(str(labels1)+"\n")
    #for i in range(len(conf_mat)):
    #    out.write(str(conf_mat[i])+"\n")
    #print np.sum(conf_mat[1,:])
    accuracy = accuracy_score(trueYflat1, predictedYflat1)
    #print measuresprs1
    print "Accuracy:", accuracy
    return F1score_micro, dic_tokenmeasure
Ejemplo n.º 8
0
    def train(self):
        model = os.path.abspath(
            '1server/nlp/data/model.joblib')
    
        if os.path.exists(model):
            model = load(model)
            self.crf = model
            pred=self.predict_single(
                'Dijual Handphone Samsung Galaxy Note 7')
            # print(pred)
            # h = WorParser(
            #     '<ENAMEX TYPE="TYPE">Handphone</ENAMEX> <ENAMEX TYPE="BRAND">Samsung</ENAMEX> <ENAMEX TYPE="NAME">Galaxy 7</ENAMEX> murah')
            h = WorParser(
                'Promo <ENAMEX TYPE="TYPE">Laptop</ENAMEX> <ENAMEX TYPE="BRAND">Asus</ENAMEX> <ENAMEX TYPE="NAME">A411UF</ENAMEX>')
            tt = sent2features(h)
        
            return model
        else:
            print('CREATE MODEL')
            crf = self.crf
            # f = open(self.file_path)
            # lines = [line for line in f.read().split("\n")]
            # f.close()
            # train_test = []
            train_data = []
            # for row in lines:
            #     h = WorParser(row)
            #     # h = parseEntity(row)
            #     if h:
            #         train_data.append(h)

            dd = pd.read_csv(os.path.abspath(
                'server/nlp/data/data_train_transform.csv'))
          
            for i in range(len(dd)):
                dd_index = i

                _da = (dd['Word'][i],dd['POS'][i],dd['Label'][i])
                if i > 0 and dd['Word_row'][i] == dd['Word_row'][dd_index] and len(train_data)-1 >= dd['Word_row'][i]:
                    dd_index = i-1
                    train_data[dd['Word_row'][i]].append(_da)
                else:
                    train_data.append([_da])
            
               

  
            X = [sent2features(s) for s in train_data[:1]]
            Y = [sent2labels(s) for s in train_data[:1]]
            self.revisian_feature(X,Y)
             for i,l in zip(X,Y):
                for val,label in zip(i,l):
                    data_t = val
                    data_t['label'] = label
                    value_data = [v for k,v in data_t.items()]
                    writer.writerow(value_data)
                    print(data_t,value_data)
            X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=10)
            
            # X_test = [sent2features(s) for s in train_test]
            # y_test = [sent2labels(s) for s in train_test]
            # X_train = X
            # y_train = Y
            # print(train_data[0])
            crf.fit(X_train, y_train)
            labels = list(crf.classes_)
            new_classes = labels.copy()
            new_classes.remove('O')
            # print(rows)
            # train_test = [parseEntity(row) for row in rows]
            y_pred = crf.predict(X_test)
            score = metrics.flat_classification_report(
                y_test, y_pred, labels=new_classes, digits=3
            )

            print(score)
            # print("Top likely transitions:")
            # self.print_transitions(Counter(crf.transition_features_).most_common(20))

            # print("\nTop unlikely transitions:")
            # self.print_transitions(Counter(crf.transition_features_).most_common()[-20:])


            y_true = flatten(y_test)
            y_pred = flatten(y_pred)
            self.conffusion_matrix_to_csv(y_true, y_pred, new_classes)
            # re_format_labels = self.re_format_class(new_classes)
            # re_format_iob_y_true = self.re_format(y_true)
            # re_format_iob_y_pred = self.re_format(y_pred)
            
          
            # pprint(tp / (tp + fp))
            # pprint(tp / (tp + fn))
            

            # print("Top likely transitions:")
            # self.print_transitions(Counter(crf.transition_features_).most_common(20))
            # state_features = crf.state_features_
            # out = zip(state_features.keys(), state_features.values())
            # with open(os.path.abspath(
            #         'server/nlp/data/data_features.csv'), 'w', encoding="utf8",newline="") as csv_feature:
            #     writer = csv.writer(csv_feature)
            #     writer.writerow(['key','value'])
            #     for i in out:
            #         writer.writerow(i)
            # csv_feature.close()

            
            dump(crf, os.path.abspath(
                'server/nlp/data/model.joblib'))
            self.crf = crf
        
         
            # weight = eli5.show_weights(crf, top=30)
            # print(dir(eli5))
          
            # for i in data_frame:
            #     print(data_frame[i])
            # pd = df.DataFrame(data_frame, index=True)
            # print(data_frame['targets'].to_html())
            # data_frame['targets'].to_csv(os.path.abspath(
            #     'server/nlp/data/data_feature_targets.csv'))
            # data_frame['transition_features'].to_csv(os.path.abspath(
            #     'server/nlp/data/data_transition_features.csv'))
            # pprint(dir(self.crf))
            # pprint(self.crf.state_features_)
            # pprint(self.crf.training_log_.iterations)
    
     
            return self.crf
Ejemplo n.º 9
0
 def wrapper(y_true, y_pred, *args, **kwargs):
     y_true_flat = flatten(y_true)
     y_pred_flat = flatten(y_pred)
     return func(y_true_flat, y_pred_flat, *args, **kwargs)
Ejemplo n.º 10
0
def flat_recall(y_true, y_pred):
    """Define flat recall metric."""
    ytr_flat = flatten(y_true)
    ypr_flat = flatten(y_pred)
    return recall_score(ytr_flat, ypr_flat, pos_label="FOOD", average='binary')
Ejemplo n.º 11
0
def gather_validation_metrics(X_text,
                              y,
                              tokenizer,
                              model,
                              preprocessor,
                              batch_size=128,
                              check_lengths=False,
                              verbose=False,
                              fine_label_report=False,
                              dataset='UNKNOWN DATASET',
                              label_ignore_set=set(['O'])):
    gold_labels = []
    pred_labels = []
    gold_coarse_labels = []
    pred_coarse_labels = []

    # now put all the predictions together
    fine_labels_set = set()
    coarse_labels_set = set()
    MAX_PREDICTIONS = len(X_text)
    # do all predictions at once (rather than sentence-by-sentence) and then collect the results
    sentence_preds = model.predict(preprocessor.transform(
        X_text[:MAX_PREDICTIONS]),
                                   batch_size=batch_size)

    # then go back and get predictions one at a time
    for i in range(len(X_text[:MAX_PREDICTIONS])):
        # only one sentence, so grab the 0 row...
        sentence_length = len(y[i])
        sentence_pred = sentence_preds[i, :sentence_length]
        #print(sentence_pred)
        #break

        # prep the labels
        sentence_coarse_labels = get_coarse_labels(y[i])

        # get the predictions
        pred_sentence_labels = preprocessor.inverse_transform(
            np.argmax(sentence_pred, -1))

        pred_sentence_coarse_labels = get_coarse_labels(pred_sentence_labels)

        if check_lengths:
            if len(y[i]) == len(pred_sentence_labels):
                print('MATCH of GOLD and PRED')
            else:
                print('NO MATCH ON GOLD and PRED')
                print('GOLD : {0}, PRED : {1}'.format(
                    len(y[i]), len(pred_sentence_labels)))
                print('SENTENCE PRED LENGTH : {}'.format(len(sentence_pred)))
                print(X_text[i])
                print(y[i])
                print(pred_sentence_labels)

        # the sklearn-crfsuite metrics call its flatten() to convert a list of lists to one flat list
        gold_labels.append(y[i])
        gold_coarse_labels.append(sentence_coarse_labels)
        pred_labels.append(pred_sentence_labels)
        pred_coarse_labels.append(pred_sentence_coarse_labels)

        fine_labels_set |= set(y[i])
        coarse_labels_set |= set(sentence_coarse_labels)

        #print(pred_sentence_labels)
        #print(y[i])

    print('Reporting metrics for dataset : [{0}]'.format(dataset))
    if verbose:
        print('Total gold FINE : {}'.format(len(gold_labels)))
        print('Total pred FINE : {}'.format(len(pred_labels)))
        print('Total gold COARSE : {}'.format(len(gold_coarse_labels)))
        print('Total pred COARSE : {}'.format(len(pred_coarse_labels)))

        print('Total FLATTENED gold FINE : {}'.format(len(
            flatten(gold_labels))))
        print('Total FLATTENED pred FINE : {}'.format(len(
            flatten(pred_labels))))
        print('Total FLATTENED gold COARSE : {}'.format(
            len(flatten(gold_coarse_labels))))
        print('Total FLATTENED pred COARSE : {}'.format(
            len(flatten(pred_coarse_labels))))

    # now let's do some evaluation
    fine_labels_sorted_list = sorted(list(fine_labels_set))
    coarse_labels_sorted_list = sorted(list(coarse_labels_set))

    # pull out any labels that we want to ignore
    fine_labels_sorted_list = [
        x for x in fine_labels_sorted_list if x not in label_ignore_set
    ]
    coarse_labels_sorted_list = [
        x for x in coarse_labels_sorted_list if x not in label_ignore_set
    ]

    if fine_label_report:
        print('FINE label report : ')
        print(
            flat_classification_report(gold_labels,
                                       pred_labels,
                                       labels=fine_labels_sorted_list,
                                       digits=3))

    print('COARSE label report : ')
    print(
        flat_classification_report(gold_coarse_labels,
                                   pred_coarse_labels,
                                   labels=coarse_labels_sorted_list,
                                   digits=3))

    flat_gold_coarse_labels = flatten(gold_coarse_labels)
    flat_pred_coarse_labels = flatten(pred_coarse_labels)

    coarse_labels_sorted_list_nofilter = sorted(list(coarse_labels_set))
    confusion_coarse = confusion_matrix(
        flat_gold_coarse_labels,
        flat_pred_coarse_labels,
        labels=coarse_labels_sorted_list_nofilter)
    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(
        confusion_coarse,
        classes=coarse_labels_sorted_list_nofilter,
        title='Coarse Label Confusion for : [{0}]'.format(dataset))
    plt.show()
Ejemplo n.º 12
0
def count_labels(labels):
    labels_flat = flatten(labels)
    counter = collections.Counter(labels_flat)
    print(counter)
Ejemplo n.º 13
0
def get_labels(y):
    return sorted_for_ner(set(flatten(y)) - {'O'})