Exemple #1
0
def crf(test_loc, train_loc):
    test_sents = convertCONLLFormJustExtractionSemEval(test_loc)
    train_sents = convertCONLLFormJustExtractionSemEval(train_loc)

    #pprint(train_sents[0])
    #pprint(test_sents[0])

    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    crf = sklearn_crfsuite.CRF(\
    algorithm='lbfgs',\
    c1=0.1,\
    c2=0.1,\
    max_iterations=100,\
    all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    labels = list(crf.classes_)
    labels.remove('O')
    #print(labels)
    pickle.dump(crf,
                open("/data/xwang/models_origin/linear-chain-crf.model.pickle",
                     "wb"),
                protocol=0,
                fix_imports=True)
    y_pred = crf.predict(X_test)

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    f1_score = metrics.flat_f1_score(y_test,
                                     y_pred,
                                     average='weighted',
                                     labels=sorted_labels)
    recall = metrics.flat_recall_score(y_test,
                                       y_pred,
                                       average='weighted',
                                       labels=sorted_labels)
    precision = metrics.flat_precision_score(y_test,
                                             y_pred,
                                             average='weighted',
                                             labels=sorted_labels)
    #print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
    return (f1_score, recall, precision)
Exemple #2
0
import bs4 as bs
import json
import re
import nltk
import heapq
import pickle
import sys
from pprint import pprint
import os
import codecs
import string
from sklearn_crfsuite import metrics
from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
from FeatureExtraction import sent2labels, sent2features
from PhraseEval import phrasesFromTestSenJustExtractionWithIndex
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import conlltags2tree, tree2conlltags
#Swapped hardcoded link with system arguement
file_inLoc = sys.argv[1]
file_outLoc = sys.argv[1].split(".")[0] + "-DKE.txt"
file_outLoc = file_outLoc[15:]
with open(file_inLoc, 'r', encoding='utf-8-sig') as f:
    article_text = json.load(f)
pprint(article_text)
'''
scraped_data = urllib.request.Request(file_inLoc, headers={'User-Agent' : "Magic Browser"})
scraped_data=urllib.request.urlopen(scraped_data)
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""
import sys
from pprint import pprint

from sklearn_crfsuite import metrics

from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
from FeatureExtraction import sent2labels,sent2features
from PhraseEval import phrasesFromTestSenJustExtractionWithIndex

fileinLoc = sys.argv[1]
fileoutLoc = sys.argv[1].split("-")[0]+"-predicted.ann"

crf = pickle.load(open("linear-chain-crf.model.pickle"))
(test_sents,test_sents_indices) = convertCONLLFormJustExtractionSemEvalPerfile(fileinLoc)

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

y_pred = crf.predict(X_test)

labels = list(crf.classes_)
labels.remove('O')

print labels
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

test_sents_pls = []  #test sentences with predicted labels
for index,testsent in enumerate(test_sents):
    sent=[]
    pls = y_pred[index]
    for i in range(len(iob_tags2)):
        #iob_tags3=[]
        c = iob_tags2[i][2]
        if c[0] == 'B':
            iob_tags1 = (iob_tags2[i][0], iob_tags2[i][1], 'B')
        elif c[0] == 'I':
            iob_tags1 = (iob_tags2[i][0], iob_tags2[i][1], 'I')
        else:
            iob_tags1 = (iob_tags2[i][0], iob_tags2[i][1], 'o')
        iob_tags3.append(iob_tags1)
    iob_tags4.append(iob_tags3)

#print(' **** Preprocessed Text given as input to model *******')
#print(iob_tags4)

X_test = [sent2features(s) for s in iob_tags4]

crf = pickle.load(open("linear-chain-crf.model.pickle"))
y_pred = crf.predict(X_test)

labels = list(crf.classes_)
labels.remove('O')

#print labels
#print y_pred

test_sents_pls = []  #test sentences with predicted labels
for index, testsent in enumerate(iob_tags4):
    sent = []
    pls = y_pred[index]
    for (token, pl) in zip(testsent, pls):
Exemple #5
0
def main():
    train_sents = convertCONLLFormJustExtractionSemEval(
        "medicalData/convertedBIO/combinedTrain.txt")
    test_sents = convertCONLLFormJustExtractionSemEval(
        "medicalData/convertedBIO/combinedTest.txt")

    pprint(train_sents[0])
    pprint(test_sents[0])

    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    crf = sklearn_crfsuite.CRF(\
    algorithm='lbfgs',\
    c1=0.1,\
    c2=0.1,\
    max_iterations=100,\
    all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    labels = list(crf.classes_)
    labels.remove('O')
    print(labels)
    pickle.dump(crf,
                open("medicalData/linear-chain-crf.model.pickle", "wb"),
                protocol=0,
                fix_imports=True)
    y_pred = crf.predict(X_test)

    # Use this if you need to do grid search on training data for parameter optimization.
    '''
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        max_iterations=100,
       all_possible_transitions=True
    )
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

    # search
    rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
    rs.fit(X_train, y_train)
    print("classification done")
    crf = rs.best_estimator_
    y_pred = crf.predict(X_test)
    '''
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
    print(
        metrics.flat_classification_report(y_test,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))

    # Use this if you want to see how the phrase extraction works. This does NOT produce the ann files.
    '''
def main(trainingCorpus):
    # trainingCorpus = 'larger'
    # trainingCorpus = 'original'
    trainFile = "medicalData/convertedBIO/" + trainingCorpus + "/combinedTrain.txt"
    testFile = "medicalData/convertedBIO/" + trainingCorpus + "/combinedTest.txt"

    train_sents = convertCONLLFormJustExtractionSemEval(trainFile)
    test_sents = convertCONLLFormJustExtractionSemEval(testFile)

    # pprint(train_sents[0])
    # print('\n')
    # pprint(test_sents[0])
    # print('\n')

    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]
    #%%
    c1 = 0.1
    c2 = 0.1

    crf = sklearn_crfsuite.CRF(\
    algorithm='lbfgs',\
    c1=c1,\
    c2=c2,\
    max_iterations=100,\
    all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    if trainingCorpus == 'larger':
        labels = list(crf.classes_)
        labels.remove('O')
        labels.remove('1')
        labels.remove('3')
        labels.remove('8')
        labels.remove('9')
        pickle.dump(
            crf,
            open(
                "medicalData/larger/unoptimized/linear-chain-crf.model.pickle",
                "wb"),
            protocol=0,
            fix_imports=True)
    elif trainingCorpus == 'original':
        labels = list(crf.classes_)
        labels.remove('O')
        pickle.dump(
            crf,
            open(
                "medicalData/original/unoptimized/linear-chain-crf.model.pickle",
                "wb"),
            protocol=0,
            fix_imports=True)
    # print(labels)
    print('\n')

    y_pred = crf.predict(X_test)

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

    print('\nTest Results (Original Model, Training Corpus: ' +
          trainingCorpus + ')')
    print('c1 = %s, c2 = %s' % (c1, c2))
    print(
        metrics.flat_classification_report(y_test,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))

    # y_predT = crf.predict(X_train)
    # print('\nTrain Results (Original Model, Training Corpus: ' + trainingCorpus + ')')
    # print('c1 = %s, c2 = %s' %(c1, c2))
    # print(metrics.flat_classification_report(y_train, y_predT, labels=sorted_labels, digits=3))

    #%%
    # '''
    # define fixed parameters and parameters to search

    def report(results, n_top=15):
        print("")
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            print(candidates)
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results['mean_test_score'][candidate],
                    results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               max_iterations=100,
                               all_possible_transitions=True)

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='weighted',
                            labels=labels)

    # params_space = {
    #     'c1': scipy.stats.expon(scale=0.5),
    #     'c2': scipy.stats.expon(scale=0.5),
    # }

    # search
    # rs = RandomizedSearchCV(crf, params_space,
    #                         cv=3,
    #                         verbose=1,
    #                         n_jobs=-1,
    #                         n_iter=10,
    #                         return_train_score=True,
    #                         scoring=f1_scorer)

    param_grid = {
        'c1': [0.10, 0.20, 0.30, 0.40],
        'c2': [0.10, 0.20, 0.30, 0.40],
    }

    numSplits = 5
    rs = GridSearchCV(crf,
                      param_grid,
                      cv=numSplits,
                      verbose=-1,
                      n_jobs=-1,
                      return_train_score=True,
                      scoring=f1_scorer)

    rs.fit(X_train, y_train)

    report(rs.cv_results_)

    # print(rs.cv_results_['params'])
    # print(rs.cv_results_['split0_test_score'])
    # print(rs.cv_results_['split1_test_score'])
    # print(rs.cv_results_['split2_test_score'])
    # print(rs.cv_results_['split3_test_score'])
    # print(rs.cv_results_['split4_test_score'])

    split1 = (np.array([rs.cv_results_['split0_test_score']])).T
    split2 = (np.array([rs.cv_results_['split0_test_score']])).T
    split3 = (np.array([rs.cv_results_['split0_test_score']])).T
    split4 = (np.array([rs.cv_results_['split0_test_score']])).T
    split5 = (np.array([rs.cv_results_['split0_test_score']])).T
    splitCompiled = np.hstack((split1, split2, split3, split4, split5))

    _x = [s['c1'] for s in rs.cv_results_['params']]
    _y = [s['c2'] for s in rs.cv_results_['params']]
    _c = [s for s in rs.cv_results_['mean_test_score']]

    fig = plt.figure()
    fig.set_size_inches(3, 3)
    ax = plt.gca()
    # ax.set_yscale('log')
    # ax.set_xscale('log')
    ax.set_xlabel('C1')
    ax.set_ylabel('C2')
    # ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(min(_c), max(_c)))
    ax.set_title(
        "Hyperparameter Gridsearch CV Results \n (min={:0.3}, max={:0.3})".
        format(min(_c), max(_c)))

    sc = ax.scatter(_x, _y, c=_c, s=60, alpha=0.9)

    # plt.text(0.1, 0.11,'Unoptimized Model', fontsize=10, horizontalalignment='left',verticalalignment='bottom')
    # if trainingCorpus == 'larger':
    #     plt.text(0.1, 0.39,'Optimized Model', fontsize=10, horizontalalignment='left',verticalalignment='top')
    # elif trainingCorpus == 'original':
    #     plt.text(0.1, 0.2,'Optimized Model', fontsize=10, horizontalalignment='left',verticalalignment='top')

    plt.colorbar(sc)
    if trainingCorpus == 'larger':
        plt.savefig('medicalData/larger/optimized/OptimizedLarger.png')
    elif trainingCorpus == 'original':
        plt.savefig('medicalData/original/optimized/OptimizedOriginal.png')

    # '''

    crf_best = rs.best_estimator_
    if trainingCorpus == 'larger':
        pickle.dump(
            crf_best,
            open("medicalData/larger/optimized/linear-chain-crf.model.pickle",
                 "wb"),
            protocol=0,
            fix_imports=True)
    elif trainingCorpus == 'original':
        pickle.dump(
            crf_best,
            open(
                "medicalData/original/optimized/linear-chain-crf.model.pickle",
                "wb"),
            protocol=0,
            fix_imports=True)

    # print('best params:', rs.best_params_)
    # print('best CV score:', rs.best_score_)
    # print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

    y_pred = crf_best.predict(X_test)
    print('\nTest Results (Optimized Model, Training Corpus: ' +
          trainingCorpus + ')')
    # print('c1 = %s, c2 = %s' %(c1, c2))
    print(
        metrics.flat_classification_report(y_test,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))

    # y_predT = crf_best.predict(X_train)
    # print('\nTrain Results (Optimized Model, Training Corpus: ' + trainingCorpus + ')')
    # # print('c1 = %s, c2 = %s' %(c1, c2))
    # print(metrics.flat_classification_report(y_train, y_predT, labels=sorted_labels, digits=3))

    #%%
    '''
    x = np.linspace(1,16,num=16)
    fig = plt.figure()
    fig.set_size_inches(10,6)
    ax = plt.gca()
    # ax.set_yscale('log')
    # ax.set_xscale('log')
    ax.set_xlabel('Regularization Coefficients')
    ax.set_ylabel('Split Test Score')
    ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(min(_c), max(_c)))
    ax.set_title("Hyperparameter Gridsearch CV Results (min={:0.3}, max={:0.3})".format(min(_c), max(_c)))
    # plt.xticks([1, 2])
    ax.set_xticklabels(['{0.1, 0.1}','{0.1, 0.2}','{0.1, 0.3}','{0.1, 0.4}',
                        '{0.2, 0.1}','{0.2, 0.2}','{0.2, 0.3}','{0.2, 0.4}',
                        '{0.3, 0.1}','{0.3, 0.2}','{0.3, 0.3}','{0.3, 0.4}',
                        '{0.4, 0.1}','{0.4, 0.2}','{0.4, 0.3}','{0.4, 0.4}'],rotation = 45)
    
    # ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])
    ax.scatter(x, split1, c='midnightblue', s=60, alpha=0.9, marker = 'o', label = 'Split 1')
    ax.scatter(x, split2, c='blue',         s=60, alpha=0.9, marker = '*', label = 'Split 2')
    ax.scatter(x, split3, c='steelblue',    s=60, alpha=0.9, marker = 's', label = 'Split 3')
    ax.scatter(x, split4, c='skyblue',  s=60, alpha=0.9, marker = '+', label = 'Split 4')
    ax.scatter(x, split5, c='turquoise',    s=60, alpha=0.9, marker = 'D', label = 'Split 5')
    
    # ax.scatter(x, split1, c= split1, s=60, alpha=0.9, marker = 'o', label = 'Split 1')
    # ax.scatter(x, split2, c= split2, s=60, alpha=0.9, marker = '*', label = 'Split 2')
    # ax.scatter(x, split3, c= split3, s=60, alpha=0.9, marker = 's', label = 'Split 3')
    # ax.scatter(x, split4, c= split4, s=60, alpha=0.9, marker = '+', label = 'Split 4')
    # ax.scatter(x, split5, c= split5, s=60, alpha=0.9, marker = 'D', label = 'Split 5')
    ax.legend()
    ax.grid(True)
    # sc = plt.scatter(xy, xy, c=z, vmin=0, vmax=20, s=35, cmap=cm)
    # plt.colorbar(sc)
    '''
    return splitCompiled