Example #1
0
class CRFTrainer(object):
    #define an init function to initialize the values.
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name
        #using chain crf to analyze the data, so add an error check for this:
        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            #define the classifier to use with CRF model.
            self.clf = FrankWolfeSSVM(model=model,
                                      C=self.c_value,
                                      max_iter=100)
        else:
            raise TypeError('Invalid classifier type')

    def load_clean_data(self):
        '''
        load the data into X and y, where X is a numpy array of samples where each sample has the shape (n_letters, n_features)
        '''
        df = featurize.get_data()
        featurize.split_words(df)
        featurize.first_letter_uppercase(df)
        featurize.has_number(df)
        featurize.has_slash(df)
        featurize.spacy_pos_tagger(df)
        featurize.pos_ngrams(df)
        featurize.encoding_labels(df)
        X, y = featurize.get_X_and_y(df)
        return df, X, y

    def cross_val(self, X_train, y_train):
        '''
        method to conduct 5-fold cross validation
        '''
        kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False)
        for train_idx, test_idx in kf:
            xtrain, xval = X_train[train_idx], X_train[test_idx]
            ytrain, yval = y_train[train_idx], y_train[test_idx]
            model = ChainCRF()
            ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15)
            ssvm.fit(xtrain, ytrain)
            print ssvm.score(xval, yval)

    def train(self, X_train, y_train):
        '''
        training method
        '''
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        '''
        method to evaluate the performance of the model
        '''
        return self.clf.score(X_test, y_test)

    def classify(self, input_data):
        '''
        method to run the classifier on input data
        '''
        return self.clf.predict(input_data)[0]
Example #2
0
def test_multinomial_blocks_frankwolfe_batch():
    X, Y = generate_blocks_multinomial(n_samples=10, noise=0.3, seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf, C=1, max_iter=500, batch_mode=True)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) 
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()
        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X is a numpy array of samples where each sample
    # has the shape (n_letters, n_features) 
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # Run the classifier on input data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Example #4
0
class CRFModel(object):
    def __init__(self, c_val=1.0):
        self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=50)

    def load_data(self):
        alphabets = load_letters()
        X = np.array(alphabets['data'])
        y = np.array(alphabets['labels'])
        folds = alphabets['folds']
        return X, y, folds

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    def classify(self, input_data):
        return self.clf.predict(input_data)[0]

    def convert_to_letters(indices):
        alphabets = np.array(list(string.ascii_lowercase))
        output = np.take(alphabets, indices)
        output = ''.join(output)
        return output
Example #5
0
def test_multinomial_blocks_frankwolfe():
    X, Y = generate_blocks_multinomial(n_samples=10, noise=0.5, seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf, C=1, max_iter=50, verbose=3)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
def n_cross_valid_crf(X, Y, K, command):
    # cross validation for crf

    if command == 'write_results':
        list_write = list()

    cv = KFold(len(X), K, shuffle=True, random_state=0)
    for traincv, testcv in cv:
        x_train, x_test = X[traincv], X[testcv]
        y_train, y_test = Y[traincv], Y[testcv]

        crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
        ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
        ssvm.fit(x_train, y_train)
        y_pred = ssvm.predict(x_test)

        print 'Accuracy of linear-crf %f:' % ssvm.score(x_test, y_test)
        if command == 'metrics_F1':
            metrics_crf(y_test, y_pred)
        elif command == 'confusion_matrix':
            confusion_matrix_CRF(y_test, y_pred)
        elif command == 'write_results':
            list_write += write_results_CRF(testcv, y_test, y_pred)

        print '------------------------------------------------------'
        print '------------------------------------------------------'

    if command == 'write_results':
        list_write = sorted(list_write, key=itemgetter(0))  # sorted list based on index
        for value in list_write:
            pred_list = value[1]
            test_list = value[2]

            for i in range(0, len(pred_list)):
                print str(pred_list[i]) + '\t' + str(test_list[i])
Example #7
0
class CRFTrainer(object):

    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()

        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X是一个由样本组成的numpy数组,每个样本为(字母,数值)
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # 对输入数据运行分类器
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) 
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()
        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X is a numpy array of samples where each sample
    # has the shape (n_letters, n_features) 
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # Run the classifier on input data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Example #9
0
def pick_best_C_value(train_sentences, sentence_labels, test_SF,
                      test_sentences, test_sentence_labels):

    i = 0.10
    best_C = i
    f_old = 0
    for z in range(1, 20):
        print "----------------- Training on C-value %f" % i
        modelCRF = ChainCRF()
        ssvm = FrankWolfeSSVM(model=modelCRF, C=i, max_iter=20, random_state=5)
        ssvm.fit(train_sentences, sentence_labels)
        print "\n"
        print "-------- Training complete --------"

        predictions = ssvm.predict(test_sentences)
        test_SF['predicted_labels'] = predictions

        #Saving model
        print "Saving model...."
        pickle.dump(ssvm, open('models/ote/otemodel.sav', 'wb'))

        #Evaluating Trained CRF model

        p, r, f1, common, retrieved, relevant = evaluating_ote(test_SF)
        if (f1 >= f_old):
            #save value of 'C'
            f_old = f1
            best_C = i

        i = i + 0.05
    return best_C
Example #10
0
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()

        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X是一个由样本组成的numpy数组,每个样本为(字母,数值)
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # 对输入数据运行分类器
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Example #11
0
def fit_predict(train_docs,
                test_docs,
                dataset,
                C,
                class_weight,
                constraints,
                compat_features,
                second_order,
                coparents,
                grandparents,
                siblings,
                exact_test=False):
    stats = stats_train(train_docs)
    prop_vect, _ = prop_vectorizer(train_docs,
                                   which=dataset,
                                   stats=stats,
                                   n_most_common_tok=None,
                                   n_most_common_dep=2000,
                                   return_transf=True)
    link_vect = link_vectorizer(train_docs, stats, n_most_common=500)

    sec_ord_vect = (second_order_vectorizer(train_docs)
                    if second_order else None)

    _, _, _, pmi_in, pmi_out = stats

    def _transform_x_y(docs):
        X = [
            _vectorize(doc, pmi_in, pmi_out, prop_vect, link_vect,
                       sec_ord_vect) for doc in docs
        ]
        Y = [doc.label for doc in docs]
        return X, Y

    X_tr, Y_tr = _transform_x_y(train_docs)
    X_te, Y_te = _transform_x_y(test_docs)

    model = ArgumentGraphCRF(class_weight=class_weight,
                             constraints=constraints,
                             compat_features=compat_features,
                             coparents=coparents,
                             grandparents=grandparents,
                             siblings=siblings)

    clf = FrankWolfeSSVM(model,
                         C=C,
                         random_state=0,
                         verbose=1,
                         check_dual_every=25,
                         show_loss_every=25,
                         max_iter=100,
                         tol=0)

    clf.fit(X_tr, Y_tr)

    if exact_test:
        clf.model.exact = True
    Y_pred = clf.predict(X_te)

    return clf, Y_te, Y_pred
def test_multinomial_blocks_frankwolfe():
    X, Y = generate_blocks_multinomial(n_samples=50, noise=0.5,
                                       seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf, C=1, line_search=True,
                         batch_mode=False, check_dual_every=500)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
Example #13
0
def test_multinomial_blocks_frankwolfe():
    X, Y = generate_blocks_multinomial(n_samples=50, noise=0.5, seed=0)
    crf = GridCRF(inference_method='qpbo')
    clf = FrankWolfeSSVM(model=crf,
                         C=1,
                         line_search=True,
                         batch_mode=False,
                         check_dual_every=500)
    clf.fit(X, Y)
    Y_pred = clf.predict(X)
    assert_array_equal(Y, Y_pred)
Example #14
0
def graph_crf():

    crf = GraphCRF()
    # X_train

    # creating features
    # maximum number of attributes = 2
    # variables have only one attribute (assigned value), so other second attribute is set to zero
    feature_1 = [1, 0]  # var_1
    feature_2 = [2, 0]  # var_2
    # function has two attributes, so an indicator variable is used to show those two
    feature_3 = [1, 1]  # function
    # if has only one condition, which checks for value 1
    feature_4 = [1, 0]  # if
    features = np.array([feature_1, feature_2, feature_3, feature_4])

    # creating edges
    # there are four edges: (v1, v2), (v1, func), (v2, func), (v1, if)
    edge_1 = [0, 1]  # (v1,v2)
    edge_2 = [0, 2]  # (v1, func)
    edge_3 = [1, 2]  # (v2, func)
    edge_4 = [0, 3]  # (v1, if)
    edges = np.array([edge_1, edge_2, edge_3, edge_4])

    X_train_sample = (features, edges)

    # y_train
    # These are enumerated values for actions
    # We assume there should be an action for each node(variable, function, if, etc.)
    y_train_sample = np.array([0, 0, 1, 2])

    # creat some full training set by re-sampling above thing
    n_samples = 100
    X_train = []
    y_train = []
    for i in range(n_samples):
        X_train.append(X_train_sample)
        y_train.append(y_train_sample)

    model = GraphCRF(directed=True, inference_method="max-product")
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
    ssvm.fit(X_train, y_train)

    # predict something
    output = ssvm.predict(X_train[0:3])
    print output
def n_cross_valid_crf_candidate(list_line, X, Y, K):
    list_text = []
    for i in range(0, len(list_line), 3):
        split_first = 0
        split_second = 0

        if i % 3 == 0:
            split_first = list_line[i].strip().split('\t')
        list_text.append(split_first)

    list_text = np.array(list_text)

    cv = KFold(len(X), K, shuffle=True, random_state=0)
    list_write = []
    for traincv, testcv in cv:
        x_train, x_test = X[traincv], X[testcv]
        y_train, y_test = Y[traincv], Y[testcv]
        list_text_train, list_text_test = list_text[traincv], list_text[testcv]

        crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
        ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=10)
        ssvm.fit(x_train, y_train)
        y_pred = ssvm.predict(x_test)
        list_wrong = metrics_crf_candidate(list_text_test, y_test, y_pred)
        if len(list_write) == 0:
            list_write = list_wrong
        else:
            for i in range(0, len(list_wrong)):
                svc = list_wrong[0]
                road = list_wrong[1]
                busstop = list_wrong[2]

                list_write[0] = list_write[0] + svc
                list_write[1] = list_write[1] + road
                list_write[2] = list_write[2] + busstop

    # write_file('d:/', 'wrong_svc', list_write[0])
    # write_file('d:/', 'wrong_road', list_write[1])
    # write_file('d:/', 'wrong_busstop', list_write[2])

    write_file('d:/', 'good_svc', list_write[0])
    write_file('d:/', 'good_road', list_write[1])
    write_file('d:/', 'good_busstop', list_write[2])
def results_CRFs(X_training, Y_training, X_testing, Y_testing, command):
    crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
    ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
    ssvm.fit(X_training, Y_training)
    y_pred = ssvm.predict(X_testing)

    list_write = list()
    print 'Accuracy of linear-crf %f:' % ssvm.score(X_testing, Y_testing)
    if command == 'metrics_F1':
        metrics_crf(Y_testing, y_pred)
    elif command == 'confusion_matrix':
        confusion_matrix_CRF(Y_testing, y_pred)
    elif command == 'write_results':
        list_write = write_CRFs_compare(Y_testing, y_pred)
        for value in list_write:
            pred_list = value[0]
            test_list = value[1]

            for i in range(0, len(pred_list)):
                print str(pred_list[i]) + '\t' + str(test_list[i])
Example #17
0
def Chain_CRF(x, y, x_test, model_args):
    # Reshape for CRF
    #svc = SVC(class_weight='balanced', kernel='rbf', decision_function_shape='ovr')
    #svc.fit(x, y)
    #x = svc.decision_function(x)
    #x_test = svc.decision_function(x_test)
    #scaler = StandardScaler().fit(x)
    #x = scaler.transform(x)
    #x_test = scaler.transform(x_test)
    x = x[:, :11]
    x_test = x_test[:, :11]
    x = x.reshape(-1, 21600, x.shape[-1])
    x_test = x_test.reshape(-1, 21600, x.shape[-1])
    y = y.reshape(-1, 21600)
    crf = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=crf,
                          C=model_args['C'],
                          max_iter=model_args['max_iter'])
    ssvm.fit(x, y)
    y_pred = np.array(ssvm.predict(x_test))
    return y_pred.flatten()
Example #18
0
class CRFModel(object):
    def __init__(self, c_val=1.0):
        self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=100)

    #Load the training data
    def load_data(self):
        alphabets = load_letters()
        X = np.array(alphabets['data'])
        y = np.array(alphabets['labels'])
        folds = alphabets['folds']
        return X, y, folds

    #Train the CRF
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    #Evaluate the accuracy of the CRF
    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    #Run the CRF on unknown data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
def CRF_pred_label(X, Y, command):
    texts = load_demo_text(command)
    if command == 'twitter':
        convert_texts = filterText_demo(texts, 'removeLink', command)
        X_ftr = load_demo_ftr(command)
        print len(convert_texts), len(X_ftr)
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/twitter'
        name_write = 'pred_label_' + command

    elif command == 'sgforums':
        convert_texts = filterText_demo(texts, 'removePunc', command)
        X_ftr = load_demo_ftr(command)
        print len(convert_texts), len(X_ftr)
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/sgforums'
        name_write = 'pred_label_' + command

    elif command == 'facebook':
        convert_texts = filterText_demo(texts, 'removeLink', command)
        X_ftr = load_demo_ftr(command)
        print len(convert_texts), len(X_ftr)
        path_write = 'D:/Project/Transportation_SMU-NEC_collaboration/Data_demo_Dec_2015/facebook'
        name_write = 'pred_label_' + command

    crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
    ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
    ssvm.fit(X, Y)
    y_pred = ssvm.predict(X_ftr)

    list_write = list()
    for line in y_pred:
        labels = ''
        for label in line:
            labels += str(label) + '\t'
        list_write.append(labels.strip())

    write_file(path_write, name_write, list_write)
Example #20
0
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'HALF-LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'FULL-LCCRF', color="#FFD700", size=25)

fig.text(0.05, 0.5, 'Word', color="#000000", size=25)
fig.text(0.5, 0.95, 'Letters', color="#000000", size=25)

for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_half = half_ssvm.predict([X_test[ind]])[0]
    y_pred_crf = ssvm.predict([X_test[ind]])[0]

    for i, (a, image, y_true, y_svm, y_half, y_crf) in enumerate(
            zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_half, y_pred_crf)):
        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
        a.text(0, 3, abc[y_true], color="#00AA00", size=25)    # Green
        a.text(0, 14, abc[y_svm], color="#5555FF", size=25)    # Blue
        a.text(5, 14, abc[y_half], color="#FF5555", size=25)  # Red
        a.text(5, 3, abc[y_crf], color="#FFD700", size=25)     # Yellow
        a.set_xticks(())
        a.set_yticks(())
    for ii in range(i + 1, max_word_len):
        axes_row[ii].set_visible(False)

w = ssvm.w[26 * 8 * 16:].reshape(26, 26)
Example #21
0
kf = KFold(n_splits=n_folds)
fold = 0
for train_index, test_index in kf.split(X):
    print(' ')
    print('train index {}'.format(train_index))
    print('test index {}'.format(test_index))
    print('{} jackets for training, {} for testing'.format(
        len(train_index), len(test_index)))
    X_train = X[train_index]
    Y_train = Y[train_index]
    X_test = X[test_index]
    Y_test = Y[test_index]
    """ YOUR S-SVM TRAINING CODE HERE """
    ssvm.fit(X_train, Y_train)
    """ LABEL THE TESTING SET AND PRINT RESULTS """
    Y_pred = ssvm.predict(X_test)
    wrong_segments_crf.append(np.sum(Y_pred != Y_test))
    score = ssvm.score(X_test, Y_test)
    scores_crf[fold] = score
    """ figure showing the result of classification of segments for
    each jacket in the testing part of present fold """
    if plot_labeling:
        for ti, pred in zip(test_index, Y_pred):
            print(ti)
            print(pred)
            s = segments[ti]
            plot_segments(s,
                          caption='SSVM predictions for jacket ' + str(ti + 1),
                          labels_segments=pred)
    """ YOUR LINEAR SVM TRAINING CODE HERE """
    svm.fit(X_train.reshape((-1, num_features)), Y_train.reshape((-1)))
Example #22
0
print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test))

print("Test score with linear SVM: %f" %
      svm.score(np.vstack(X_test), np.hstack(y_test)))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_chain = ssvm.predict([X_test[ind]])[0]
    for i, (a, image, y_true, y_svm, y_chain) in enumerate(
            zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)):
        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
        a.text(0, 3, abc[y_true], color="#00AA00", size=25)
        a.text(0, 14, abc[y_svm], color="#5555FF", size=25)
        a.text(5, 14, abc[y_chain], color="#FF5555", size=25)
        a.set_xticks(())
        a.set_yticks(())
    for ii in range(i + 1, max_word_len):
        axes_row[ii].set_visible(False)

plt.matshow(ssvm.w[26 * 8 * 16:].reshape(26, 26))
plt.title("Transition parameters of the chain CRF.")
plt.xticks(np.arange(25), abc)
plt.yticks(np.arange(25), abc)
Example #23
0
class CRFClassifierText(object):

    IGNORE_IF = re.compile(r'(in press|submitted|to appear)',
                           flags=re.IGNORECASE)

    QUOTES_AROUND_ETAL_REMOVE = re.compile(r'(.*)(")(et al\.?)(")(.*)',
                                           re.IGNORECASE)
    TO_ADD_DOT_AFTER_INITIALS = re.compile(
        r'\b([A-Z]{1}(?!\.))([\s,]+)([A-Z12(]|and)')
    TO_ADD_SEPARATE_INITIALS = re.compile(r'\b([A-Z]{1})([A-Z]{1})([,\s]{1})')
    SEPARATE_AUTHOR = re.compile(r'^((.*?)([\d\":]+))(.*)$')
    TO_REMOVE_HYPEN_NEAR_INITIAL = [
        re.compile(r'([A-Z]\.)(\-)([A-Z]\.)'),
        re.compile(r'([A-Z])(\-)(\.)'),
        re.compile(r'([A-Z])(\-)([A-Z])\b')
    ]

    URL_EXTRACTOR = re.compile(r'((url\s*)?(http)s?://[A-z0-9\-\.\/\={}?&%]+)',
                               re.IGNORECASE)
    MONTH_NAME_EXTRACTOR = re.compile(
        r'\b([Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|([Nn]ov|[Dd]ec)(?:ember)?)\b'
    )

    URL_TO_DOI = re.compile(
        r'((url\s*)?(https\s*:\s*//\s*|http\s*:\s*//\s*)((.*?)doi(.*?)org/))|(DOI:https\s*://\s*)',
        flags=re.IGNORECASE)
    URL_TO_ARXIV = re.compile(
        r'((url\s*)?(https://|http://)(arxiv.org/(abs|pdf)/))',
        flags=re.IGNORECASE)
    URL_TO_ASCL = re.compile(r'((url\s*)?(https://|http://)(ascl.net/))',
                             flags=re.IGNORECASE)
    ADD_COLON_TO_IDENTIFIER = re.compile(r'(\s+(DOI|arXiv|ascl))(:?\s*)',
                                         flags=re.IGNORECASE)

    IS_START_WITH_YEAR = re.compile(r'(^[12][089]\d\d)')
    START_WITH_AUTHOR = re.compile(r'([A-Za-z].*$)')

    WORD_BREAKER_REMOVE = [re.compile(r'([A-Za-z]+)([\-]+\s+)([A-Za-z]+)')]

    TOKENS_NOT_IDENTIFIED = re.compile(r'\w+\b(?!\|)')

    REFERENCE_TOKENIZER = re.compile(r'([\s.,():;\[\]\'\"#\/])')
    TAGGED_MULTI_WORD_TOKENIZER = re.compile(r'([\s.,])')

    # is all capital
    IS_ALL_CAPITAL = re.compile(r'^([A-Z]+)$')
    # is only the first character capital
    IS_FIRST_CAPITAL = re.compile(r'^([A-Z][a-z]+)$')
    # is alphabet only, consider hyphenated words also
    IS_ALPHABET = re.compile(r'^(?=.*[a-zA-Z])([a-zA-Z\-]+)$')
    # is numeric only, consider the page range with - being also numeric
    # also include arxiv id with a dot to be numeric
    # note that this differs from function is_numeric in the
    # sense that this recognizes numeric even if it was not identified/tagged
    IS_NUMERIC = re.compile(r'^(?=.*[0-9])([0-9\-\.]+)$')
    # is alphanumeric, must have at least one digit and one alphabet character
    IS_ALPHANUMERIC = re.compile(r'^(?=.*[0-9])(?=.*[a-zA-Z])([a-zA-Z0-9]+)$')

    ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS = re.compile(
        r'(\|[a-z\_]+\|)(\|[a-z\_]+\|)')
    REGEX_PATTERN_WHOLE_WORD_ONLY = r'(?:\b|\B)%s(?:\b|\B)'

    nltk_tagger = None
    crf = None
    X = y = label_code = folds = None

    def __init__(self):
        """

        """
        self.originator_token = OriginatorToken(self.REFERENCE_TOKENIZER)
        self.numeric_token = NumericToken()
        self.pub_token = PubToken()
        self.unknown_tokens = []
        self.filename = os.path.dirname(
            __file__) + '/serialized_files/crfModelText.pkl'

    def create_crf(self):
        """

        :return:
        """
        # to load nltk tagger, a time consuming, one time needed operation
        self.nltk_tagger = nltk.tag._get_tagger()
        self.crf = FrankWolfeSSVM(model=ChainCRF(), C=1.0, max_iter=50)
        self.X, self.y, self.label_code, self.folds, generate_fold = self.load_training_data(
        )

        score = 0
        # only need to iterate through if fold was generated
        num_tries = 10 if generate_fold else 1
        while (score <= 0.90) and (num_tries > 0):
            try:
                X_train, y_train = self.get_train_data()
                self.train(X_train, y_train)

                X_test, y_test = self.get_test_data()
                score = self.evaluate(X_test, y_test)
            except Exception as e:
                current_app.logger.error('Exception: %s' % (str(e)))
                current_app.logger.error(traceback.format_exc())
                pass
            num_tries -= 1
        return (score > 0)

    def format_training_data(self, the_data):
        """

        :param the_data:
        :return:
        """
        # get label, word in the original presentation
        labels = [[elem[0] for elem in ref] for ref in the_data]
        words = [[elem[1] for elem in ref] for ref in the_data]

        # count how many unique labels there are, return a dict to convert from words to numeric words
        label_code = self.encoder(labels)

        numeric_labels = []
        features = []
        for label, word in zip(labels, words):
            # replace of numeric words for the original presentation of label
            numeric_label = []
            for l in label:
                numeric_label.append(label_code[l])
            numeric_labels.append(np.array(numeric_label))

            # get the numeric features for the original presentation of word and insert at index of label
            feature = []
            for idx in range(len(word)):
                feature.append(self.get_data_features(word, idx, label))
            features.append(np.array(feature))
        return features, numeric_labels, label_code

    def get_num_states(self):
        """

        :return:
        """
        num_states = len(
            np.unique(np.hstack([y for y in self.y[self.folds != 0]])))
        current_app.logger.debug("number of states = %s" % num_states)
        return num_states

    def get_folds_array(self, filename):
        """
        read the distribution of train and test indices from file
        :param filename:
        :return:
        """
        with open(filename, 'r') as f:
            reader = f.readlines()
            for line in reader:
                if line.startswith("STATIC_FOLD"):
                    try:
                        return eval(line.split(" = ")[1])
                    except:
                        return None

    def get_train_data(self):
        """

        :return:
        """
        return self.X[self.folds != 0], self.y[self.folds != 0]

    def get_test_data(self):
        """

        :return:
        """
        return self.X[self.folds == 0], self.y[self.folds == 0]

    def train(self, X_train, y_train):
        """
        :param X_train: is a numpy array of samples where each sample
                        has the shape (n_labels, n_features)
        :param y_train: is numpy array of labels
        :return:
        """
        self.crf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        """

        :param X_test:
        :param y_test:
        :return:
        """
        return self.crf.score(X_test, y_test)

    def decoder(self, numeric_label):
        """

        :param numeric_label:
        :return:
        """
        labels = []
        for nl in numeric_label:
            key = next(key for key, value in self.label_code.items()
                       if value == nl)
            labels.append(key)
        return labels

    def encoder(self, labels):
        """

        :param labels:
        :return: dict of labels as key and numeric value is its value
        """
        # assign a numeric value to each label
        label_code = {}
        numeric = -1
        for label in labels:
            for l in label:
                if (numeric >= 0 and l in label_code):
                    continue
                else:
                    numeric = numeric + 1
                    label_code[l] = numeric
        return label_code

    def load_training_data(self):
        """
        load training/test data
        :return:
        """
        training_files_path = os.path.dirname(__file__) + '/training_files/'
        arXiv_text_ref_filenames = [
            training_files_path + 'arxiv.raw',
        ]
        references = []
        for f in arXiv_text_ref_filenames:
            references = references + get_arxiv_tagged_data(f)

        X, y, label_code = self.format_training_data(references)

        # for now use static division. see comments in foldModelText.dat
        generate_fold = False
        if generate_fold:
            folds = list(np.random.choice(range(0, 9), len(y)))
        else:
            folds = self.get_folds_array(training_files_path +
                                         'foldModelText.dat')

        return np.array(X, dtype=object), np.array(
            y, dtype=object), label_code, np.array(folds), generate_fold

    def save(self):
        """
        save object to a pickle file
        :return:
        """
        try:
            with open(self.filename, "wb") as f:
                pickler = pickle.Pickler(f, -1)
                pickler.dump(self.crf)
                pickler.dump(self.label_code)
                pickler.dump(self.nltk_tagger)
            current_app.logger.info("saved crf in %s." % self.filename)
            return True
        except Exception as e:
            current_app.logger.error('Exception: %s' % (str(e)))
            current_app.logger.error(traceback.format_exc())
            return False

    def load(self):
        """

        :return:
        """
        try:
            with open(self.filename, "rb") as f:
                unpickler = pickle.Unpickler(f)
                self.crf = unpickler.load()
                self.label_code = unpickler.load()
                self.nltk_tagger = unpickler.load()
            current_app.logger.info("loaded crf from %s." % self.filename)
            return self.crf
        except Exception as e:
            current_app.logger.error('Exception: %s' % (str(e)))
            current_app.logger.error(traceback.format_exc())

    def search(self, pattern, text):
        """
        search whole word only in the text
        :param pattern:
        :param text:
        :return: Ture/False depending if found
        """
        try:
            return re.search(self.REGEX_PATTERN_WHOLE_WORD_ONLY % pattern,
                             text) is not None
        except:
            return False

    def reference(self, refstr, words, labels):
        """
        put identified words into a dict to be passed out

        :param words:
        :param labels:
        :return:
        """
        ref_dict = {}
        ref_dict['authors'] = self.originator_token.collect_tagged_tokens(
            words, labels)
        if 'DOI' in labels or 'ARXIV' in labels or 'ASCL' in labels:
            ref_dict.update(
                self.numeric_token.collect_id_tagged_tokens(words, labels))
        if 'YEAR' in labels:
            ref_dict['year'] = words[labels.index('YEAR')]
        if 'VOLUME' in labels:
            volume = self.numeric_token.collect_tagged_numerals_token(
                words, labels, 'VOLUME')
            if volume:
                ref_dict['volume'] = volume
        if 'PAGE' in labels:
            page = self.numeric_token.collect_tagged_numerals_token(
                words, labels, 'PAGE')
            if page:
                ref_dict['page'] = page
        if 'ISSUE' in labels:
            ref_dict['issue'] = words[labels.index('ISSUE')]
        if 'ISSN' in labels:
            ref_dict['ISSN'] = words[labels.index('ISSN')]
        if 'JOURNAL' in labels:
            ref_dict['journal'] = self.pub_token.collect_tagged_journal_tokens(
                words, labels)
        if 'TITLE' in labels:
            title = self.pub_token.collect_tagged_title_tokens(words, labels)
            if title:
                ref_dict['title'] = title
        ref_dict['refstr'] = refstr
        return ref_dict

    def punctuation_features(self, ref_word, ref_label):
        """
        return a feature vector that has 1 in the first cell if ref_word is a punctuation
        followed by 1 in the position corresponding to which one

        :param ref_word:
        :param ref_label:
        :return:
        """
        which = which_punctuation(ref_word, ref_label)
        return [
            1 if which == 0 else 0,  # 0 if punctuation,
            1 if which == 1 else 0,  # 1 if brackets,
            1 if which == 2 else 0,  # 2 if colon,
            1 if which == 3 else 0,  # 3 if comma,
            1 if which == 4 else 0,  # 4 if dot,
            1 if which == 5 else 0,  # 5 if parenthesis,
            1 if which == 6 else 0,  # 6 if quotes (both single and double),
            1 if which == 7 else 0,  # 7 if num signs,
            1 if which == 8 else 0,  # 8 if hypen,
            1 if which == 9 else 0,  # 9 if forward slash,
            1 if which == 10 else 0,  # 10 if semicolon,
        ]

    def is_token_unknown(self, ref_word, ref_label):
        """

        :param ref_word:
        :param ref_label:
        :return:
        """
        if ref_label:
            return 1 if ref_label == 'NA' else 0

        if ref_word is None:
            return 0
        return int(any(ref_word == token for token in self.unknown_tokens))

    def length_features(self, ref_word):
        """
        distinguish between token of length 1, and longer

        :param ref_word:
        :return:
        """
        return [1 if len(ref_word) == 1 else 0, 1 if len(ref_word) > 1 else 0]

    def get_data_features(self, ref_word_list, index, ref_label_list=None):
        """

        :param ref_word_list: has the form [e1,e2,e3,..]
        :param index: the position of the word in the set, assume it is valid
        :param ref_label_list: labels for ref_word_list available during training only
        :return:
        """
        ref_word = ref_word_list[index]
        ref_label = ref_label_list[index] if ref_label_list else None
        return \
              self.length_features(ref_word)                                                \
            + self.originator_token.author_features(ref_word_list, ref_label_list, index)   \
            + self.pub_token.title_features(ref_word_list, ref_label_list, index)           \
            + self.pub_token.journal_features(ref_word_list, ref_label_list, index)         \
            + self.numeric_token.numeric_features(ref_word, ref_label)                      \
            + self.numeric_token.identifying_word_features(ref_word, ref_label)             \
            + self.punctuation_features(ref_word, ref_label)                                \
            + self.pub_token.publisher_features(ref_word, ref_label)                        \
            + self.originator_token.editor_features(ref_word_list, ref_label_list, index)   \
            + [
                int(self.IS_ALL_CAPITAL.match(ref_word) is not None),                       # is element all capital
                int(self.IS_FIRST_CAPITAL.match(ref_word) is not None),                     # is first character capital
                int(self.IS_ALPHABET.match(ref_word) is not None),                          # is alphabet only, consider hyphenated words also
                int(self.IS_NUMERIC.match(ref_word) is not None),                           # is numeric only, consider the page range with - being also numeric
                int(self.IS_ALPHANUMERIC.match(ref_word) is not None),                      # is alphanumeric, must at least one digit and one alphabet character
                self.is_token_unknown(ref_word, ref_label),                                 # is it one of the words unable to guess
                self.pub_token.is_token_stopword(ref_word, ref_label),                      # is it one of tagged stopwords
              ]

    def segment(self, reference_str):
        """
        going to attempt and segment the reference string
        each token that is identified is removed from reference_str
        in the reverse order the identified tokens are inserted back to reference_str
        before feature extraction

        :param reference_str:
        :return:
        """
        if isinstance(reference_str, list):
            return []

        # start fresh
        self.numeric_token.clear()
        self.originator_token.clear()
        self.pub_token.clear()
        na_url = None
        na_month = None

        # step 1: remove any non essential tokens (ie, urls, months, etc)
        matches = self.URL_EXTRACTOR.findall(reference_str)
        if len(matches) > 0:
            na_url = []
            for i, url in enumerate(matches, start=1):
                na_url.append(url[0])
                reference_str = reference_str.replace(url[0],
                                                      '|na_url_%d|' % i)
        extractor = self.MONTH_NAME_EXTRACTOR.search(reference_str)
        if extractor:
            na_month = extractor.group().strip()
            reference_str = reference_str.replace(na_month, '|na_month|')

        # step 2: identify doi/arxiv/ascl
        reference_str = self.numeric_token.segment_ids(reference_str)

        # step 3: identify list of authors and editors
        reference_str = self.originator_token.identify(reference_str)

        # step 4: identify title and journal substrings
        # but first remove any numerical identifying words
        reference_str = self.pub_token.identify(
            self.numeric_token.remove_identifying_words(reference_str).strip(),
            self.nltk_tagger, self.originator_token.indices(),
            self.originator_token.have_editor())

        # step 5: identify year, volume, page, issue
        reference_str = self.numeric_token.segment_numerals(reference_str)

        # collect all tokens that has not been identified
        self.unknown_tokens = self.TOKENS_NOT_IDENTIFIED.findall(reference_str)
        if na_url:
            self.unknown_tokens.append(' '.join(na_url))
        if na_month:
            self.unknown_tokens.append(na_month)

        # now put the identified tokens back into the string, and before tokenizing and sending to crf

        # step 5 reverse
        reference_str = self.numeric_token.assemble_stage1(reference_str)

        # step 4 reverse
        reference_str = self.pub_token.assemble(reference_str)

        # step 3 reverse
        reference_str = self.originator_token.assemble(reference_str)

        # tokenize
        ref_words = list(
            filter(None, [
                w.strip() for w in self.REFERENCE_TOKENIZER.split(
                    self.ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS.sub(
                        r'\1 \2', reference_str))
            ]))

        # step 2 reverse
        ref_words = self.numeric_token.assemble_stage2(ref_words)

        # step 1 reverse
        if na_month:
            ref_words[ref_words.index('|na_month|')] = na_month
        if na_url:
            for i, url in enumerate(na_url, start=1):
                ref_words[ref_words.index('|na_url_%d|' % i)] = url

        return ref_words

    def dots_after_initials(self, reference_str):
        """

        :param reference_str:
        :return:
        """
        try:
            author_part = self.SEPARATE_AUTHOR.search(reference_str).group(1)
            # separate first and middle initials if there are any attached, add dot after each
            # make sure there is a dot after single character, repeat to capture middle name
            reference_str = reference_str.replace(
                author_part,
                self.TO_ADD_SEPARATE_INITIALS.sub(
                    r"\1. \2. \3",
                    self.TO_ADD_DOT_AFTER_INITIALS.sub(
                        r"\1.\2\3",
                        self.TO_ADD_DOT_AFTER_INITIALS.sub(
                            r"\1.\2\3", author_part))))
        except:
            pass

        return reference_str

    def pre_processing(self, reference_str):
        """
        
        :param reference_str: 
        :return: 
        """
        # remove any numbering that appears before the reference to start with authors
        # exception is the year
        if self.IS_START_WITH_YEAR.search(reference_str) is None:
            reference_str = self.START_WITH_AUTHOR.search(
                reference_str).group()

        # also if for some reason et al. has been put in double quoted! remove them
        reference_str = self.QUOTES_AROUND_ETAL_REMOVE.sub(
            r"\1\3\5", reference_str)
        # if there is a hypen either between initials, or after initials and before dot, remove it
        for rhni, replace in zip(self.TO_REMOVE_HYPEN_NEAR_INITIAL,
                                 [r"\1 \3", r"\1\3", r"\1. \3"]):
            reference_str = rhni.sub(replace, reference_str)
        # add dots after initials, separate first and middle if needed
        reference_str = self.dots_after_initials(reference_str)
        # if no colon after the identifer, add it in
        reference_str = self.ADD_COLON_TO_IDENTIFIER.sub(r"\1:", reference_str)
        # if there is a url for DOI turned it to recognizable DOI
        reference_str = self.URL_TO_DOI.sub(r"DOI:", reference_str)
        # if there is a url for arxiv turned it to recognizable arxiv
        reference_str = self.URL_TO_ARXIV.sub(r"arXiv:", reference_str)
        # if there is a url for ascl turned it to recognizable ascl
        reference_str = self.URL_TO_ASCL.sub(r"ascl:", reference_str)

        for rwb in self.WORD_BREAKER_REMOVE:
            reference_str = rwb.sub(r'\1\3', reference_str)

        return reference_str

    def classify(self, reference_str):
        """
        Run the classifier on input data
        
        :param reference_str:
        :return: list of words and the corresponding list of labels
        """
        reference_str = self.pre_processing(reference_str)
        ref_words = self.segment(reference_str)

        features = []
        for i in range(len(ref_words)):
            features.append(self.get_data_features(ref_words, i, []))

        ref_labels = self.decoder(self.crf.predict([np.array(features)])[0])
        return ref_words, ref_labels

    def parse(self, reference_str):
        """

        :param reference_str:
        :return:
        """
        if self.IGNORE_IF.search(reference_str):
            return None
        words, labels = self.classify(reference_str)
        return self.reference(reference_str, words, labels)

    def tokenize(self, reference_str):
        """
        used for unittest only

        :param reference_str:
        :return:
        """
        if self.IGNORE_IF.search(reference_str):
            return None
        words, _ = self.classify(reference_str)
        return words
def classify(traincorpus, testcorpus):

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
	
    pos_lexicon = load_lexicon("lexica/restaurants/ote/pos")
    term_lexicon = load_lexicon("lexica/restaurants/ote/term")
    pre1_lexicon = load_lexicon("lexica/restaurants/ote/prefix1")
    pre2_lexicon = load_lexicon("lexica/restaurants/ote/prefix2")
    pre3_lexicon = load_lexicon("lexica/restaurants/ote/prefix3")
    suf1_lexicon = load_lexicon("lexica/restaurants/ote/suffix1")
    suf2_lexicon = load_lexicon("lexica/restaurants/ote/suffix2")
    suf3_lexicon = load_lexicon("lexica/restaurants/ote/suffix3")
    
    train_sentences = [] #the list to be used to store our features for the words    
    sentence_labels = [] #the list to be used for labeling if a word is an aspect term

    print('Creating train feature vectors...')

    #extracting sentences and appending them labels
    for instance in traincorpus.corpus:
        words = nltk.word_tokenize(instance.text)
        
        tags = nltk.pos_tag(words)
        tags_list = [] #the pos list
        for _, t in tags:
                tags_list.append(t)

        last_prediction = ""

        train_words = []
        word_labels = []
        for i, w in enumerate(words):
            word_found = False
            if words[i] == w:
                word_found = True
                
                pos_feats = []
                previous_pos_feats = []
                second_previous_pos_feats = []
                next_pos_feats = []
                second_next_pos_feats = []
                morph_feats = []
                term_feats = []
                pre1_feats = []
                pre2_feats = []
                pre3_feats = []
                suf1_feats = []
                suf2_feats = []
                suf3_feats = []

                target_labels = []
                train_word_features = []

                #prefix of lengths 1,2,3 lexicon features
                for p1 in pre1_lexicon:
                    if p1 == w[0]:
                        pre1_feats.append(1)
                    else:
                        pre1_feats.append(0)

                for p2 in pre2_lexicon:
                    if len(w) > 1:
                        if p2 == w[0]+w[1]:
                            pre2_feats.append(1)
                        else:
                            pre2_feats.append(0)
                    else:
                        pre2_feats.append(0)

                for p3 in pre3_lexicon:
                    if len(w) > 2:
                        if p3 == w[0]+w[1]+w[2]:
                            pre3_feats.append(1)
                        else:
                            pre3_feats.append(0)
                    else:
                        pre3_feats.append(0)

                #suffix of lengths 1,2,3 lexicon features
                for s1 in suf1_lexicon:
                    if s1 == w[-1]:
                        suf1_feats.append(1)
                    else:
                        suf1_feats.append(0)

                for s2 in suf2_lexicon:
                    if len(w) > 1:
                        if s2 == w[-2]+w[-1]:
                            suf2_feats.append(1)
                        else:
                            suf2_feats.append(0)
                    else:
                        suf2_feats.append(0)

                for s3 in suf3_lexicon:
                    if len(w) > 2:
                        if s3 == w[-3]+w[-2]+w[-1]:
                            suf3_feats.append(1)
                        else:
                            suf3_feats.append(0)
                    else:
                        suf3_feats.append(0)

                #frequent term lexicon features
                for t in term_lexicon:
                    if t == w.lower():
                        term_feats.append(1)
                    else:
                        term_feats.append(0)

                #morphological features
                if w[0].isupper(): #is first letter capital
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                capitals = 0
                lowers = 0
                for letter in w:
                    if letter.isupper():
                        capitals = capitals + 1
                    if letter.islower():
                        lowers = lowers + 1

                if w[0].islower() and capitals > 0: #contains capitals, except 1st letter
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if capitals == len(w): #is all letters capitals
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if lowers == len(w): #is all letters lower
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"\d", w)) == len(w): #is all letters digits
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[.]", w)) > 0: #is there a '.'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[-]", w)) > 0: #is there a '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)
                
                for p in pos_lexicon:
                    #check the POS tag of the current word
                    if tags_list[i] == p:
                        pos_feats.append(1)
                    else:
                        pos_feats.append(0)
                            
                    #check the POS tag of the previous word (if the index is IN list's bounds)
                    if (i-1) >= 0:
                        if tags_list[i-1] == p:
                            previous_pos_feats.append(1)
                        else:
                            previous_pos_feats.append(0)
                    else:
                        previous_pos_feats.append(0)
                            
                    #check the POS tag of the 2nd previous word (if the index is IN list's bounds)
                    if (i-2) >= 0:
                        if tags_list[i-2] == p:
                            second_previous_pos_feats.append(1)
                        else:
                            second_previous_pos_feats.append(0)
                    else:
                        second_previous_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+1) < len(words):
                        if tags_list[i+1] == p:
                            next_pos_feats.append(1)
                        else:
                            next_pos_feats.append(0)
                    else:
                        next_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+2) < len(words):
                        if tags_list[i+2] == p:
                            second_next_pos_feats.append(1)
                        else:
                            second_next_pos_feats.append(0)
                    else:
                        second_next_pos_feats.append(0)
                            
		#label the word, using IOB system,
                #B:start of aspect term, I:continue of aspect term, O: no aspect term
                term_found = False                
                for aspect_term in set(instance.get_aspect_terms()):
                    term_words = aspect_term.split()
                    for term_index, term in enumerate(term_words):
                        if (w.lower() == term) and (term_found is False):
                            if term_index == 0:
                                target_labels = [1] #1 is "B"
                                last_prediction = "1"
                                term_found = True                            
                            else:
                                if (last_prediction == "1") or (last_prediction == "2"):
                                    target_labels = [2] #2 is "I"
                                    last_prediction = "2"
                                    term_found = True                            
                                else:
                                    target_labels = [0]
                                    last_prediction = "0"

                if term_found is False:
                    target_labels = [0] #0 is "O"
                    last_prediction = "0"
            
                train_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats +
                                next_pos_feats + second_next_pos_feats + morph_feats + term_feats +
                                pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats]
            if word_found is True:
                train_words.append(train_word_features)
                word_labels.append(target_labels)

        train_sentences_array = np.zeros((len(train_words), len(train_words[0][0])))
        index_i = 0
        for word in train_words:
            index_j = 0
            for features in word:
                for f in features:
                    train_sentences_array[index_i, index_j] = f
                    index_j = index_j + 1
            index_i = index_i + 1
        train_sentences.append(train_sentences_array)        

        sentence_labels_array = np.zeros((len(word_labels)))
        index_i = 0
        for label in word_labels:
            sentence_labels_array[index_i] = label[0]
            index_i = index_i + 1
        sentence_labels.append(sentence_labels_array.astype(np.int64))

    #the chain-crf needs a list (representing the sentences), that
    #contains a 2d-array(n_words, n_features), which in turn contains the
    #features extracted from each word. the sentence labels must be
    #an array of type int
    ssvm.fit(train_sentences, sentence_labels)

    print('Done!')
    print('Creating test feature vectors...')
    
    test_sentences = []
    for instance in testcorpus.corpus:
        words = nltk.word_tokenize(instance.text)
        
        tags = nltk.pos_tag(words)
        tags_list = [] #the pos list
        for _, t in tags:
            tags_list.append(t)

        test_words = []
        for i, w in enumerate(words):
            word_found = False
            if words[i] == w:
                word_found = True
                
                pos_feats = []
                previous_pos_feats = []
                second_previous_pos_feats = []
                next_pos_feats = []
                second_next_pos_feats = []
                morph_feats = []
                term_feats = []
                pre1_feats = []
                pre2_feats = []
                pre3_feats = []
                suf1_feats = []
                suf2_feats = []
                suf3_feats = []

                test_word_features = []

                #prefix 1,2,3 lexicon features
                for p1 in pre1_lexicon:
                    if p1 == w[0]:
                        pre1_feats.append(1)
                    else:
                        pre1_feats.append(0)

                for p2 in pre2_lexicon:
                    if len(w) > 1:
                        if p2 == w[0]+w[1]:
                            pre2_feats.append(1)
                        else:
                            pre2_feats.append(0)
                    else:
                        pre2_feats.append(0)

                for p3 in pre3_lexicon:
                    if len(w) > 2:
                        if p3 == w[0]+w[1]+w[2]:
                            pre3_feats.append(1)
                        else:
                            pre3_feats.append(0)
                    else:
                        pre3_feats.append(0)

                #suffix 1,2,3 lexicon features
                for s1 in suf1_lexicon:
                    if s1 == w[-1]:
                        suf1_feats.append(1)
                    else:
                        suf1_feats.append(0)

                for s2 in suf2_lexicon:
                    if len(w) > 1:
                        if s2 == w[-2]+w[-1]:
                            suf2_feats.append(1)
                        else:
                            suf2_feats.append(0)
                    else:
                        suf2_feats.append(0)

                for s3 in suf3_lexicon:
                    if len(w) > 2:
                        if s3 == w[-3]+w[-2]+w[-1]:
                            suf3_feats.append(1)
                        else:
                            suf3_feats.append(0)
                    else:
                        suf3_feats.append(0)

                #term lexicon features
                for t in term_lexicon:
                    if t == w.lower():
                        term_feats.append(1)
                    else:
                        term_feats.append(0)

                #morphological features
                if w[0].isupper(): #is first letter capital
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                capitals = 0
                lowers = 0
                for letter in w:
                    if letter.isupper():
                        capitals = capitals + 1
                    if letter.islower():
                        lowers = lowers + 1

                if w[0].islower() and capitals > 0: #contains capitals, except 1st letter
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if capitals == len(w): #is all letters capitals
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if lowers == len(w): #is all letters lower
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"\d", w)) == len(w): #is all letters digits
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[a-zA-Z]", w)) == len(w): #is all letters words
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[.]", w)) > 0: #is there a '.'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[-]", w)) > 0: #is there a '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r'''[][,;"'?():_`]''', w)) > 0: #is there a punctuation mark, except '.', '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)
                
                for p in pos_lexicon:
                    #check the POS tag of the current word
                    if tags_list[i] == p:
                        pos_feats.append(1)
                    else:
                        pos_feats.append(0)
                            
                    #check the POS tag of the previous word (if the index is IN list's bounds)
                    if (i-1) >= 0:
                        if tags_list[i-1] == p:
                            previous_pos_feats.append(1)
                        else:
                            previous_pos_feats.append(0)
                    else:
                        previous_pos_feats.append(0)
                            
                    #check the POS tag of the 2nd previous word (if the index is IN list's bounds)
                    if (i-2) >= 0:
                        if tags_list[i-2] == p:
                            second_previous_pos_feats.append(1)
                        else:
                            second_previous_pos_feats.append(0)
                    else:
                        second_previous_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+1) < len(words):
                        if tags_list[i+1] == p:
                            next_pos_feats.append(1)
                        else:
                            next_pos_feats.append(0)
                    else:
                        next_pos_feats.append(0)
                            
                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i+2) < len(words):
                        if tags_list[i+2] == p:
                            second_next_pos_feats.append(1)
                        else:
                            second_next_pos_feats.append(0)
                    else:
                        second_next_pos_feats.append(0)
            
                test_word_features = [pos_feats + previous_pos_feats + second_previous_pos_feats +
                                next_pos_feats + second_next_pos_feats + morph_feats + term_feats +
                                pre1_feats + pre2_feats + pre3_feats + suf1_feats + suf2_feats + suf3_feats]
            if word_found is True:
                test_words.append(test_word_features)

        test_sentences_array = np.zeros((len(test_words), len(test_words[0][0])))
        index_i = 0
        for word in test_words:
            index_j = 0
            for features in word:
                for f in features:
                    test_sentences_array[index_i, index_j] = f
                    index_j = index_j + 1
            index_i = index_i + 1
        test_sentences.append(test_sentences_array)

    print('Done!')
    print('Predicting aspect terms...')

    predictions = ssvm.predict(test_sentences)
    #the predict function returns a list (symbolizing the sentences),
    #which contains a list that contains the predicted label for each word
    for sentence_index, sentence_predictions in enumerate(predictions):
            testcorpus.corpus[sentence_index].aspect_terms = []

            predicted_term = ""
            last_prediction = ""
            for word_index, word_prediction in enumerate(sentence_predictions):
                if word_prediction == 1:
                    if last_prediction == 1 or last_prediction == 2:
                        start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term)
                        testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)})
                        
                    c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index)
                    predicted_term = c
                    last_prediction = 1
                    
                elif word_prediction == 2:
                    if last_prediction == 1 or last_prediction == 2:
                        c = find_term(testcorpus.corpus[sentence_index].text.lower(), word_index)
                        if len(predicted_term) > 0:
                            predicted_term = predicted_term + " " + c
                        else:
                            predicted_term = c
                    last_prediction = 2

                elif word_prediction == 0:
                    if last_prediction == 1 or last_prediction == 2:
                        start, end = find_offsets(testcorpus.corpus[sentence_index].text.lower(), predicted_term)
                        testcorpus.corpus[sentence_index].add_aspect_term(term=predicted_term, offsets={'from': str(start), 'to': str(end)})
                    last_prediction = 0
                            
    print('Done!')
    return testcorpus.corpus
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'UD-LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'D-LCCRF', color="#FFD700", size=25)

fig.text(0.05, 0.5, 'Word', color="#000000", size=25)
fig.text(0.5, 0.95, 'Letters', color="#000000", size=25)

for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_undirected = undirected_ssvm.predict([X_test[ind]])[0]
    y_pred_crf = ssvm.predict([X_test[ind]])[0]

    for i, (a, image, y_true, y_svm, y_undirected, y_crf) in enumerate(
            zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_undirected, y_pred_crf)):
        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
        a.text(0, 3, abc[y_true], color="#00AA00", size=25)    # Green
        a.text(0, 14, abc[y_svm], color="#5555FF", size=25)    # Blue
        a.text(5, 14, abc[y_undirected], color="#FF5555", size=25)  # Red
        a.text(5, 3, abc[y_crf], color="#FFD700", size=25)     # Yellow
        a.set_xticks(())
        a.set_yticks(())
    for ii in range(i + 1, max_word_len):
        axes_row[ii].set_visible(False)

w = ssvm.w[26 * 8 * 16:].reshape(26, 26)
Example #26
0
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'CRF', color="#FFD700", size=25)

fig.text(0.05, 0.5, 'Word', color="#000000", size=25)
fig.text(0.5, 0.95, 'Letters', color="#000000", size=25)

for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_chain = chain_ssvm.predict([X_test[ind]])[0]
    y_pred_crf = ssvm.predict([X_test[ind]])[0]
    import pdb
    pdb.set_trace()
    for i, (a, image, y_true, y_svm, y_chain, y_crf) in enumerate(
            zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain, y_pred_crf)):
        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
        a.text(0, 3, abc[y_true], color="#00AA00", size=25)    # Green
        a.text(0, 14, abc[y_svm], color="#5555FF", size=25)    # Blue
        a.text(5, 14, abc[y_chain], color="#FF5555", size=25)  # Red
        a.text(5, 3, abc[y_crf], color="#FFD700", size=25)     # Yellow
        a.set_xticks(())
        a.set_yticks(())
    for ii in range(i + 1, max_word_len):
        axes_row[ii].set_visible(False)
Example #27
0
 
 X = X[:100]
 y = y[:100]
 
 
 #Add edges
 for i in range(X.shape[0]):
     X[i] = [X[i], np.vstack([(0,1),(2,2)])]
     
 model = GraphCRF(directed=True, inference_method="max-product")
 
 X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X,y, test_size =0.5, random_state=0)
 ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
 ssvm.fit(X_train,y_train)
 print ssvm.score(X_test, y_test)
 print ssvm.predict(X_test)
 print y_test
 
 '''
 for i in range(X.shape[0]):
     
     X_train, X_test = X[] 
     X_test = X[i]
     y_test = y[i]
     X_train = np.delete(X,i)
     y_train = np.delete(y,i)
     
 
     ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
     ssvm.fit(X_train,y_train)
     print ssvm.model
Example #28
0
subgradient_svm.fit(X_train_bias, y_train)
time_subgradient_svm = time() - start
y_pred = np.hstack(subgradient_svm.predict(X_test_bias))

print("Score with pystruct subgradient ssvm: %f (took %f seconds)"
      % (np.mean(y_pred == y_test), time_subgradient_svm))

# the standard one-vs-rest multi-class would probably be as good and faster
# but solving a different model
libsvm = LinearSVC(multi_class='crammer_singer', C=.1)
start = time()
libsvm.fit(X_train, y_train)
time_libsvm = time() - start
print("Score with sklearn and libsvm: %f (took %f seconds)"
      % (libsvm.score(X_test, y_test), time_libsvm))


start = time()
fw_bc_svm.fit(X_train_bias, y_train)
y_pred = np.hstack(fw_bc_svm.predict(X_test_bias))
time_fw_bc_svm = time() - start
print("Score with pystruct frankwolfe block coordinate ssvm: %f (took %f seconds)" %
      (np.mean(y_pred == y_test), time_fw_bc_svm))

start = time()
fw_batch_svm.fit(X_train_bias, y_train)
y_pred = np.hstack(fw_batch_svm.predict(X_test_bias))
time_fw_batch_svm = time() - start
print("Score with pystruct frankwolfe batch ssvm: %f (took %f seconds)" %
      (np.mean(y_pred == y_test), time_fw_batch_svm))
Example #29
0
y_test = preprocess_label(y_test)

### CS : best c =0.01
### Phy: best c= 0.005
### stat: best c = 0.005
'''
C= [0.005,0.01,0.02,0.05,0.1,0.2]
score = {}

for i in C:
	model = ChainCRF()
	ssvm = FrankWolfeSSVM(model=model, C=i, max_iter=100)
	ssvm.fit(x_train, y_train) 
	score[i] = ssvm.score(x_dev, y_dev)

print score
'''
model = ChainCRF()
ssvm = FrankWolfeSSVM(model=model, C=0.005, max_iter=100)
ssvm.fit(x_train, y_train)
score = ssvm.score(x_test, y_test)
y_pred = ssvm.predict(x_test)

print 'Micro-averaged F1 score:', f1_score(get_one_list(y_test),
                                           get_one_list(y_pred),
                                           average='micro')

experiment_util.sequential_error_analysis(
    restore_label(y_test), restore_label(y_pred),
    './chaincrf_sequential_error_analysis')
def trainModel_Basic(num_iter=5,
                     inference="qpbo",
                     trainer="NSlack",
                     num_train=2,
                     num_test=1,
                     C=0.1,
                     edges="180x180_dist1_diag0",
                     inputs=[1, 1, 1, 1, 1, 1],
                     features="all",
                     directed=False,
                     savePred=False):

    padding = (30, 30, 30, 30)

    if directed == True:
        features += '+directed'

    resultsDir = os.getcwd() + '/CRFResults'
    nameLen = len(os.listdir(resultsDir))
    edgeFeature = edges
    filename = str(nameLen) + '_CRF_iter_' + str(
        num_iter
    ) + "_" + inference + "_" + trainer + "_" + features + "_" + str(
        num_train) + "_" + str(num_test) + "_" + edgeFeature

    print "Loading training slices"

    start = time.clock()
    train = extractSlices2(train_path, num_train, padding, inputs=inputs)
    end = time.clock()
    train_load_time = (end - start) / 60.0

    [trainLayers, trainTruth, sliceShape] = train
    print "Training slices loaded in %f" % (train_load_time)

    n_features = len(trainLayers[0][0, 0])
    print "Layer shape is : "
    print trainLayers[0].shape

    print "Training the model"
    edges = np.load("/home/bmi/CRF/edges/" + edges + ".npy")

    G = [edges for x in trainLayers]

    print trainLayers[0].shape

    trainLayers = np.array([
        x.reshape((sliceShape[0] * sliceShape[1], n_features))
        for x in trainLayers
    ])
    trainTruth = np.array([
        x.reshape((sliceShape[0] * sliceShape[1], )).astype(int)
        for x in trainTruth
    ])

    if inference == 'ogm':
        crf = GraphCRF(inference_method=('ogm', {
            'alg': 'fm'
        }),
                       directed=directed)
    else:
        crf = GraphCRF(inference_method=inference, directed=directed)

    if trainer == "Frank":
        svm = FrankWolfeSSVM(model=crf,
                             max_iter=num_iter,
                             C=C,
                             n_jobs=6,
                             verbose=1)
    elif trainer == "NSlack":
        svm = NSlackSSVM(model=crf,
                         max_iter=num_iter,
                         C=C,
                         n_jobs=-1,
                         verbose=1)
    else:
        svm = OneSlackSSVM(model=crf,
                           max_iter=num_iter,
                           C=C,
                           n_jobs=-1,
                           verbose=1)

    start = time.clock()
    asdf = zip(trainLayers, G)
    svm.fit(asdf, trainTruth)
    end = time.clock()
    train_time = (end - start) / 60.0
    print "The training took %f" % (train_time)
    print "Model parameter size :"
    print svm.w.shape

    print "making predictions on train data"
    predTrain = svm.predict(asdf)
    trainDice = []
    for i in range(len(trainLayers)):
        diceScore = accuracy(predTrain[i], trainTruth[i])
        trainDice.append(diceScore)
    meanTrainDice = sum(trainDice) / len(trainLayers)

    del trainLayers, trainTruth

    ################################################################################################
    overallDicePerPatient = []  # For overall test Dice
    extDicePerPatient = []
    PatientTruthLayers = []
    PatientPredLayers = []
    PREC = []
    RECALL = []
    F1 = []
    LayerwiseDiceTotal = []

    testResultFile = open(os.getcwd() + "/CRFResults/" + filename + ".csv",
                          'a')
    testResultFile.write(
        "folderName,numLayers, Overall Dice, precision , recall, F1" + "\n")

    counter = 0
    print "Loading the test slices"
    for folder in os.listdir(test_path):
        path = test_path + "/" + folder
        layerDiceScores = ''
        #        print path

        data = extractTestSlices2(path, padding, inputs=inputs)
        if data != 0:
            [testLayers, testTruth, sliceShape, startSlice, endSlice] = data

#        trueTestLayers=testLayers
        GTest = [edges for x in testLayers]
        testLayers = np.array([
            x.reshape((sliceShape[0] * sliceShape[1], n_features))
            for x in testLayers
        ])
        testTruth = np.array([
            x.reshape((sliceShape[0] * sliceShape[1], )).astype(int)
            for x in testTruth
        ])

        asdfTest = zip(testLayers, GTest)
        predTest = svm.predict(asdfTest)

        LayerwiseDice = []

        for i in range(len(testLayers)):
            diceScore = accuracy(predTest[i], testTruth[i])
            layerDiceScores += "," + str(diceScore)
            if math.isnan(diceScore):
                if sum(predTest[i]) == 0 and sum(testTruth[i]) == 0:
                    LayerwiseDice.append(1.0)
                continue
            LayerwiseDice.append(diceScore)

        LayerwiseDiceTotal.append(LayerwiseDice)

        overallTestDice = accuracy(np.hstack(predTest), np.hstack(testTruth))
        extDice = np.mean(
            np.array(LayerwiseDice)
            [range(10) + range(len(LayerwiseDice) - 10, len(LayerwiseDice))])
        prec, recall, f1 = precision_score(np.hstack(testTruth),
                                           np.hstack(predTest)), recall_score(
                                               np.hstack(testTruth),
                                               np.hstack(predTest)), f1_score(
                                                   np.hstack(testTruth),
                                                   np.hstack(predTest))
        print "Patient %d : Overall test DICE for %s is : %f and extDice is %f" % (
            counter, folder, overallTestDice, extDice)
        print "Precision : %f  Recall : %f  F1 : %f " % (prec, recall, f1)
        print "__________________________________________"

        #        testResultFile.write(folder+","+str(len(testLayers))+","+str(meanTestDice)+","+str(overallTestDice) ","+str(np.max(testDice)) +","+ str(np.min(testDice))+"\n" )
        testResultFile.write(folder + "," + str(len(testLayers)) + "," +
                             str(overallTestDice) + "," + str(prec) + "," +
                             str(recall) + "," + str(extDice) +
                             layerDiceScores + "\n")
        overallDicePerPatient.append(overallTestDice)
        extDicePerPatient.append(extDice)
        PREC.append(prec), RECALL.append(recall), F1.append(f1)

        PatientTruthLayers.append(testTruth)
        PatientPredLayers.append(predTest)

        counter += 1
        if counter == num_test and num_test != -1:
            break
######################################################################################################
    print "Done testing slices"
    overallDice = sum(overallDicePerPatient) / len(PatientTruthLayers)
    overallPrec = sum(PREC) / len(PatientTruthLayers)
    overallRecall = sum(RECALL) / len(PatientTruthLayers)
    overallExtDice = np.mean(extDicePerPatient)
    print "Overall DICE : %f Precision : %f Recall : %f extDice : %f  " % (
        overallDice, overallPrec, overallRecall, overallExtDice)
    print "############################################"

    #    testOutput=np.array([PatientPredLayers,PatientTruthLayers,trueTestLayers])
    testOutput = np.array([PatientPredLayers, PatientTruthLayers])

    ########### Saving the models ######################################################################

    #    print "Saving the model"
    #    modelDir = os.getcwd()+"/CRFModel/"
    #    svmModel = open(modelDir+filename+"_model"+".pkl",'wb')
    #    cPickle.dump(svm,svmModel,protocol=cPickle.HIGHEST_PROTOCOL)
    #    svmModel.close()
    #
    #    print "saving the predictions"
    #    predFileTest = open(os.getcwd()+"/CRFPred/"+filename+"_pred.pkl",'wb')
    #    cPickle.dump(testOutput,predFileTest,protocol=cPickle.HIGHEST_PROTOCOL)
    #    predFileTest.close()

    layerDataLog = open(os.getcwd() + "/CRFModel/" + filename + "_layer.pkl",
                        'wb')
    cPickle.dump(LayerwiseDiceTotal,
                 layerDataLog,
                 protocol=cPickle.HIGHEST_PROTOCOL)
    layerDataLog.close()

    resultLog = os.getcwd() + "/CRFResults/TestResultFinal.csv"
    resultFile = open(resultLog, 'a')
    resultFile.write(time.ctime() + "," + str(num_iter) + "," +
                     str(num_train) + "," + str(num_test) + "," + inference +
                     "," + trainer + "," + str(C) + "," + str(train_time) +
                     "," + str(meanTrainDice) + "," + str(overallDice) + "," +
                     str(np.std(overallDicePerPatient)) + "," + edgeFeature +
                     "," + "None" + "," + features + "," + filename + "," +
                     str(overallPrec) + "," + str(overallRecall) + "," +
                     str(overallExtDice) + "," +
                     "Flair(5)+T2(9)-Without last 4 train Layers" + "\n")

    resultFile.close()
    testResultFile.close()

    return
Example #31
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)


    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []


    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _= pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
Example #32
0
    print _js_function
    # processed_function = process_function(_js_function)
    line_f = _js_function.replace('/n', " ")
    raw_tokens = tokenizer.init_processing_function(line_f)
    tr_sets = crfpredictor.generate_type1_prediction(raw_tokens)
    r_assert = ssvm.predict(tr_sets[0][0:1])
    unit_test = assert_pre.unit_test_assembler(r_assert, raw_tokens, 2)

    response = Response(str(unit_test))
    response.headers["content-type"] = "text/plain"
    return response


if __name__ == '__main__':

    result = tokenizer.read_process_file()
    train_sets = crfpredictor.generate_type1_prediction(result)
    ssvm.fit(train_sets[0], train_sets[1])
    result_assert = ssvm.predict(train_sets[0][0:1])
    test = assert_pre.unit_test_assembler(result_assert, result, 2)
    for f in test:
        print f

    print result_assert

    app.run()


def process_function(_js_function):
    pass
Example #33
0
def classify(traincorpus, testcorpus):

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)

    pos_lexicon = load_lexicon("lexica/restaurants/ote/pos")
    term_lexicon = load_lexicon("lexica/restaurants/ote/term")
    pre1_lexicon = load_lexicon("lexica/restaurants/ote/prefix1")
    pre2_lexicon = load_lexicon("lexica/restaurants/ote/prefix2")
    pre3_lexicon = load_lexicon("lexica/restaurants/ote/prefix3")
    suf1_lexicon = load_lexicon("lexica/restaurants/ote/suffix1")
    suf2_lexicon = load_lexicon("lexica/restaurants/ote/suffix2")
    suf3_lexicon = load_lexicon("lexica/restaurants/ote/suffix3")

    train_sentences = [
    ]  #the list to be used to store our features for the words
    sentence_labels = [
    ]  #the list to be used for labeling if a word is an aspect term

    print('Creating train feature vectors...')

    #extracting sentences and appending them labels
    for instance in traincorpus.corpus:
        words = nltk.word_tokenize(instance.text)

        tags = nltk.pos_tag(words)
        tags_list = []  #the pos list
        for _, t in tags:
            tags_list.append(t)

        last_prediction = ""

        train_words = []
        word_labels = []
        for i, w in enumerate(words):
            word_found = False
            if words[i] == w:
                word_found = True

                pos_feats = []
                previous_pos_feats = []
                second_previous_pos_feats = []
                next_pos_feats = []
                second_next_pos_feats = []
                morph_feats = []
                term_feats = []
                pre1_feats = []
                pre2_feats = []
                pre3_feats = []
                suf1_feats = []
                suf2_feats = []
                suf3_feats = []

                target_labels = []
                train_word_features = []

                #prefix of lengths 1,2,3 lexicon features
                for p1 in pre1_lexicon:
                    if p1 == w[0]:
                        pre1_feats.append(1)
                    else:
                        pre1_feats.append(0)

                for p2 in pre2_lexicon:
                    if len(w) > 1:
                        if p2 == w[0] + w[1]:
                            pre2_feats.append(1)
                        else:
                            pre2_feats.append(0)
                    else:
                        pre2_feats.append(0)

                for p3 in pre3_lexicon:
                    if len(w) > 2:
                        if p3 == w[0] + w[1] + w[2]:
                            pre3_feats.append(1)
                        else:
                            pre3_feats.append(0)
                    else:
                        pre3_feats.append(0)

                #suffix of lengths 1,2,3 lexicon features
                for s1 in suf1_lexicon:
                    if s1 == w[-1]:
                        suf1_feats.append(1)
                    else:
                        suf1_feats.append(0)

                for s2 in suf2_lexicon:
                    if len(w) > 1:
                        if s2 == w[-2] + w[-1]:
                            suf2_feats.append(1)
                        else:
                            suf2_feats.append(0)
                    else:
                        suf2_feats.append(0)

                for s3 in suf3_lexicon:
                    if len(w) > 2:
                        if s3 == w[-3] + w[-2] + w[-1]:
                            suf3_feats.append(1)
                        else:
                            suf3_feats.append(0)
                    else:
                        suf3_feats.append(0)

                #frequent term lexicon features
                for t in term_lexicon:
                    if t == w.lower():
                        term_feats.append(1)
                    else:
                        term_feats.append(0)

                #morphological features
                if w[0].isupper():  #is first letter capital
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                capitals = 0
                lowers = 0
                for letter in w:
                    if letter.isupper():
                        capitals = capitals + 1
                    if letter.islower():
                        lowers = lowers + 1

                if w[0].islower(
                ) and capitals > 0:  #contains capitals, except 1st letter
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if capitals == len(w):  #is all letters capitals
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if lowers == len(w):  #is all letters lower
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"\d", w)) == len(w):  #is all letters digits
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[a-zA-Z]",
                                  w)) == len(w):  #is all letters words
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[.]", w)) > 0:  #is there a '.'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[-]", w)) > 0:  #is there a '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(
                        r'''[][,;"'?():_`]''',
                        w)) > 0:  #is there a punctuation mark, except '.', '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                for p in pos_lexicon:
                    #check the POS tag of the current word
                    if tags_list[i] == p:
                        pos_feats.append(1)
                    else:
                        pos_feats.append(0)

                    #check the POS tag of the previous word (if the index is IN list's bounds)
                    if (i - 1) >= 0:
                        if tags_list[i - 1] == p:
                            previous_pos_feats.append(1)
                        else:
                            previous_pos_feats.append(0)
                    else:
                        previous_pos_feats.append(0)

                    #check the POS tag of the 2nd previous word (if the index is IN list's bounds)
                    if (i - 2) >= 0:
                        if tags_list[i - 2] == p:
                            second_previous_pos_feats.append(1)
                        else:
                            second_previous_pos_feats.append(0)
                    else:
                        second_previous_pos_feats.append(0)

                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i + 1) < len(words):
                        if tags_list[i + 1] == p:
                            next_pos_feats.append(1)
                        else:
                            next_pos_feats.append(0)
                    else:
                        next_pos_feats.append(0)

                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i + 2) < len(words):
                        if tags_list[i + 2] == p:
                            second_next_pos_feats.append(1)
                        else:
                            second_next_pos_feats.append(0)
                    else:
                        second_next_pos_feats.append(0)

#label the word, using IOB system,
#B:start of aspect term, I:continue of aspect term, O: no aspect term
                term_found = False
                for aspect_term in set(instance.get_aspect_terms()):
                    term_words = aspect_term.split()
                    for term_index, term in enumerate(term_words):
                        if (w.lower() == term) and (term_found is False):
                            if term_index == 0:
                                target_labels = [1]  #1 is "B"
                                last_prediction = "1"
                                term_found = True
                            else:
                                if (last_prediction == "1") or (last_prediction
                                                                == "2"):
                                    target_labels = [2]  #2 is "I"
                                    last_prediction = "2"
                                    term_found = True
                                else:
                                    target_labels = [0]
                                    last_prediction = "0"

                if term_found is False:
                    target_labels = [0]  #0 is "O"
                    last_prediction = "0"

                train_word_features = [
                    pos_feats + previous_pos_feats +
                    second_previous_pos_feats + next_pos_feats +
                    second_next_pos_feats + morph_feats + term_feats +
                    pre1_feats + pre2_feats + pre3_feats + suf1_feats +
                    suf2_feats + suf3_feats
                ]
            if word_found is True:
                train_words.append(train_word_features)
                word_labels.append(target_labels)

        train_sentences_array = np.zeros(
            (len(train_words), len(train_words[0][0])))
        index_i = 0
        for word in train_words:
            index_j = 0
            for features in word:
                for f in features:
                    train_sentences_array[index_i, index_j] = f
                    index_j = index_j + 1
            index_i = index_i + 1
        train_sentences.append(train_sentences_array)

        sentence_labels_array = np.zeros((len(word_labels)))
        index_i = 0
        for label in word_labels:
            sentence_labels_array[index_i] = label[0]
            index_i = index_i + 1
        sentence_labels.append(sentence_labels_array.astype(np.int64))

    #the chain-crf needs a list (representing the sentences), that
    #contains a 2d-array(n_words, n_features), which in turn contains the
    #features extracted from each word. the sentence labels must be
    #an array of type int
    ssvm.fit(train_sentences, sentence_labels)

    print('Done!')
    print('Creating test feature vectors...')

    test_sentences = []
    for instance in testcorpus.corpus:
        words = nltk.word_tokenize(instance.text)

        tags = nltk.pos_tag(words)
        tags_list = []  #the pos list
        for _, t in tags:
            tags_list.append(t)

        test_words = []
        for i, w in enumerate(words):
            word_found = False
            if words[i] == w:
                word_found = True

                pos_feats = []
                previous_pos_feats = []
                second_previous_pos_feats = []
                next_pos_feats = []
                second_next_pos_feats = []
                morph_feats = []
                term_feats = []
                pre1_feats = []
                pre2_feats = []
                pre3_feats = []
                suf1_feats = []
                suf2_feats = []
                suf3_feats = []

                test_word_features = []

                #prefix 1,2,3 lexicon features
                for p1 in pre1_lexicon:
                    if p1 == w[0]:
                        pre1_feats.append(1)
                    else:
                        pre1_feats.append(0)

                for p2 in pre2_lexicon:
                    if len(w) > 1:
                        if p2 == w[0] + w[1]:
                            pre2_feats.append(1)
                        else:
                            pre2_feats.append(0)
                    else:
                        pre2_feats.append(0)

                for p3 in pre3_lexicon:
                    if len(w) > 2:
                        if p3 == w[0] + w[1] + w[2]:
                            pre3_feats.append(1)
                        else:
                            pre3_feats.append(0)
                    else:
                        pre3_feats.append(0)

                #suffix 1,2,3 lexicon features
                for s1 in suf1_lexicon:
                    if s1 == w[-1]:
                        suf1_feats.append(1)
                    else:
                        suf1_feats.append(0)

                for s2 in suf2_lexicon:
                    if len(w) > 1:
                        if s2 == w[-2] + w[-1]:
                            suf2_feats.append(1)
                        else:
                            suf2_feats.append(0)
                    else:
                        suf2_feats.append(0)

                for s3 in suf3_lexicon:
                    if len(w) > 2:
                        if s3 == w[-3] + w[-2] + w[-1]:
                            suf3_feats.append(1)
                        else:
                            suf3_feats.append(0)
                    else:
                        suf3_feats.append(0)

                #term lexicon features
                for t in term_lexicon:
                    if t == w.lower():
                        term_feats.append(1)
                    else:
                        term_feats.append(0)

                #morphological features
                if w[0].isupper():  #is first letter capital
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                capitals = 0
                lowers = 0
                for letter in w:
                    if letter.isupper():
                        capitals = capitals + 1
                    if letter.islower():
                        lowers = lowers + 1

                if w[0].islower(
                ) and capitals > 0:  #contains capitals, except 1st letter
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if capitals == len(w):  #is all letters capitals
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if lowers == len(w):  #is all letters lower
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"\d", w)) == len(w):  #is all letters digits
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[a-zA-Z]",
                                  w)) == len(w):  #is all letters words
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[.]", w)) > 0:  #is there a '.'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(r"[-]", w)) > 0:  #is there a '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                if len(re.findall(
                        r'''[][,;"'?():_`]''',
                        w)) > 0:  #is there a punctuation mark, except '.', '-'
                    morph_feats.append(1)
                else:
                    morph_feats.append(0)

                for p in pos_lexicon:
                    #check the POS tag of the current word
                    if tags_list[i] == p:
                        pos_feats.append(1)
                    else:
                        pos_feats.append(0)

                    #check the POS tag of the previous word (if the index is IN list's bounds)
                    if (i - 1) >= 0:
                        if tags_list[i - 1] == p:
                            previous_pos_feats.append(1)
                        else:
                            previous_pos_feats.append(0)
                    else:
                        previous_pos_feats.append(0)

                    #check the POS tag of the 2nd previous word (if the index is IN list's bounds)
                    if (i - 2) >= 0:
                        if tags_list[i - 2] == p:
                            second_previous_pos_feats.append(1)
                        else:
                            second_previous_pos_feats.append(0)
                    else:
                        second_previous_pos_feats.append(0)

                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i + 1) < len(words):
                        if tags_list[i + 1] == p:
                            next_pos_feats.append(1)
                        else:
                            next_pos_feats.append(0)
                    else:
                        next_pos_feats.append(0)

                    #check the POS tag of the next word (if the index is IN list's bounds)
                    if (i + 2) < len(words):
                        if tags_list[i + 2] == p:
                            second_next_pos_feats.append(1)
                        else:
                            second_next_pos_feats.append(0)
                    else:
                        second_next_pos_feats.append(0)

                test_word_features = [
                    pos_feats + previous_pos_feats +
                    second_previous_pos_feats + next_pos_feats +
                    second_next_pos_feats + morph_feats + term_feats +
                    pre1_feats + pre2_feats + pre3_feats + suf1_feats +
                    suf2_feats + suf3_feats
                ]
            if word_found is True:
                test_words.append(test_word_features)

        test_sentences_array = np.zeros(
            (len(test_words), len(test_words[0][0])))
        index_i = 0
        for word in test_words:
            index_j = 0
            for features in word:
                for f in features:
                    test_sentences_array[index_i, index_j] = f
                    index_j = index_j + 1
            index_i = index_i + 1
        test_sentences.append(test_sentences_array)

    print('Done!')
    print('Predicting aspect terms...')

    predictions = ssvm.predict(test_sentences)
    #the predict function returns a list (symbolizing the sentences),
    #which contains a list that contains the predicted label for each word
    for sentence_index, sentence_predictions in enumerate(predictions):
        testcorpus.corpus[sentence_index].aspect_terms = []

        predicted_term = ""
        last_prediction = ""
        for word_index, word_prediction in enumerate(sentence_predictions):
            if word_prediction == 1:
                if last_prediction == 1 or last_prediction == 2:
                    start, end = find_offsets(
                        testcorpus.corpus[sentence_index].text.lower(),
                        predicted_term)
                    testcorpus.corpus[sentence_index].add_aspect_term(
                        term=predicted_term,
                        offsets={
                            'from': str(start),
                            'to': str(end)
                        })

                c = find_term(testcorpus.corpus[sentence_index].text.lower(),
                              word_index)
                predicted_term = c
                last_prediction = 1

            elif word_prediction == 2:
                if last_prediction == 1 or last_prediction == 2:
                    c = find_term(
                        testcorpus.corpus[sentence_index].text.lower(),
                        word_index)
                    if len(predicted_term) > 0:
                        predicted_term = predicted_term + " " + c
                    else:
                        predicted_term = c
                last_prediction = 2

            elif word_prediction == 0:
                if last_prediction == 1 or last_prediction == 2:
                    start, end = find_offsets(
                        testcorpus.corpus[sentence_index].text.lower(),
                        predicted_term)
                    testcorpus.corpus[sentence_index].add_aspect_term(
                        term=predicted_term,
                        offsets={
                            'from': str(start),
                            'to': str(end)
                        })
                last_prediction = 0

    print('Done!')
    return testcorpus.corpus
list_x.append(np.array(x_1))
list_y.append(y)
list_y.append(y_1)

# crf = ChainCRF(inference_method='max-product')
crf = ChainCRF(inference_method="max-product", directed=False)
ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
ssvm.fit(np.array(list_x), np.array(list_y))

test_x = np.array(list_x)
test_y = np.array(list_y)
# print np.array(list_x)[0].shape[1]

x_test = [[1, 0, 0, 0], [1, 0, 1, 0]]
list_x_test = list()
list_x_test.append(x_test)

pred = ssvm.predict(np.array(list_x_test))
# for value in pred:
#     print value


# file_model = pickle.dumps(ssvm)
# load_model = pickle.loads(file_model)
joblib.dump(ssvm, "d:/filename.pkl")
load_model = joblib.load("d:/filename.pkl")
output = load_model.predict(np.array(list_x_test))

for value in output:
    print value
Example #35
0
def main():

    tweets_data_train = []
    with open('Train\dataset_train.pkl', 'rb') as r:
        tweets_set = pickle.load(r)

    for i in range(0, len(tweets_set)):
        for j in range(0, len(tweets_set[i])):
            t = tweets_set[i][j][1].encode('ascii', 'ignore')
            tweets_data_train.append(t)

    features_train_transformed = get_extra_features(tweets_data_train)
    print(features_train_transformed.shape)
    features_train_transformed.dump("Train\extra_features_train.pkl")

    extra_features_train = numpy.load("Train\extra_features_train.pkl")
    print "EXTRA FEATURES FOR TRAIN DATA IS SUCCESSFULLY EXTRACTED"

    tweets_data_test = []
    with open('Test\dataset_test.pkl', 'rb') as r:
        tweets_set = pickle.load(r)
    for i in range(0, len(tweets_set)):
        for j in range(0, len(tweets_set[i])):
            t = tweets_set[i][j][1].encode('ascii', 'ignore')
            tweets_data_test.append(t)

    features_test_transformed = get_extra_features(tweets_data_test)
    features_test_transformed.dump("Test\extra_features_test.pkl")

    extra_features_test = numpy.load("Test\extra_features_test.pkl")
    print "EXTRA FEATURES FOR TEST DATA IS SUCCESSFULLY EXTRACTED"

    #TFIDF VECTORIZER
    features_train_tfidf, features_test_tfidf = get_main_features(
        tweets_data_train, tweets_data_test)

    with open('Train\edges_train.pkl', 'rb') as e:
        edges_train = pickle.load(e)
    with open('Train\labels_train.pkl', 'rb') as l:
        labels_tr = pickle.load(l)
    with open('Test\edges_test.pkl', 'rb') as e:
        edges_test = pickle.load(e)
    with open('Test\labels_test.pkl', 'rb') as l:
        labels_te = pickle.load(l)

    #edges=numpy.array(edges)
    labels_tr = numpy.array(labels_tr)
    labels_te = numpy.array(labels_te)
    #labels_1D=numpy.zeros(1)

    labels_train = array_to_list(labels_tr)
    labels_test = array_to_list(labels_te)
    labels_test = numpy.array(labels_test)

    #labels_1D=numpy.delete(labels_1D,(0),0)
    """

	selector=SelectPercentile(f_classif,percentile=70)
	selector.fit(features_train_tfidf,labels_1D)
	features_train_transformed=selector.transform(features_train_tfidf).toarray()
	features_test_transformed=selector.transform(features_test_tfidf).toarray()
	print "Features Selection is done successfully """

    print features_test_tfidf.shape, extra_features_test.shape

    features_train_transformed = numpy.concatenate(
        (features_train_tfidf, extra_features_train), axis=1)
    features_test_transformed = numpy.concatenate(
        (features_test_tfidf, extra_features_test), axis=1)
    print "TFIDF FEATURES ARE SUCCESSFULLY CREATED"

    features_train = get_features_and_edges(features_train_transformed,
                                            edges_train)
    features_test = get_features_and_edges(features_test_transformed,
                                           edges_test)

    labels_train = numpy.array(labels_train)
    print labels_train.shape
    model_name = "GraphCRF_model"
    model = GraphCRF(directed=True)
    ssvm = FrankWolfeSSVM(model=model,
                          C=1.0,
                          max_iter=100,
                          logger=SaveLogger(model_name + ".pickle",
                                            save_every=100))
    start_time = time.time()
    final_model = ssvm.fit(features_train, labels_train)
    print("--- Time taken to train the classifier is %s seconds " %
          (time.time() - start_time))
    print "YAAY ! A GRAPH CRF MODEL IS SUCCESSFULLY CREATED AND TRAINED"

    print("Charliehedbo event is the Test Data")
    pickle.dump(final_model, open('Saved_Model/sdqc_final_model.pkl', 'wb'))
    ssvm = pickle.load(open('Saved_Model/sdqc_final_model.pkl', 'rb'))
    #ssvm = SaveLogger(model_name+".pickle").load()
    X_test = []
    y_test = []
    for i in range(0, len(features_test)):
        if features_test[i][0].shape[0] >= 3:
            X_test.append(features_test[i])
            y_test.append(labels_test[i])
    #print X_test

    #print ("Accuracy score with Graph CRF : %f" % ssvm.score(X_test,y_test))

    predictions = ssvm.predict(X_test)
    #PREDICTIONS AND y_TEST ARE LIST OF ARRAYS
    true = numpy.zeros(1)
    prediction = numpy.zeros(1)
    for i in range(0, len(predictions)):
        true = numpy.hstack((true, y_test[i]))
        prediction = numpy.hstack((prediction, predictions[i]))

    true = numpy.delete(true, (0), axis=0)
    prediction = numpy.delete(prediction, (0), axis=0)
    print "TOTAL", true.shape[0]
    print accuracy_score(true, prediction)
    with open('SDQC_Result.pkl', 'wb') as w:
        pickle.dump(prediction, w)
    print(
        classification_report(
            true,
            prediction,
            target_names=["support", "deny", "query", "comment"]))
    print confusion_matrix(true, prediction, labels=[0, 1, 2, 3])
    plot_cmat(true, prediction)
ssvm.fit(X_train, y_train)

print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test))

print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test)))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_chain = ssvm.predict([X_test[ind]])[0]
    for i, (a, image, y_true, y_svm, y_chain) in enumerate(
        zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)
    ):
        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
        a.text(0, 3, abc[y_true], color="#00AA00", size=25)
        a.text(0, 14, abc[y_svm], color="#5555FF", size=25)
        a.text(5, 14, abc[y_chain], color="#FF5555", size=25)
        a.set_xticks(())
        a.set_yticks(())
    for ii in range(i + 1, max_word_len):
        axes_row[ii].set_visible(False)

plt.matshow(ssvm.w[26 * 8 * 16 :].reshape(26, 26))
plt.colorbar()
plt.title("Transition parameters of the chain CRF.")
Example #37
0
def main(train_SF, test_SF, model):
    #read files
    #print("Reading training and testing files")
    #train_SF = pd.read_csv('data/restaurants/train.csv',sep = '\t')
    #test_SF = pd.read_csv('data/restaurants/test.csv',sep = '\t')
    #vectors_filename = "vectors_yelp_200.txt"
    #filename = 'otemodel.sav' #for OTE model

    #for laptops
    #'data/laptops/laptop_train_ote.csv'
    #'data/laptops/laptop_test_ote.csv'
    #"gloveVec200.txt"

    #train_SF = pd.read_csv(trainF,sep = '\t')
    #test_SF = pd.read_csv(testF,sep = '\t')
    #vectors_filename =  vecF
    filename = 'otemodel.sav'  #for OTE model

    pos_lexicon = load_lexicon("lexica/pos")

    #Load word2vec text files
    #print "Loading word2vec file"
    #model = gensim.models.Word2Vec.load_word2vec_format(vectors_filename,binary=False)
    ndim = model.vector_size
    index2word_set = set(model.index2word)

    #Cleaning text
    print "Cleaning text"
    train_SF['cleanText'] = train_SF['text'].apply(review_to_words)
    test_SF['cleanText'] = test_SF['text'].apply(review_to_words)
    test_SF = test_SF[test_SF['cleanText'] != ''].reset_index(drop=True)
    train_SF = train_SF[train_SF['cleanText'] != ''].reset_index(drop=True)
    #Extracting vector features
    print "Extracting vector features"

    train_vec = []
    test_vec = []
    for i in range(0, len(train_SF)):
        train_vec.append(
            create_vector_features(train_SF['cleanText'][i], model))

    for i in range(0, len(test_SF)):
        test_vec.append(create_vector_features(test_SF['cleanText'][i], model))
    train_SF['vector_feats'] = train_vec
    test_SF['vector_feats'] = test_vec

    #Extracting morphological features
    print "Extracting morphological features"

    train_SF['morph_feats'] = train_SF['cleanText'].apply(create_morph_feats)
    test_SF['morph_feats'] = test_SF['cleanText'].apply(create_morph_feats)

    #Extracting POS features
    print "Extracting POS features"
    train_pos = []
    test_pos = []
    for index, row in train_SF.iterrows():
        if (index % 1000 == 0):
            print "Train data - POS features extraction Progress :%d sentences done" % index
        train_pos.append(create_pos_feats(row['cleanText'], pos_lexicon))

    for index, row in test_SF.iterrows():
        if (index % 1000 == 0):
            print "Test data - POS features extraction Progress :%d sentences done" % index

        test_pos.append(create_pos_feats(row['cleanText'], pos_lexicon))

    train_SF['pos_feats'] = train_pos
    test_SF['pos_feats'] = test_pos

    #Extracting previous,next Vector Features
    print "Extracting previous,next Vector features"

    previous_vector_feats_array = []
    next_vector_feats_array = []
    second_next_vector_feats_array = []
    second_previous_vector_feats_array = []
    for i in range(0, len(train_SF)):
        previous_vector_feats, next_vector_feats, second_next_vector_feats, second_previous_vector_feats = create_next_prev_vector_features(
            train_SF['cleanText'][i], model)
        previous_vector_feats_array.append(previous_vector_feats)
        next_vector_feats_array.append(next_vector_feats)
        second_next_vector_feats_array.append(second_next_vector_feats)
        second_previous_vector_feats_array.append(second_previous_vector_feats)

    train_SF['previous_vector_feats'] = previous_vector_feats_array
    train_SF['next_vector_feats'] = next_vector_feats_array
    train_SF[
        'second_previous_vector_feats'] = second_previous_vector_feats_array
    train_SF['second_next_vector_feats'] = second_next_vector_feats_array

    #create next prev vector features
    previous_vector_feats_array = []
    next_vector_feats_array = []
    second_next_vector_feats_array = []
    second_previous_vector_feats_array = []
    for i in range(0, len(test_SF)):
        previous_vector_feats, next_vector_feats, second_next_vector_feats, second_previous_vector_feats = create_next_prev_vector_features(
            test_SF['cleanText'][i], model)
        previous_vector_feats_array.append(previous_vector_feats)
        next_vector_feats_array.append(next_vector_feats)
        second_next_vector_feats_array.append(second_next_vector_feats)
        second_previous_vector_feats_array.append(second_previous_vector_feats)

    test_SF['previous_vector_feats'] = previous_vector_feats_array
    test_SF['next_vector_feats'] = next_vector_feats_array
    test_SF[
        'second_previous_vector_feats'] = second_previous_vector_feats_array
    test_SF['second_next_vector_feats'] = second_next_vector_feats_array

    #Extracting previous,next POS features
    print "Extracting previous,next POS features"

    pos_sent_prev_feats_array = []
    pos_sent_next_feats_array = []
    pos_sent_second_prev_feats_array = []
    pos_sent_second_next_feats_array = []

    for i in range(0, len(train_SF)):
        pos_sent_prev_feats, pos_sent_next_feats, pos_sent_second_prev_feats, pos_sent_second_next_feats = create_prev_pos_feats(
            train_SF['cleanText'][i], pos_lexicon)
        pos_sent_prev_feats_array.append(pos_sent_prev_feats)
        pos_sent_next_feats_array.append(pos_sent_next_feats)
        pos_sent_second_next_feats_array.append(pos_sent_second_next_feats)
        pos_sent_second_prev_feats_array.append(pos_sent_second_prev_feats)

    train_SF['pos_sent_prev_feats'] = pos_sent_prev_feats_array
    train_SF['pos_sent_next_feats'] = pos_sent_next_feats_array
    train_SF['pos_sent_second_prev_feats'] = pos_sent_second_prev_feats_array
    train_SF['pos_sent_second_next_feats'] = pos_sent_second_next_feats_array

    #for test file
    pos_sent_prev_feats_array = []
    pos_sent_next_feats_array = []
    pos_sent_second_prev_feats_array = []
    pos_sent_second_next_feats_array = []

    for i in range(0, len(test_SF)):
        pos_sent_prev_feats, pos_sent_next_feats, pos_sent_second_prev_feats, pos_sent_second_next_feats = create_prev_pos_feats(
            test_SF['cleanText'][i], pos_lexicon)
        pos_sent_prev_feats_array.append(pos_sent_prev_feats)
        pos_sent_next_feats_array.append(pos_sent_next_feats)
        pos_sent_second_next_feats_array.append(pos_sent_second_next_feats)
        pos_sent_second_prev_feats_array.append(pos_sent_second_prev_feats)

    test_SF['pos_sent_prev_feats'] = pos_sent_prev_feats_array
    test_SF['pos_sent_next_feats'] = pos_sent_next_feats_array
    test_SF['pos_sent_second_prev_feats'] = pos_sent_second_prev_feats_array
    test_SF['pos_sent_second_next_feats'] = pos_sent_second_next_feats_array

    print "Features extraction complete............"

    # Creating labels
    print "Creating labels.."
    labels_train = []
    labels_test = []
    for index, rev in train_SF.iterrows():
        labels_train.append(create_labels(rev['cleanText'],
                                          rev['aspect term']))

    for index, rev in test_SF.iterrows():
        labels_test.append(create_labels(rev['cleanText'], rev['aspect term']))

    train_SF['labels'] = labels_train
    test_SF['labels'] = labels_test

    test_SF = test_SF[test_SF['cleanText'] != '']

    # Training CRF model...
    print "Training CRF model...."
    train_sentences, sentence_labels = create_features_array(train_SF)
    test_sentences, test_sentence_labels = create_features_array(test_SF)

    print "Parameter 'C' value selection...."
    best_C_val = pick_best_C_value(train_sentences, sentence_labels, test_SF,
                                   test_sentences, test_sentence_labels)
    print "C-value found : %f" % best_C_val
    modelCRF = ChainCRF()
    ssvm = FrankWolfeSSVM(model=modelCRF,
                          C=best_C_val,
                          max_iter=10,
                          random_state=5)
    ssvm.fit(train_sentences, sentence_labels)

    print "Training complete...."

    predictions = ssvm.predict(test_sentences)
    test_SF['predicted_labels'] = predictions

    #Saving model
    print "Saving model...."
    pickle.dump(ssvm, open(filename, 'wb'))

    #Evaluating Trained CRF model
    print ""
    print " -------------- Evaluation Results --------------"
    predictions = ssvm.predict(train_sentences)
    train_SF['predicted_labels'] = predictions
    p, r, f1, common, retrieved, relevant = evaluating_ote(train_SF)
    print "--------- Train Set Results ---------"
    print "Precision : %f" % p
    print "Recall : %f" % r
    print "F1 measure : %f" % f1
    print ""
    p, r, f1, common, retrieved, relevant = evaluating_ote(test_SF)

    print "--------- Test Set Results ---------"
    print "Precision : %f" % p
    print "Recall : %f" % r
    print "F1 measure : %f" % f1
    print ""

    return f1
Example #38
0
def main():
    default_train = \
        scriptdir+'/../../../data/compression/googlecomp100.train.lbl'
    default_test = \
        scriptdir+'/../../../data/compression/googlecomp.dev.lbl'
    parser = argparse.ArgumentParser()
    parser.add_argument('--threshold',
                        '-t',
                        type=float,
                        help='Threshold for predicting 0/1. ')
    parser.add_argument('--iterations',
                        '-i',
                        type=int,
                        default=50,
                        help='Training iterations.')
    parser.add_argument('--data',
                        '-d',
                        default=default_train,
                        help='Features and labels')
    parser.add_argument('--testdata',
                        default=default_test,
                        help='Test data (not needed for crossval).')
    parser.add_argument('--verbose',
                        '-v',
                        dest='verbose',
                        action='store_true',
                        help='Print avg. loss at every iter.')
    parser.add_argument('--output', '-o', help="Output file")
    parser.add_argument('--features',
                        '-f',
                        dest='features',
                        default=[],
                        type=str,
                        nargs='+',
                        help='Used feature types')
    parser.add_argument('--train',
                        action='store_true',
                        help='If set, will train the model')

    args = parser.parse_args()

    featurizer = edge_featurize.Featurizer(args.features)
    X, y = featurizer.fit_transform(default_train)

    crf = EdgeFeatureGraphCRF(inference_method="max-product")
    model = FrankWolfeSSVM(model=crf, C=.1, max_iter=args.iterations)
    model.fit(X, y)
    if args.testdata:
        X_te, y_te = featurizer.transform(args.testdata)
        pred = model.predict(X_te)
        pred_flat = [item for sublist in pred for item in sublist]
        y_te_flat = [item for sublist in y_te for item in sublist]
        if args.output:
            with open(args.output, 'w') as of:
                for sent_pred in pred:
                    for lid in sent_pred:
                        # print(lid)
                        of.write('%s\n' % featurizer.mapper.id2label[lid])
                    of.write('\n')
        res = evaluate(pred_flat, y_te_flat)
        resout = "F1: %f, R: %f, A: %f, P: %f\n" % res
        print(resout)
Example #39
0
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'NN', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'LCCRF+NN', color="#FFD700", size=25)

fig.text(0.05, 0.5, 'Word', color="#000000", size=25)
fig.text(0.5, 0.95, 'Letters', color="#000000", size=25)

for ind, axes_row in zip(selected, axes):
    y_pred_nn = nn_predictions_test[ind].argmax(axis=1)
    y_pred_chain = chain_ssvm.predict([X_test[ind]])[0]
    y_pred_chain_nn = chain_ssvm_nn.predict([nn_predictions_test[ind]])[0]

    for i, (a, image, y_true, y_nn, y_chain, y_chain_nn) in enumerate(
            zip(axes_row, X_test[ind], y_test[ind], y_pred_nn, y_pred_chain, y_pred_chain_nn)):
        a.matshow(image.reshape(16, 8), cmap=plt.cm.Greys)
        a.text(0, 3, abc[y_true], color="#00AA00", size=25)    # Green
        a.text(0, 14, abc[y_nn], color="#5555FF", size=25)    # Blue
        a.text(5, 14, abc[y_chain], color="#FF5555", size=25)  # Red
        a.text(5, 3, abc[y_chain_nn], color="#FFD700", size=25)     # Yellow
        a.set_xticks(())
        a.set_yticks(())
    for ii in range(i + 1, max_word_len):
        axes_row[ii].set_visible(False)

w = chain_ssvm_nn.w[26 * 26:].reshape(26, 26)
Example #40
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)
    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v,
                                             shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v,
                                           shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _ = pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
Example #41
0
def trainModel_Basic(num_iter=5,inference="qpbo",trainer="NSlack",num_train=2,num_test=1,C=0.1,edges="180x180_dist1_diag0",inputs=[1,1,1,1,1,1],features="all",directed=False,savePred=False):
    
    
    padding=(30,30,30,30)
    
    
    if directed==True:
        features +='+directed'
        
    resultsDir = os.getcwd()+'/CRFResults'
    nameLen = len(os.listdir(resultsDir))
    edgeFeature = edges
    filename=str(nameLen)+'_CRF_iter_'+str(num_iter)+"_"+inference+"_"+trainer+"_"+features+"_"+str(num_train)+"_"+str(num_test)+"_"+edgeFeature
        
    
    print "Loading training slices"
    
    
    start = time.clock()
    train =extractSlices2(train_path,num_train,padding,inputs=inputs)
    end= time.clock()
    train_load_time = (end-start)/60.0
    
    [trainLayers,trainTruth,sliceShape] = train
    print "Training slices loaded in %f" % (train_load_time)
    
    n_features= len(trainLayers[0][0,0])
    print "Layer shape is : "
    print trainLayers[0].shape
    
    print "Training the model"
    edges= np.load("/home/bmi/CRF/edges/"+edges+".npy")
    
    G = [edges for x in trainLayers]
   
    print trainLayers[0].shape
    
    trainLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in trainLayers] )
    trainTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in trainTruth] )
    
    if inference=='ogm':
        crf = GraphCRF(inference_method=('ogm',{'alg':'fm'}),directed=directed)
    else:
        crf = GraphCRF(inference_method=inference,directed=directed)
    
    if trainer=="Frank":
        svm = FrankWolfeSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=6,verbose=1)
    elif trainer=="NSlack":
        svm = NSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1)
    else:
        svm = OneSlackSSVM(model = crf,max_iter=num_iter,C=C,n_jobs=-1,verbose=1)
    
    
    start = time.clock()
    asdf = zip(trainLayers,G)
    svm.fit(asdf,trainTruth)
    end = time.clock()
    train_time = (end-start)/60.0
    print "The training took %f" % (train_time)
    print "Model parameter size :"
    print svm.w.shape
    
    print "making predictions on train data"
    predTrain = svm.predict(asdf)
    trainDice=[]
    for i in range(len(trainLayers)):
        diceScore = accuracy(predTrain[i],trainTruth[i])
        trainDice.append(diceScore)
    meanTrainDice =  sum(trainDice)/len(trainLayers)
    
    del trainLayers,trainTruth
    
################################################################################################    
    overallDicePerPatient=[]           # For overall test Dice 
    extDicePerPatient=[]
    PatientTruthLayers=[]
    PatientPredLayers=[]
    PREC=[]
    RECALL=[]
    F1=[]
    LayerwiseDiceTotal=[]
    
    
    
    
    testResultFile = open(os.getcwd()+"/CRFResults/"+filename+".csv",'a')
    testResultFile.write("folderName,numLayers, Overall Dice, precision , recall, F1"+"\n")
    
    
    counter=0
    print "Loading the test slices"
    for folder in os.listdir(test_path):
        path = test_path + "/" + folder
        layerDiceScores=''
#        print path
        
        data = extractTestSlices2(path,padding,inputs=inputs)
        if data!=0:
            [testLayers,testTruth,sliceShape,startSlice,endSlice] = data
        
#        trueTestLayers=testLayers
        GTest = [edges for x in testLayers]
        testLayers = np.array( [x.reshape((sliceShape[0]*sliceShape[1],n_features)) for x in testLayers] )
        testTruth = np.array( [x.reshape((sliceShape[0]*sliceShape[1],)).astype(int) for x in testTruth] )
        
        asdfTest = zip(testLayers,GTest)
        predTest = svm.predict(asdfTest)  
        
        LayerwiseDice=[]
        
        for i in range(len(testLayers)):
            diceScore = accuracy(predTest[i],testTruth[i])
            layerDiceScores+=","+str(diceScore)
            if math.isnan(diceScore):
                if sum(predTest[i])==0 and sum(testTruth[i])==0:
                    LayerwiseDice.append(1.0)
                continue
            LayerwiseDice.append(diceScore)
        
        LayerwiseDiceTotal.append(LayerwiseDice)
        
        
        overallTestDice = accuracy(np.hstack(predTest),np.hstack(testTruth))
        extDice = np.mean ( np.array(LayerwiseDice)[ range(10) + range(len(LayerwiseDice)-10, len(LayerwiseDice)) ] )
        prec,recall,f1 = precision_score(np.hstack(testTruth),np.hstack(predTest)) , recall_score(np.hstack(testTruth),np.hstack(predTest)) , f1_score(np.hstack(testTruth),np.hstack(predTest))
        print "Patient %d : Overall test DICE for %s is : %f and extDice is %f"%(counter,folder,overallTestDice,extDice)
        print "Precision : %f  Recall : %f  F1 : %f " %(prec,recall,f1)
        print "__________________________________________"

        
        
#        testResultFile.write(folder+","+str(len(testLayers))+","+str(meanTestDice)+","+str(overallTestDice) ","+str(np.max(testDice)) +","+ str(np.min(testDice))+"\n" )
        testResultFile.write(folder+","+str(len(testLayers)) + ","+ str(overallTestDice) + ","+str(prec)+","+str(recall)+","+str(extDice)+layerDiceScores+"\n" )
        overallDicePerPatient.append(overallTestDice)
        extDicePerPatient.append(extDice)
        PREC.append(prec), RECALL.append(recall) , F1.append(f1)
        
        PatientTruthLayers.append(testTruth)
        PatientPredLayers.append(predTest)
        
        counter+=1
        if counter==num_test and num_test!=-1:
            break
######################################################################################################       
    print "Done testing slices"
    overallDice = sum(overallDicePerPatient)/len(PatientTruthLayers)
    overallPrec = sum(PREC)/len(PatientTruthLayers)
    overallRecall = sum(RECALL)/len(PatientTruthLayers)
    overallExtDice = np.mean(extDicePerPatient)
    print "Overall DICE : %f Precision : %f Recall : %f extDice : %f  "%(overallDice,overallPrec,overallRecall,overallExtDice)
    print "############################################"    
    
#    testOutput=np.array([PatientPredLayers,PatientTruthLayers,trueTestLayers])
    testOutput=np.array([PatientPredLayers,PatientTruthLayers])
    
    ########### Saving the models ######################################################################
    
    
#    print "Saving the model"
#    modelDir = os.getcwd()+"/CRFModel/"
#    svmModel = open(modelDir+filename+"_model"+".pkl",'wb')
#    cPickle.dump(svm,svmModel,protocol=cPickle.HIGHEST_PROTOCOL)
#    svmModel.close()    
#    
#    print "saving the predictions"
#    predFileTest = open(os.getcwd()+"/CRFPred/"+filename+"_pred.pkl",'wb')
#    cPickle.dump(testOutput,predFileTest,protocol=cPickle.HIGHEST_PROTOCOL)
#    predFileTest.close()   
    
    
    layerDataLog = open(os.getcwd()+"/CRFModel/"+filename+"_layer.pkl",'wb')
    cPickle.dump(LayerwiseDiceTotal,layerDataLog,protocol = cPickle.HIGHEST_PROTOCOL)
    layerDataLog.close()
    
    resultLog = os.getcwd()+"/CRFResults/TestResultFinal.csv"
    resultFile = open(resultLog,'a')
    resultFile.write(time.ctime()+","+str(num_iter)+","+str(num_train)+","+str(num_test)+","+inference+","+
    trainer+","+str(C)+","+str(train_time)+","+str(meanTrainDice)+","+str(overallDice)+","+
    str(np.std(overallDicePerPatient))+","+edgeFeature+","+"None"+","+features+","+filename +","+ str(overallPrec) +","+ str(overallRecall) +","+ str(overallExtDice)+","+"Flair(5)+T2(9)-Without last 4 train Layers"+"\n")
    
    
    resultFile.close()
    testResultFile.close()
    
    return
Example #42
0
        format(len(train_index), len(test_index)))
    X_train = X[train_index]
    Y_train = Y[train_index]
    X_test = X[test_index]
    Y_test = Y[test_index]

    X_train_vector = np.reshape(
        X_train, (X_train.shape[0] * X_train.shape[1], X_train.shape[2]))
    X_test_vector = np.reshape(
        X_test, (X_test.shape[0] * X_test.shape[1], X_test.shape[2]))
    Y_train_vector = np.reshape(Y_train, (Y_train.shape[0] * Y_train.shape[1]))
    Y_test_vector = np.reshape(Y_test, (Y_test.shape[0] * Y_test.shape[1]))
    """ YOUR S-SVM TRAINING CODE HERE """
    ssvm.fit(X_train, Y_train)
    """ LABEL THE TESTING SET AND PRINT RESULTS """
    y_pred_ssvm = ssvm.predict(X_test)
    ssvm_score = ssvm.score(X_test, Y_test)
    scores_crf[fold] = ssvm_score
    wrong_segments_crf.append(np.size(Y_test) - np.sum(y_pred_ssvm == Y_test))
    """ figure showing the result of classification of segments for
    each jacket in the testing part of present fold """
    if plot_labeling:
        for ti, pred in zip(test_index, Y_pred):
            print(ti)
            print(pred)
            s = segments[ti]
            plot_segments(s,
                          caption='SSVM predictions for jacket ' + str(ti + 1),
                          labels_segments=pred)
    """ YOUR LINEAR SVM TRAINING CODE HERE """
Example #43
0
                               C=C,
                               max_iter=300,
                               check_dual_every=50,
                               line_search=False,
                               verbose=True)
    # fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True)
    gfw_bc_svm = GeneralizedFrankWolfeSSVM(gmodel,
                                           C=C,
                                           max_iter=300,
                                           check_dual_every=50,
                                           line_search=False,
                                           verbose=True)

    if method == 'generalized':
        start = time()
        gfw_bc_svm.fit(X_train_bias, y_train)
        y_pred = np.hstack(gfw_bc_svm.predict(X_test_bias))
        time_fw_bc_svm = time() - start
        print("Score with maxminsvm: %f , C=%f (took %f seconds)" %
              (np.mean(y_pred == y_test), C, time_fw_bc_svm))
        pdb.set_trace()
    elif method == 'vanilla':
        start = time()
        fw_bc_svm.fit(X_train_bias, y_train)
        y_pred = np.hstack(fw_bc_svm.predict(X_test_bias))
        time_fw_bc_svm = time() - start
        print("Score with cssvm: %f , C=%f (took %f seconds)" %
              (np.mean(y_pred == y_test), C, time_fw_bc_svm))
        pdb.set_trace()
        # compute error
Example #44
0
            print('find training data')
            train_datas, train_labels, _ = self.get_datas(c_idxs, labels, mentions, retweets, bags)
<<<<<<< HEAD
            test_datas, test_labels, node_ids = self.get_datas(test_ids, labels, mentions, retweets, bags)
            if i == 0:
                x_test_ori, y_test_ori = test_datas, test_labels
=======
>>>>>>> 93309e3207d37152eefafa6b563c72777a863935
            print(len(train_datas))
            print(len(test_datas))
            X_train, y_train = train_datas, train_labels

            model = GraphCRF(inference_method="max-product")
            ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
            ssvm.fit(X_train, y_train)
            y_preds = ssvm.predict(test_datas)
<<<<<<< HEAD
            result = ssvm.score(x_test_ori, y_test_ori)
            print('iter {} result = {}'.format(i, result))
            count = 0
            for clique_idx, clique in enumerate(y_preds):
                for node_idx, node in enumerate(clique):
                    node_id = node_ids[clique_idx][node_idx]
                    if node == central_propagation_df.iloc[node_id].values:
                        clabels[int(node_id)] = node
                        if not int(node_id) in c_idxs:
                            c_idxs = np.append(c_idxs, int(node_id))
                            count += 1
            print('iter {} update {} new labels'.format(i, count))
=======
            # result = ssvm.score(test_datas, test_labels)