Beispiel #1
0
def chain_crf():
    letters = load_letters()
    x, y, folds = letters['data'], letters['labels'], letters['folds']
    print "Letters : "
    print letters
    # print "Data : "
    # print letters['data']
    # print "Labels : "
    # print letters['labels']
    x, y = np.array(x), np.array(y)
    x_train, x_test = x[folds == 1], x[folds != 1]
    y_train, y_test = y[folds == 1], y[folds != 1]
    print len(x_train)
    print len(x_test)
    print "Done"

    print x_train[0].shape
    print y_train[0].shape
    print x_train[10].shape
    print y_train[10].shape

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
    print ssvm.fit(x_train, y_train)
    print ssvm.score(x_test, y_test)
Beispiel #2
0
 def cross_val(self, X_train, y_train):
     '''
     method to conduct 5-fold cross validation
     '''
     kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False)
     for train_idx, test_idx in kf:
         xtrain, xval = X_train[train_idx], X_train[test_idx]
         ytrain, yval = y_train[train_idx], y_train[test_idx]
         model = ChainCRF()
         ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15)
         ssvm.fit(xtrain, ytrain)
         print ssvm.score(xval, yval)
Beispiel #3
0
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()

        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X是一个由样本组成的numpy数组,每个样本为(字母,数值)
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # 对输入数据运行分类器
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Beispiel #4
0
class CRFTrainer(object):
    #define an init function to initialize the values.
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name
        #using chain crf to analyze the data, so add an error check for this:
        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            #define the classifier to use with CRF model.
            self.clf = FrankWolfeSSVM(model=model,
                                      C=self.c_value,
                                      max_iter=100)
        else:
            raise TypeError('Invalid classifier type')

    def load_clean_data(self):
        '''
        load the data into X and y, where X is a numpy array of samples where each sample has the shape (n_letters, n_features)
        '''
        df = featurize.get_data()
        featurize.split_words(df)
        featurize.first_letter_uppercase(df)
        featurize.has_number(df)
        featurize.has_slash(df)
        featurize.spacy_pos_tagger(df)
        featurize.pos_ngrams(df)
        featurize.encoding_labels(df)
        X, y = featurize.get_X_and_y(df)
        return df, X, y

    def cross_val(self, X_train, y_train):
        '''
        method to conduct 5-fold cross validation
        '''
        kf = KFold(len(X_train), n_folds=5, random_state=None, shuffle=False)
        for train_idx, test_idx in kf:
            xtrain, xval = X_train[train_idx], X_train[test_idx]
            ytrain, yval = y_train[train_idx], y_train[test_idx]
            model = ChainCRF()
            ssvm = FrankWolfeSSVM(model=model, C=0.5, max_iter=15)
            ssvm.fit(xtrain, ytrain)
            print ssvm.score(xval, yval)

    def train(self, X_train, y_train):
        '''
        training method
        '''
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        '''
        method to evaluate the performance of the model
        '''
        return self.clf.score(X_test, y_test)

    def classify(self, input_data):
        '''
        method to run the classifier on input data
        '''
        return self.clf.predict(input_data)[0]
Beispiel #5
0
class CRFModel(object):
    def __init__(self, c_val=1.0):
        self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=50)

    def load_data(self):
        alphabets = load_letters()
        X = np.array(alphabets['data'])
        y = np.array(alphabets['labels'])
        folds = alphabets['folds']
        return X, y, folds

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    def classify(self, input_data):
        return self.clf.predict(input_data)[0]

    def convert_to_letters(indices):
        alphabets = np.array(list(string.ascii_lowercase))
        output = np.take(alphabets, indices)
        output = ''.join(output)
        return output
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) 
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()
        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X is a numpy array of samples where each sample
    # has the shape (n_letters, n_features) 
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # Run the classifier on input data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Beispiel #7
0
class CRFTrainer(object):

    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50)
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()

        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X是一个由样本组成的numpy数组,每个样本为(字母,数值)
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # 对输入数据运行分类器
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
class CRFTrainer(object):
    def __init__(self, c_value, classifier_name='ChainCRF'):
        self.c_value = c_value
        self.classifier_name = classifier_name

        if self.classifier_name == 'ChainCRF':
            model = ChainCRF()
            self.clf = FrankWolfeSSVM(model=model, C=self.c_value, max_iter=50) 
        else:
            raise TypeError('Invalid classifier type')

    def load_data(self):
        letters = load_letters()
        X, y, folds = letters['data'], letters['labels'], letters['folds']
        X, y = np.array(X), np.array(y)
        return X, y, folds

    # X is a numpy array of samples where each sample
    # has the shape (n_letters, n_features) 
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    # Run the classifier on input data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
def n_cross_valid_crf(X, Y, K, command):
    # cross validation for crf

    if command == 'write_results':
        list_write = list()

    cv = KFold(len(X), K, shuffle=True, random_state=0)
    for traincv, testcv in cv:
        x_train, x_test = X[traincv], X[testcv]
        y_train, y_test = Y[traincv], Y[testcv]

        crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
        ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
        ssvm.fit(x_train, y_train)
        y_pred = ssvm.predict(x_test)

        print 'Accuracy of linear-crf %f:' % ssvm.score(x_test, y_test)
        if command == 'metrics_F1':
            metrics_crf(y_test, y_pred)
        elif command == 'confusion_matrix':
            confusion_matrix_CRF(y_test, y_pred)
        elif command == 'write_results':
            list_write += write_results_CRF(testcv, y_test, y_pred)

        print '------------------------------------------------------'
        print '------------------------------------------------------'

    if command == 'write_results':
        list_write = sorted(list_write, key=itemgetter(0))  # sorted list based on index
        for value in list_write:
            pred_list = value[1]
            test_list = value[2]

            for i in range(0, len(pred_list)):
                print str(pred_list[i]) + '\t' + str(test_list[i])
Beispiel #10
0
    def structraining(self, bags, mentions, retweets, labels):
        total_datas = []
        total_labels = []
        print('num_user', len(bags.keys()))
        for user_id, bag in bags.items():
            if not user_id in labels:
                continue
            features = np.empty((0, self.top_seq))
            edge_nodes = np.empty((0, 2))
            edge_features = np.empty((0, 1))
            clique_labels = np.array([labels[user_id]])
            features = np.vstack([features, bag])
            mentioned_ids = mentions[user_id]
            cnt = 0
            for mentioned_id in enumerate(mentioned_ids):
                if not mentioned_id in labels:
                    continue
                clique_labels = np.append(clique_labels,
                                          np.array([labels[mentioned_id]]))
                if mentioned_id in bags:
                    features = np.vstack([features, bags[mentioned_id]])
                else:
                    features = np.vstack([features, np.zeros(self.top_seq)])
                edge_nodes = np.vstack([edge_nodes, np.array([0, cnt + 1])])
                edge_features = np.vstack([edge_features, np.array([[0]])])
                cnt += 1

            num_mentioned = edge_nodes.shape[0]
            retweet_ids = retweets[user_id]
            cnt = 0
            for retweet_id in retweet_ids:
                if not retweet_id in labels:
                    continue
                clique_labels = np.append(clique_labels,
                                          np.array([labels[retweet_id]]))
                if retweet_id in bags:
                    features = np.vstack([features, bags[retweet_id]])
                else:
                    features = np.vstack([features, np.zeros(self.top_seq)])
                edge_nodes = np.vstack(
                    [edge_nodes,
                     np.array([0, cnt + 1 + num_mentioned])])
                edge_features = np.vstack([edge_features, np.array([[1]])])
                cnt += 1

            total_datas.append(
                (features, edge_nodes.astype(int), edge_features))
            total_labels.append(clique_labels)

        ratio = len(total_datas) * 0.7
        ratio = int(ratio)
        print(ratio)
        X_train, y_train = total_datas[:ratio], total_labels[:ratio]
        X_test, y_test = total_datas[ratio:], total_labels[ratio:]

        model = EdgeFeatureGraphCRF(inference_method="max-product")
        ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
        ssvm.fit(X_train, y_train)
        result = ssvm.score(X_test, y_test)
        print(result)
def test_svm_as_crf_pickling_batch():

    iris = load_iris()
    X, y = iris.data, iris.target

    X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
    Y = y.reshape(-1, 1)

    X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1)
    _, file_name = mkstemp()

    pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary')
    logger = SaveLogger(file_name)
    svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50, batch_mode=False)
    svm.fit(X_train, y_train)

    assert_less(.97, svm.score(X_test, y_test))
    assert_less(.97, logger.load().score(X_test, y_test))
def test_svm_as_crf_pickling_bcfw():

    iris = load_iris()
    X, y = iris.data, iris.target

    X_ = [(np.atleast_2d(x), np.empty((0, 2), dtype=np.int)) for x in X]
    Y = y.reshape(-1, 1)

    X_train, X_test, y_train, y_test = train_test_split(X_, Y, random_state=1)
    _, file_name = mkstemp()

    pbl = GraphCRF(n_features=4, n_states=3, inference_method='unary')
    logger = SaveLogger(file_name)
    svm = FrankWolfeSSVM(pbl, C=10, logger=logger, max_iter=50)
    svm.fit(X_train, y_train)

    assert_less(.97, svm.score(X_test, y_test))
    assert_less(.97, logger.load().score(X_test, y_test))
Beispiel #13
0
def MLfitCRF(data_train, data_test, records, folds):
    fvector = np.array([data_train[0]])
    labels = np.array([data_train[1]])

    #create CRF model
    CRFmodel = ChainCRF()
    #create ML classifier
    ssvm = FrankWolfeSSVM(model = CRFmodel, C = 0.1)
    #training
    ssvm.fit(fvector, labels)

    #model testing
    fvector_test = np.array(data_test[0])
    labels_test = np.array(data_test[1])
    score = ssvm.score(fvector_train, labels_test)

    print score

    return
def results_CRFs(X_training, Y_training, X_testing, Y_testing, command):
    crf = ChainCRF(inference_method='max-product', directed=False, class_weight=None)
    ssvm = FrankWolfeSSVM(model=crf, C=1.0, max_iter=100)
    ssvm.fit(X_training, Y_training)
    y_pred = ssvm.predict(X_testing)

    list_write = list()
    print 'Accuracy of linear-crf %f:' % ssvm.score(X_testing, Y_testing)
    if command == 'metrics_F1':
        metrics_crf(Y_testing, y_pred)
    elif command == 'confusion_matrix':
        confusion_matrix_CRF(Y_testing, y_pred)
    elif command == 'write_results':
        list_write = write_CRFs_compare(Y_testing, y_pred)
        for value in list_write:
            pred_list = value[0]
            test_list = value[1]

            for i in range(0, len(pred_list)):
                print str(pred_list[i]) + '\t' + str(test_list[i])
def chaincrf_test():
	num_pics = 3000
	X, Y= load_pictures(num_pics)
	X = np.array(X)
	Y = np.array(Y)

	print X.shape
	print Y.shape

	# 0: pixel, 1: row, 2: picture
	mode = 0
	outstr = "Test score with data arranged by "

	if mode == 0:
		X, Y = arrange_by_pixel(X, Y)
		outstr += "pixel:"
	elif mode == 1:
		X, Y = arrange_by_row(X, Y)
		outstr += "row:"
	elif mode == 2:
		X, Y = arrange_by_picture(X, Y)
		outstr += "picture:"

	print X.shape
	print Y.shape

	#print X.shape, Y.shape
	train_pct = 0.66
	test_pct = 1 - train_pct
	X_train = X[0:math.floor(train_pct * num_pics)]
	X_test = X[math.floor(test_pct*num_pics):]
	Y_train = Y[0:math.floor(train_pct * num_pics)]
	Y_test = Y[math.floor(test_pct*num_pics):]

	model = ChainCRF()
	ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
	# #print X_train.shape, Y_train.shape
	ssvm.fit(X_train, Y_train)
	results = ssvm.score(X_test, Y_test)
	print outstr
	print results
Beispiel #16
0
def chaincrf_test():
    num_pics = 3000
    X, Y = load_pictures(num_pics)
    X = np.array(X)
    Y = np.array(Y)

    print X.shape
    print Y.shape

    # 0: pixel, 1: row, 2: picture
    mode = 0
    outstr = "Test score with data arranged by "

    if mode == 0:
        X, Y = arrange_by_pixel(X, Y)
        outstr += "pixel:"
    elif mode == 1:
        X, Y = arrange_by_row(X, Y)
        outstr += "row:"
    elif mode == 2:
        X, Y = arrange_by_picture(X, Y)
        outstr += "picture:"

    print X.shape
    print Y.shape

    #print X.shape, Y.shape
    train_pct = 0.66
    test_pct = 1 - train_pct
    X_train = X[0:math.floor(train_pct * num_pics)]
    X_test = X[math.floor(test_pct * num_pics):]
    Y_train = Y[0:math.floor(train_pct * num_pics)]
    Y_test = Y[math.floor(test_pct * num_pics):]

    model = ChainCRF()
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
    # #print X_train.shape, Y_train.shape
    ssvm.fit(X_train, Y_train)
    results = ssvm.score(X_test, Y_test)
    print outstr
    print results
Beispiel #17
0
class CRFModel(object):
    def __init__(self, c_val=1.0):
        self.clf = FrankWolfeSSVM(model=ChainCRF(), C=c_val, max_iter=100)

    #Load the training data
    def load_data(self):
        alphabets = load_letters()
        X = np.array(alphabets['data'])
        y = np.array(alphabets['labels'])
        folds = alphabets['folds']
        return X, y, folds

    #Train the CRF
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    #Evaluate the accuracy of the CRF
    def evaluate(self, X_test, y_test):
        return self.clf.score(X_test, y_test)

    #Run the CRF on unknown data
    def classify(self, input_data):
        return self.clf.predict(input_data)[0]
Beispiel #18
0
for train_index, test_index in kf.split(X):
    print(' ')
    print('train index {}'.format(train_index))
    print('test index {}'.format(test_index))
    print('{} jackets for training, {} for testing'.format(
        len(train_index), len(test_index)))
    X_train = X[train_index]
    Y_train = Y[train_index]
    X_test = X[test_index]
    Y_test = Y[test_index]
    """ YOUR S-SVM TRAINING CODE HERE """
    ssvm.fit(X_train, Y_train)
    """ LABEL THE TESTING SET AND PRINT RESULTS """
    Y_pred = ssvm.predict(X_test)
    wrong_segments_crf.append(np.sum(Y_pred != Y_test))
    score = ssvm.score(X_test, Y_test)
    scores_crf[fold] = score
    """ figure showing the result of classification of segments for
    each jacket in the testing part of present fold """
    if plot_labeling:
        for ti, pred in zip(test_index, Y_pred):
            print(ti)
            print(pred)
            s = segments[ti]
            plot_segments(s,
                          caption='SSVM predictions for jacket ' + str(ti + 1),
                          labels_segments=pred)
    """ YOUR LINEAR SVM TRAINING CODE HERE """
    svm.fit(X_train.reshape((-1, num_features)), Y_train.reshape((-1)))
    """ LABEL THE TESTING SET AND PRINT RESULTS """
    Y_pred = svm.predict(X_test.reshape((-1, num_features))).reshape(
Beispiel #19
0
# print("Shuffle results")
# features, labels = util.shuffle(features, labels)


trsize = int(0.7*len(labels))
X_train = features[1:trsize]
y_train = labels[1:trsize]

X_test = features[trsize+1:]
y_test = labels[trsize+1:]

# X_train = X_test = features
# y_train = y_test = labels
# trsize = len(labels)

# Evaluate the chain
model = ChainCRF()
C=0.0001
max_iter=50
ssvm = FrankWolfeSSVM(model=model, C=C, max_iter=max_iter, verbose=True)
print(ssvm)
print(ssvm.fit(X_train, y_train))
print(ssvm.w)
trscore = ssvm.score(X_train,y_train)
# testscore = ssvm.score(X_test,y_test)
print("Training score: {0}".format(trscore))
# print("Test score: {0}".format(testscore))

# Save the result
# util.saveToSQL(featureset, C, max_iter, trsize, trscore, 2)
Beispiel #20
0
nn_predictions_train = arrange_letters_in_pred_like(X_train, train_net_pred, size_of_pred=26)
nn_predictions_test = arrange_letters_in_pred_like(X_test, test_net_pred, size_of_pred=26)

# Train LCCRF
chain_model = ChainCRF(directed=True)
chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11)
chain_ssvm.fit(X_train, y_train)

# Train LCCRF+NN
chain_model = ChainCRF(directed=True)
chain_ssvm_nn = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11)
chain_ssvm_nn.fit(nn_predictions_train, y_train)

print("Test score with linear NN: 84.15%")

print("Test score with LCCRF: %f" % chain_ssvm.score(X_test, y_test))

print("Test score with LCCRF+NN: %f" % chain_ssvm_nn.score(nn_predictions_test, y_test))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'NN', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'LCCRF+NN', color="#FFD700", size=25)
Beispiel #21
0
class CRFClassifierText(object):

    IGNORE_IF = re.compile(r'(in press|submitted|to appear)',
                           flags=re.IGNORECASE)

    QUOTES_AROUND_ETAL_REMOVE = re.compile(r'(.*)(")(et al\.?)(")(.*)',
                                           re.IGNORECASE)
    TO_ADD_DOT_AFTER_INITIALS = re.compile(
        r'\b([A-Z]{1}(?!\.))([\s,]+)([A-Z12(]|and)')
    TO_ADD_SEPARATE_INITIALS = re.compile(r'\b([A-Z]{1})([A-Z]{1})([,\s]{1})')
    SEPARATE_AUTHOR = re.compile(r'^((.*?)([\d\":]+))(.*)$')
    TO_REMOVE_HYPEN_NEAR_INITIAL = [
        re.compile(r'([A-Z]\.)(\-)([A-Z]\.)'),
        re.compile(r'([A-Z])(\-)(\.)'),
        re.compile(r'([A-Z])(\-)([A-Z])\b')
    ]

    URL_EXTRACTOR = re.compile(r'((url\s*)?(http)s?://[A-z0-9\-\.\/\={}?&%]+)',
                               re.IGNORECASE)
    MONTH_NAME_EXTRACTOR = re.compile(
        r'\b([Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|([Nn]ov|[Dd]ec)(?:ember)?)\b'
    )

    URL_TO_DOI = re.compile(
        r'((url\s*)?(https\s*:\s*//\s*|http\s*:\s*//\s*)((.*?)doi(.*?)org/))|(DOI:https\s*://\s*)',
        flags=re.IGNORECASE)
    URL_TO_ARXIV = re.compile(
        r'((url\s*)?(https://|http://)(arxiv.org/(abs|pdf)/))',
        flags=re.IGNORECASE)
    URL_TO_ASCL = re.compile(r'((url\s*)?(https://|http://)(ascl.net/))',
                             flags=re.IGNORECASE)
    ADD_COLON_TO_IDENTIFIER = re.compile(r'(\s+(DOI|arXiv|ascl))(:?\s*)',
                                         flags=re.IGNORECASE)

    IS_START_WITH_YEAR = re.compile(r'(^[12][089]\d\d)')
    START_WITH_AUTHOR = re.compile(r'([A-Za-z].*$)')

    WORD_BREAKER_REMOVE = [re.compile(r'([A-Za-z]+)([\-]+\s+)([A-Za-z]+)')]

    TOKENS_NOT_IDENTIFIED = re.compile(r'\w+\b(?!\|)')

    REFERENCE_TOKENIZER = re.compile(r'([\s.,():;\[\]\'\"#\/])')
    TAGGED_MULTI_WORD_TOKENIZER = re.compile(r'([\s.,])')

    # is all capital
    IS_ALL_CAPITAL = re.compile(r'^([A-Z]+)$')
    # is only the first character capital
    IS_FIRST_CAPITAL = re.compile(r'^([A-Z][a-z]+)$')
    # is alphabet only, consider hyphenated words also
    IS_ALPHABET = re.compile(r'^(?=.*[a-zA-Z])([a-zA-Z\-]+)$')
    # is numeric only, consider the page range with - being also numeric
    # also include arxiv id with a dot to be numeric
    # note that this differs from function is_numeric in the
    # sense that this recognizes numeric even if it was not identified/tagged
    IS_NUMERIC = re.compile(r'^(?=.*[0-9])([0-9\-\.]+)$')
    # is alphanumeric, must have at least one digit and one alphabet character
    IS_ALPHANUMERIC = re.compile(r'^(?=.*[0-9])(?=.*[a-zA-Z])([a-zA-Z0-9]+)$')

    ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS = re.compile(
        r'(\|[a-z\_]+\|)(\|[a-z\_]+\|)')
    REGEX_PATTERN_WHOLE_WORD_ONLY = r'(?:\b|\B)%s(?:\b|\B)'

    nltk_tagger = None
    crf = None
    X = y = label_code = folds = None

    def __init__(self):
        """

        """
        self.originator_token = OriginatorToken(self.REFERENCE_TOKENIZER)
        self.numeric_token = NumericToken()
        self.pub_token = PubToken()
        self.unknown_tokens = []
        self.filename = os.path.dirname(
            __file__) + '/serialized_files/crfModelText.pkl'

    def create_crf(self):
        """

        :return:
        """
        # to load nltk tagger, a time consuming, one time needed operation
        self.nltk_tagger = nltk.tag._get_tagger()
        self.crf = FrankWolfeSSVM(model=ChainCRF(), C=1.0, max_iter=50)
        self.X, self.y, self.label_code, self.folds, generate_fold = self.load_training_data(
        )

        score = 0
        # only need to iterate through if fold was generated
        num_tries = 10 if generate_fold else 1
        while (score <= 0.90) and (num_tries > 0):
            try:
                X_train, y_train = self.get_train_data()
                self.train(X_train, y_train)

                X_test, y_test = self.get_test_data()
                score = self.evaluate(X_test, y_test)
            except Exception as e:
                current_app.logger.error('Exception: %s' % (str(e)))
                current_app.logger.error(traceback.format_exc())
                pass
            num_tries -= 1
        return (score > 0)

    def format_training_data(self, the_data):
        """

        :param the_data:
        :return:
        """
        # get label, word in the original presentation
        labels = [[elem[0] for elem in ref] for ref in the_data]
        words = [[elem[1] for elem in ref] for ref in the_data]

        # count how many unique labels there are, return a dict to convert from words to numeric words
        label_code = self.encoder(labels)

        numeric_labels = []
        features = []
        for label, word in zip(labels, words):
            # replace of numeric words for the original presentation of label
            numeric_label = []
            for l in label:
                numeric_label.append(label_code[l])
            numeric_labels.append(np.array(numeric_label))

            # get the numeric features for the original presentation of word and insert at index of label
            feature = []
            for idx in range(len(word)):
                feature.append(self.get_data_features(word, idx, label))
            features.append(np.array(feature))
        return features, numeric_labels, label_code

    def get_num_states(self):
        """

        :return:
        """
        num_states = len(
            np.unique(np.hstack([y for y in self.y[self.folds != 0]])))
        current_app.logger.debug("number of states = %s" % num_states)
        return num_states

    def get_folds_array(self, filename):
        """
        read the distribution of train and test indices from file
        :param filename:
        :return:
        """
        with open(filename, 'r') as f:
            reader = f.readlines()
            for line in reader:
                if line.startswith("STATIC_FOLD"):
                    try:
                        return eval(line.split(" = ")[1])
                    except:
                        return None

    def get_train_data(self):
        """

        :return:
        """
        return self.X[self.folds != 0], self.y[self.folds != 0]

    def get_test_data(self):
        """

        :return:
        """
        return self.X[self.folds == 0], self.y[self.folds == 0]

    def train(self, X_train, y_train):
        """
        :param X_train: is a numpy array of samples where each sample
                        has the shape (n_labels, n_features)
        :param y_train: is numpy array of labels
        :return:
        """
        self.crf.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        """

        :param X_test:
        :param y_test:
        :return:
        """
        return self.crf.score(X_test, y_test)

    def decoder(self, numeric_label):
        """

        :param numeric_label:
        :return:
        """
        labels = []
        for nl in numeric_label:
            key = next(key for key, value in self.label_code.items()
                       if value == nl)
            labels.append(key)
        return labels

    def encoder(self, labels):
        """

        :param labels:
        :return: dict of labels as key and numeric value is its value
        """
        # assign a numeric value to each label
        label_code = {}
        numeric = -1
        for label in labels:
            for l in label:
                if (numeric >= 0 and l in label_code):
                    continue
                else:
                    numeric = numeric + 1
                    label_code[l] = numeric
        return label_code

    def load_training_data(self):
        """
        load training/test data
        :return:
        """
        training_files_path = os.path.dirname(__file__) + '/training_files/'
        arXiv_text_ref_filenames = [
            training_files_path + 'arxiv.raw',
        ]
        references = []
        for f in arXiv_text_ref_filenames:
            references = references + get_arxiv_tagged_data(f)

        X, y, label_code = self.format_training_data(references)

        # for now use static division. see comments in foldModelText.dat
        generate_fold = False
        if generate_fold:
            folds = list(np.random.choice(range(0, 9), len(y)))
        else:
            folds = self.get_folds_array(training_files_path +
                                         'foldModelText.dat')

        return np.array(X, dtype=object), np.array(
            y, dtype=object), label_code, np.array(folds), generate_fold

    def save(self):
        """
        save object to a pickle file
        :return:
        """
        try:
            with open(self.filename, "wb") as f:
                pickler = pickle.Pickler(f, -1)
                pickler.dump(self.crf)
                pickler.dump(self.label_code)
                pickler.dump(self.nltk_tagger)
            current_app.logger.info("saved crf in %s." % self.filename)
            return True
        except Exception as e:
            current_app.logger.error('Exception: %s' % (str(e)))
            current_app.logger.error(traceback.format_exc())
            return False

    def load(self):
        """

        :return:
        """
        try:
            with open(self.filename, "rb") as f:
                unpickler = pickle.Unpickler(f)
                self.crf = unpickler.load()
                self.label_code = unpickler.load()
                self.nltk_tagger = unpickler.load()
            current_app.logger.info("loaded crf from %s." % self.filename)
            return self.crf
        except Exception as e:
            current_app.logger.error('Exception: %s' % (str(e)))
            current_app.logger.error(traceback.format_exc())

    def search(self, pattern, text):
        """
        search whole word only in the text
        :param pattern:
        :param text:
        :return: Ture/False depending if found
        """
        try:
            return re.search(self.REGEX_PATTERN_WHOLE_WORD_ONLY % pattern,
                             text) is not None
        except:
            return False

    def reference(self, refstr, words, labels):
        """
        put identified words into a dict to be passed out

        :param words:
        :param labels:
        :return:
        """
        ref_dict = {}
        ref_dict['authors'] = self.originator_token.collect_tagged_tokens(
            words, labels)
        if 'DOI' in labels or 'ARXIV' in labels or 'ASCL' in labels:
            ref_dict.update(
                self.numeric_token.collect_id_tagged_tokens(words, labels))
        if 'YEAR' in labels:
            ref_dict['year'] = words[labels.index('YEAR')]
        if 'VOLUME' in labels:
            volume = self.numeric_token.collect_tagged_numerals_token(
                words, labels, 'VOLUME')
            if volume:
                ref_dict['volume'] = volume
        if 'PAGE' in labels:
            page = self.numeric_token.collect_tagged_numerals_token(
                words, labels, 'PAGE')
            if page:
                ref_dict['page'] = page
        if 'ISSUE' in labels:
            ref_dict['issue'] = words[labels.index('ISSUE')]
        if 'ISSN' in labels:
            ref_dict['ISSN'] = words[labels.index('ISSN')]
        if 'JOURNAL' in labels:
            ref_dict['journal'] = self.pub_token.collect_tagged_journal_tokens(
                words, labels)
        if 'TITLE' in labels:
            title = self.pub_token.collect_tagged_title_tokens(words, labels)
            if title:
                ref_dict['title'] = title
        ref_dict['refstr'] = refstr
        return ref_dict

    def punctuation_features(self, ref_word, ref_label):
        """
        return a feature vector that has 1 in the first cell if ref_word is a punctuation
        followed by 1 in the position corresponding to which one

        :param ref_word:
        :param ref_label:
        :return:
        """
        which = which_punctuation(ref_word, ref_label)
        return [
            1 if which == 0 else 0,  # 0 if punctuation,
            1 if which == 1 else 0,  # 1 if brackets,
            1 if which == 2 else 0,  # 2 if colon,
            1 if which == 3 else 0,  # 3 if comma,
            1 if which == 4 else 0,  # 4 if dot,
            1 if which == 5 else 0,  # 5 if parenthesis,
            1 if which == 6 else 0,  # 6 if quotes (both single and double),
            1 if which == 7 else 0,  # 7 if num signs,
            1 if which == 8 else 0,  # 8 if hypen,
            1 if which == 9 else 0,  # 9 if forward slash,
            1 if which == 10 else 0,  # 10 if semicolon,
        ]

    def is_token_unknown(self, ref_word, ref_label):
        """

        :param ref_word:
        :param ref_label:
        :return:
        """
        if ref_label:
            return 1 if ref_label == 'NA' else 0

        if ref_word is None:
            return 0
        return int(any(ref_word == token for token in self.unknown_tokens))

    def length_features(self, ref_word):
        """
        distinguish between token of length 1, and longer

        :param ref_word:
        :return:
        """
        return [1 if len(ref_word) == 1 else 0, 1 if len(ref_word) > 1 else 0]

    def get_data_features(self, ref_word_list, index, ref_label_list=None):
        """

        :param ref_word_list: has the form [e1,e2,e3,..]
        :param index: the position of the word in the set, assume it is valid
        :param ref_label_list: labels for ref_word_list available during training only
        :return:
        """
        ref_word = ref_word_list[index]
        ref_label = ref_label_list[index] if ref_label_list else None
        return \
              self.length_features(ref_word)                                                \
            + self.originator_token.author_features(ref_word_list, ref_label_list, index)   \
            + self.pub_token.title_features(ref_word_list, ref_label_list, index)           \
            + self.pub_token.journal_features(ref_word_list, ref_label_list, index)         \
            + self.numeric_token.numeric_features(ref_word, ref_label)                      \
            + self.numeric_token.identifying_word_features(ref_word, ref_label)             \
            + self.punctuation_features(ref_word, ref_label)                                \
            + self.pub_token.publisher_features(ref_word, ref_label)                        \
            + self.originator_token.editor_features(ref_word_list, ref_label_list, index)   \
            + [
                int(self.IS_ALL_CAPITAL.match(ref_word) is not None),                       # is element all capital
                int(self.IS_FIRST_CAPITAL.match(ref_word) is not None),                     # is first character capital
                int(self.IS_ALPHABET.match(ref_word) is not None),                          # is alphabet only, consider hyphenated words also
                int(self.IS_NUMERIC.match(ref_word) is not None),                           # is numeric only, consider the page range with - being also numeric
                int(self.IS_ALPHANUMERIC.match(ref_word) is not None),                      # is alphanumeric, must at least one digit and one alphabet character
                self.is_token_unknown(ref_word, ref_label),                                 # is it one of the words unable to guess
                self.pub_token.is_token_stopword(ref_word, ref_label),                      # is it one of tagged stopwords
              ]

    def segment(self, reference_str):
        """
        going to attempt and segment the reference string
        each token that is identified is removed from reference_str
        in the reverse order the identified tokens are inserted back to reference_str
        before feature extraction

        :param reference_str:
        :return:
        """
        if isinstance(reference_str, list):
            return []

        # start fresh
        self.numeric_token.clear()
        self.originator_token.clear()
        self.pub_token.clear()
        na_url = None
        na_month = None

        # step 1: remove any non essential tokens (ie, urls, months, etc)
        matches = self.URL_EXTRACTOR.findall(reference_str)
        if len(matches) > 0:
            na_url = []
            for i, url in enumerate(matches, start=1):
                na_url.append(url[0])
                reference_str = reference_str.replace(url[0],
                                                      '|na_url_%d|' % i)
        extractor = self.MONTH_NAME_EXTRACTOR.search(reference_str)
        if extractor:
            na_month = extractor.group().strip()
            reference_str = reference_str.replace(na_month, '|na_month|')

        # step 2: identify doi/arxiv/ascl
        reference_str = self.numeric_token.segment_ids(reference_str)

        # step 3: identify list of authors and editors
        reference_str = self.originator_token.identify(reference_str)

        # step 4: identify title and journal substrings
        # but first remove any numerical identifying words
        reference_str = self.pub_token.identify(
            self.numeric_token.remove_identifying_words(reference_str).strip(),
            self.nltk_tagger, self.originator_token.indices(),
            self.originator_token.have_editor())

        # step 5: identify year, volume, page, issue
        reference_str = self.numeric_token.segment_numerals(reference_str)

        # collect all tokens that has not been identified
        self.unknown_tokens = self.TOKENS_NOT_IDENTIFIED.findall(reference_str)
        if na_url:
            self.unknown_tokens.append(' '.join(na_url))
        if na_month:
            self.unknown_tokens.append(na_month)

        # now put the identified tokens back into the string, and before tokenizing and sending to crf

        # step 5 reverse
        reference_str = self.numeric_token.assemble_stage1(reference_str)

        # step 4 reverse
        reference_str = self.pub_token.assemble(reference_str)

        # step 3 reverse
        reference_str = self.originator_token.assemble(reference_str)

        # tokenize
        ref_words = list(
            filter(None, [
                w.strip() for w in self.REFERENCE_TOKENIZER.split(
                    self.ADD_SPACE_BETWEEN_TWO_IDENTIFIED_TOKENS.sub(
                        r'\1 \2', reference_str))
            ]))

        # step 2 reverse
        ref_words = self.numeric_token.assemble_stage2(ref_words)

        # step 1 reverse
        if na_month:
            ref_words[ref_words.index('|na_month|')] = na_month
        if na_url:
            for i, url in enumerate(na_url, start=1):
                ref_words[ref_words.index('|na_url_%d|' % i)] = url

        return ref_words

    def dots_after_initials(self, reference_str):
        """

        :param reference_str:
        :return:
        """
        try:
            author_part = self.SEPARATE_AUTHOR.search(reference_str).group(1)
            # separate first and middle initials if there are any attached, add dot after each
            # make sure there is a dot after single character, repeat to capture middle name
            reference_str = reference_str.replace(
                author_part,
                self.TO_ADD_SEPARATE_INITIALS.sub(
                    r"\1. \2. \3",
                    self.TO_ADD_DOT_AFTER_INITIALS.sub(
                        r"\1.\2\3",
                        self.TO_ADD_DOT_AFTER_INITIALS.sub(
                            r"\1.\2\3", author_part))))
        except:
            pass

        return reference_str

    def pre_processing(self, reference_str):
        """
        
        :param reference_str: 
        :return: 
        """
        # remove any numbering that appears before the reference to start with authors
        # exception is the year
        if self.IS_START_WITH_YEAR.search(reference_str) is None:
            reference_str = self.START_WITH_AUTHOR.search(
                reference_str).group()

        # also if for some reason et al. has been put in double quoted! remove them
        reference_str = self.QUOTES_AROUND_ETAL_REMOVE.sub(
            r"\1\3\5", reference_str)
        # if there is a hypen either between initials, or after initials and before dot, remove it
        for rhni, replace in zip(self.TO_REMOVE_HYPEN_NEAR_INITIAL,
                                 [r"\1 \3", r"\1\3", r"\1. \3"]):
            reference_str = rhni.sub(replace, reference_str)
        # add dots after initials, separate first and middle if needed
        reference_str = self.dots_after_initials(reference_str)
        # if no colon after the identifer, add it in
        reference_str = self.ADD_COLON_TO_IDENTIFIER.sub(r"\1:", reference_str)
        # if there is a url for DOI turned it to recognizable DOI
        reference_str = self.URL_TO_DOI.sub(r"DOI:", reference_str)
        # if there is a url for arxiv turned it to recognizable arxiv
        reference_str = self.URL_TO_ARXIV.sub(r"arXiv:", reference_str)
        # if there is a url for ascl turned it to recognizable ascl
        reference_str = self.URL_TO_ASCL.sub(r"ascl:", reference_str)

        for rwb in self.WORD_BREAKER_REMOVE:
            reference_str = rwb.sub(r'\1\3', reference_str)

        return reference_str

    def classify(self, reference_str):
        """
        Run the classifier on input data
        
        :param reference_str:
        :return: list of words and the corresponding list of labels
        """
        reference_str = self.pre_processing(reference_str)
        ref_words = self.segment(reference_str)

        features = []
        for i in range(len(ref_words)):
            features.append(self.get_data_features(ref_words, i, []))

        ref_labels = self.decoder(self.crf.predict([np.array(features)])[0])
        return ref_words, ref_labels

    def parse(self, reference_str):
        """

        :param reference_str:
        :return:
        """
        if self.IGNORE_IF.search(reference_str):
            return None
        words, labels = self.classify(reference_str)
        return self.reference(reference_str, words, labels)

    def tokenize(self, reference_str):
        """
        used for unittest only

        :param reference_str:
        :return:
        """
        if self.IGNORE_IF.search(reference_str):
            return None
        words, _ = self.classify(reference_str)
        return words
Beispiel #22
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)
    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []

    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v,
                                             shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v,
                                           shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _ = pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
Beispiel #23
0
y_test = preprocess_label(y_test)

### CS : best c =0.01
### Phy: best c= 0.005
### stat: best c = 0.005
'''
C= [0.005,0.01,0.02,0.05,0.1,0.2]
score = {}

for i in C:
	model = ChainCRF()
	ssvm = FrankWolfeSSVM(model=model, C=i, max_iter=100)
	ssvm.fit(x_train, y_train) 
	score[i] = ssvm.score(x_dev, y_dev)

print score
'''
model = ChainCRF()
ssvm = FrankWolfeSSVM(model=model, C=0.005, max_iter=100)
ssvm.fit(x_train, y_train)
score = ssvm.score(x_test, y_test)
y_pred = ssvm.predict(x_test)

print 'Micro-averaged F1 score:', f1_score(get_one_list(y_test),
                                           get_one_list(y_pred),
                                           average='micro')

experiment_util.sequential_error_analysis(
    restore_label(y_test), restore_label(y_pred),
    './chaincrf_sequential_error_analysis')
Beispiel #24
0
<<<<<<< HEAD
            test_datas, test_labels, node_ids = self.get_datas(test_ids, labels, mentions, retweets, bags)
            if i == 0:
                x_test_ori, y_test_ori = test_datas, test_labels
=======
>>>>>>> 93309e3207d37152eefafa6b563c72777a863935
            print(len(train_datas))
            print(len(test_datas))
            X_train, y_train = train_datas, train_labels

            model = GraphCRF(inference_method="max-product")
            ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=10)
            ssvm.fit(X_train, y_train)
            y_preds = ssvm.predict(test_datas)
<<<<<<< HEAD
            result = ssvm.score(x_test_ori, y_test_ori)
            print('iter {} result = {}'.format(i, result))
            count = 0
            for clique_idx, clique in enumerate(y_preds):
                for node_idx, node in enumerate(clique):
                    node_id = node_ids[clique_idx][node_idx]
                    if node == central_propagation_df.iloc[node_id].values:
                        clabels[int(node_id)] = node
                        if not int(node_id) in c_idxs:
                            c_idxs = np.append(c_idxs, int(node_id))
                            count += 1
            print('iter {} update {} new labels'.format(i, count))
=======
            # result = ssvm.score(test_datas, test_labels)
            # print('iter {} result = {}'.format(i, result))
            count = 0
Beispiel #25
0
 
  
  X = X[:100]
  y = y[:100]
  
  
  #Add edges
  for i in range(X.shape[0]):
      X[i] = [X[i], np.vstack([(0,1),(2,2)])]
      
  model = GraphCRF(directed=True, inference_method="max-product")
  
  X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X,y, test_size =0.5, random_state=0)
  ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
  ssvm.fit(X_train,y_train)
  print ssvm.score(X_test, y_test)
  print ssvm.predict(X_test)
  print y_test
  
  '''
  for i in range(X.shape[0]):
      
      X_train, X_test = X[] 
      X_test = X[i]
      y_test = y[i]
      X_train = np.delete(X,i)
      y_train = np.delete(y,i)
      
  
      ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
      ssvm.fit(X_train,y_train)
Beispiel #26
0
y_train, y_test = y[folds == 1], y[folds != 1]

"""
features_0 = features_train[0]
n_nodes = features_0.shape[0]
edges_0 = np.vstack([np.arange(n_nodes - 1), np.arange(1, n_nodes)])
x = (features_0, edges_0)
"""

f_t = features_train
X_train = [(features_i, np.vstack([np.arange(features_i.shape[0] - 1), np.arange(1, features_i.shape[0])])) for features_i in f_t]



print type(X_train)
print type(X_train[0][1])
print X_train[0][1].shape

print type(y_train)
print type(y_train[0])
print y_train[0]
print y_train[0].shape

from pystruct.models import GraphCRF
from pystruct.learners import FrankWolfeSSVM
model = GraphCRF(directed=True, inference_method="max-product")
ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=10)
ssvm.fit(X_train, y_train)
print "OM SRI SAIRAM"
print ("Accuracy score with Graph CRF : %f" % ssvm.score(y_train,y_test))
Beispiel #27
0
    gssvm.fit(X_train, y_train)
    train_score = gssvm.score(X_train, y_train)
    test_score = gssvm.score(X_test, y_test)
    print("Train / Test score with gchain CRF: %f %f" %
          (train_score, test_score))
else:
    # Train linear chain CRF
    model = ChainCRF()
    # pdb.set_trace()
    ssvm = FrankWolfeSSVM(model=model,
                          C=C,
                          check_dual_every=10,
                          max_iter=100,
                          verbose=True)
    ssvm.fit(X_train, y_train)
    train_score = ssvm.score(X_train, y_train)
    test_score = ssvm.score(X_test, y_test)
    print("Train / Test score with chain CRF: %f %f" %
          (train_score, test_score))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_chain = ssvm.predict([X_test[ind]])[0]
    for i, (a, image, y_true, y_svm, y_chain) in enumerate(
Beispiel #28
0
def run_crf(w2v, words_before, words_after, shallow_parse):

    pmids_dict, pmids, abstracts, lbls, vectorizer, groups_map, one_hot, dicts = \
        parse_summerscales.get_tokens_and_lbls(
                make_pmids_dict=True, sen=True)


    """
        Create model
    """
    model = ChainCRF(directed=False)
    ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=30)

    all_pmids = pmids_dict.keys()
    n = len(all_pmids)
    n_folds = 5
    kf = KFold(n, random_state=1337, shuffle=True, n_folds=n_folds)
    fold_gi = []


    for fold_idx, (train, test) in enumerate(kf):
        print("on fold %s" % fold_idx)
        train_pmids = [all_pmids[pmid_idx] for pmid_idx in train]
        test_pmids  = [all_pmids[pmid_idx] for pmid_idx in test]
        print('loading data...')
        train_x, train_y = abstract2features(pmids_dict, words_before, w2v, shallow_parse)
        test_x, test_y = abstract2features(pmids_dict, words_after, w2v, shallow_parse)

        print('loaded data...')
        print 'training...'
        ssvm.fit(train_x, train_y)

        print ssvm.score(test_x, test_y)

        for i, (pmid, x, y) in enumerate(zip(test_pmids, test_x, test_y)):
            abstract_words, _, _= pmids_dict[pmid]

            print(pmid)

            # predict() takes in a list returns another list
            prediction = ssvm.predict([x]).pop(0)

            predicted = ''
            output = ''

            if len(prediction) > 0:

                for p in prediction:
                    if p == 1:
                        print "word: {}".format(abstract_words[p])
                        if n == 0:
                            predicted += abstract_words[p]
                        else:
                            predicted += ' ' + abstract_words[p]

                if not predicted == '':
                    output = 'predicted: {}'.format(predicted)
                else:
                    output = 'Predicted nothing!'
            else:
                output = 'Predicted nothing!'
            print output
Beispiel #29
0
# Train CRF
model = ChainCRF(directed=True)
ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11)
ssvm.fit(np.vstack(X_train).reshape((5375, 1, 128)), np.hstack(y_train).reshape(5375, 1))


# Train linear chain CRF
chain_model = ChainCRF(directed=True)
chain_ssvm = FrankWolfeSSVM(model=chain_model, C=.1, max_iter=11, verbose=0)
chain_ssvm.fit(X_train, y_train)


print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test),
                                                   np.hstack(y_test)))
print("Test score with CRF: %f" % ssvm.score(X_test, y_test))

print("Test score with Linear Chain CRF: %f" % chain_ssvm.score(X_test, y_test))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'CRF', color="#FFD700", size=25)
Beispiel #30
0
chain_ssvm.fit(nn_predictions_train, y_train)


# # Create linear regression object
# regr = LinearRegression()
# # Train the model using the training sets
# regr.fit(np.vstack(nn_predictions_train), np.hstack(y_train))

# print("Test score with linear regression: %f" % regr.score(np.vstack(nn_predictions_test),
#                                                    np.hstack(y_test)))

print("Test score with linear NN: 84.15%")

print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test),
                                                   np.hstack(y_test)))
print("Test score with CRF: %f" % ssvm.score(nn_predictions_test, y_test))

print("Test score with Linear Chain CRF: %f" % chain_ssvm.score(nn_predictions_test, y_test))

# # plot some word sequenced
# n_words = 4
# rnd = np.random.RandomState(1)
# selected = rnd.randint(len(y_test), size=n_words)
# max_word_len = max([len(y_) for y_ in y_test[selected]])
# fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
# fig.subplots_adjust(wspace=0)
# fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
# fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
# fig.text(0.6, 0.05, 'LCCRF', color="#FF5555", size=25)
# fig.text(0.8, 0.05, 'CRF', color="#FFD700", size=25)
Beispiel #31
0
# print("Shuffle results")
# features, labels = util.shuffle(features, labels)

trsize = int(0.7 * len(labels))
X_train = features[1:trsize]
y_train = labels[1:trsize]

X_test = features[trsize + 1:]
y_test = labels[trsize + 1:]

# X_train = X_test = features
# y_train = y_test = labels
# trsize = len(labels)

# Evaluate the chain
model = ChainCRF()
C = 0.0001
max_iter = 50
ssvm = FrankWolfeSSVM(model=model, C=C, max_iter=max_iter, verbose=True)
print(ssvm)
print(ssvm.fit(X_train, y_train))
print(ssvm.w)
trscore = ssvm.score(X_train, y_train)
# testscore = ssvm.score(X_test,y_test)
print("Training score: {0}".format(trscore))
# print("Test score: {0}".format(testscore))

# Save the result
# util.saveToSQL(featureset, C, max_iter, trsize, trscore, 2)
Beispiel #32
0
y_half_train = np.ones_like(X_half_train)

for ind in range(0, X_train.shape[0]):
    # n_letters = 2 #fixed len of word
    n_letters = int(np.floor(X_train[ind].shape[0] / 2))
    X_half_train[2*ind] = X_train[ind][0:n_letters]
    X_half_train[2*ind+1] = X_train[ind][n_letters:]
    y_half_train[2*ind] = y_train[ind][0:n_letters]
    y_half_train[2*ind+1] = y_train[ind][n_letters:]
# Train the model
half_ssvm.fit(X_half_train, y_half_train)


print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test),
                                                   np.hstack(y_test)))
print("Test score with FULL LCCRF: %f" % ssvm.score(X_test, y_test))

print("Test score with HALF LCCRF: %f" % half_ssvm.score(X_test, y_test))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'HALF-LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'FULL-LCCRF', color="#FFD700", size=25)
# for value in X:
#     print value.shape
#
# print X_train.shape
# print y_train.shape
#
# print type(X_train)

# for value in y_train:
#     print value
#
# for i in range(0, len(X_train)):
#     if i == 15:
#         print X_train[i], len(X_train[i])
#         for f in X_train[i]:
#             print len(f)
#             break
#         print y_train[i], len(X_train[i])
#     # break
#

start = time()

model = ChainCRF(inference_method='max-product', directed=True)
ssvm = FrankWolfeSSVM(model=model, C=1.0, max_iter=10)

ssvm.fit(X_train, y_train)

print 'accuracy of linear-crf %f:' % ssvm.score(X_test, y_test), ' time spend: %f' %(time()-start)
# Train directed chain CRF
model = ChainCRF(directed=True)
ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11)
ssvm.fit(X_train, y_train)


# Train undirected chain CRF
undirected_model = ChainCRF(directed=False)
undirected_ssvm = FrankWolfeSSVM(model=undirected_model, C=.1, max_iter=11, verbose=0)
undirected_ssvm.fit(X_train, y_train)


print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test),
                                                   np.hstack(y_test)))
print("Test score with directed LCCRF: %f" % ssvm.score(X_test, y_test))

print("Test score with undirected LCCRF: %f" % undirected_ssvm.score(X_test, y_test))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
fig.text(0.2, 0.05, 'GT', color="#00AA00", size=25)
fig.text(0.4, 0.05, 'SVM', color="#5555FF", size=25)
fig.text(0.6, 0.05, 'UD-LCCRF', color="#FF5555", size=25)
fig.text(0.8, 0.05, 'D-LCCRF', color="#FFD700", size=25)
# convenient
X, y = np.array(X), np.array(y)
X_train, X_test = X[folds == 1], X[folds != 1]
y_train, y_test = y[folds == 1], y[folds != 1]

# Train linear SVM
svm = LinearSVC(dual=False, C=0.1)
# flatten input
svm.fit(np.vstack(X_train), np.hstack(y_train))

# Train linear chain CRF
model = ChainCRF()
ssvm = FrankWolfeSSVM(model=model, C=0.1, max_iter=11)
ssvm.fit(X_train, y_train)

print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test))

print("Test score with linear SVM: %f" % svm.score(np.vstack(X_test), np.hstack(y_test)))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_chain = ssvm.predict([X_test[ind]])[0]
    for i, (a, image, y_true, y_svm, y_chain) in enumerate(
        zip(axes_row, X_test[ind], y_test[ind], y_pred_svm, y_pred_chain)
Beispiel #36
0
import loader
import util
from sklearn import preprocessing


directory = "/Users/thijs/dev/boilerplate/src/main/resources/dataset/"
featureset = "features10"

print("Load files")
features, labels = \
  loader.loadBinary(featureset+'.csv', 'labels.csv', directory)

# print("Shuffle results")
# features, labels = util.shuffle(features, labels)

print("Loaded")
# print(labels)

# features = preprocessing.scale(features)


from pystruct.models import BinaryClf
from pystruct.learners import (NSlackSSVM, OneSlackSSVM,
                               SubgradientSSVM, FrankWolfeSSVM)
clf = FrankWolfeSSVM(BinaryClf(),verbose=True)
# print(clf)
clf.fit(features,labels)
trscore = clf.score(features,labels)

# print("Training score: {0}".format(trscore))
print("Klaar")
Beispiel #37
0
Cs = [.5]
test_cs = []
test_g = []

for C in Cs:
    fw_bc_svm = FrankWolfeSSVM(model, C=C, max_iter=1000, check_dual_every=20, line_search=False, verbose=True)
    # fw_batch_svm = FrankWolfeSSVM(model, C=.1, max_iter=50, batch_mode=True)
    gfw_bc_svm = GeneralizedFrankWolfeSSVM(gmodel, C=C, max_iter=1000, check_dual_every=5, line_search=False, verbose=True, X_test=X_test_bias, Y_test=y_test)

    # VANILLA

    print("CRAMMER-SINGER RUNNING")

    start = time()
    fw_bc_svm.fit(X_train_bias, y_train)
    print("error train %f and test %f" % (fw_bc_svm.score(X_train_bias, y_train), fw_bc_svm.score(X_test_bias, y_test)))
    test_cs.append(fw_bc_svm.score(X_test_bias, y_test))
    # y_pred = np.hstack(fw_bc_svm.predict(X_test_bias))
    # time_fw_bc_svm = time() - start
    # print("Score with cssvm: %f , C=%f (took %f seconds)" %
    #     (np.mean(y_pred == y_test), C, time_fw_bc_svm))
    # pdb.set_trace()

    # GENERALIZED

    print("GENERALIZED METHOD RUNNING")

    start = time()
    gfw_bc_svm.fit(X_train_bias, y_train)
    print("error train %f and test %f" % (gfw_bc_svm.score(X_train_bias, y_train), gfw_bc_svm.score(X_test_bias, y_test)))
    test_g.append(gfw_bc_svm.score(X_test_bias, y_test))
Beispiel #38
0
# convenient
X, y = np.array(X), np.array(y)
X_train, X_test = X[folds == 1], X[folds != 1]
y_train, y_test = y[folds == 1], y[folds != 1]

# Train linear SVM
svm = LinearSVC(dual=False, C=.1)
# flatten input
svm.fit(np.vstack(X_train), np.hstack(y_train))

# Train linear chain CRF
model = ChainCRF()
ssvm = FrankWolfeSSVM(model=model, C=.1, max_iter=11)
ssvm.fit(X_train, y_train)

print("Test score with chain CRF: %f" % ssvm.score(X_test, y_test))

print("Test score with linear SVM: %f" %
      svm.score(np.vstack(X_test), np.hstack(y_test)))

# plot some word sequenced
n_words = 4
rnd = np.random.RandomState(1)
selected = rnd.randint(len(y_test), size=n_words)
max_word_len = max([len(y_) for y_ in y_test[selected]])
fig, axes = plt.subplots(n_words, max_word_len, figsize=(10, 10))
fig.subplots_adjust(wspace=0)
for ind, axes_row in zip(selected, axes):
    y_pred_svm = svm.predict(X_test[ind])
    y_pred_chain = ssvm.predict([X_test[ind]])[0]
    for i, (a, image, y_true, y_svm, y_chain) in enumerate(
Beispiel #39
0
import numpy as np
import loader
import util
from sklearn import preprocessing

directory = "/Users/thijs/dev/boilerplate/src/main/resources/dataset/"
featureset = "features10"

print("Load files")
features, labels = \
  loader.loadBinary(featureset+'.csv', 'labels.csv', directory)

# print("Shuffle results")
# features, labels = util.shuffle(features, labels)

print("Loaded")
# print(labels)

# features = preprocessing.scale(features)

from pystruct.models import BinaryClf
from pystruct.learners import (NSlackSSVM, OneSlackSSVM, SubgradientSSVM,
                               FrankWolfeSSVM)
clf = FrankWolfeSSVM(BinaryClf(), verbose=True)
# print(clf)
clf.fit(features, labels)
trscore = clf.score(features, labels)

# print("Training score: {0}".format(trscore))
print("Klaar")