Ejemplo n.º 1
0
def leave_one_out_score(data_set_name):
    # open data set
    data_set = DataSet(data_set_name)
    x, y = data_set.load_training_data()

    correct_count = 0

    for i in range(len(y)):
        x_train = np.delete(x, i, axis=0)
        y_train = np.delete(y, i)
        x_test = [x[i]]
        y_test = [y[i]]

        # train model
        model = train_model(x_train, y_train)
        if model.predict(x_test)[0] == y_test:
            correct_count += 1
            print(i, 'correct,', correct_count / (i + 1))
        else:
            print(i, 'wrong,', correct_count / (i + 1))

    print('accuracy:', correct_count / len(y))
Ejemplo n.º 2
0
def cross_validation(data_set_name):
    # open data set
    data_set = DataSet(data_set_name)

    # get train and test set
    x, y = data_set.load_training_data()
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.1,
                                                        shuffle=True)

    # train and save model
    model = train_model(x_train, y_train)

    # get cross validation performance
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test, predict)
    print('Cross validation accuracy:', accuracy)

    # print_classification_matrix(predict, y_test)

    return accuracy, y_test, predict
Ejemplo n.º 3
0
def cross_corpus(train_set_name, test_set_name):
    # open data set and train model
    train_set = DataSet(train_set_name)
    x_train, y_train = train_set.load_training_data()
    model = train_model(x_train, y_train)

    # open test set and predict labels
    test_set = DataSet(test_set_name)
    x_test, y_test = test_set.load_training_data()
    predict = model.predict(x_test)

    print_classification_matrix(predict, y_test)

    return model
Ejemplo n.º 4
0
            doc_start = time.time()

            # read raw text and parse tree
            text = data_set.get_text(counter.count)
            label = data_set.data['labels'][counter.count]
            annotation = data_set.load_stanford_annotation(counter.count)

            # insert row to matrix
            # also, initialize feature matrix if it is None
            row = stanford_feature.get_features(annotation)
            row.append(label)

            if self.feature_matrix is None:
                self.feature_matrix = np.zeros(
                    [data_set.data['count'], len(row)])
            self.feature_matrix[counter.count, :] = row

            # count annd print
            counter.increment()
            print('%i, %i%% %.2f seconds (%.0f total))' %
                  (counter.count - 1,
                   100 * counter.count / data_set.data['count'],
                   time.time() - doc_start, time.time() - start))

        counter.commit()


if __name__ == '__main__':
    data_set = DataSet('cepp')
    FeatureExtractor(data_set)
Ejemplo n.º 5
0
    start = time.time()
    while counter.count < data_set.data['count']:
        doc_start = time.time()

        # read raw text and parse tree
        text = data_set.get_text(counter.count)
        label = data_set.data['labels'][counter.count]
        annotation = data_set.load_stanford_annotation(counter.count)

        # extract features into a row array
        row = extract_features(text, annotation)
        row.append(label)

        # initialize feature matrix if it is None
        if feature_matrix is None:
            feature_matrix = np.zeros([data_set.data['count'], len(row)])

        # insert row array to matrix
        feature_matrix[counter.count, :] = row

        # count and print
        counter.increment()
        print('%i, %i%% %.2f seconds (%.0f total))' % (
        counter.count - 1, 100 * counter.count / data_set.data['count'], time.time() - doc_start, time.time() - start))

    counter.commit()


if __name__ == '__main__':
    process_feature(DataSet('core-standard'), restart=True)
Ejemplo n.º 6
0

if __name__ == '__main__':
    # which data set to use
    train_on = 'cepp'
    test_on = 'core-standard'

    # Leave one out validation (very slow)
    # leave_one_out_score('cepp')

    # Cross validation
    accuracy, y_test, predict = cross_validation(train_on)
    print(confusion_matrix(y_test, predict))

    # 10-Fold Cross validation
    x, y = DataSet(train_on).load_training_data()
    model = RandomForestClassifier(n_estimators=1000,
                                   oob_score=True,
                                   warm_start=True)
    print('10-Fold Cross Validation Score:',
          np.mean(cross_val_score(model, x, y, cv=10)))

    # save fully trained model
    model.fit(x, y)
    DataSet(train_on).save_model(model, 'random-forest')

    # load and evaluate on train (just to see if it is correct)
    loaded_model = DataSet(train_on).load_model('random-forest')
    predict = loaded_model.predict(x)
    print_classification_matrix(predict, y)
Ejemplo n.º 7
0
        len(sentence['tokens']) for sentence in annotation['sentences'])
    num_stopwords = count_stopwords(annotation)

    # features
    complex_token_ratio = count_complex_tokens(sentences) / num_tokens
    number_meaningful_bigrams_percorpus = float(
        count_meaningful_bigrams(annotation)) / num_tokens
    stopword_ratio = num_stopwords / num_tokens
    stopword_per_sentence = num_stopwords / num_sentences

    # summarize features into an array
    scores = ttr_pos(annotation)
    scores.extend([
        complex_token_ratio, number_meaningful_bigrams_percorpus,
        stopword_ratio, stopword_per_sentence
    ])

    return scores


if __name__ == '__main__':
    cepp = DataSet('cepp')
    annotation = cepp.load_stanford_annotation(0)

    print(count_meaningful_bigrams(annotation))

    #test_data = """Eleven states working in conjunction with the U.S Department of Transportation (D.O.T) have agreed to implement an ordinance banning the use of electronic cigarettes in vehicles  meaning if you are a resident of one of the impacted states, you will be prohibited from utilizing an electronic cigarette while inside your vehicle."""
    #print(extract_lexical_features([test_data, test_data]))
    #print(list(find_ngrams(word_tokenize(test_data), 2)))
    #print(meanigful_bigrams(find_ngrams(word_tokenize(test_data), 2)))
Ejemplo n.º 8
0
from nutrition.structure.data_set import DataSet

if __name__ == '__main__':
    
    data_set = DataSet('cepp')
    
    levels = ['KET', 'PET', 'FCE', 'CAE', 'CPE']
    num_articles = [64, 60, 71, 67, 69]
    
    labels = []
    
    text_id = 0
    for l in range(0, 5):
        print('working on level', l)
        for i in range(1, num_articles[l] + 1):
            print('working on text', i)
            path = '{}/_origin/{}/{}.txt'.format(data_set.path, levels[l], i)
            data_set.import_raw_text(path, text_id)
            labels.append(l)
            text_id += 1
            
    data_set.set_labels(labels)
    
Ejemplo n.º 9
0
from nutrition.structure.data_set import DataSet
import numpy as np

if __name__ == '__main__':
    data_set = DataSet('newsela')

    labels = np.genfromtxt('D:/master project/data/newsela/average_level.csv',
                           delimiter=',')
    data_set.set_labels(labels[:, 1].tolist())

    for i in range(0, 17027):
        path = 'D:/master project/data/newsela/text/{}.txt'.format(i + 1)
        data_set.import_raw_text(path, i)

        print(i)
            sys.exit()

        # pickle the result
        data_set.save_stanford_annotation(counter.count, annotation)

        # save the new count
        counter.increment()

        # print time information
        print('%i, %i%% %.2f seconds (%.0f total))' %
              (counter.count - 1, 100 * counter.count / data_set.data['count'],
               time.time() - doc_start, time.time() - start))


if __name__ == '__main__':
    process_stanford(DataSet('learning-corpus'), restart=False)

    # sw = Stopwatch()
    #
    # data_set = DataSet('cepp')
    # sw.lap('test')
    # nlp = StanfordCoreNLP(STANFORD_SERVER)
    # sw.lap('test')
    # text = fix(data_set.get_text(0))
    # print(text)
    # sw.lap('test')
    # annotation = nlp.annotate(text, properties={
    #     'annotators': 'lemma,parse',
    #     'outputFormat': 'json',
    #     'coref.algorithm': 'statistical'
    # })
    print(sum(abs(y_test - cv_predict) < 0.5) / len(y))

    # train set performance
    model.fit(x, y)
    predict_train = model.predict(x)
    #plt.figure()
    #plt.scatter(y, predict_train)
    #plt.title('Train set')

    # cross corpus
    x_cc, y_cc = cc_set.load_training_data()
    x_cc = x_cc[:, features]
    cc_predict = model.predict(x_cc)
    plt.figure()
    plt.scatter(y_cc, cc_predict)
    plt.title('CEPP model on Newsela')

    plt.show()


if __name__ == '__main__':
    model = SVC(kernel='linear')
    data_set = DataSet('cepp')
    cc_set = DataSet('core-standard')

    # features = [6,66,69,86,112,115,118,121]  # RFE
    # features = [120, 121, 66, 68, 115, 45]  # Greedy
    # features = [15, 31]  # NP/VP per sentence
    features = list(range(0, 122))

    eval_plot(model, data_set, cc_set, features)
Ejemplo n.º 12
0
from nutrition.structure.data_set import DataSet
import os

if __name__ == '__main__':

    data_set = DataSet('nil')

    text_id = 0
    labels = []
    for level in range(1, 4):
        folder = 'D:/master project/data/news_in_levels/News_in_levels_level{}/articles/'.format(
            level)
        for filename in os.listdir(folder):
            # ignore files that are very small (< n bytes)
            if os.stat(folder + filename).st_size < 10:
                print('ignored {} because its size is only {} bytes'.format(
                    folder + filename,
                    os.stat(folder + filename).st_size))
                continue

            data_set.import_raw_text(folder + filename, text_id)
            labels.append(level)
            text_id += 1
            print(text_id)

    data_set.set_labels(labels)
Ejemplo n.º 13
0
        text = data_set.get_text(counter.count)
        
        # call stanford annotate api
        annotation = nlp.annotate(text, properties={
            'annotators': 'tokenize,ssplit,pos,depparse,parse',
            'outputFormat': 'json'
        })
        
        if type(annotation) is str:
            print('Error returned by stanford parser:', str)
            sys.exit()
        
        # pickle the result
        data_set.save_stanford_annotation(counter.count, annotation)
        
        # save the new count
        counter.increment()
        
        # print time information
        print('%i, %i%% %.2f seconds (%.0f total))' % (counter.count-1, 100*counter.count/data_set.data['count'], time.time() - doc_start, time.time() - start))

if __name__ == '__main__':
    data_set = DataSet('newsela')
    parse(data_set)
    
    
    
    
    
    
    
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from nutrition.structure.data_set import DataSet

import matplotlib.pyplot as plt

if __name__ == '__main__':
    x, y = DataSet('cepp').load_training_data()
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.5,
                                                        shuffle=True,
                                                        random_state=42)

    model = RandomForestRegressor(n_estimators=1000,
                                  oob_score=True,
                                  warm_start=True)

    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    # print(accuracy_score(y_test, predict))

    plt.figure()
    plt.scatter(y_test, predict)
    plt.show()
import os

from nutrition.structure.data_set import DataSet

if __name__ == '__main__':

    data_set = DataSet('core-standard');
    raw_folder = 'D:/master project/dataold/core-standards-readability'
    grades = ['1', '2-3', '4-5', '6-8', '9-10', '11-CCR']
    bans = ['poetry', 'drama']

    labels = []

    text_id = 0
    for l in range(0, 6):
        print('working on level', l)
        path_level = raw_folder + '/grade ' + grades[l]
        for cat in os.listdir(path_level):
            path_cat = path_level + '/' + cat
            if any(ban in cat for ban in bans):
                print('ignoring ' + path_cat)
            else:
                for file in os.listdir(path_cat):
                    path_text = path_cat + '/' + file
                    print('reading ' + path_text)
                    data_set.import_raw_text(path_text, text_id)
                    labels.append(l)
                    text_id += 1

    data_set.set_labels(labels)
        # average node depth
        count_sum_node_depth(sentence_trees) / token_count,

        # average word length
        count_sum_word_length(sentence_trees) / token_count,

        # function words (stopwords which are not DT/determiners)
        count_function_words(tokens)
    ])

    return features


if __name__ == '__main__':
    annotation = DataSet('cepp').load_stanford_annotation(0)

    #print(count_difficult_words(annotation['sentences']))

    sentences = annotation['sentences']
    num_sentences = len(sentences)
    tokens = [token for sentence in sentences for token in sentence['tokens']]
    print(count_function_words(tokens))
    #
    # # pprint.pprint(annotation)
    # # sentence_trees = [Tree.fromstring(sentence['parse']) for sentence in annotation['sentences']]
    # # sentence_trees[0].pretty_print()
    # for sentence in annotation['sentences']:
    #     for token in sentence['tokens']:
    #         print(token['lemma'], token['word'], token['originalText'], token['pos'])
    #
from nltk.corpus import stopwords

from nutrition.structure.data_set import DataSet

if __name__ == '__main__':
    DataSet('learning-corpus').delete_row(1437)
import os

from nutrition.structure.data_set import DataSet

if __name__ == '__main__':
    levels = [
        'elementry', 'pre_inter', 'intermediate', 'upper_inter', 'advanced'
    ]

    data_set = DataSet('learning-corpus')
    labels = []

    text_id = 0
    root_path = 'D:/master project/dataold/learning_corpus'
    for level in range(len(levels)):
        level_path = root_path + '/' + levels[level]

        for file_name in os.listdir(level_path):
            text_path = level_path + '/' + file_name

            with open(text_path, 'r', encoding='utf8') as file:
                text_length = len(file.read())

            if 10 < text_length < 100000:
                labels.append(level)
                data_set.import_raw_text(text_path, text_id)
                text_id += 1

    data_set.set_labels(labels)
Ejemplo n.º 19
0
    # train model on train data set
    model = train_linear(train_data_set)

    # evaluate model on test data set (cross corpus)
    x, y = test_data_set.load_training_data()
    score = cross_val_score(model, x, y, scoring='neg_mean_squared_error')
    print('Mean squared error: {}'.format(-score))

    # plot
    predict = model.predict(x)
    plt.scatter(y, predict)
    plt.show()


if __name__ == '__main__':
    data_set = DataSet('cepp')

    x, y = data_set.load_training_data()

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.4,
                                                        shuffle=True,
                                                        random_state=0)

    regr = linear_model.LinearRegression(normalize=True)
    #regr = linear_model.Ridge(alpha=0.001, normalize=True)

    regr.fit(x_train, y_train)
    predict = regr.predict(x_test)
        # c = current column id
        for c in range(0, len(x_train[0])):
            c_x_train = np.delete(x_train, c, 1)
            c_x_test = np.delete(x_test, c, 1)
            c_error = get_error(c_x_train, y_train)

            if b_error == -1 or b_error > c_error:
                b_id = c
                b_error = c_error
                b_cv_error = get_cv_error(c_x_train, c_x_test, y_train, y_test)

        x_train = np.delete(x_train, b_id, 1)
        x_test = np.delete(x_test, b_id, 1)
        features = np.delete(features, b_id, 0)
        feature_ids = np.delete(feature_ids, b_id, 0)
        print(features, feature_ids, b_error, b_cv_error, len(x_train[0]))
        plot_x_num_features.append(len(x_test[0]))
        plot_y_error.append(b_error)
        plot_y_cv_error.append(min(10, b_cv_error))

    #print(features, b_error)
    plt.scatter(plot_x_num_features, plot_y_error)
    plt.scatter(plot_x_num_features, plot_y_cv_error)
    plt.show()


if __name__ == '__main__':
    #backward_feature_selection(DataSet('cepp'), DataSet('mini-newsela'))
    forward_feature_selection(DataSet('cepp'), DataSet('mini-newsela'))
Ejemplo n.º 21
0
    predict_train = model.predict(x_train)
    plt.figure()
    plt.title('train')
    plt.scatter(y_train, predict_train)

    # plot test performance
    predict = model.predict(x_test)
    plt.figure()
    plt.title('test')
    plt.scatter(y_test, predict)
    plt.show()


def eval_cc_linear(train_data_set, test_data_set):
    # train model on train data set
    model = train_linear(train_data_set)

    # evaluate model on test data set (cross corpus)
    x, y = test_data_set.load_training_data()
    score = cross_val_score(model, x, y, scoring='neg_mean_squared_error')
    print('Mean squared error: {}'.format(-score))

    # plot
    predict = model.predict(x)
    plt.scatter(y, predict)
    plt.show()


if __name__ == '__main__':
    eval_linear(DataSet('cepp'))