Ejemplo n.º 1
0
def kfold_average_score(learner, files, dirs, k=5, min_word_len=2, min_freq=10, feature_size=160, weight='tfidf', sw=True):
    scores = [0] * k
    kfold = KFold(files, dirs, k)
    for ii in range(k):
        train_X, train_Y, test_X, test_Y = kfold.kth(ii)
        features = Features(train_X, train_Y, min_word_len, min_freq, feature_size, sw)
        x = []
        for f_name in train_X:
            x.append(features.get_x_vector(f_name, weight))

        learner.fit(x, features.y_transform(train_Y))
        x = []
        for f_name in test_X:
            x.append(features.get_x_vector(f_name, weight))
        scores[ii] = f1_score(features.y_transform(test_Y), learner.predict(x).tolist())
    return mean(scores)
Ejemplo n.º 2
0
    return mean(scores)


if __name__ == '__main__':
    parser = argparse.ArgumentParser("Read in data directory")
    parser.add_argument('data_dir')
    print('Reading Data Path')
    files, dirs = fe.get_file_name_and_path(parser.parse_args().data_dir)
    print('Spliting Train-Test Set ')
    train_x, train_y, test_x, test_y = train_test_split(files, dirs, 0.25)

    learner = svm.SVC(kernel='rbf', C=1)
    features = Features(train_x, train_y, 3, 0, 160)
    x = []
    for f_name in train_x:
        x.append(features.get_x_vector(f_name, 'tfidf'))

    learner.fit(x, features.y_transform(train_y))
    x=[]
    for f_name in test_x:
        x.append(features.get_x_vector(f_name, 'tfidf'))
    print('Score:', f1_score(features.y_transform(test_y), learner.predict(x).tolist()))


    # print('Test if "TFIDF" is better than "TF"')
    # print('TF:', kfold_average_score(learner, train_x, train_y, weight='tf',feature_size=2))
    # print('TFIDF:', kfold_average_score(learner, train_x, train_y, weight='tfidf',feature_size=2))
    # #
    # #
    # print('Test which min length of word is best')
    # lst = []