Exemple #1
0
def draw_plot(metric):
    metric_coefs = dump.load_object(metric + '/svm/coefs.dump')
    metric_f1 = dump.load_object(metric + '/svm/f1.dump')
    metric_n_feat = dump.load_object(metric + '/svm/feat.dump')

    pt.title(metric + ': F1')
    pt.plot(metric_coefs, metric_f1)
    pt.plot(metric_coefs, [score] * len(metric_coefs), color='red')
    pt.figure()
    pt.title(metric + ': N Features')
    pt.plot(metric_coefs, metric_n_feat)
    pt.show()
Exemple #2
0
def information_gain(data, labels, dumped=False):
    import numpy as np

    def get_features(data_set):
        n = len(data[0])
        return [[i[j] for i in data_set] for j in range(n)]

    def possibilities(feature):
        counts = np.bincount(feature)
        return np.asarray(counts[np.nonzero(counts)] / float(len(feature)))

    def entropy(feature):
        p = possibilities(feature)
        return -np.sum(p * np.log2(p))

    def spec_cond_entropy(x, y, xi):
        new_y = [y[i] for i in range(len(y)) if x[i] == xi]
        return entropy(new_y)

    def cond_entropy(x, y):
        p = possibilities(x)
        return sum([p[xi] * spec_cond_entropy(x, y, xi) for xi in range(len(p))])

    def cond_entropy_full(x, y):
        from util.frame import progress
        print('Information gain: computing conditional entropy:')
        feat_len = len(x)
        result = []
        for i in range(feat_len):
            result.append(cond_entropy(x[i], y))
            if i % 10 == 0:
                progress((i + 1) / feat_len)
        progress(1)
        return np.asarray(result)

    import util.dump as dump
    features = get_features(data)
    h_y_x = []
    if not dumped:
        h_y_x = cond_entropy_full(features, labels)
        dump.dump_object(h_y_x, 'ig/hyx.dump')
    else:
        h_y_x = dump.load_object('ig/hyx.dump')
    info_gain = entropy(labels) - h_y_x
    result = [(info_gain[i], i) for i in range(len(info_gain))]
    return result
def pearson(data, labels, dumped=False):
    import numpy as np
    import util.dump as dump
    import math
    import warnings
    import scipy.stats as stats
    warnings.filterwarnings('ignore')

    def get_features(data_set):
        n = len(data[0])
        return [[i[j] for i in data_set] for j in range(n)]

    def feature_correlation(x, y):
        n = range(len(x))
        x_avg = sum(x) / len(x)
        y_avg = sum(y) / len(y)
        cov = sum([(x[i] - x_avg) * (y[i] - y_avg) for i in n])
        x_dev = math.sqrt(sum([(x[i] - x_avg) ** 2 for i in n]))
        y_dev = math.sqrt(sum([(y[i] - y_avg) ** 2 for i in n]))
        return cov / (x_dev * y_dev)

    def correlation(x, y):
        from util.frame import progress
        print('Pearson: computing corellation coefficients:')
        feat_len = len(x)
        result = []
        for i in range(feat_len):
            result.append(feature_correlation(x[i], y))
            if i % 10 == 0:
                progress((i + 1) / feat_len)
        progress(1)
        return np.asarray(result)

    features = get_features(data)
    ro = []
    if not dumped:
        ro = correlation(features, labels)
        dump.dump_object(ro, 'pearson/ro.dump')
    else:
        ro = dump.load_object('pearson/ro.dump')
    v = len(labels) - 2
    p = []
    for i in range(len(ro)):
        t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2)
        p.append((stats.t.sf(np.abs(t), v) * 2, i))
    return p
Exemple #4
0
def spearman(data, labels, dumped=False):
    import numpy as np
    import util.dump as dump
    import math
    import warnings
    import scipy.stats as stats
    warnings.filterwarnings('ignore')

    def get_features(data_set):
        n = len(data[0])
        return [[i[j] for i in data_set] for j in range(n)]

    def feature_correlation(x, y):
        n = len(x)
        rank_x = np.asarray(stats.rankdata(x, method='max'))
        rank_y = np.asarray(stats.rankdata(y, method='max'))
        sum_d_2 = sum((rank_x - rank_y) ** 2)
        return 1 - 6 * sum_d_2 / (n * (n ** 2 - 1))

    def correlation(x, y):
        from util.frame import progress
        print('Spearman: computing corellation coefficients:')
        feat_len = len(x)
        result = []
        for i in range(feat_len):
            result.append(feature_correlation(x[i], y))
            if i % 10 == 0:
                progress((i + 1) / feat_len)
        progress(1)
        return np.asarray(result)

    features = get_features(data)
    ro = []
    if not dumped:
        ro = correlation(features, labels)
        dump.dump_object(ro, 'spearman/ro.dump')
    else:
        ro = dump.load_object('spearman/ro.dump')
    n = len(labels)
    v = n - 2
    p = []
    for i in range(len(ro)):
        t = ro[i] * math.sqrt(v) / math.sqrt(1 - ro[i] ** 2)
        p.append((stats.t.sf(np.abs(t), v) * 2, i))
    return p
Exemple #5
0
import util.dump as dump
import matplotlib.pyplot as pt

INFO_GAIN = True
PEARSON = True
SPEARMAN = True

IG_NBEST = True
PEARSON_NBEST = True
SPEARMAN_NBEST = True

VENN = True
VENN_NBEST = True

score = dump.load_object('score.dump')


def draw_plot(metric):
    metric_coefs = dump.load_object(metric + '/svm/coefs.dump')
    metric_f1 = dump.load_object(metric + '/svm/f1.dump')
    metric_n_feat = dump.load_object(metric + '/svm/feat.dump')

    pt.title(metric + ': F1')
    pt.plot(metric_coefs, metric_f1)
    pt.plot(metric_coefs, [score] * len(metric_coefs), color='red')
    pt.figure()
    pt.title(metric + ': N Features')
    pt.plot(metric_coefs, metric_n_feat)
    pt.show()

Exemple #6
0

def run_classifier(train_data, train_labels, test_data, classifier):
    if train_data.shape[1] == 0:
        return np.asarray([0] * train_data.shape[0])
    classifier.fit(train_data, train_labels)
    return classifier.predict(test_data)


def classify(x, x_val, y):
    import sklearn.svm as svm
    predict = run_classifier(x, y, x_val, svm.LinearSVC())
    return predict


data = dump.load_object('data.dump')
data_valid = dump.load_object('data_valid.dump')
labels = dump.load_object('labels.dump')
labels_valid = dump.load_object('labels_valid.dump')

score = metrics.f1_score(labels_valid, classify(data, data_valid, labels))
print(score)
print()
dump.dump_object(score, 'score.dump')

# INFO GAIN
if INFO_GAIN:
    ig = dump.load_object('ig/ig.dump')
    ig_coefs = np.arange(0.1, 0.91, 0.01)
    ig_f1 = []
    ig_n_feat = []
Exemple #7
0

def labels_to_np_array(labels_data):
    x = np.zeros((len(labels_data), 10))
    for i in range(len(labels_data)):
        x[i][labels_data[i]] = 1
    return x


def get_predicted(predict_data):
    return [max(range(len(i)), key=lambda x: i[x]) for i in predict_data]


stats_x, stats_y, stats_y2, stats_y3 = [], [], [], []
if CONTINUE or DUMPED:
    stats_x, stats_y = load_object('stoch-n-images-stat.dump')
if not DUMPED or (DUMPED and CONTINUE):
    train_labels = []
    train_images = []
    image_size = (28, 28)
    timer = Timer()
    stdout.write('Loading Train data...')
    timer.set_new()
    train_labels = reader.read_labels('mnist/train-labels-idx1-ubyte')
    train_images = reader.read_images('mnist/train-images-idx3-ubyte')
    print('DONE in ' + timer.get_diff_str())
    image_size = train_images[1]

    stdout.write('Loading Test data...')
    timer.set_new()
    test_labels = reader.read_labels('mnist/t10k-labels-idx1-ubyte')
Exemple #8
0
    print('DONE in ' + timer.get_diff_str())

    stdout.write('Loading Test data...')
    timer.set_new()
    test_labels_file = reader.read_labels('mnist/t10k-labels-idx1-ubyte')
    test_images_file = reader.read_images('mnist/t10k-images-idx3-ubyte')
    test_data = images_to_np_array(test_images_file[2])
    test_labels = np.asarray(test_labels_file[1])
    print('DONE in ' + timer.get_diff_str())
    # timer.set_new()
    # coef = information_gain(train_data, train_labels)
    # print(' DONE in ' + timer.get_diff_str())
    # dump_object(coef, 'spearman.dump')
    import pylab as pt

    ig = [x[1] for x in sorted(load_object('ig.dump'))]

    y = np.zeros((28, 28, 3))
    n = 100
    features = ig[-n:]
    for i in features:
        y[i // 28][i % 28] = [1, 1, 1]
    pt.imshow(y)
    pt.show()

    fs_data = train_data.T[features].T
    fs_labels = train_labels

    fs_test_data = test_data.T[features].T
    fs_test_labels = test_labels
Exemple #9
0

def labels_to_np_array(labels_data):
    x = np.zeros((len(labels_data), 10))
    for i in range(len(labels_data)):
        x[i][labels_data[i]] = 1
    return x


def get_predicted(predict_data):
    return [max(range(len(i)), key=lambda x: i[x]) for i in predict_data]


stats_x, stats_y, stats_y2, stats_y3 = [], [], [], []
if CONTINUE or DUMPED:
    stats_x, stats_y = load_object('stoch-hidden-stat.dump')
if not DUMPED or (DUMPED and CONTINUE):
    train_labels = []
    train_images = []
    image_size = (28, 28)
    timer = Timer()
    stdout.write('Loading Train data...')
    timer.set_new()
    train_labels = reader.read_labels('mnist/train-labels-idx1-ubyte')
    train_images = reader.read_images('mnist/train-images-idx3-ubyte')
    print('DONE in ' + timer.get_diff_str())
    image_size = train_images[1]

    stdout.write('Loading Test data...')
    timer.set_new()
    test_labels = reader.read_labels('mnist/t10k-labels-idx1-ubyte')
Exemple #10
0
print('DONE in ' + timer.get_diff_str())
image_size = test_images[1]

images_test = images_to_np_array(test_images[2])
labels_test = labels_to_np_array(test_labels[1])
rang_test = len(images_test)


def classify():
    predicted = network.predict(images_test)
    predicted = get_predicted(predicted)
    return f1_score(test_labels[1], predicted)

network = NeuralNetwork(1, 1, 1)
if NETWORK_DUMPED:
    network = load_object('network.dump')
    print(classify())
else:
    images_train = images_to_np_array(train_images[2])
    labels_train = labels_to_np_array(train_labels[1])
    stats = []
    if NETWORK_CONTINUE:
        network = load_object('network.dump')
        stats = load_object('stats.dump')
    else:
        network = NeuralNetwork(image_size[0] * image_size[1], 10, 10)
    rang_train = len(images_train)
    print('Training...')
    cycles = 0
    timer = Timer()
    progress(0)
Exemple #11
0

def classify():
    predicted = network.predict(images_test)
    predicted = get_predicted(predicted)
    return f1_score(test_labels[1], predicted)


def classify_print():
    predicted = network.predict(images_test)
    predicted = get_predicted(predicted)
    print(classification_report(test_labels[1], predicted))

network = NeuralNetwork(1, 1, 1)
if NETWORK_DUMPED:
    network = load_object('stoch-network.dump')
    print(classify_print())
else:
    images_train = images_to_np_array(train_images[2])
    labels_train = labels_to_np_array(train_labels[1])
    stats = []
    if NETWORK_CONTINUE:
        network = load_object('stoch-network.dump')
        stats = load_object('stoch-stats.dump')
    else:
        network = NeuralNetwork(image_size[0] * image_size[1], 300, 10, layers=1)
    rang_train = len(images_train)

    print('Training...')
    cycles = 100
    num = 240