コード例 #1
0
ファイル: pruning.py プロジェクト: cjuliani/tf-cnn-pruning
    def __init__(self):
        self.img_path = './data/images'
        self.anno_path = './data/annotations'
        self.ft_path = './feature_maps/'
        self.model_path = './checkpoint/'
        self.model_name = 'segmentation.ckpt-285'
        self.model = os.path.join(self.model_path, self.model_name)

        # Parameters
        self.depth = 7
        self.classes = 1
        self.img_size = 32

        # Placeholders
        self.x = tf.placeholder(tf.float32,
                                shape=[None, None, None, self.depth],
                                name='input')
        self.y_true = tf.placeholder(tf.float32,
                                     shape=[None, None, None, self.classes],
                                     name='y_true')
        self.rate = tf.placeholder(tf.float32, name='dropout_rate')
        self.is_training = tf.placeholder(tf.bool, shape=())

        # Build network
        self.y01 = cvmodel.build_model(input=self.x,
                                       drop_rate=0,
                                       is_training=False)

        # Calculate loss + f1
        self.cost_reg, self.f1_vec, self.recall, \
        self.precision, self.specificity, self.accuracy = utils.loss(logits=[self.y01],
                                                      labels=self.y_true,
                                                      classes_weights=[2.])
        # Open session and restore model
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        self.saver.restore(self.sess, self.model)

        # Load data
        self.img_names = utils.load_train(path=self.img_path)
        self.anno_names = utils.load_train(path=self.anno_path)
        self.imgs_ = utils.get_image_array(self.img_names, self.img_size)
        self.annos_ = utils.get_annotation_array(self.anno_names,
                                                 self.img_size)
        n = self.imgs_.shape[0]

        print('\nNumber of images:', n)
        # Get number of trainable variables
        v_nb = np.sum([
            np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()
        ])
        print('Number of trainable variables:', v_nb)
コード例 #2
0
ファイル: knn.py プロジェクト: immadina/Logistic-Regression
def knn():
    train_data, train_labels = load_train()

    #for validation
    valid_data, valid_labels = load_valid()

    #for test
    #valid_data, valid_labels = load_test()

    values = [1, 3, 5, 7, 9]
    ratio = []
    for k in values:
        c = 0
        prediction_labels = run_knn(k, train_data, train_labels, valid_data)

        for i in range(len(valid_labels)):
            if valid_labels[i] == prediction_labels[i]:
                c += 1
        ratio.append(float(c) / len(prediction_labels))

    plt.plot(values, ratio)

    #for validation
    plt.axis([1, 9, 0.81, 0.87])

    #for test
    #plt.axis([1, 9, 0.87, 0.95])

    plt.show()
コード例 #3
0
ファイル: knn.py プロジェクト: immadina/Logistic-Regression
def knn():
    train_data, train_labels = load_train()

    #for validation
    valid_data, valid_labels = load_valid()
    
    #for test
    #valid_data, valid_labels = load_test()
    
    values = [1, 3, 5, 7, 9]
    ratio = []
    for k in values:
        c = 0
        prediction_labels = run_knn(k, train_data, train_labels, valid_data)
        
        for i in range(len(valid_labels)):
            if valid_labels[i] == prediction_labels[i]:
                c += 1
        ratio.append(float(c) / len(prediction_labels))

    plt.plot(values, ratio)
    
    #for validation
    plt.axis([1, 9, 0.81, 0.87])
    
    #for test
    #plt.axis([1, 9, 0.87, 0.95])
    
    plt.show()
コード例 #4
0
def train():
    tr_X, tr_y = load_train(size='_t')
    tr_X = norm4d_per_sample(tr_X)
    te_X, te_y = load_test(size='_t')
    te_X = norm4d_per_sample(te_X)
    model = PlainCNN(istrained=False, args=(0.01, 0.1, 0.9))
    model.train(tr_X, tr_y, te_X, te_y)
コード例 #5
0
ファイル: preprocessor.py プロジェクト: thexdesk/smile
def load_train(is_gabor):
    tr_identity, tr_labels, tr_images = utils.load_train()

    pc_tr_identity = reshape_labels(tr_identity)
    pc_tr_labels = reshape_labels(tr_labels)
    pc_tr_images = reshape_images(tr_images, is_gabor)

    return pc_tr_identity, pc_tr_labels, pc_tr_images
コード例 #6
0
def run_logistic_regression():
    train_inputs, train_targets = load_train()
    valid_inputs, valid_targets = load_valid()

    # TODO: initialize parameters
    parameters = {
                    'learning_rate': 0.01 ,          
                    'weight_regularization': 0 ,
                    'num_iterations': 10
                 }

    # logistic regression weights
    dimension = 28*28
    z = np.ones([dimension+1, 1], int)
    z = z/100.0
    #weight = np.matrix('1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1')
    for i in xrange(0,28*28):
      if i%2 == 1:
        z[i] = 0
        
    weights = z


    #weights = 1,1,2,1

    # Verify that your logistic function produces the right gradient.
    # diff should be very close to 0.
    #run_check_grad(parameters)

    # Begin learning with gradient descent
    for t in xrange(parameters['num_iterations']):

        # TODO: you will need to modify this loop to create plots, etc.

        # find the negative log likelihood and derivatives w.r.t. weights
        f, df, frac_correct_train = logistic(weights,
                                             train_inputs,
                                             train_targets,
                                             parameters)

        _, _, frac_correct_valid = logistic(weights,
                                            valid_inputs,
                                            valid_targets,
                                            parameters)
        
        if np.isnan(f) or np.isinf(f):
            raise ValueError("nan/inf error")

        # update parameters
        for i in range(weights.shape[0]):
          weights[i] = weights[i] + parameters['learning_rate'] * (df[i] - 0.001*(weights[i]))

        # print some stats
        print ("ITERATION:{:4d}   LOGL:{:4.2f}   "
               "TRAIN FRAC:{:2.2f}   VALID FRAC:{:2.2f}").format(t+1,
                                                                 f,
                                                                 frac_correct_train*100,
                                                                 frac_correct_valid*100)
コード例 #7
0
def train():
    tr_X, tr_y = load_train(size='_t')
    tr_X = norm4d_per_sample(tr_X)
    tr_y = one_hot(tr_y, 2)
    te_X, te_y = load_test(size='_t')
    te_y = one_hot(te_y, 2)
    te_X = norm4d_per_sample(te_X)
    model = DeepCNN('vgg')
    model.train(tr_X, tr_y, te_X, te_y)
コード例 #8
0
def train():
    tr_X, tr_y = load_train(size='_t')
    tr_X = norm4d_per_sample(tr_X)
    tr_y = one_hot(tr_y, 2)
    te_X, te_y = load_test(size='_t')
    te_y = one_hot(te_y, 2)
    te_X = norm4d_per_sample(te_X)
    model = RL(istrained=False, name='rl_noun')
    model.train(tr_X, tr_y, te_X, te_y)
コード例 #9
0
def main(args):
    srcnn = SRCNN(
        image_size=args.image_size,
        c_dim=args.c_dim,
        is_training=True,
        learning_rate=args.learning_rate,
        batch_size=args.batch_size,
        epochs=args.epochs)
    X_train, Y_train = load_train(image_size=args.image_size, stride=args.stride, scale=args.scale)
    srcnn.train(X_train, Y_train)
コード例 #10
0
def run_logistic_regression(hyperparameters):

    # TODO specify training data
    train_inputs, train_targets = load_train()

    valid_inputs, valid_targets = load_valid()

    # N is number of examples; M is the number of features per example.
    N, M = train_inputs.shape

    # Logistic regression weights
    # TODO:Initialize to random weights here.
    #weights = np.random.normal(0, 0.2, (train_inputs.shape[1]+1,1))
    weights = np.zeros(785).reshape((785, 1))

    # Verify that your logistic function produces the right gradient.
    # diff should be very close to 0.
    run_check_grad(hyperparameters)

    # Begin learning with gradient descent
    logging = np.zeros((hyperparameters['num_iterations'], 5))
    for t in xrange(hyperparameters['num_iterations']):

        # Find the negative log likelihood and its derivatives w.r.t. the weights.
        f, df, predictions = logistic(weights, train_inputs, train_targets,
                                      hyperparameters)

        # Evaluate the prediction.
        cross_entropy_train, frac_correct_train = evaluate(
            train_targets, predictions)

        if np.isnan(f) or np.isinf(f):
            raise ValueError("nan/inf error")

        # update parameters
        weights = weights - hyperparameters['learning_rate'] * df / N

        # Make a prediction on the valid_inputs.
        predictions_valid = logistic_predict(weights, valid_inputs)

        # Evaluate the prediction.
        cross_entropy_valid, frac_correct_valid = evaluate(
            valid_targets, predictions_valid)

        # print some stats
        print(
            "ITERATION:{:4d}  TRAIN NLOGL:{:4.2f}  TRAIN CE:{:.6f} "
            "TRAIN FRAC:{:2.2f}  VALID CE:{:.6f}  VALID FRAC:{:2.2f}").format(
                t + 1, f / N, cross_entropy_train, frac_correct_train * 100,
                cross_entropy_valid, frac_correct_valid * 100)
        logging[t] = [
            f / N, cross_entropy_train, frac_correct_train * 100,
            cross_entropy_valid, frac_correct_valid * 100
        ]
    return logging
コード例 #11
0
ファイル: kfold.py プロジェクト: VeLKerr/lazy-learning-fca
def kfold(classification_algorithm, k):
    res = {"accuracy": 0, "precision": 0, "recall": 0, "f1": 0}
    for i in range(1, k + 1):
        validation = utils.load_train(i)
        validation = validation["plus"] + validation["minus"]
        train = {"plus": [], "minus": []}
        for j in range(1, k + 1):
            if j != i:
                extension = utils.load_train(j)
                train["plus"].extend(extension["plus"])
                train["minus"].extend(extension["minus"])
        classification = classification_algorithm(train, validation)
        res["accuracy"] += utils.accuracy(classification)
        res["precision"] += utils.precision(classification)
        res["recall"] += utils.recall(classification)
        res["f1"] += utils.F1_score(classification)
    for k in res:
        res[k] /= k
    print res
    return res
コード例 #12
0
ファイル: plot_saliency.py プロジェクト: aaa29022/ML2017
def main():
    x_train, y_train = load_train('train.csv')
    x_train = x_train[:20, :]
    x_train = x_train.reshape(x_train.shape[0], 1, 48, 48, 1)

    emotion_classifier = load_model('./model_cnn.h5')
    input_img = emotion_classifier.input

    for idx in tqdm(range(20)):
        val_proba = emotion_classifier.predict(x_train[idx])
        pred = val_proba.argmax(axis=-1)
        target = K.mean(emotion_classifier.output[:, pred])
        grads = K.gradients(target, input_img)[0]
        fn = K.function([input_img, K.learning_phase()], [grads])

        grads_value = fn([x_train[idx], 0])
        heatmap = np.array(grads_value).reshape(48, 48)
        s = np.sort(heatmap, axis = None)
        clip_rate = 0.1
        clip_size = int(len(s) * clip_rate)
        heatmap = np.clip(heatmap, s[clip_size], s[len(s) - clip_size])
        heatmap = abs(heatmap - np.mean(heatmap))
        heatmap = (heatmap - np.mean(heatmap))/np.std(heatmap)
        heatmap = (heatmap - heatmap.min())/ heatmap.ptp()

        thres = 0.5
        origin = x_train[idx].reshape(48, 48)*255
        see = x_train[idx].reshape(48, 48)
        see[np.where(heatmap <= thres)] = np.mean(see)
        see *= 255

        plt.figure()
        plt.imshow(heatmap, cmap=plt.cm.jet)
        plt.colorbar()
        plt.tight_layout()
        fig = plt.gcf()
        plt.draw()
        fig.savefig(os.path.join(cmap_dir, '{}.png'.format(idx)), dpi=100)

        plt.figure()
        plt.imshow(see, cmap='gray')
        plt.colorbar()
        plt.tight_layout()
        fig = plt.gcf()
        plt.draw()
        fig.savefig(os.path.join(partial_see_dir, '{}.png'.format(idx)), dpi=100)

        plt.figure()
        plt.imshow(origin, cmap='gray')
        plt.tight_layout()
        fig = plt.gcf()
        plt.draw()
        fig.savefig(os.path.join(origin_dir, '{}.png'.format(idx)), dpi=100)
コード例 #13
0
def main():
    filter_dir = './img/'
    if not os.path.isdir(filter_dir):
        os.mkdir(filter_dir)

    filter_dir = './img/filter/'
    if not os.path.isdir(filter_dir):
        os.mkdir(filter_dir)

    emotion_classifier = load_model('./model_cnn.h5')
    layer_dict = dict([layer.name, layer]
                      for layer in emotion_classifier.layers[1:])

    input_img = emotion_classifier.input
    name_ls = ['activation_1']
    collect_layers = [
        K.function([input_img, K.learning_phase()], [layer_dict[name].output])
        for name in name_ls
    ]

    x_train, y_train = load_train('train.csv')
    x_train = x_train.reshape(x_train.shape[0], 1, 48, 48, 1)

    choose_id = 2044
    photo = x_train[choose_id]
    for cnt, fn in enumerate(collect_layers):
        im = fn([photo, 0])  #get the output of that layer
        fig = plt.figure(figsize=(14, 8))
        nb_filter = im[0].shape[3]
        for i in range(nb_filter):
            ax = fig.add_subplot(nb_filter / 16, 16, i + 1)
            ax.imshow(im[0][0, :, :, i], cmap='YlGnBu')
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))
            plt.tight_layout()
        fig.suptitle('Output of layer{} (Given image{})'.format(
            cnt, choose_id))
        img_path = filter_dir
        if not os.path.isdir(img_path):
            os.mkdir(img_path)
        fig.savefig(os.path.join(img_path, 'layer{}'.format(cnt)))
コード例 #14
0
def data_generator(dataset, shuffle=True, augment=True, batch_size=32):
    b = 0
    image_index = -1
    image_ids = np.copy(dataset.image_ids())
    #print(image_ids)
    error_count = 0
    while True:
        try:
            #print(b)
            image_index = (image_index + 1) % len(image_ids)
            if shuffle and image_index == 0:
                np.random.shuffle(image_ids)
            image_id = image_ids[image_index]
            image_concat = utils.load_train(dataset, image_id, augment=augment)
            label = dataset.load_label(image_id)
            if b == 0:
                batch_images = np.zeros((batch_size, 448, 224, 3),
                                        dtype=np.float32)
                batch_actions = np.zeros((batch_size, 1), dtype=np.int32)

            batch_images[b] = image_concat
            batch_actions[b] = label

            b += 1
            if b >= batch_size:
                #inputs=[batch_images, batch_actions]
                inputs = batch_images
                outputs = batch_actions
                yield inputs, outputs
                b = 0
        except (GeneratorExit, KeyboardInterrupt):
            raise
        except:
            # Log it and skip the image
            logging.exception("Error processing image {}".format(
                dataset.image_info[image_id]))
            error_count += 1
            if error_count > 5:
                raise
コード例 #15
0
import numpy as np
import pandas as pd
import sys
import os
from sklearn.externals import joblib

scriptpath = os.path.dirname(os.path.realpath(sys.argv[0])) + '/../'
sys.path.append(os.path.abspath(scriptpath))
import utils


train = utils.load_train('group_by')


train_2013 = train[train.date_time < '2014-01-01 00:00:00']
train_2014 = train[train.date_time >= '2014-01-01 00:00:00']


def top_k_relevence(group, topk = utils.k):
    """
    Order and get the topk hotel cluters by the relevance score in desc order
    :param group: the aggregate group with hotel cluster relevance scores
    :param topk: the top k value
    :return: the topk hotel clusters for the aggregate group
    """
    idx = group.relevance.nlargest(topk).index
    top_k_relevence = group.hotel_cluster[idx].values
    return np.array_str(top_k_relevence)[1:-1]


def gen_top_k_group_by_model(group_by_field, click_weight = utils.click_weight, year = 'all'):
コード例 #16
0
def run_logistic_regression(hyperparameters):
    # specify training data
    xIn = False
    while xIn == False:
        x = raw_input('Training Set LARGE or SMALL? ')
        print(x)
        if x == 'LARGE':
            print("HELLO")
            train_inputs, train_targets = load_train()
            xIn = True
        elif x == 'SMALL':
            print("hello")
            train_inputs, train_targets = load_train_small()
            xIn = True
        else:
            print("Please input LARGE or SMALL")

    valid_inputs, valid_targets = load_valid()
    test_inputs, test_targets = load_test()

    # N is number of examples; M is the number of features per example.
    N, M = train_inputs.shape
    print("N:", N, "  M:", M)

    # Logistic regression weights
    # Initialize to random weights here.
    weights = np.random.normal(0, 0.001, (M+1, 1))

    # Verify that your logistic function produces the right gradient.
    # diff should be very close to 0.
    run_check_grad(hyperparameters)

    # Begin learning with gradient descent
    logging = np.zeros((hyperparameters['num_iterations'], 5))
    for t in xrange(hyperparameters['num_iterations']):

        # Find the negative log likelihood and its derivatives w.r.t. the weights.
        f, df, predictions = logistic(weights, train_inputs, train_targets, hyperparameters)

        # Evaluate the prediction.
        cross_entropy_train, frac_correct_train = evaluate(train_targets, predictions)

        if np.isnan(f) or np.isinf(f):
            raise ValueError("nan/inf error")

        # update parameters
        weights = weights - hyperparameters['learning_rate'] * df / N

        # Make a prediction on the valid_inputs.
        predictions_valid = logistic_predict(weights, valid_inputs)

        # Evaluate the prediction.
        cross_entropy_valid, frac_correct_valid = evaluate(valid_targets, predictions_valid)

        # print some stats
        print ("ITERATION:{:4d}  TRAIN NLOGL:{:4.2f}  TRAIN CE:{:.6f} "
               "TRAIN FRAC:{:2.2f}  VALID CE:{:.6f}  VALID FRAC:{:2.2f}").format(
                   t+1, f / N, cross_entropy_train, frac_correct_train*100,
                   cross_entropy_valid, frac_correct_valid*100)
        logging[t] = [f / N, cross_entropy_train, frac_correct_train*100, cross_entropy_valid, frac_correct_valid*100]
    return logging
コード例 #17
0
import pandas as pd
import numpy as np
import utils

path_to_data = ''
data, meta = utils.load_train(path_to_data)

objects = meta['object_id'].values

for obj in objects:
    df = data.loc[data['object_id'] == obj]
    arr =  utils.conv_preprocess_data(df, 355feature)
    print pd.DataFrame(arr[0][0])
    break
コード例 #18
0
ファイル: knn_new_script.py プロジェクト: ShiyuanLin/csc411
# -*- coding: utf-8 -*-

from utils import load_train, load_valid

from run_knn import run_knn

(train_inputs, train_targets) = load_train()
(valid_inputs, valid_targets) = load_valid()

for k in [1, 3, 5, 7, 9]:
    print run_knn(k, train_inputs, train_targets, valid_inputs)
コード例 #19
0
                         'NAME_HOUSING_TYPE',
                         'OCCUPATION_TYPE',
                         'WEEKDAY_APPR_PROCESS_START',
                         'ORGANIZATION_TYPE',
                         'FONDKAPREMONT_MODE',
                         'HOUSETYPE_MODE',
                         'WALLSMATERIAL_MODE',
                         ]


# =============================================================================
# 
# =============================================================================
skf = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=SEED)

train = utils.load_train(categorical_features+['TARGET']).fillna('na dayo')
test  = utils.load_test(categorical_features).fillna('na dayo')


col = []
cat_comb = list(combinations(categorical_features, 2))
for c1,c2 in cat_comb:
    train[f'{c1}-{c2}'] = train[c1] + train[c2]
    test[f'{c1}-{c2}'] = test[c1] + test[c2]
    col.append( f'{c1}-{c2}' )

# =============================================================================
# cardinality check
# =============================================================================
train['fold'] = 0
for i,(train_index, test_index) in enumerate(skf.split(train, train.TARGET)):
コード例 #20
0
#
# =============================================================================
prev = utils.read_pickles('../data/previous_application')
base = prev[[KEY]].drop_duplicates().set_index(KEY)

gr = prev.groupby(KEY)
gr_app = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved'].groupby(KEY)
gr_ref = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused'].groupby(KEY)
gr_act = prev[prev['active'] == 1].groupby(KEY)
gr_cmp = prev[prev['completed'] == 1].groupby(KEY)

col = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
    'AMT_CREDIT-d-AMT_ANNUITY', 'DAYS_BIRTH'
]
train = utils.load_train([KEY] + col)
test = utils.load_test([KEY] + col)

train.AMT_ANNUITY.fillna(0, inplace=True)
test.AMT_ANNUITY.fillna(0, inplace=True)

train.columns = [KEY] + ['app_' + c for c in train.columns[1:]]
test.columns = [KEY] + ['app_' + c for c in test.columns[1:]]

col_init = train.columns.tolist()

# =============================================================================
# feature
# =============================================================================

# size
コード例 #21
0
"""
Created on Feb 26 2017
Author: Weiping Song
"""
import os, sys
import tensorflow as tf
import numpy as np
import argparse, random

from model import GRU4Rec
from utils import load_train, load_valid

unfold_max = 20
error_during_training = False

train_x, train_y, n_items = load_train(unfold_max)
valid_x, valid_y, _ = load_valid(unfold_max)


class Args():
    is_training = True
    layers = 1
    rnn_size = 100
    n_epochs = 10
    batch_size = 50
    keep_prob = 1
    learning_rate = 0.001
    decay = 0.98
    decay_steps = 2 * 1e3
    sigma = 0.0001
    init_as_normal = False
コード例 #22
0
import theano
import lasagne
import utils
from lasagne.layers import *
from nolearn.lasagne import NeuralNet
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
from nolearn.lasagne import TrainSplit
from nolearn.lasagne import objective
from lasagne.nonlinearities import softmax
from lasagne.updates import momentum



X, y=utils.load_train()
X_test=utils.load_test()
X=X.reshape([X.shape[0],3,32,32])
y=np.array(y,dtype="int32")
X_test=X_test.reshape([X_test.shape[0],3,32,32])

layers = [
    # layer dealing with the input data
    (InputLayer, {'shape': (None, 3, 32, 32)}),

    # first stage of our convolutional layers

    # second stage of our convolutional layers
    (Conv2DLayer, {'pad':2,'num_filters': 32, 'filter_size': 5,'W':lasagne.init.Normal(std=0.01)}),
    (ParametricRectifierLayer, {'alpha':lasagne.init.Constant(0)}),
    (Pool2DLayer, {'pool_size': 2,'stride':2,'mode':'max'}),
コード例 #23
0
ファイル: main_nn.py プロジェクト: Sasha-P/Whats-cooking
        np.reshape(Theta[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, input_layer_size + 1), order='F'))
    Theta2 = np.matrix(
        np.reshape(Theta[hidden_layer_size * (input_layer_size + 1):], (num_labels, hidden_layer_size + 1), order='F'))

    p = fnx.predict(Theta1, Theta2, X)

    precision = 0
    for i in range(len(y)):
        if y[i] == p[i]:
            precision += 1

    print('Training Set Accuracy:', (1.0 * precision) / len(y))

    return Theta1, Theta2


if __name__ == '__main__':
    cuisine_list, ingredients_list, X, y = utl.load_train('number')

    ingredients_count = len(ingredients_list)
    cuisines_count = len(cuisine_list)

    Theta1, Theta2 = train_nn(ingredients_count, ingredients_count//16, cuisines_count, X, y)

    T, ids = utl.load_test(ingredients_list)

    p = fnx.predict(Theta1, Theta2, T)

    utl.save_result('nn', cuisine_list, p, ids)

コード例 #24
0
	merged_df.reset_index(inplace = True)

	if type == 'test':
		result = merged_df[['index', 'hotel_cluster']]
		result.columns =  ['id', 'hotel_cluster']
	elif type == 'train':
		result = merged_df[['index', 'hotel_cluster_y']]
		result.columns =  ['id', 'hotel_cluster']

	return result


#############################################################
####################   train dataset     ####################
#############################################################
train = utils.load_train('group_by')
train_is_booking = train[train.is_booking == 1]
train_is_booking.reset_index(inplace = True)
del train

print 'generate top k hotel clusters with orig_destination_distance model...'
result = gen_top_k_hotel_cluster(train_is_booking, 'orig_destination_distance', 'train')
print 'generate top k hotel clusters with srch_destination_id model...'
result = utils.fill_all_top_5(train_is_booking, result, 'srch_destination_id', 'train')
print 'generate top k hotel clusters with user_id model...'
result = utils.fill_all_top_5(train_is_booking, result, 'user_id', 'train')
print 'generate top k hotel clusters with hotel_market model...'
result = utils.fill_all_top_5(train_is_booking, result, 'hotel_market', 'train')
print 'hotel clusters to ranking features...'
new_result = result.apply(lambda row: hotel_clusters_to_ranking_features(row), axis=1)
new_result.columns = ['_'.join(['hotel_cluster', str(hotel_cluster_id), 'rank']) for hotel_cluster_id in range(100)]
コード例 #25
0
def main():

    # ================
    # time managment #
    # ================

    program_st = time.time()

    # =====================================
    # bert classification logging handler #
    # =====================================
    logging_filename = f"../logs/bertclf_{args.corpus_name}.log"
    logging.basicConfig(level=logging.INFO,
                        filename=logging_filename,
                        filemode="w")
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter("%(levelname)s: %(message)s")
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

    # =======================
    # predefined parameters #
    # =======================

    cv = args.cross_validation
    num_labels = 3

    batch_size = args.batch_size
    epochs = args.epochs
    learning_rate = args.learning_rate
    max_length = args.max_length

    if args.domain_adaption:
        if args.model == "german":
            if args.domain_adaption_alternative_path:
                model_name = '../corpora/domain-adaption/german-alternative/'
            else:
                model_name = '../corpora/domain-adaption/german/'
        elif args.model == "rede":
            if args.domain_adaption_alternative_path:
                model_name = '../corpora/domain-adaption/redewiedergabe-alternative/'
            else:
                model_name = '../corpora/domain-adaption/redewiedergabe/'
        elif args.model == "test":
            model_name = '../corpora/domain-adaption/test/'
        else:
            logging.warning(
                f"Couldn't find a model with the name '{args.model}'.")
    else:
        if args.model == "german":
            model_name = 'bert-base-german-dbmdz-cased'
        elif args.model == "rede":
            model_name = 'redewiedergabe/bert-base-historical-german-rw-cased'
        else:
            logging.warning(
                f"Couldn't find a model with the name '{args.model}'.")

    cv_acc_dict = defaultdict(list)
    year_cv_dict = {}
    poet_cv_dict = {}

    class_name1 = "epoch_year"
    class_name2 = "epoch_poet"
    text_name = "poem"

    false_clf_dict = {class_name1: {}, class_name2: {}}

    # ================
    # classification #
    # ================

    # =======================
    # use GPU, if available #
    # =======================

    if torch.cuda.is_available():
        device = torch.device("cuda")
        logging.info(
            f'There are {torch.cuda.device_count()} GPU(s) available.')
        logging.info(f'Used GPU: {torch.cuda.get_device_name(0)}')
    else:
        logging.info('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    for i in range(1, cv + 1):
        if args.corpus_name == "poet":
            train_data = utils.load_train("../corpora/train_epochpoet", cv, i,
                                          "epochpoet")
            test_data = pd.read_csv(
                f"../corpora/train_epochpoet/epochpoet{i}.csv")
        elif args.corpus_name == "year":
            train_data = utils.load_train("../corpora/train_epochyear", cv, i,
                                          "epochyear")
            test_data = pd.read_csv(
                f"../corpora/train_epochyear/epochyear{i}.csv")
        elif args.corpus_name == "poeta":
            train_data = utils.load_train(
                "../corpora/train_epochpoetalternative", cv, i,
                "epochpoetalternative")
            test_data = pd.read_csv(
                f"../corpora/train_epochpoetalternative/epochpoetalternative{i}.csv"
            )
        else:
            logging.warning(
                f"Couldn't find a corpus with the name '{args.corpus_name}'.")

        for class_name in [class_name1, class_name2]:

            # tmp lists and result dicts #
            input_ids = []
            attention_masks = []

            texts = train_data[text_name].values
            encoder = LabelEncoder()
            labels = encoder.fit_transform(train_data[class_name].values)

            encoder_mapping = dict(
                zip(encoder.transform(encoder.classes_), encoder.classes_))

            # ==============
            # tokenization #
            # ==============

            tokenizer = BertTokenizer.from_pretrained(model_name,
                                                      do_lower_case=False)

            for sent in texts:
                encoded_dict = tokenizer.encode_plus(
                    sent,
                    add_special_tokens=True,
                    max_length=args.max_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt')

                input_ids.append(encoded_dict['input_ids'])
                attention_masks.append(encoded_dict['attention_mask'])

            input_ids = torch.cat(input_ids, dim=0)
            attention_masks = torch.cat(attention_masks, dim=0)
            labels = torch.tensor(labels)

            # =================
            # train val split #
            # =================

            dataset = TensorDataset(input_ids, attention_masks, labels)

            train_size = int(0.9 * len(dataset))
            val_size = len(dataset) - train_size

            train_dataset, val_dataset = random_split(dataset,
                                                      [train_size, val_size])

            # ============
            # DataLoader #
            # ============

            train_dataloader = DataLoader(train_dataset,
                                          sampler=RandomSampler(train_dataset),
                                          batch_size=batch_size)

            val_dataloader = DataLoader(val_dataset,
                                        sampler=SequentialSampler(val_dataset),
                                        batch_size=batch_size)

            # ======== #
            # Training #
            # ======== #
            model = BertForSequenceClassification.from_pretrained(
                model_name,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False).cuda()

            optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

            total_steps = len(train_dataloader) * epochs

            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0, num_training_steps=total_steps)

            training_stats = []
            total_t0 = time.time()

            validation_losses = {}

            for epoch_i in range(0, epochs):
                print("")
                print('======== Epoch {:} / {:} ========'.format(
                    epoch_i + 1, epochs))
                print('Now Training.')
                t0 = time.time()
                total_train_loss = 0
                model.train()
                for step, batch in enumerate(train_dataloader):
                    if step % 50 == 0 and not step == 0:
                        elapsed = utils.format_time(time.time() - t0)
                        print('Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.
                              format(step, len(train_dataloader), elapsed))

                    b_input_ids = batch[0].to(device)
                    b_input_mask = batch[1].to(device)
                    b_labels = batch[2].to(device)

                    model.zero_grad()

                    loss, logits = model(b_input_ids,
                                         token_type_ids=None,
                                         attention_mask=b_input_mask,
                                         labels=b_labels)

                    total_train_loss += loss.item()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    scheduler.step()

                # average loss (all batches)
                avg_train_loss = total_train_loss / len(train_dataloader)
                training_time = utils.format_time(time.time() - t0)

                print("")
                print(
                    "  Average training loss: {0:.2f}".format(avg_train_loss))
                print("  Training epoch took: {:}".format(training_time))

                # ========== #
                # Validation #
                # ========== #

                print("")
                print("Now Validating.")

                t0 = time.time()
                model.eval()

                total_eval_accuracy = 0
                total_eval_loss = 0
                nb_eval_steps = 0

                for batch in val_dataloader:

                    b_input_ids = batch[0].to(device)
                    b_input_mask = batch[1].to(device)
                    b_labels = batch[2].to(device)

                    with torch.no_grad():

                        (loss, logits) = model(b_input_ids,
                                               token_type_ids=None,
                                               attention_mask=b_input_mask,
                                               labels=b_labels)

                    # validation loss.
                    total_eval_loss += loss.item()

                    # Move logits and labels to CPU
                    logits = logits.detach().cpu().numpy()
                    label_ids = b_labels.to('cpu').numpy()

                    total_eval_accuracy += utils.flat_f1(label_ids, logits)

                # final validation accuracy / loss
                avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
                print(
                    "  Validation Accuracy: {0:.2f}".format(avg_val_accuracy))

                avg_val_loss = total_eval_loss / len(val_dataloader)
                validation_time = utils.format_time(time.time() - t0)
                print("  Validation Loss: {0:.2f}".format(avg_val_loss))
                print("  Validation took: {:}".format(validation_time))

                training_stats.append({
                    'epoch': epoch_i + 1,
                    'train_loss': avg_train_loss,
                    'val_loss': avg_val_loss,
                    'val_acc': avg_val_accuracy,
                    'train_time': training_time,
                    'val_time': validation_time
                })

                current_epoch = f"epoch{epoch_i + 1}"
                validation_losses[current_epoch] = avg_val_loss

                # ================
                # Early Stopping #
                # ================

                if utils.early_stopping(validation_losses, patience=2):
                    logging.info(
                        f"Stopping epoch run early (Epoch {epoch_i}).")
                    break

            logging.info(f"Training for {class_name} done.")
            logging.info("Training took {:} (h:mm:ss) \n".format(
                utils.format_time(time.time() - total_t0)))
            print("--------------------------------\n")

            # =========
            # Testing #
            # =========

            test_input_ids = []
            test_attention_masks = []

            X_test = test_data[text_name].values
            y_test = LabelEncoder().fit_transform(test_data[class_name].values)

            for sent in X_test:
                encoded_dict = tokenizer.encode_plus(
                    sent,
                    add_special_tokens=True,
                    max_length=args.max_length,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt')

                test_input_ids.append(encoded_dict['input_ids'])

                test_attention_masks.append(encoded_dict['attention_mask'])

            test_input_ids = torch.cat(test_input_ids, dim=0)
            test_attention_masks = torch.cat(test_attention_masks, dim=0)
            labels = torch.tensor(y_test)

            prediction_data = TensorDataset(test_input_ids,
                                            test_attention_masks, labels)
            prediction_sampler = SequentialSampler(prediction_data)
            prediction_dataloader = DataLoader(prediction_data,
                                               sampler=prediction_sampler,
                                               batch_size=batch_size)

            model.eval()

            predictions, true_labels = [], []

            for batch in prediction_dataloader:
                # Add batch to GPU
                batch = tuple(t.to(device) for t in batch)

                # Unpack the inputs from our dataloader
                b_input_ids, b_input_mask, b_labels = batch

                with torch.no_grad():
                    outputs = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask)

                logits = outputs[0]

                # Move logits and labels to CPU
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()

                # Store predictions and true labels
                predictions.append(logits)
                true_labels.append(label_ids)

            flat_predictions = np.concatenate(predictions, axis=0)
            flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
            flat_true_labels = np.concatenate(true_labels, axis=0)

            if args.save_misclassification:
                logging.info("Saving misclassifications.")
                test_pid = test_data["pid"].values
                false_classifications = {
                    "Jahrhundertwende": {
                        "Naturalismus": [],
                        "Expressionismus": []
                    },
                    "Naturalismus": {
                        "Jahrhundertwende": [],
                        "Expressionismus": []
                    },
                    "Expressionismus": {
                        "Naturalismus": [],
                        "Jahrhundertwende": []
                    }
                }

                for idx, (t, p) in enumerate(
                        zip(flat_true_labels, flat_predictions)):
                    if t != p:
                        false_classifications[encoder_mapping[t]][
                            encoder_mapping[p]].append(int(test_pid[idx]))

                false_clf_dict[class_name][i] = false_classifications

            test_score = f1_score(flat_true_labels,
                                  flat_predictions,
                                  average="macro")
            classes = test_data[class_name].drop_duplicates().tolist()

            if args.save_confusion_matrices:
                logging.info("Saving confusion matrices.")
                cm = confusion_matrix(flat_true_labels, flat_predictions)
                cm_df = pd.DataFrame(cm, index=classes, columns=classes)

                if args.domain_adaption:
                    cm_name = f"{args.corpus_name}c_{class_name}_da_{args.model}"
                else:
                    cm_name = f"{args.corpus_name}c_{class_name}_{args.model}"

                if args.save_date:
                    cm_name += f"({datetime.now():%d.%m.%y}_{datetime.now():%H:%M})"

                cm_df.to_csv(
                    f"../results/bert/confusion_matrices/cm{i}_{cm_name}.csv")

            stats = pd.DataFrame(data=training_stats)
            cv_acc_dict[class_name].append(test_score)

            if class_name == "epoch_year":
                year_cv_dict[f"cv{i}"] = training_stats
            elif class_name == "epoch_poet":
                poet_cv_dict[f"cv{i}"] = training_stats
            else:
                logging.info(f"The class {class_name} does not exist.")

            logging.info(f"Testing for {class_name} done.")
            logging.info(f"CV Test F1-Score: {test_score} (run: {i}/{cv}).")
            logging.info("Testing took {:} (h:mm:ss) \n".format(
                utils.format_time(time.time() - total_t0)))
            print("--------------------------------\n")

        logging.info(f"Training for run {i}/{cv} completed.")
        logging.info("Training run took {:} (h:mm:ss)".format(
            utils.format_time(time.time() - total_t0)))
        print("________________________________")
        print("________________________________\n")

    # ================
    # saving results #
    # ================

    result_path = "../results/bert/"
    logging.info(f"Writing results to '{result_path}'.")

    if args.domain_adaption:
        output_name = f"{args.corpus_name}c_da_{args.model}"
    else:
        output_name = f"{args.corpus_name}c_{args.model}"

    if args.save_date:
        output_name += f"({datetime.now():%d.%m.%y}_{datetime.now():%H:%M})"

    with open(f'{result_path}cv_{output_name}.json', 'w') as f:
        json.dump(cv_acc_dict, f)

    with open(f'{result_path}eyear_{output_name}.json', 'w') as f:
        json.dump(year_cv_dict, f)

    with open(f'{result_path}epoet_{output_name}.json', 'w') as f:
        json.dump(poet_cv_dict, f)

    if args.save_misclassification:
        mis_output_path = f'{result_path}/misclassifications/pid_{output_name}'
        with open(f'{mis_output_path}.json', 'w') as f:
            json.dump(false_clf_dict, f)

    program_duration = float(time.time() - program_st)
    logging.info(f"Total duration: {int(program_duration)/60} minute(s).")
        p = (10**i)
        for c in tqdm(df.columns):
            s = (df[c] * p * 2 + 1) // 2 / p  # round
            di = s.value_counts().to_dict()
            feature[f'{PREF}_{c}_r{i}'] = s.map(di)

    tr_ = feature.iloc[:200000]
    output(tr_, 'train')

    te_ = feature.iloc[200000:].reset_index(drop=True)
    output(te_, 'test')

    return


# =============================================================================
# main
# =============================================================================
if __name__ == "__main__":
    utils.start(__file__)

    tr = utils.load_train().drop(['ID_code', 'target'], axis=1)
    te = utils.load_test().drop(['ID_code'], axis=1)
    te = te.drop(np.load('../data/fake_index.npy'))

    trte = pd.concat([tr, te], ignore_index=True)[tr.columns]

    fe(trte)

    utils.end(__file__)
コード例 #27
0
ファイル: sample.py プロジェクト: KazukiOnodera/PLAsTiCC-2018
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 16 19:55:54 2018

@author: kazuki.onodera
"""

import numpy as np
import pandas as pd
import utils, os

os.system('rm -rf ../sample')
os.system('mkdir ../sample')

tr = utils.load_train()
log = pd.read_feather('../data/train_log.f')

oids = tr.sample(999).object_id.tolist()

tr_ = tr[tr.object_id.isin(oids)]
log_ = log[log.object_id.isin(oids)]

tr_.to_csv('../sample/tr.csv', index=False)
log_.to_csv('../sample/tr_log.csv', index=False)
コード例 #28
0
os.system(f'rm ../data/t*_{PREF}*')
os.system(f'rm ../feature/t*_{PREF}*')

#def mk_feats(df):
#    df['hostgal_specz-m-hostgal_photoz'] = df['hostgal_specz'] - df['hostgal_photoz']
#    df['hostgal_specz-d-hostgal_photoz'] = df['hostgal_specz'] / df['hostgal_photoz']
#    df['hostgal_photoz-d-hostgal_photoz_err'] = df['hostgal_photoz'] / df['hostgal_photoz_err']
#    df['hostgal_specz-d-hostgal_photoz_err'] = df['hostgal_specz'] / df['hostgal_photoz_err']
#    return

# =============================================================================
# main
# =============================================================================
if __name__ == "__main__":
    utils.start(__file__)
    
    train = utils.load_train().drop(['object_id', 'target'], axis=1)
    train.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl')
    train_aug = pd.read_pickle('../data/train_aug.pkl').drop(['object_id', 'object_id_bk', 'target'], axis=1)
    train_aug.add_prefix(PREF+'_').to_pickle(f'../data/train_aug_{PREF}.pkl')
    
    
    test  = utils.load_test().drop(['object_id'], axis=1)
    test.loc[test.hostgal_photoz==0, 'hostgal_specz'] = 0
    test = test.add_prefix(PREF+'_')
    test.to_pickle(f'../data/test_{PREF}.pkl')
    utils.save_test_features(test)
    
    utils.end(__file__)

コード例 #29
0
ファイル: main_tf_gd.py プロジェクト: Sasha-P/Whats-cooking
import utils as utl
import tensorflow as tf
import numpy as np


cuisine_list, ingredients_list, xs, ys = utl.load_train('vector')
ts, ids = utl.load_test(ingredients_list)

cuisine_count = len(cuisine_list)
ingredients_count = len(ingredients_list)

x = tf.placeholder(tf.float32, [None, ingredients_count])
W = tf.Variable(tf.zeros([ingredients_count, cuisine_count]))
b = tf.Variable(tf.zeros([cuisine_count]))

y = tf.nn.softmax(tf.matmul(x, W) + b)

y_ = tf.placeholder(tf.float32, [None, cuisine_count])

t = tf.placeholder(tf.float32, [None, ingredients_count])

p = tf.nn.softmax(tf.matmul(t, W) + b)

cross_entropy = -tf.reduce_sum(y_*tf.log(y))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)
# train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

init = tf.initialize_all_variables()

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
sess.run(init)
コード例 #30
0
ファイル: stack.py プロジェクト: glouppe/kaggle-higgs
from functools import partial

from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.externals.joblib import Parallel, delayed

from utils import load_train, load_test
from utils import find_threshold
from utils import rescale, rebalance
from utils import make_submission

def load_predictions(pattern):
    return np.column_stack([np.load(f) for f in sorted(glob.glob(pattern))])

# Load training data
X, y, w, _ = load_train()

# Tune stacker
print "Optimize parameters in 5-CV..."

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
#Classifier = partial(BaggingClassifier, base_estimator=GradientBoostingClassifier(n_estimators=500, learning_rate=0.025, max_depth=4, max_features=None, min_samples_leaf=250))
#grid = ParameterGrid({"n_estimators": [24], "max_features": [1.0, 0.9, 0.8], "n_jobs": [24]})

Classifier = GradientBoostingClassifier
grid = ParameterGrid({"n_estimators": [500], "max_features": [None, 0.95, 0.9], "learning_rate": [0.0225, 0.025, 0.0275], "max_depth": [4], "min_samples_leaf": [250]})

n_jobs = 24

def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1):
コード例 #31
0
    for suf in suffix_list:
        col = [c for c in col_init if c.endswith(suf)]
        df[f'{suf}_min'] = df[col].min(1)
        df[f'{suf}_mean'] = df[col].mean(1)
        df[f'{suf}_max'] = df[col].max(1)
        df[f'{suf}_std'] = df[col].std(1)
    return

# =============================================================================
# main
# =============================================================================
if __name__ == "__main__":
    utils.start(__file__)
    
    # train
    tr = utils.load_train(['object_id'])
    
    df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_train_v3_20181215.pkl.gz')
    df = pd.merge(tr, df, on='object_id', how='left')
    df.reset_index(drop=True, inplace=True)
    get_feature(df)
    
    del df['object_id']
    df.add_prefix(PREF+'_').to_pickle(f'../data/train_{PREF}.pkl')
    
    # test
    te = utils.load_test(['object_id'])
    df = pd.read_pickle('../FROM_MYTEAM/LCfit_feature_allSN_r_test_v3_20181215.pkl.gz')
    df = pd.merge(te, df, on='object_id', how='left')
    df.reset_index(drop=True, inplace=True)
    get_feature(df)
コード例 #32
0
import utils

#path_to_data = '/courses/cs342/Assignment2/'
path_to_data = ''

train, train_meta = utils.load_train(path_to_data)

g_train, eg_train, g_meta, eg_meta, g_target, eg_target = utils.gal_split_data(
    train, train_meta, True)

g_features = utils.feature_engineering(g_train, g_meta)
g_wtable, g_labels, g_classes, g_target_map = utils.preprocess_target(g_target)
g_features = utils.standardize_data(g_features)
utils.train_mlp(g_features, g_wtable, g_labels, g_classes, g_target_map, True)

eg_features = utils.feature_engineering(eg_train, eg_meta)
eg_wtable, eg_labels, eg_classes, eg_target_map = utils.preprocess_target(
    eg_target)
eg_features = utils.standardize_data(eg_features)
utils.train_mlp(eg_features, eg_wtable, eg_labels, eg_classes, eg_target_map,
                False)
コード例 #33
0
def classify(train, examples):
    cv_res = {
     "PP": 0,
     "PN": 0,
     "NP": 0,
     "NN": 0,
     "contradictory": 0,
    }
    plus = train["plus"]
    minus = train["minus"]
    l = len(examples)
    i = 0
    for elem in examples:
        i += 1
        print "%i/%i" % (i, l)
        result = check_hypothesis(plus, minus, elem)
        cv_res[result] += 1
    return cv_res


if __name__ == "__main__":

    index = int(sys.argv[1])

    train = utils.load_train(index)
    test = utils.load_test(index)

    res = classify(train, test)
    print res
    print utils.summary(res)
コード例 #34
0
	return site_name_encoding, posa_continent_encoding, user_location_country_encoding, user_location_region_encoding, \
	channel_encoding, srch_destination_type_id_encoding, hotel_continent_encoding, hotel_country_encoding

def fill_na_features(dataset):
	"""
    Fill the remaining missing values
    :param dataset: train/test dataset
	"""
	dataset.fillna(-1, inplace=True)


#############################################################
####################   train dataset     ####################
#############################################################
train = utils.load_train('baseline')

train_is_booking = train[train.is_booking == 1]
train_is_booking.reset_index(inplace = True)
train_is_booking.is_copy = False
del train

print 'generate train time features...'
time_features_enricher(train_is_booking)

print 'generate train one hot encoding features...'
site_name_encoding, posa_continent_encoding, user_location_country_encoding, user_location_region_encoding, \
	channel_encoding, srch_destination_type_id_encoding, hotel_continent_encoding, hotel_country_encoding = \
	gen_all_top_one_hot_encoding_columns(train_is_booking)

print 'fill train na features...'
コード例 #35
0
# =============================================================================
# NAME_CONTRACT_STATUS
# =============================================================================

ct1 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS']).add_suffix('_cnt')
ct2 = pd.crosstab(pos[KEY], pos['NAME_CONTRACT_STATUS'],
                  normalize='index').add_suffix('_nrm')

base = pd.concat([base, ct1, ct2], axis=1)

# TODO: DPD

# =============================================================================
# merge
# =============================================================================
base.reset_index(inplace=True)

train = utils.load_train([KEY])

test = utils.load_test([KEY])

train_ = pd.merge(train, base, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(train_.add_prefix(PREF), '../feature/train')

test_ = pd.merge(test, base, on=KEY, how='left').drop(KEY, axis=1)
utils.to_feature(test_.add_prefix(PREF), '../feature/test')

#==============================================================================
utils.end(__file__)
コード例 #36
0
ファイル: main_tf_gd.py プロジェクト: Sasha-P/Whats-cooking
import utils as utl
import tensorflow as tf
import numpy as np

cuisine_list, ingredients_list, xs, ys = utl.load_train('vector')
ts, ids = utl.load_test(ingredients_list)

cuisine_count = len(cuisine_list)
ingredients_count = len(ingredients_list)

x = tf.placeholder(tf.float32, [None, ingredients_count])
W = tf.Variable(tf.zeros([ingredients_count, cuisine_count]))
b = tf.Variable(tf.zeros([cuisine_count]))

y = tf.nn.softmax(tf.matmul(x, W) + b)

y_ = tf.placeholder(tf.float32, [None, cuisine_count])

t = tf.placeholder(tf.float32, [None, ingredients_count])

p = tf.nn.softmax(tf.matmul(t, W) + b)

cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)
# train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

init = tf.initialize_all_variables()

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
sess.run(init)
コード例 #37
0
    """

    dist = l2_distance(valid_data.T, train_data.T)
    nearest = np.argsort(dist, axis=1)[:,:k]

    train_labels = train_labels.reshape(-1)
    valid_labels = train_labels[nearest]

    # note this only works for binary labels
    valid_labels = (np.mean(valid_labels, axis=1) >= 0.5).astype(np.int)
    valid_labels = valid_labels.reshape(-1,1)

    return valid_labels

if __name__ == '__main__':
    train_inputs, train_targets = utils.load_train()
    valid_inputs, valid_targets = utils.load_valid()
    test_inputs, test_targets = utils.load_test()
    
    set_k = [1,3,5,7,9]
    
    accuracy_valid_output = {}
    accuracy_test_output = {}
   
    length_valid = len(valid_inputs)
    length_test = len(test_inputs)
    
    for k in set_k:
        valid_outputs = run_knn(k, train_inputs, train_targets, valid_inputs)
        test_outputs =  run_knn(k, train_inputs, train_targets, test_inputs)
コード例 #38
0
"""
Created on Sun Jun  3 05:56:27 2018

@author: Kazuki
"""

import numpy as np
import pandas as pd
#from sklearn.preprocessing import LabelEncoder
import utils
utils.start(__file__)
#==============================================================================

PREF = 'app_002_'

train = utils.load_train().drop(['SK_ID_CURR', 'TARGET'], axis=1)
test = utils.load_test().drop(['SK_ID_CURR'], axis=1)

col_init = train.columns

df = pd.concat([train, test], ignore_index=True)
# =============================================================================
# features
# =============================================================================

df['AMT_CREDIT-by-AMT_INCOME_TOTAL'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['AMT_INCOME_TOTAL-AMT_CREDIT'] = df['AMT_INCOME_TOTAL'] - df['AMT_CREDIT']

df['AMT_ANNUITY-by-AMT_INCOME_TOTAL'] = df['AMT_ANNUITY'] / df[
    'AMT_INCOME_TOTAL']
df['AMT_INCOME_TOTAL-AMT_ANNUITY'] = df['AMT_INCOME_TOTAL'] - df['AMT_ANNUITY']
コード例 #39
0
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import utils as utl

cuisine_list, ingredients_list, x, y = utl.load_train('number')
classifier = OneVsRestClassifier(LogisticRegression(C=1e6)).fit(x, y)
p = classifier.predict(x)

precision = 0
for i in range(len(y)):
    if y[i] == p[i]:
        precision += 1
accuracy = (1.0 * precision) / len(y)

print('Training Set Accuracy:', accuracy)

t, ids = utl.load_test(ingredients_list)
p = classifier.predict(t)
utl.save_result('sk_lr', cuisine_list, p, ids, 'number')
コード例 #40
0
#hyper_parameter  #
#######################################################################

import utils
from run_knn import run_knn 
import plot_digits
import numpy as np
import matplotlib.pyplot as plt



if __name__ == "__main__":
    

    #loading the dataset
    train_data, train_labels = utils.load_train()

    #loading the validation set
    valid_data,valid_labels = utils.load_valid()


    # vector of each k
    K = np.array([1,3,5,7,9])
    
    #dictionnay result
    results={}

    for k in K:
        
        #prediction 
        prediction = run_knn(k,train_data,train_labels,valid_data)