Ejemplo n.º 1
0
def run_random(data_file, rs, outfile1, outfile2):
    print('running RANDOM model...')
    day_batcher = DayBatcher(data_file, skiprow=1, delimiter=' ')
    mat = day_batcher.next_batch()
    random.seed(rs)
    rst = []
    while mat is not None:
        if mat.shape[1] == 13:
            # use compact10d
            datadict = {
                'features': mat[:, 3:],
                'red': mat[:, 2],
                'user': mat[:, 1],
                'day': mat[:, 0]
            }
        else:
            # use all_fixed
            datadict = {
                'features': mat[:, 14:],
                'red': mat[:, 13],
                'user': mat[:, 1],
                'day': mat[:, 0]
            }
        anomaly_scores = [random.random() for x in datadict['features']]
        for day, user, red, score in zip(datadict['day'], datadict['user'],
                                         datadict['red'], anomaly_scores):
            rst.append((user, day, score, red))
        mat = day_batcher.next_batch()
    train_rst, test_rst = split_train_test(rst)
    save_rst(train_rst, outfile1)
    save_rst(test_rst, outfile2)
    eval_cr(test_rst, 'random')
    def run_label(self, label):
        start_time = time.time()
        print >> sys.stderr, start_time, 'Working on label', label
        print 'Working on label', label
        train_fn = os.path.join(self.top_dir, 'train',
                                'data_%d_train.npy' % (label, ))
        test_fn = os.path.join(self.top_dir, 'test',
                               'data_%d_test.npy' % (label, ))
        model_fn = os.path.join(self.top_dir, 'model',
                                'model_%d.pkl' % (label, ))
        result_fn = os.path.join(self.top_dir, 'results',
                                 'result_%d_%d.npy' % (label, label))
        _mkdir(train_fn)
        _mkdir(test_fn)
        _mkdir(model_fn)

        train, test, mask = util.split_train_test(
            self.data[self.dl.get_mask_for_label(label)], 0.80)
        np.save(train_fn, train)
        np.save(test_fn, test)

        hyper_params_aa = self.base_hyper_params_aa.copy()
        hyper_params_aa['input_fn'] = train_fn
        hyper_params_aa['output_fn'] = model_fn
        #hyper_params_aa['max_epochs'] = train.shape[0]/580000+40
        hyper_params_aa['max_epochs'] = 300
        config = self.aa_yaml % hyper_params_aa

        train = yaml_parse.load(config)
        train.main_loop()
        run_ae(model_fn, test_fn, result_fn)
        print 'time', time.time(
        ) - start_time, 'model', model_fn, 'result', result_fn, 'train', train
    def run_label(self, label):
        start_time = time.time()
        print >>sys.stderr, start_time, 'Working on label', label
        print 'Working on label', label
        train_fn = os.path.join(self.top_dir, 'train', 'data_%d_train.npy' % (label, ))
        test_fn = os.path.join(self.top_dir, 'test', 'data_%d_test.npy' % (label, ))
        model_fn = os.path.join(self.top_dir, 'model', 'model_%d.pkl' % (label, ))
        result_fn = os.path.join(self.top_dir, 'results', 'result_%d_%d.npy' % (label, label))
        _mkdir(train_fn)
        _mkdir(test_fn)
        _mkdir(model_fn)

        train, test, mask = util.split_train_test(self.data[self.dl.get_mask_for_label(label)], 0.80)
        np.save(train_fn, train)
        np.save(test_fn, test)

        hyper_params_aa = self.base_hyper_params_aa.copy()
        hyper_params_aa['input_fn'] = train_fn
        hyper_params_aa['output_fn'] = model_fn
        #hyper_params_aa['max_epochs'] = train.shape[0]/580000+40
        hyper_params_aa['max_epochs'] = 300
        config = self.aa_yaml % hyper_params_aa

        train = yaml_parse.load(config)
        train.main_loop()
        run_ae(model_fn, test_fn, result_fn)
        print 'time', time.time() - start_time, 'model', model_fn, 'result', result_fn, 'train', train
Ejemplo n.º 4
0
def run_pca(data_file, rs, n_components, outfile1, outfile2):
    """Wrapper to run PCA model.

    Parameters
    ----------
    data_file : str
        filepath of data file
    rs : int
        random seed
    n_components: int
        PCA parameter
    outfile : str
        filepath of output file to be generated
    """
    print('running PCA with n_components={}'.format(n_components))
    day_batcher = DayBatcher(data_file, skiprow=1, delimiter=' ')
    mat = day_batcher.next_batch()
    rst = []
    while mat is not None:
        if mat.shape[1] == 13:
            # use compact10d
            datadict = {
                'features': mat[:, 3:],
                'red': mat[:, 2],
                'user': mat[:, 1],
                'day': mat[:, 0]
            }
        else:
            # use all_fixed
            datadict = {
                'features': mat[:, 14:],
                'red': mat[:, 13],
                'user': mat[:, 1],
                'day': mat[:, 0]
            }
        batch = scale(datadict['features'])
        pca = PCA(n_components=n_components, random_state=rs)
        pca.fit(batch)
        data_reduced = np.dot(batch, pca.components_.T)  # pca transform
        data_original = np.dot(data_reduced,
                               pca.components_)  # inverse_transform
        pointloss = np.mean(np.square(batch - data_original), axis=1)
        loss = np.mean(pointloss)
        for d, u, t, l, in zip(datadict['day'].tolist(),
                               datadict['user'].tolist(),
                               datadict['red'].tolist(),
                               pointloss.flatten().tolist()):
            rst.append((u, d, l, t))
        mat = day_batcher.next_batch()
    train_rst, test_rst = split_train_test(rst)
    save_rst(train_rst, outfile1)
    save_rst(test_rst, outfile2)
    eval_cr(test_rst, 'pca')
Ejemplo n.º 5
0
def seperate_train_test(data, labels, output_dir):
    train_fn = os.path.join(output_dir, 'train.npy')
    train_labels_fn = os.path.join(output_dir, 'train_labels.npy')
    test_fn = os.path.join(output_dir, 'test.npy')
    test_labels_fn = os.path.join(output_dir, 'test_labels.npy')

    train, test, mask = util.split_train_test(data)

    np.save(train_fn, train)
    np.save(train_labels_fn, labels[mask])
    np.save(test_fn, test)
    np.save(test_labels_fn, labels[~mask])

    return train_fn
Ejemplo n.º 6
0
 def find_best_k(self):
     folds = self.trainset.n_folds(10)
     all_results = {}
     for o in range(0, 10):
         test, train = split_train_test(folds, o)
         all_results = self.addDictionaries(
             all_results,
             self.calculate_best_precision(train, test, self.k_max))
     all_results = sort_dict(all_results, 1)
     best_result = all_results[0][1]
     all_results.reverse()
     for k in all_results:
         if k[1] == best_result:
             return k[0]
     return -1
Ejemplo n.º 7
0
 def find_global_accuracies(self):
     scores = {}
     ten_folds = self.dataset.n_folds(10)
     k_min = self.k_min
     k_max = self.k_max
     for i in range(k_min, k_max):
         global_accuracy = 0
         for fold in range(0, len(ten_folds)):
             test, train = util.split_train_test(ten_folds, fold)
             temp_knn = KNeighborsClassifier(n_neighbors=i, algorithm='brute')
             temp_knn.fit(train.get_X(), train.get_y())
             score = temp_knn.score(test.get_X(), test.get_y())
             global_accuracy += float(score)
         global_accuracy /= 10
         scores[i] = global_accuracy
     self.scores = scores
Ejemplo n.º 8
0
def run_svm(data_file, rs, nu, kernel, gamma, shrink, outfile1, outfile2):
    #
    print('running SVM with nu={}, kernel={}, shrink={}'.format(
        nu, kernel, shrink))
    try:
        feat = load_features_bin(data_file)
    except:
        feat = load_features_txt(data_file)
        npyfile = data_file[:-4] + '.npy'
        np.save(npyfile, feat)
    feat = group_by_day(feat)
    rdd_feat = sc.parallelize(feat, len(feat))
    rst = rdd_feat.flatMap(partial(rdd_svm, nu, kernel, gamma, shrink,
                                   rs)).collect()
    train_rst, test_rst = split_train_test(rst)
    save_rst(train_rst, outfile1)
    save_rst(test_rst, outfile2)
    eval_cr(test_rst, 'svm')
Ejemplo n.º 9
0
    def find_global_accuracies(self):
        scores = {}
        ten_folds = self.samples.n_folds(10)
        if self.k_max - self.k_min == 0:
            return 1
        for i in range(self.k_min, self.k_max):
            global_accuracy = 0
            for fold in range(0, len(ten_folds)):
                test, train = util.split_train_test(ten_folds, fold)
                temp_knn = KNeighborsClassifier(n_neighbors=i,
                                                algorithm='brute')
                temp_knn.fit(train.get_X(), train.get_y())
                score = temp_knn.score(test.get_X(), test.get_y())
                global_accuracy += float(score)
            global_accuracy /= 10
            scores[i] = global_accuracy
        result = util.sort_dict(scores, 1)

        return result[0][0]
Ejemplo n.º 10
0
def run_iso_forest(data_file, rs, n_estimators, max_samples, contamination,
                   max_features, bootstrap, outfile1, outfile2):
    #
    print(
        'running Isolation Forest with n_estimators={}, max_samples={}, contamination={}, max_features={}, bootstrap={}'
        .format(n_estimators, max_samples, contamination, max_features,
                bootstrap))
    try:
        feat = load_features_bin(data_file)
    except:
        feat = load_features_txt(data_file)
        npyfile = data_file[:-4] + '.npy'
        np.save(npyfile, feat)
    feat = group_by_day(feat)
    rdd_feat = sc.parallelize(feat, len(feat))
    rst = rdd_feat.flatMap(
        partial(rdd_iso_forest, n_estimators, max_samples, contamination,
                max_features, bootstrap)).collect()
    train_rst, test_rst = split_train_test(rst)
    save_rst(train_rst, outfile1)
    save_rst(test_rst, outfile2)
    eval_cr(test_rst, 'iso-forest')
Ejemplo n.º 11
0
    mm2csr(A, '/tmp/train.mat')
    mm2csr(useritem_featureitem, '/tmp/train_feature.mat')
    C = tsv_to_matrix(test_file)
    mm2csr(C, '/tmp/test.mat')
    """

    W = sslim_train(A, B)

    recommendations = slim_recommender(A, W)

    compute_precision(recommendations, test_file)


if __name__ == "__main__":
    train_file, test_file = split_train_test(
        "data/cidades_categorias/100_cidades_minimas/categorias_usuarios_cidades.tsv"
    )
    import pdb

    pdb.set_trace()
    main(
        "data/cidades/100_without_stemming_less_outliers/usuarios_cidades_train.tsv",
        "data/cidades/100_without_stemming_less_outliers/palavras_cidades.tsv",
        "data/cidades/100_without_stemming_less_outliers/usuarios_cidades_test.tsv",
    )
    """
    main('data/atracoes/10/usuarios_atracoes_train.tsv',
         'data/atracoes/10/palavras_atracoes.tsv',
         'data/atracoes/10/usuarios_atracoes_test.tsv')
    """
    10: {
        'learning_rate': 0.03,
        'n_estimators': 700,
        'max_depth': 2,
        'min_child_weight': 19,
        'gamma': 0.67,
        'subsample': 0.7,
        'colsample_bytree': 1.0,
        'colsample_bylevel': 0.7,
        'colsample_bynode': 1.0,
    },
}

para_train, attr_train, qual_train, para_test, attr_test, qual_test \
    = util.split_train_test(train, test_rate=0,
                            para_range=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                            attr_range=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

attr_pred = pd.DataFrame(data={
    f'Attribute{i}': np.empty(shape=(6000, ))
    for i in [4, 5, 6, 7, 8, 9, 10]
})

attr_train_pred = pd.DataFrame(data={
    f'Attribute{i}': np.empty(shape=(6000, ))
    for i in [4, 5, 6, 7, 8, 9, 10]
})

for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    param_set = xgb_params_set[i]
    model = xgb.XGBRegressor(
Ejemplo n.º 13
0
    from util import mm2csr
    mm2csr(A, '/tmp/train.mat')
    mm2csr(useritem_featureitem, '/tmp/train_feature.mat')
    C = tsv_to_matrix(test_file)
    mm2csr(C, '/tmp/test.mat')
    """

    W = sslim_train(A, B)

    recommendations = slim_recommender(A, W)

    compute_precision(recommendations, test_file)


if __name__ == '__main__':
    train_file, test_file = split_train_test(
        'data/cidades_categorias/100_cidades_minimas/categorias_usuarios_cidades.tsv'
    )
    import pdb
    pdb.set_trace()
    main(
        'data/cidades/100_without_stemming_less_outliers/usuarios_cidades_train.tsv',
        'data/cidades/100_without_stemming_less_outliers/palavras_cidades.tsv',
        'data/cidades/100_without_stemming_less_outliers/usuarios_cidades_test.tsv'
    )
    """
    main('data/atracoes/10/usuarios_atracoes_train.tsv',
         'data/atracoes/10/palavras_atracoes.tsv',
         'data/atracoes/10/usuarios_atracoes_test.tsv')
    """
Ejemplo n.º 14
0
from tensorflow import keras
from tensorflow.keras import layers
import data_augmentor as da
import numpy as np
from util import split_train_test, plot_hist

target_model = keras.models.load_model('saved_models/cifar10_target_model')
(_, _), (x_data, y_data) = cifar10.load_data()
img_rows, img_cols, channels = 32, 32, 3
num_classes = 10

# 1. initial data collection
x_data = x_data.astype("float32") / 255.0
x_train, x_test, y_train, y_test = split_train_test(x_data,
                                                    y_data,
                                                    test_ratio=0.8)

x_train = x_train.reshape((-1, img_rows, img_cols, channels))
x_test = x_test.reshape((-1, img_rows, img_cols, channels))

y_train_prob = target_model.predict(x_train)
y_train = y_train_prob.argmax(axis=-1)


# 2. substitute model architecture
def create_model():
    model = keras.Sequential([
        keras.Input(shape=(img_rows, img_cols, channels)),
        layers.Conv2D(32, (3, 3),
                      activation='relu',
Ejemplo n.º 15
0
from util import split_train_test
import sys
split_train_test(sys.argv[1])
Ejemplo n.º 16
0
def run_dnn_test(args):
    data_file, data_spec_file, outdir = args
    """Run DNN model on data file given parameters."""
    nl = config.dnn.num_layers
    hs = config.dnn.hidden_size
    # io and state
    print('running DNN with nl={}, hs={}'.format(nl, hs))
    start = time.time()
    dataspecs = json.load(open(data_spec_file, 'r'))
    feature_spec = make_feature_spec(dataspecs)
    datastart_index = dataspecs['counts']['index'][0]
    normalizers = {'none': None, 'layer': layer_norm, 'batch': batch_normalize}
    tf.set_random_seed(config.state.random_seed)
    data = OnlineBatcher(data_file,
                         config.dnn.batch_size,
                         skipheader=True,
                         delimiter=' ')
    # activation
    if config.dnn.activation == 'tanh':
        activation = tf.tanh
    elif config.dnn.activation == 'relu':
        activation = tf.nn.relu
    else:
        raise ValueError('Activation must be "relu", or "tanh"')
    # mvn
    if config.dnn.dist == "ident":
        mvn = eyed_mvn_loss
    elif config.dnn.dist == "diag":
        mvn = diag_mvn_loss
    elif config.dnn.dist == "full":
        mvn = full_mvn_loss
        raise ValueError('dnn.dist must be "ident", "diag", or "full"')
    # setup tf model
    x, ph_dict = join_multivariate_inputs(feature_spec, dataspecs, 0.75, 1000,
                                          2)
    h = dnn(x,
            layers=[hs for i in range(nl)],
            act=activation,
            keep_prob=None,
            norm=normalizers[config.dnn.normalizer],
            scale_range=1.0)
    loss_spec = make_loss_spec(dataspecs, mvn)
    loss_matrix = multivariate_loss(h, loss_spec, ph_dict, variance_floor=0.01)
    loss_vector = tf.reduce_sum(loss_matrix, reduction_indices=1)  # is MB x 1
    loss = tf.reduce_mean(loss_vector)  # is scalar
    loss_names = get_multivariate_loss_names(loss_spec)
    eval_tensors = [loss, loss_vector, loss_matrix]
    model = ModelRunner(loss,
                        ph_dict,
                        learnrate=config.dnn.lr,
                        opt='adam',
                        debug=config.dnn.debug,
                        decay_rate=1.0,
                        decay_steps=20)
    raw_batch = data.next_batch()
    current_loss = sys.float_info.max
    not_early_stop = EarlyStop(20)
    loss_feats = [triple[0] for triple in loss_spec]
    # start training
    start_time = time.time()
    continue_training = not_early_stop(raw_batch, current_loss)
    # mat is not None and self.badcount < self.badlimit and loss != inf, nan:
    rst = []
    while continue_training:
        datadict = split_batch(raw_batch, dataspecs)
        targets = {'target_' + name: datadict[name] for name in loss_feats}
        datadict.update(targets)
        current_loss, pointloss, contrib = model.eval(datadict, eval_tensors)
        model.train_step(datadict)
        for user, day, score, red in zip(
                datadict['user'].flatten().tolist(),
                datadict['time'].flatten().tolist(),
                pointloss.flatten().tolist(),
                datadict['redteam'].flatten().tolist()):
            rst.append((user, int(day), score, red))
        if data.index % 10000 == 1:
            print('index: %s loss: %.4f' % (data.index, current_loss))
            sys.stdout.flush()
        raw_batch = data.next_batch()
        continue_training = not_early_stop(raw_batch, current_loss)
        if continue_training < 0:
            break
    # save the (user, day, score, red).
    train_rst, test_rst = split_train_test(rst)
    outfile1, outfile2 = FileName.get_dnn_rst_name()
    save_rst(train_rst, outfile1)
    save_rst(test_rst, outfile2)
    print('')
    eval_cr(test_rst, 'dnn')
    dt = time.time() - start
    print("run_dnn_test Done. Elapsed time is %.2f seconds." % dt)