def run_random(data_file, rs, outfile1, outfile2): print('running RANDOM model...') day_batcher = DayBatcher(data_file, skiprow=1, delimiter=' ') mat = day_batcher.next_batch() random.seed(rs) rst = [] while mat is not None: if mat.shape[1] == 13: # use compact10d datadict = { 'features': mat[:, 3:], 'red': mat[:, 2], 'user': mat[:, 1], 'day': mat[:, 0] } else: # use all_fixed datadict = { 'features': mat[:, 14:], 'red': mat[:, 13], 'user': mat[:, 1], 'day': mat[:, 0] } anomaly_scores = [random.random() for x in datadict['features']] for day, user, red, score in zip(datadict['day'], datadict['user'], datadict['red'], anomaly_scores): rst.append((user, day, score, red)) mat = day_batcher.next_batch() train_rst, test_rst = split_train_test(rst) save_rst(train_rst, outfile1) save_rst(test_rst, outfile2) eval_cr(test_rst, 'random')
def run_label(self, label): start_time = time.time() print >> sys.stderr, start_time, 'Working on label', label print 'Working on label', label train_fn = os.path.join(self.top_dir, 'train', 'data_%d_train.npy' % (label, )) test_fn = os.path.join(self.top_dir, 'test', 'data_%d_test.npy' % (label, )) model_fn = os.path.join(self.top_dir, 'model', 'model_%d.pkl' % (label, )) result_fn = os.path.join(self.top_dir, 'results', 'result_%d_%d.npy' % (label, label)) _mkdir(train_fn) _mkdir(test_fn) _mkdir(model_fn) train, test, mask = util.split_train_test( self.data[self.dl.get_mask_for_label(label)], 0.80) np.save(train_fn, train) np.save(test_fn, test) hyper_params_aa = self.base_hyper_params_aa.copy() hyper_params_aa['input_fn'] = train_fn hyper_params_aa['output_fn'] = model_fn #hyper_params_aa['max_epochs'] = train.shape[0]/580000+40 hyper_params_aa['max_epochs'] = 300 config = self.aa_yaml % hyper_params_aa train = yaml_parse.load(config) train.main_loop() run_ae(model_fn, test_fn, result_fn) print 'time', time.time( ) - start_time, 'model', model_fn, 'result', result_fn, 'train', train
def run_label(self, label): start_time = time.time() print >>sys.stderr, start_time, 'Working on label', label print 'Working on label', label train_fn = os.path.join(self.top_dir, 'train', 'data_%d_train.npy' % (label, )) test_fn = os.path.join(self.top_dir, 'test', 'data_%d_test.npy' % (label, )) model_fn = os.path.join(self.top_dir, 'model', 'model_%d.pkl' % (label, )) result_fn = os.path.join(self.top_dir, 'results', 'result_%d_%d.npy' % (label, label)) _mkdir(train_fn) _mkdir(test_fn) _mkdir(model_fn) train, test, mask = util.split_train_test(self.data[self.dl.get_mask_for_label(label)], 0.80) np.save(train_fn, train) np.save(test_fn, test) hyper_params_aa = self.base_hyper_params_aa.copy() hyper_params_aa['input_fn'] = train_fn hyper_params_aa['output_fn'] = model_fn #hyper_params_aa['max_epochs'] = train.shape[0]/580000+40 hyper_params_aa['max_epochs'] = 300 config = self.aa_yaml % hyper_params_aa train = yaml_parse.load(config) train.main_loop() run_ae(model_fn, test_fn, result_fn) print 'time', time.time() - start_time, 'model', model_fn, 'result', result_fn, 'train', train
def run_pca(data_file, rs, n_components, outfile1, outfile2): """Wrapper to run PCA model. Parameters ---------- data_file : str filepath of data file rs : int random seed n_components: int PCA parameter outfile : str filepath of output file to be generated """ print('running PCA with n_components={}'.format(n_components)) day_batcher = DayBatcher(data_file, skiprow=1, delimiter=' ') mat = day_batcher.next_batch() rst = [] while mat is not None: if mat.shape[1] == 13: # use compact10d datadict = { 'features': mat[:, 3:], 'red': mat[:, 2], 'user': mat[:, 1], 'day': mat[:, 0] } else: # use all_fixed datadict = { 'features': mat[:, 14:], 'red': mat[:, 13], 'user': mat[:, 1], 'day': mat[:, 0] } batch = scale(datadict['features']) pca = PCA(n_components=n_components, random_state=rs) pca.fit(batch) data_reduced = np.dot(batch, pca.components_.T) # pca transform data_original = np.dot(data_reduced, pca.components_) # inverse_transform pointloss = np.mean(np.square(batch - data_original), axis=1) loss = np.mean(pointloss) for d, u, t, l, in zip(datadict['day'].tolist(), datadict['user'].tolist(), datadict['red'].tolist(), pointloss.flatten().tolist()): rst.append((u, d, l, t)) mat = day_batcher.next_batch() train_rst, test_rst = split_train_test(rst) save_rst(train_rst, outfile1) save_rst(test_rst, outfile2) eval_cr(test_rst, 'pca')
def seperate_train_test(data, labels, output_dir): train_fn = os.path.join(output_dir, 'train.npy') train_labels_fn = os.path.join(output_dir, 'train_labels.npy') test_fn = os.path.join(output_dir, 'test.npy') test_labels_fn = os.path.join(output_dir, 'test_labels.npy') train, test, mask = util.split_train_test(data) np.save(train_fn, train) np.save(train_labels_fn, labels[mask]) np.save(test_fn, test) np.save(test_labels_fn, labels[~mask]) return train_fn
def find_best_k(self): folds = self.trainset.n_folds(10) all_results = {} for o in range(0, 10): test, train = split_train_test(folds, o) all_results = self.addDictionaries( all_results, self.calculate_best_precision(train, test, self.k_max)) all_results = sort_dict(all_results, 1) best_result = all_results[0][1] all_results.reverse() for k in all_results: if k[1] == best_result: return k[0] return -1
def find_global_accuracies(self): scores = {} ten_folds = self.dataset.n_folds(10) k_min = self.k_min k_max = self.k_max for i in range(k_min, k_max): global_accuracy = 0 for fold in range(0, len(ten_folds)): test, train = util.split_train_test(ten_folds, fold) temp_knn = KNeighborsClassifier(n_neighbors=i, algorithm='brute') temp_knn.fit(train.get_X(), train.get_y()) score = temp_knn.score(test.get_X(), test.get_y()) global_accuracy += float(score) global_accuracy /= 10 scores[i] = global_accuracy self.scores = scores
def run_svm(data_file, rs, nu, kernel, gamma, shrink, outfile1, outfile2): # print('running SVM with nu={}, kernel={}, shrink={}'.format( nu, kernel, shrink)) try: feat = load_features_bin(data_file) except: feat = load_features_txt(data_file) npyfile = data_file[:-4] + '.npy' np.save(npyfile, feat) feat = group_by_day(feat) rdd_feat = sc.parallelize(feat, len(feat)) rst = rdd_feat.flatMap(partial(rdd_svm, nu, kernel, gamma, shrink, rs)).collect() train_rst, test_rst = split_train_test(rst) save_rst(train_rst, outfile1) save_rst(test_rst, outfile2) eval_cr(test_rst, 'svm')
def find_global_accuracies(self): scores = {} ten_folds = self.samples.n_folds(10) if self.k_max - self.k_min == 0: return 1 for i in range(self.k_min, self.k_max): global_accuracy = 0 for fold in range(0, len(ten_folds)): test, train = util.split_train_test(ten_folds, fold) temp_knn = KNeighborsClassifier(n_neighbors=i, algorithm='brute') temp_knn.fit(train.get_X(), train.get_y()) score = temp_knn.score(test.get_X(), test.get_y()) global_accuracy += float(score) global_accuracy /= 10 scores[i] = global_accuracy result = util.sort_dict(scores, 1) return result[0][0]
def run_iso_forest(data_file, rs, n_estimators, max_samples, contamination, max_features, bootstrap, outfile1, outfile2): # print( 'running Isolation Forest with n_estimators={}, max_samples={}, contamination={}, max_features={}, bootstrap={}' .format(n_estimators, max_samples, contamination, max_features, bootstrap)) try: feat = load_features_bin(data_file) except: feat = load_features_txt(data_file) npyfile = data_file[:-4] + '.npy' np.save(npyfile, feat) feat = group_by_day(feat) rdd_feat = sc.parallelize(feat, len(feat)) rst = rdd_feat.flatMap( partial(rdd_iso_forest, n_estimators, max_samples, contamination, max_features, bootstrap)).collect() train_rst, test_rst = split_train_test(rst) save_rst(train_rst, outfile1) save_rst(test_rst, outfile2) eval_cr(test_rst, 'iso-forest')
mm2csr(A, '/tmp/train.mat') mm2csr(useritem_featureitem, '/tmp/train_feature.mat') C = tsv_to_matrix(test_file) mm2csr(C, '/tmp/test.mat') """ W = sslim_train(A, B) recommendations = slim_recommender(A, W) compute_precision(recommendations, test_file) if __name__ == "__main__": train_file, test_file = split_train_test( "data/cidades_categorias/100_cidades_minimas/categorias_usuarios_cidades.tsv" ) import pdb pdb.set_trace() main( "data/cidades/100_without_stemming_less_outliers/usuarios_cidades_train.tsv", "data/cidades/100_without_stemming_less_outliers/palavras_cidades.tsv", "data/cidades/100_without_stemming_less_outliers/usuarios_cidades_test.tsv", ) """ main('data/atracoes/10/usuarios_atracoes_train.tsv', 'data/atracoes/10/palavras_atracoes.tsv', 'data/atracoes/10/usuarios_atracoes_test.tsv') """
10: { 'learning_rate': 0.03, 'n_estimators': 700, 'max_depth': 2, 'min_child_weight': 19, 'gamma': 0.67, 'subsample': 0.7, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.7, 'colsample_bynode': 1.0, }, } para_train, attr_train, qual_train, para_test, attr_test, qual_test \ = util.split_train_test(train, test_rate=0, para_range=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], attr_range=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) attr_pred = pd.DataFrame(data={ f'Attribute{i}': np.empty(shape=(6000, )) for i in [4, 5, 6, 7, 8, 9, 10] }) attr_train_pred = pd.DataFrame(data={ f'Attribute{i}': np.empty(shape=(6000, )) for i in [4, 5, 6, 7, 8, 9, 10] }) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: param_set = xgb_params_set[i] model = xgb.XGBRegressor(
from util import mm2csr mm2csr(A, '/tmp/train.mat') mm2csr(useritem_featureitem, '/tmp/train_feature.mat') C = tsv_to_matrix(test_file) mm2csr(C, '/tmp/test.mat') """ W = sslim_train(A, B) recommendations = slim_recommender(A, W) compute_precision(recommendations, test_file) if __name__ == '__main__': train_file, test_file = split_train_test( 'data/cidades_categorias/100_cidades_minimas/categorias_usuarios_cidades.tsv' ) import pdb pdb.set_trace() main( 'data/cidades/100_without_stemming_less_outliers/usuarios_cidades_train.tsv', 'data/cidades/100_without_stemming_less_outliers/palavras_cidades.tsv', 'data/cidades/100_without_stemming_less_outliers/usuarios_cidades_test.tsv' ) """ main('data/atracoes/10/usuarios_atracoes_train.tsv', 'data/atracoes/10/palavras_atracoes.tsv', 'data/atracoes/10/usuarios_atracoes_test.tsv') """
from tensorflow import keras from tensorflow.keras import layers import data_augmentor as da import numpy as np from util import split_train_test, plot_hist target_model = keras.models.load_model('saved_models/cifar10_target_model') (_, _), (x_data, y_data) = cifar10.load_data() img_rows, img_cols, channels = 32, 32, 3 num_classes = 10 # 1. initial data collection x_data = x_data.astype("float32") / 255.0 x_train, x_test, y_train, y_test = split_train_test(x_data, y_data, test_ratio=0.8) x_train = x_train.reshape((-1, img_rows, img_cols, channels)) x_test = x_test.reshape((-1, img_rows, img_cols, channels)) y_train_prob = target_model.predict(x_train) y_train = y_train_prob.argmax(axis=-1) # 2. substitute model architecture def create_model(): model = keras.Sequential([ keras.Input(shape=(img_rows, img_cols, channels)), layers.Conv2D(32, (3, 3), activation='relu',
from util import split_train_test import sys split_train_test(sys.argv[1])
def run_dnn_test(args): data_file, data_spec_file, outdir = args """Run DNN model on data file given parameters.""" nl = config.dnn.num_layers hs = config.dnn.hidden_size # io and state print('running DNN with nl={}, hs={}'.format(nl, hs)) start = time.time() dataspecs = json.load(open(data_spec_file, 'r')) feature_spec = make_feature_spec(dataspecs) datastart_index = dataspecs['counts']['index'][0] normalizers = {'none': None, 'layer': layer_norm, 'batch': batch_normalize} tf.set_random_seed(config.state.random_seed) data = OnlineBatcher(data_file, config.dnn.batch_size, skipheader=True, delimiter=' ') # activation if config.dnn.activation == 'tanh': activation = tf.tanh elif config.dnn.activation == 'relu': activation = tf.nn.relu else: raise ValueError('Activation must be "relu", or "tanh"') # mvn if config.dnn.dist == "ident": mvn = eyed_mvn_loss elif config.dnn.dist == "diag": mvn = diag_mvn_loss elif config.dnn.dist == "full": mvn = full_mvn_loss raise ValueError('dnn.dist must be "ident", "diag", or "full"') # setup tf model x, ph_dict = join_multivariate_inputs(feature_spec, dataspecs, 0.75, 1000, 2) h = dnn(x, layers=[hs for i in range(nl)], act=activation, keep_prob=None, norm=normalizers[config.dnn.normalizer], scale_range=1.0) loss_spec = make_loss_spec(dataspecs, mvn) loss_matrix = multivariate_loss(h, loss_spec, ph_dict, variance_floor=0.01) loss_vector = tf.reduce_sum(loss_matrix, reduction_indices=1) # is MB x 1 loss = tf.reduce_mean(loss_vector) # is scalar loss_names = get_multivariate_loss_names(loss_spec) eval_tensors = [loss, loss_vector, loss_matrix] model = ModelRunner(loss, ph_dict, learnrate=config.dnn.lr, opt='adam', debug=config.dnn.debug, decay_rate=1.0, decay_steps=20) raw_batch = data.next_batch() current_loss = sys.float_info.max not_early_stop = EarlyStop(20) loss_feats = [triple[0] for triple in loss_spec] # start training start_time = time.time() continue_training = not_early_stop(raw_batch, current_loss) # mat is not None and self.badcount < self.badlimit and loss != inf, nan: rst = [] while continue_training: datadict = split_batch(raw_batch, dataspecs) targets = {'target_' + name: datadict[name] for name in loss_feats} datadict.update(targets) current_loss, pointloss, contrib = model.eval(datadict, eval_tensors) model.train_step(datadict) for user, day, score, red in zip( datadict['user'].flatten().tolist(), datadict['time'].flatten().tolist(), pointloss.flatten().tolist(), datadict['redteam'].flatten().tolist()): rst.append((user, int(day), score, red)) if data.index % 10000 == 1: print('index: %s loss: %.4f' % (data.index, current_loss)) sys.stdout.flush() raw_batch = data.next_batch() continue_training = not_early_stop(raw_batch, current_loss) if continue_training < 0: break # save the (user, day, score, red). train_rst, test_rst = split_train_test(rst) outfile1, outfile2 = FileName.get_dnn_rst_name() save_rst(train_rst, outfile1) save_rst(test_rst, outfile2) print('') eval_cr(test_rst, 'dnn') dt = time.time() - start print("run_dnn_test Done. Elapsed time is %.2f seconds." % dt)