def get_data_for_learner(): """Return data ready to be used in the model. Returns: train_dl: DataLoader to iterate on. x_valid: validation set. y_valid: target validation set. test: data for final test. """ train, test = u.get_data() train = u.preprocess_data(train, True) test = u.preprocess_data(test, False) X_train, X_valid, y_train, y_valid = u.train_validation_split(train, VALID_PERCENTAGE) x_train, y_train, x_valid, y_valid = map(torch.tensor, (X_train, y_train, X_valid, y_valid)) test = torch.tensor(test.to_numpy()) x_train = x_train.float() y_train = y_train.long() test = test.float() train_ds = TensorDataset(x_train, y_train) train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True) x_valid = x_valid.float() y_valid = y_valid.long() return train_dl, x_valid, y_valid, test
def get_rank(self, arXiv_df, field_df): field_matrix = self._model.transform( preprocess_data(field_df)).toarray() paper_matrix = self._model.transform( preprocess_data(arXiv_df[['title', 'abstract']])).toarray() score = self.get_similarity(paper_matrix, field_matrix) result_df = arXiv_df.copy() result_df['score'] = score result_df.sort_values(by='score', inplace=True, ascending=False) return result_df
def main(): COMMENT_TEXT_COL = 'comment_text' EMB_MAX_FEAT = 300 MAX_LEN = 220 MAX_FEATURES = 100000 #BATCH_SIZE = 1024 BATCH_SIZE = 256 #BATCH_SIZE = 2048 NUM_EPOCHS = 1 LSTM_UNITS = 64 if args.debug: print('running in debug mode') if args.debug: result_dir = os.path.join(utils.RESULT_DIR, 'debug-'+datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')) else: result_dir = os.path.join(utils.RESULT_DIR, datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')) os.mkdir(result_dir) print(f'created: {result_dir}') # convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch( # os.path.join(utils.BERT_MODEL_PATH, 'bert_model.ckpt'), # os.path.join(utils.BERT_MODEL_PATH, 'bert_config.json'), # utils.PYTORCH_BERT_MODEL_PATH) train_data = ToxicDataset(mode='train', debug=args.debug) test_data = ToxicDataset(mode='test') train, test = train_data.data, test_data.data train = utils.preprocess_data(train, mode='train') test = utils.preprocess_data(test) #tokenizer = Tokenizer(num_words=MAX_FEATURES, lower=True) tokenizer = BertTokenizer.from_pretrained(utils.BERT_MODEL_PATH, do_lower_case=True) X_train, X_test, y_train = utils.run_bert_tokenizer(tokenizer, train, test, seq_len=MAX_LEN) #word_index = tokenizer.word_index word_index = None #print(word_index) # print(f'vocab size: {len(word_index)}') # embedding_matrix = utils.build_embeddings(word_index, emb_max_feat=EMB_MAX_FEAT) # print(embedding_matrix.shape) embedding_matrix = None sub_preds, oof_df = utils.run_model_pytorch(result_dir, X_train, X_test, y_train, embedding_matrix, word_index, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, max_len=MAX_LEN, lstm_units=LSTM_UNITS, oof_df=train) bias_metrics_df = utils.compute_bias_metrics_for_model(dataset=oof_df, subgroups=utils.IDENTITY_COLS, model=utils.PREDICT_COL, label_col=utils.TOXICITY_COLUMN) validation_final_socre = utils.get_final_metric(bias_metrics_df, utils.calculate_overall_auc(oof_df, utils.TOXICITY_COLUMN) ) print(f'validation final score: {validation_final_socre}') utils.submit(result_dir, sub_preds) print('finish!!!')
def render_graph(graph_type, dict_data, bind_css=None, css_file_names=None): # Pre-process data utils.preprocess_data(graph_type, dict_data) if bind_css is None and utils.get_default_css_binding( graph_type) is not None: bind_css = utils.get_default_css_binding(graph_type) result = display.HTML( _render_graph(graph_type, dict_data, bind_css, css_file_names)) return result
def main(train=True): p = { 'batch_size': 4986, 'dim_1': 248, 'dim_2': 487, 'dim_3': 269, 'dim_4': 218, 'dim_5': 113, 'activation': nn.ReLU, 'dropout': 0.01563457578202565, 'lr': 0.00026372556533974916, 'label_smoothing': 0.06834918091900156, 'weight_decay': 0.005270589494631074, 'amsgrad': False } if train: models, features = train_cross_val(p) # models, features = final_train(p, load=False) else: data_ = load_data(root_dir='./data/', mode='train') data_, target_, features, date = preprocess_data(data_, nn=True) model_path = '/kaggle/input/model-files' f_mean = calc_data_mean(data_, 'cache') models = load_model(model_path, data_.shape[-1], 1, p, False) # model, checkpoint = final_train(p) # best_model_path = checkpoint.best_model_path # model, features = final_train(load=best_model_path) test_model(models, features) return models
def train_autoencoder(): data = utils.load_data(root_dir='./data/', mode='train') data, target, features, date = utils.preprocess_data(data, nn=True) dataset = utils.FinData(data=data, target=target, date=date) p = {'batch_size': 4597, 'dim_1': 231, 'dim_2': 851, 'dim_3': 777, 'dim_4': 192, 'hidden': 50, 'dropout': 0.017122456592972537, 'lr': 0.0013131268366473552, 'activation': nn.GELU, 'label_smoothing': 0.09401544509474698, 'weight_decay': 0.005078413740277699, 'amsgrad': True} train_idx = [i for i in range(len(data))] val_idx = [i for i in range(10000)] dataloaders = utils.create_dataloaders(dataset=dataset, indexes={ 'train': train_idx, 'val': val_idx}, batch_size=p['batch_size']) checkpoint_callback = ModelCheckpoint( dirpath='logs', monitor='t_loss', mode='min', save_top_k=1, period=10) input_size = data.shape[-1] output_size = 1 model = AutoEncoder(input_size=input_size, output_size=output_size, params=p) es = EarlyStopping(monitor='t_loss', patience=10, min_delta=0.0005, mode='min') trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'])
def main(): api_token = read_api_token() neptune.init(api_token=api_token, project_qualified_name='jamesmccarthy65/JSMP') data = load_data('data/', mode='train', overide='filtered_train.csv') data, target, features, date = preprocess_data(data) data_dict = { 'data': data, 'target': target, 'features': features, 'date': date } print('creating XGBoost Trials') xgb_exp = neptune.create_experiment('XGBoost_HPO') xgb_neptune_callback = opt_utils.NeptuneCallback(experiment=xgb_exp) study = optuna.create_study(direction='maximize') study.optimize(lambda trial: optimize(trial, data_dict), n_trials=100, callbacks=[xgb_neptune_callback]) joblib.dump(study, f'HPO/xgb_hpo_{str(datetime.datetime.now().date())}.pkl') print('Creating LightGBM Trials') lgb_exp = neptune.create_experiment('LGBM_HPO') lgbm_neptune_callback = opt_utils.NeptuneCallback(experiment=lgb_exp) study = optuna.create_study(direction='maximize') study.optimize(lambda trial: loptimize(trial, data_dict), n_trials=100, callbacks=[lgbm_neptune_callback]) joblib.dump(study, f'HPO/lgb_hpo_{str(datetime.datetime.now().date())}.pkl')
def __init__(self, args, rank): self.rank = rank self.epoch_loss = 0 self.epoch_acc = 0 self.args = args self.model, input_size, self.quant_model = initialize_model( args.model_name, get_num_classes(args.image_path)) self.dataloaders_dict = preprocess_data(args.image_path, args.batch_size, input_size, args.num_workers, rank) self.train_iterator = iter(self.dataloaders_dict['train']) print("Params to learn:") params_to_update = [] for name, param in self.model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) print("\t", name) self.optimizer = optim.Adam(params_to_update, lr=0.001) self.criterion = nn.CrossEntropyLoss()
def predict_api(): if request.method == 'POST': data = io.BytesIO(request.files.get('resume').read()) resume_text = preprocess_data(data) entities = predict(model, TOKENIZER, idx2tag, DEVICE, resume_text, MAX_LEN) return jsonify({'entities': entities})
def load_for_jupyter(): v_data, e_data, core_targets, ext_targets, core_testing = utils.load_data( from_jup=True) #v_sample, e_sample, core_sample, ext_sample = subsample(v_data, e_data, core_targets, ext_targets, n) v_sets, e_sets = utils.preprocess_data(v_data, e_data, core_targets, ext_targets, core_testing) return v_data, e_data, v_sets, e_sets, core_targets, ext_targets, core_testing
def test_model(obs_file=OBS_FILE, length_file=LENGTH_FILE): obs, length = utils.read_files(obs_file, length_file) obs = utils.preprocess_data(obs) # model = joblib.load(MODEL_NAME+str(0)+".pkl") model = joblib.load("../interval_model/model_mbs.pkl") model._print_info() validate_model(model, obs[:, 0].reshape(-1, 1), length)
def get_markov_chain_for_each(model, obs, lengths, patients, col_index): print("obs.shape: ", obs.shape) obs = utils.preprocess_data(obs, THREADHOLD[col_index]) start_index = 0 for i in range(0, len(lengths)): patient_id = lengths[i, 0] end_index = start_index + lengths[i, 1] states = model.predict(obs[start_index:end_index, :]) # print("states: ", states) state_frequency = np.bincount(states) state_frequency = np.pad(state_frequency, (0, N_COMPONENTS - state_frequency.shape[0]), 'constant', constant_values=0) if USE_PROPORTION: state_proportion = state_frequency / np.sum(state_frequency) else: state_proportion = state_frequency # print("state_frequency: ", state_frequency) if patient_id in patients: patients[patient_id].add_state_proportion(state_proportion, col_index) else: patients[patient_id] = Patient.Patient(patient_id) patients[patient_id].add_state_proportion(state_proportion, col_index) start_index = end_index
def entity_train(logger, tokenizer, model, to_be_trained_entities, yanbao_texts): entities_json = to_be_trained_entities train_proportion = 0.9 text_num = int(len(yanbao_texts)) random.shuffle(yanbao_texts) yanbao_texts_train = yanbao_texts[:int(text_num * train_proportion)] yanbao_texts_dev = yanbao_texts[int(text_num * train_proportion):] train_preprocessed_datas = preprocess_data(entities_json, yanbao_texts_train, tokenizer) train_dataloader = build_dataloader(train_preprocessed_datas, tokenizer, batch_size=BATCH_SIZE) dev_preprocessed_datas = preprocess_data(entities_json, yanbao_texts_dev, tokenizer) dev_dataloader = build_dataloader(dev_preprocessed_datas, tokenizer, batch_size=BATCH_SIZE) best_evaluate_score = 0 for epoch in range(TOTAL_EPOCH_NUMS): epoch_start_time = time.time() train(model, train_dataloader, logger=logger, epoch_id=epoch, device=DEVICE) # model.eval() evaluate_score = evaluate(model, dev_dataloader, logger=logger, tokenizer=tokenizer, device=DEVICE) f1 = evaluate_score['f'] p = evaluate_score['p'] r = evaluate_score['r'] duration = time.time() - epoch_start_time print('f1:', f1, 'p:', p, 'r:', r, 'time:', duration) if f1 > best_evaluate_score: best_evaluate_score = f1 save_model_path = os.path.join(SAVE_MODEL_DIR, 'best_en_model.pth') logger.info('saving model to {}'.format(save_model_path)) model.save(save_model_path, epoch)
def train_cross_val(p): data_ = load_data(root_dir='./data/', mode='train') data_, target_, features, date = preprocess_data(data_, nn=True) gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5) input_size = data_.shape[-1] output_size = 1 tb_logger = pl_loggers.TensorBoardLogger('logs/') models = [] for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_[idx]) target = copy.deepcopy(target_[idx]) checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "fold_{}".format(i)), monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') dataset = FinData(data=data, target=target, date=date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(logger=tb_logger, max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth') models.append(model) return models, features
def get_top_k(self, arXiv_df, field_df, k, addition_df=None): field_matrix = self._model.transform( preprocess_data(field_df)).toarray() paper_matrix = self._model.transform( preprocess_data(arXiv_df[['title', 'abstract']])).toarray() if not addition_df is None: addition_matrix = self._model.transform( preprocess_data(addition_df)).toarray() result = np.argsort( self.get_similarity(paper_matrix, field_matrix, addition_matrix))[::-1] else: result = np.argsort(self.get_similarity(paper_matrix, field_matrix))[::-1] # print(result) return arXiv_df.loc[result[:k]]
def get_train_data(): normal_train, normal_test = get_sentence(args.train_data, args.test_data) transfer_train, transfer_test = get_sentence(args.transfer_train_data, args.transfer_test_data) char2id, id2char, tag2id, id2tag, transfer_tag2id, transfer_id2tag = get_transform(normal_train + transfer_train, args.map_path, args.tag2label_path, args.transfer_tag2label_path) train_data = preprocess_data(normal_train, char2id, tag2id) train_manager = BatchManager(train_data, args.batch_size) test_data = preprocess_data(normal_test, char2id, tag2id) test_manager = BatchManager(test_data, args.batch_size) transfer_train_data = preprocess_data(transfer_train, char2id, transfer_tag2id) transfer_train_manager = BatchManager(transfer_train_data, args.batch_size) transfer_test_data = preprocess_data(transfer_test, char2id, transfer_tag2id) transfer_test_manager = BatchManager(transfer_test_data, args.batch_size) return train_manager, test_manager, transfer_train_manager, transfer_test_manager, id2char, id2tag, transfer_id2tag
def transform_and_save(source_path, target_path): # read data data = pd.read_csv(source_path) # transform data = preprocess_data(data) # output np.save(str(target_path.absolute()), data)
def get_top_k_with_kw(self, arXiv_df, field_df, k, addition_df=None): field_matrix = self._model.transform( preprocess_data(field_df)).toarray() paper_matrix = self._model.transform( preprocess_data(arXiv_df[['title', 'abstract']])).toarray() if not addition_df is None: addition_matrix = self._model.transform( preprocess_data(addition_df)).toarray() result = np.argsort( self.get_similarity(paper_matrix, field_matrix, addition_matrix))[::-1] else: score = self.get_similarity(paper_matrix, field_matrix) addition_score = arXiv_df['contain_keywords'].str.strip( ).str.split(';').apply( lambda item: 0.01 * 2**len(item[:-1])).values score += addition_score result = np.argsort(score)[::-1] return arXiv_df.loc[result[:k]]
def __getitem__(self, idx): case_path = self.case_paths[idx] series = preprocess_data(case_path, self.transform) case_id = int(os.path.splitext(os.path.basename(case_path))[0]) case_row = self.labels_df[self.labels_df.case == case_id] diagnoses = case_row.values[0, 1:].astype(np.float32) labels = torch.tensor(diagnoses) return (series, labels)
def get_search_results(cls, s): slugs, model, bigrams = cls.get_artifacts() s_preprocessed = preprocess_data(s, cls.regex, True, True) s_ = list(bigrams[s_preprocessed]) s_vector = model.infer([(s_, 0)]) results = cdist(s_vector, model.sv.vectors, metric='cosine').squeeze() results = list(enumerate(results)) results = sorted(results, key=lambda x: x[1])[:10] results = [slugs[idx] for idx, _ in results] return results
def init_dm(): (X_TRAIN, Y_TRAIN), (X_TEST, Y_TEST) = load_mnist10() X_TRAIN = X_TRAIN[:12000] # !TODO just for tests Y_TRAIN = Y_TRAIN[:12000] x_train = preprocess_data(X_TRAIN, data_type='x') y_train = preprocess_data(Y_TRAIN, data_type='y') (x_eval, y_eval), (x_train, y_train) = split(x_train, y_train, N_EVAL) x_test = preprocess_data(X_TEST, data_type='x') y_test = preprocess_data(Y_TEST, data_type='y') print(""" Shapes: x_train: {} x_eval: {} y_eval: {} x_test: {} y_test: {} """.format(x_train.shape, x_eval.shape, y_eval.shape, x_test.shape, y_test.shape)) return DatasetsManager(x_train, x_eval, y_eval, x_test, y_test), y_train
def run_experiment(data_train: pd.DataFrame, data_test: pd.DataFrame, config: Dict) -> pd.DataFrame: data_train = preprocess_data(data_train) data_test = preprocess_data(data_test) for data in [data_train, data_test]: data[TEXT_COLUMN] = join_text_columns(data, config["text_columns"]) Ns = np.arange(1000, MAX_N, 1000) if Ns[-1] != MAX_N: Ns = np.append(Ns, MAX_N) grid_search = get_grid_search(config) results = [] for N in Ns: logger.info(f"running for N = {N}") t0 = time.time() data_train_sample = data_train.sample(N) data_train_sample, data_test_sample = select_popular_classes( data_train_sample, data_test, LABEL_COLUMN, config["min_samples_per_class"]) result = run_experiment_iteration(grid_search, data_train_sample, data_test_sample) result["min_samples_per_class"] = config["min_samples_per_class"] representation_score_threshold = config.get( "representation_score_threshold", DEFAULT_REPRESENTATION_THRESHOLD) result["representation_score"] = representation_score( data_train_sample[LABEL_COLUMN], representation_score_threshold) result[ "representation_score_threshold"] = representation_score_threshold logger.info(f"completed in {int(time.time() - t0)} seconds") results.append(result) results = pd.DataFrame(results) return results
def start(self): for ds in self.__datasets: print(ds) self.current_dataset_name = utils.get_filename(ds) X, y = utils.preprocess_data(ds) self.classes_names = utils.get_classes_names(y) self.num_of_classes = len(self.classes_names) y = label_binarize(y, classes=self.classes_names) self.k_folds_cross_validation(X, y) print(f'model won in {self.model_wins} \ {self.counter}')
def sparsify(learn_cfs, test_cfs, descriptizers, max_iter=50, startPoints=5, stepPoints=1): sink = lambda s: '' l = utils.random_cfs(learn_cfs, startPoints) l_desc, l_lables, t_desc, t_lables, descriptors_scaler, labels_scaler = utils.preprocess_data( learn_cfs, test_cfs, descriptizers, log=sink) iteration = 0 spars_info = {} cur_gap_instance = utils.GAP_predict(l, test_cfs, descriptizers, log=sink)[1] while iteration < max_iter and len(l) <= len(l_desc): var = [(cf, cur_gap_instance.compute_variance(desc)) for cf, desc in zip(learn_cfs, l_desc)] var = sorted(var, key=itemgetter(1), reverse=True) joined_cfs, joined_vars = zip(*var[0:stepPoints]) mean_joined_variance = np.mean(joined_vars) l += joined_cfs s, cur_gap_instance = utils.GAP_predict(l, test_cfs, descriptizers, log=sink) mse = s['diff_mse'] print 'Join %d cfs with mean variance = %e, mse = %e ' % ( len(joined_cfs), mean_joined_variance, mse) # Note: # size_db = old + n_joined # mse = mse(size_db) # mean_joined_variance computed on n_joined cfs # spars_info[iteration] = { 'size_db': len(l), 'mse': mse, 'n_joined_cfs': stepPoints, 'mean_joined_variance': mean_joined_variance } iteration += 1 return spars_info
def main(argv=None): file = open(log_dir() + "/" +str(FLAGS.graph_id)+"_start_time_"+FLAGS.train_worker+".txt", "w") currentDT = datetime.datetime.now() print (str(currentDT)) file.write (str(currentDT)) print("Loading training data..") processed_data = preprocess_data(FLAGS.train_prefix, FLAGS.train_attr_prefix, FLAGS.train_worker,FLAGS.isLabel,FLAGS.isFeatures) train_data = processed_data[0:-1] G_local = processed_data[-1] print("Done loading training data..") file.write(str("Done loading training data..")) file.close() train(train_data,G_local)
def mnist_classification(): training_phase = "saved_model" not in os.listdir() load_dataset_mnist() mndata = MNIST('data_mnist') lenet = Lenet(20, 64, tf.train.AdamOptimizer(), tf.losses.softmax_cross_entropy) if training_phase: images, labels = mndata.load_training() images, labels = preprocess_data(images, labels, True) lenet.train(images, labels) else: images_test, labels_test = mndata.load_testing() images_test, labels_test = preprocess_data(images_test, labels_test, True, True) lenet.load_model() pred = lenet.predict(images_test) print("Accuracy:", len(labels_test[pred == labels_test]) / len(labels_test)) # 98% from sklearn.metrics.classification import confusion_matrix print("Confusion matrix: ") print(confusion_matrix(labels_test, pred))
class Grid: nm = 41 no_pv = 5 total_iteration = 100 # load mpc pf = 0.8 alpha = 0.8 beta = 0.2 bus, branch = mpc(pf, beta) from_to = branch[:, 0:2] pv_bus = np.array([bus[1, 11], bus[14, 11], bus[15, 11], bus[17, 11], bus[18, 11]]) pv_set = np.array([1, 14, 15, 17, 18]) qg_min, qg_max = np.float32(bus[pv_set, 12]), np.float32(bus[pv_set, 11]) r = np.zeros((nm, 1)) x = np.zeros((nm, 1)) A_tilde = np.zeros((nm, nm+1)) for i in range(nm): A_tilde[i, i+1] = -1 for k in range(nm): if branch[k, 1] == i + 1: A_tilde[i, int(from_to[k, 0])] = 1 r[i] = branch[k, 2] x[i] = branch[k, 3] a0 = A_tilde[:, 0] A = A_tilde[:, 1:] A_inv = np.linalg.inv(A) R = np.diagflat(r) X = np.diagflat(x) v0 = np.ones(1) # load data n_load = sio.loadmat("bus_47_load_data.mat") n_solar = sio.loadmat("bus_47_solar_data.mat") load_data = n_load['bus47loaddata'] solar_data = n_solar['bus47solardata'] pc, pg, qc = preprocess_data(load_data, solar_data, bus, alpha) p = pg - pc data_set_temp = np.vstack((p, qc)) data_set = data_set_temp.T
def train_and_test_one_model(obs, lengths, col_index): obs = utils.preprocess_data(obs, THREADHOLD[col_index]) lamdas_ = init_lamdas(obs, SUGGEST[col_index]) for i in range(0, len(lengths)): start_index = 0 print("the ", i, "th round.") # Calculate the mask for j in range(0, i): start_index += lengths[j].sum() end_index = start_index + lengths[i].sum() test_data, train_data = utils.split_data(obs, start_index, end_index) test_length, train_length = utils.split_length(lengths, i) print(lamdas_) model = train_model(train_data, train_length, lamdas_) model_name = MODEL_NAME + TYPE[col_index] + ".pkl" save_model(model, model_name) validate_model(model, test_data, test_length) break print("end")
def classify_documents(): # load models model, features_model = load_models() # load data data = load_documents() # preprocess data data = preprocess_data(data) # create text features features = features_model.transform(data['text_4']).toarray() # classify documents data['scores'] = model.predict(features) print('\nDocument classification:\n') for filename, topic in zip(data['filename'], data['scores']): print(' - Document {} belongs to category {}'.format(filename, topic))
def run_training(): df = read_data_as_df(DATA_PATH) new_df = get_feature_df(df) tfidf_df = get_tfidf(new_df) X, y = preprocess_data(tfidf_df) X_test, y_test = X.loc[X.index == 'TEST'], y.loc[y.index == 'TEST'].values X_train, y_train = X.loc[(X.index == 'TRAIN') | ( X.index == 'VALIDATION')], y.loc[(y.index == 'TRAIN') | (y.index == 'VALIDATION')].values LOG.info(f"Training set: {X_train.shape}, Testing set: {X_test.shape}") LOG.info( f"Training set positive examples: {y_train.sum()}, Testing set positive examples: {y_test.sum()}" ) clf_d = get_trained_models(["RF", "SGD", "LR", "SVM"], X_train, y_train) evaluate_models(clf_d, X_train, X_test, y_train, y_test)
def pseudo_random_sparsify(learn_cfs, test_cfs, descriptizers, max_iter=150, prob=0.1, limit=250): """ Naive sparsification with Hilbert-Schmidt indepedance criteria """ spars_info = {} #print descs.shape[0], lbl.shape[0] var = float('inf') selection = np.ones(2000, dtype = bool) subselection = np.ones(2000, dtype = bool) kernel = lambda xi, xj : np.exp(-np.dot(xj-xi, xj-xi)) iterations = 0 desc, lbl, t_desc, t_lables, descriptors_scaler, labels_scaler = utils.preprocess_data(learn_cfs, test_cfs, descriptizers) while(selection.sum() > limit): hs = 0.0 for i in range(max_iter): b = np.logical_and(np.random.binomial(1, prob, 2000), selection) var = HSCI(desc[b,:], lbl[b], kernel, kernel) print var if hs < var : hs = var subselection = b selection = np.logical_and(selection, subselection) lcfs = list( learn_cfs[i] for i in selection.nonzero()[0] ) mse = utils.GAP_predict(lcfs, test_cfs, descriptizers, log=utils.empty_printer)[0]['diff_mse'] print 'Iterations %d : Taille de la selection : %d ; HSCI = %e ; mse = %e' % (iterations, selection.sum(), hs, mse) spars_info[iterations] = { 'size_db' : selection.sum(), 'mse' : mse } iterations += 1 return spars_info
dropout2_p=0.5, hidden3_num_units=200, dropout3_p=0.2, output_num_units=1, output_nonlinearity=None, update=nesterov_momentum, update_learning_rate=0.05, update_momentum=0.9, eval_size=0.1, verbose=1, regression=True, max_epochs=35) return net0 train = read_csv('data/1.5/train.csv') data = train.ix[:, train.columns != 'Hazard'][:1000] #Quitamos la columna Hazard X = data.ix[:, data.columns != 'Id'][:1000] #Quitamos la columna Id y = train['Hazard'][:1000] new_X = preprocess_data(X) X_train, X_test, y_train, y_test = train_test_split(new_X, y) net0 = NeuralNetConstructor(32) net0.train(X_train, y_train) predicted = net0.predict(X_test) print r2_score(y_test, predicted) # R2 > 0
def classify_with_network2( # alignment files group_1, group_2, group_3, # which data to use strand, motif_start_positions, preprocess, events_per_pos, feature_set, title, # training params learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size, # model params learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None, # output params out_path="./"): print("2 way classification") assert(len(motif_start_positions) >= 2) out_file = open(out_path + title + ".tsv", 'wa') if model_dir is not None: print("looking for model in {}".format(model_dir)) model_file = find_model_path(model_dir, title) else: model_file = None # bin to hold accuracies for each iteration scores = [] collect_data_vectors_args = { "events_per_pos": events_per_pos, "portion": train_test_split, "strand": strand, "max_samples": max_samples, "feature_set": feature_set, "kmer_length": 6 } for i in xrange(iterations): list_of_datasets = [] # [((g1, g1l), (xg1, xg1l), (tg1, tg1l)), ... ] add_to_list = list_of_datasets.append for n, group in enumerate((group_1, group_2)): train_set, xtrain_set, test_set = collect_data_vectors2(label=n, files=group, motif_starts=motif_start_positions[n], dataset_title=title + "_group{}".format(n), **collect_data_vectors_args) add_to_list((train_set, xtrain_set, test_set)) # unpack list g1_train, g1_tr_labels = list_of_datasets[0][0][0], list_of_datasets[0][0][1] g1_xtr, g1_xtr_targets = list_of_datasets[0][1][0], list_of_datasets[0][1][1] g1_test, g1_test_targets = list_of_datasets[0][2][0], list_of_datasets[0][2][1] g2_train, g2_tr_labels = list_of_datasets[1][0][0], list_of_datasets[1][0][1] g2_xtr, g2_xtr_targets = list_of_datasets[1][1][0], list_of_datasets[1][1][1] g2_test, g2_test_targets = list_of_datasets[1][2][0], list_of_datasets[1][2][1] nb_g1_train, nb_g1_xtr, nb_g1_test = len(g1_train), len(g1_xtr), len(g1_test) nb_g2_train, nb_g2_xtr, nb_g2_test = len(g2_train), len(g2_xtr), len(g2_test) assert(nb_g1_train > 0 and nb_g2_train > 0), "got {0} group 1 training and " \ "{1} group 2 training vectors".format(nb_g1_train, nb_g2_train) # level training and cross-training events so that the model gets equal exposure tr_level = np.min([nb_g1_train, nb_g2_train]) xtr_level = np.min([nb_g1_xtr, nb_g2_xtr]) test_level = np.min([nb_g1_test, nb_g2_test]) print("{motif}: got {g1} group 1 and {g2} group 2 training vectors, leveled to {level}" .format(motif=title, g1=nb_g1_train, g2=nb_g2_train, level=tr_level)) print("{motif}: got {g1} group 1 and {g2} group 2 cross-training vectors, leveled to {level}" .format(motif=title, g1=nb_g1_xtr, g2=nb_g2_xtr, level=xtr_level)) print("{motif}: got {g1} group 1 and {g2} group 2 test vectors, leveled to {level}" .format(motif=title, g1=nb_g1_test, g2=nb_g2_test, level=test_level)) training_data = stack_and_level_datasets2(g1_train, g2_train, tr_level) training_labels = append_and_level_labels2(g1_tr_labels, g2_tr_labels, tr_level) xtrain_data = stack_and_level_datasets2(g1_xtr, g2_xtr, xtr_level) xtrain_targets = append_and_level_labels2(g1_xtr_targets, g2_xtr_targets, xtr_level) test_data = stack_and_level_datasets2(g1_test, g2_test, test_level) test_targets = append_and_level_labels2(g1_test_targets, g2_test_targets, test_level) prc_train, prc_xtrain, prc_test = preprocess_data(training_vectors=training_data, xtrain_vectors=xtrain_data, test_vectors=test_data, preprocess=preprocess) # evaluate X, y = shuffle_and_maintain_labels(prc_train, training_labels) trained_model_dir = "{0}{1}_Models/".format(out_path, title) training_routine_args = { "motif": title, "train_data": X, "labels": y, "xTrain_data": prc_xtrain, "xTrain_targets": xtrain_targets, "learning_rate": learning_rate, "L1_reg": L1_reg, "L2_reg": L2_reg, "epochs": epochs, "batch_size": batch_size, "hidden_dim": hidden_dim, "model_type": model_type, "model_file": model_file, "trained_model_dir": trained_model_dir, "extra_args": extra_args } if learning_algorithm == "annealing": net, summary = mini_batch_sgd_with_annealing(**training_routine_args) else: net, summary = mini_batch_sgd(**training_routine_args) errors, probs = predict(prc_test, test_targets, training_routine_args['batch_size'], net, model_file=summary['best_model']) errors = 1 - np.mean(errors) print("{0}: {1} test accuracy.".format(title, (errors * 100))) out_file.write("{}\n".format(errors)) scores.append(errors) with open("{}test_probs.pkl".format(trained_model_dir), 'w') as probs_file: cPickle.dump(probs, probs_file) print(">{motif}\t{accuracy}".format(motif=title, accuracy=np.mean(scores), end="\n"), file=out_file) return net
base_dir = os.path.dirname(os.path.realpath(__file__)) data_dir = os.path.join(base_dir, 'data') ann_dir = os.path.join(base_dir, 'annotation/coloncancer') plain_dir = os.path.join(base_dir, 'original') train_dir = os.path.join(data_dir, 'train') dev_dir = os.path.join(data_dir, 'dev') test_dir = os.path.join(data_dir, 'test') make_dirs([train_dir, dev_dir, test_dir]) preprocess_data(os.path.join(ann_dir, "Train"), os.path.join(plain_dir, "train"), train_dir, window_size, num_feats) preprocess_data(os.path.join(ann_dir, "Dev"), os.path.join(plain_dir, "dev"), dev_dir, window_size, num_feats) ann_dir_2 = os.path.join(base_dir, 'thymedata-1.2.0-coloncancer-test-event-time/coloncancer') preprocess_test_data_phase2(os.path.join(plain_dir, "test"), os.path.join(ann_dir_2, "Test"), test_dir, window_size, num_feats) build_vocab( glob.glob(os.path.join(data_dir, '*/*.toks')), os.path.join(data_dir, 'vocab-cased.txt'), lowercase=False) build_word2Vector(os.path.join('../NLP-Tools', 'glove.840B.300d.txt'), data_dir, 'vocab-cased.txt')
labels = ut.load(labelsname) else: X = np.concatenate((X, ut.load(objname))) labels = np.concatenate((labels, ut.load(labelsname))) else: print("Loading data...") X, labels = ut.load_data('data/train.csv', train=True, selected=sort_idx) dims = X.shape print(dims, 'dims') ######################################################################## print("Preprocessing data") X, scaler = ut.preprocess_data(X) print("Preprocessing labels") y, encoder = ut.preprocess_labels(labels) X_test, ids = ut.load_data('data/test.csv', train=False, selected=sort_idx) X_test, _ = ut.preprocess_data(X_test, scaler) nb_classes = y.shape[1] print(nb_classes, 'classes') dims = X.shape[1] print(dims, 'dims') ## check if model exists and resume otherwise rebuild if os.path.isfile("./tmp/keras-nn"): print ("Loading existing neural network...")
def sparsifyFOHSIC(learn_cfs, test_cfs, descriptizers, limit=250, max_iter=[2400, 1200, 600, 300, 200, 100, 100, 100, 100, 80, 80, 60, 50, 40, 40]): """ Naive sparsification with Hilbert-Schmidt indepedance criteria """ spars_info = {} #print descs.shape[0], lbl.shape[0] var = float('inf') #selection = np.random.binomial(1, 0.005, 2000) selection = np.zeros(2000, dtype = bool) subselection = np.zeros(2000, dtype = bool) kernel = lambda xi, xj : np.exp(-np.dot(xj-xi, xj-xi)) iterations = 0 desc, lbl, t_desc, t_lables, descriptors_scaler, labels_scaler = utils.preprocess_data(learn_cfs, test_cfs, descriptizers) """ K = np.array([[kernel(desc[i,:], desc[j,:]) if i != j else 0. for i in range(2000)] for j in range(2000)]) L = np.array([[kernel(lbl[i,:], lbl[j,:]) if i != j else 0. for i in range(2000)] for j in range(2000)]) KL = K*L Cub = np.array([[[K[i,k]*L[k,j] for i in range(2000)] for j in range(2000)] for k in range(2000)]) lcfs = list( learn_cfs[i] for i in selection.nonzero()[0] ) mse = utils.GAP_predict(lcfs, test_cfs, descriptizers, log=utils.empty_printer)[0]['diff_mse'] #print 'Iterations %d : Taille de la selection : %d ; HSCI = %e ; mse = %e' % (iterations, selection.sum(), hs, mse) spars_info[iterations] = { 'size_db' : selection.sum(), 'mse' : mse } iterations += 1 """ while(selection.sum() < limit): hs = 0.0 #for i in range(2000): if iterations >= len(max_iter): repet = 30 else: repet = max_iter[iterations] for i in range(repet): b = np.logical_or(np.random.binomial(1, 0.01, 2000), selection) #b[i] = True var = HSCI(desc[b,:], lbl[b], kernel, kernel) print var #m = selection.sum() #print 'HSCI 2 :' #var2 = (KL[b,:][:,b].sum() + K[b,:][:,b].sum()*L[b,:][:,b].sum()/(m-1)/(m-2) - 2*Cub[b,:,:][:,b,:][:,:,b].sum()/(m-2))/(m-3)/m #print var2 if hs < var : hs = var subselection = b selection = subselection lcfs = list( learn_cfs[i] for i in selection.nonzero()[0] ) mse = utils.GAP_predict(lcfs, test_cfs, descriptizers, log=utils.empty_printer)[0]['diff_mse'] print 'Iterations %d : Taille de la selection : %d ; HSIC = %e ; mse = %e' % (iterations, selection.sum(), hs, mse) spars_info[iterations] = { 'size_db' : selection.sum(), 'mse' : mse } iterations += 1 return spars_info
def classify_with_network3( # alignment files group_1, group_2, group_3, # these arguments should be strings that are used as the file suffix # which data to use strand, motif_start_positions, preprocess, events_per_pos, feature_set, title, # training params learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size, # model params learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None, # output params out_path="./"): # checks and file IO assert(len(motif_start_positions) >= 3) out_file = open(out_path + title + ".tsv", 'wa') if model_dir is not None: print("looking for model in {}".format(os.path.abspath(model_dir))) model_file = find_model_path(os.path.abspath(model_dir), title) else: model_file = None # bin to hold accuracies for each iteration scores = [] collect_data_vectors_args = { "events_per_pos": events_per_pos, "portion": train_test_split, "strand": strand, "max_samples": max_samples, "feature_set": feature_set, "kmer_length": 6 } for i in xrange(iterations): list_of_datasets = [] # [((g1, g1l), (xg1, xg1l), (tg1, tg1l)), ... ] add_to_list = list_of_datasets.append for n, group in enumerate((group_1, group_2, group_3)): train_set, xtrain_set, test_set = collect_data_vectors2(label=n, files=group, motif_starts=motif_start_positions[n], dataset_title=title + "_group{}".format(n), **collect_data_vectors_args) add_to_list((train_set, xtrain_set, test_set)) # unpack to make things easier, list_of_datasets[group][set_idx][vector/labels] c_train, c_tr_labels = list_of_datasets[0][0][0], list_of_datasets[0][0][1] c_xtr, c_xtr_targets = list_of_datasets[0][1][0], list_of_datasets[0][1][1] c_test, c_test_targets = list_of_datasets[0][2][0], list_of_datasets[0][2][1] mc_train, mc_tr_labels = list_of_datasets[1][0][0], list_of_datasets[1][0][1] mc_xtr, mc_xtr_targets = list_of_datasets[1][1][0], list_of_datasets[1][1][1] mc_test, mc_test_targets = list_of_datasets[1][2][0], list_of_datasets[1][2][1] hmc_train, hmc_tr_labels = list_of_datasets[2][0][0], list_of_datasets[2][0][1] hmc_xtr, hmc_xtr_targets = list_of_datasets[2][1][0], list_of_datasets[2][1][1] hmc_test, hmc_test_targets = list_of_datasets[2][2][0], list_of_datasets[2][2][1] nb_c_train, nb_c_xtr, nb_c_test = len(c_train), len(c_xtr), len(c_test) nb_mc_train, nb_mc_xtr, nb_mc_test = len(mc_train), len(mc_xtr), len(mc_test) nb_hmc_train, nb_hmc_xtr, nb_hmc_test = len(hmc_train), len(hmc_xtr), len(hmc_test) assert(nb_c_train > 0 and nb_mc_train > 0 and nb_hmc_train > 0), "got zero training vectors" # level training events so that the model gets equal exposure tr_level = np.min([nb_c_train, nb_mc_train, nb_hmc_train]) xtr_level = np.min([nb_c_xtr, nb_mc_xtr, nb_hmc_xtr]) test_level = np.min([nb_c_test, nb_mc_test, nb_hmc_test]) # log how many vectors we got print("{motif}: got {C} C, {mC} mC, and {hmC} hmC, training vectors, leveled to {level}" .format(motif=title, C=nb_c_train, mC=nb_mc_train, hmC=nb_hmc_train, level=tr_level), file=sys.stderr) print("{motif}: got {xC} C, {xmC} mC, and {xhmC} hmC, cross-training vectors, leveled to {xlevel}" .format(motif=title, xC=nb_c_xtr, xmC=nb_mc_xtr, xhmC=nb_hmc_xtr, xlevel=xtr_level), file=sys.stderr) print("{motif}: got {xC} C, {xmC} mC, and {xhmC} hmC, test vectors, leveled to {tstLevel}" .format(motif=title, xC=len(c_test), xmC=len(mc_test), xhmC=len(hmc_test), tstLevel=test_level), file=sys.stderr) # stack the data into one object # training data training_data = stack_and_level_datasets3(c_train, mc_train, hmc_train, tr_level) training_labels = append_and_level_labels3(c_tr_labels, mc_tr_labels, hmc_tr_labels, tr_level) # cross training xtrain_data = stack_and_level_datasets3(c_xtr, mc_xtr, hmc_xtr, xtr_level) xtrain_targets = append_and_level_labels3(c_xtr_targets, mc_xtr_targets, hmc_xtr_targets, xtr_level) # test test_data = stack_and_level_datasets3(c_test, mc_test, hmc_test, test_level) test_targets = append_and_level_labels3(c_test_targets, mc_test_targets, hmc_test_targets, test_level) prc_train, prc_xtrain, prc_test = preprocess_data(training_vectors=training_data, xtrain_vectors=xtrain_data, test_vectors=test_data, preprocess=preprocess) #if evaluate is True: # all_test_data = np.vstack((xtrain_data, test_data)) # all_test_targets = np.append(xtrain_targets, test_targets) # errors, probs = evaluate_network(all_test_data, all_test_targets, model_dir, model_type, batch_size, extra_args) # return # shuffle data X, y = shuffle_and_maintain_labels(prc_train, training_labels) working_directory_path = "{outpath}/{title}_Models/".format(outpath=out_path, title=title) if not os.path.exists(working_directory_path): os.makedirs(working_directory_path) trained_model_dir = "{workingdirpath}{iteration}/".format(workingdirpath=working_directory_path, iteration=i) training_routine_args = { "motif": title, "train_data": X, "labels": y, "xTrain_data": prc_xtrain, "xTrain_targets": xtrain_targets, "learning_rate": learning_rate, "L1_reg": L1_reg, "L2_reg": L2_reg, "epochs": epochs, "batch_size": batch_size, "hidden_dim": hidden_dim, "model_type": model_type, "model_file": model_file, "trained_model_dir": trained_model_dir, "extra_args": extra_args } if learning_algorithm == "annealing": net, summary = mini_batch_sgd_with_annealing(**training_routine_args) else: net, summary = mini_batch_sgd(**training_routine_args) errors, probs = predict(prc_test, test_targets, training_routine_args['batch_size'], net, model_file=summary['best_model']) errors = 1 - np.mean(errors) probs = zip(probs, test_targets) print("{0}:{1}:{2} test accuracy.".format(title, i, (errors * 100))) out_file.write("{}\n".format(errors)) scores.append(errors) with open("{}test_probs.pkl".format(trained_model_dir), 'w') as probs_file: cPickle.dump(probs, probs_file) print(">{motif}\t{accuracy}".format(motif=title, accuracy=np.mean(scores), end="\n"), file=out_file) return net
from sklearn.metrics import adjusted_rand_score, r2_score, mean_squared_error from sklearn import svm from pandas import read_csv from prettytable import PrettyTable from utils import preprocess_data #Obtenemos los datos de entrenamiento train = read_csv('data/1.5/train.csv') data = train.ix[:, train.columns != 'Hazard'][:1000] #Quitamos la columna Hazard X = data.ix[:, data.columns != 'Id'][:1000] #Quitamos la columna Id y = train['Hazard'][:1000] X_train, X_test, y_train, y_test = train_test_split(X, y) df_train = preprocess_data(X_train) svr = svm.SVR(kernel='linear') svr.fit(df_train.values, y_train) df_test = preprocess_data(X_test) predicted_values = svr.predict(df_test.values) pt = PrettyTable() pt.add_column("Predicted hazard", predicted_values) print pt #Regression score print "R2 Score" print r2_score(y_test, predicted_values)
def _usage(argv): print("Usage: python %s <action>" % argv[0]) print("\tWhere action is one of: %s" % repr(_ACTIONS)) exit(1) if __name__ == '__main__': if len(sys.argv) != 2: _usage(sys.argv) action = sys.argv[1] if action not in _ACTIONS: _usage(sys.argv) if action == 'preprocess': preprocess_data(_DATA_FILE, _ENCODING) if action == 'preparetfidf': create_tfidf() if action == 'preparelsa': calculate_lsi() if action == 'preparelda': calculate_lda() if action == 'notes': print('Reading notes...') with open('data/notes.dat', 'rb') as f: data = pickle.loads(f.read()) while True: try: index = int(input('Enter note number (ctrl+d to end program): ')) print(data[index]) print()