def compress(name, k=8, force_update=False): result = [] block_list = [] elements_file_name = name + '.pickle' compressed_elements_file_name = name + '_compressed.pickle' if not force_update and os.path.exists(compressed_elements_file_name): return load_data(compressed_elements_file_name) else: elements = load_data(elements_file_name) for i in xrange(0, len(elements), k): block_list.append( (0, elements[i].term, elements[i].count, elements[i].posting_lists ) ) last_index = len(elements) - i for bi in xrange(1, min(k, last_index)): c = compare(elements[i + bi - 1].term, elements[i + bi].term) block_list.append( (c, elements[i + bi].term[c:], elements[i + bi].count, # elements[i + bi].term, elements[i + bi].posting_lists ) ) result.append(block_list) block_list = [] save_data(result, compressed_elements_file_name) return result
def hand_generator(self): for root, directories, files in walk(self.color_folder): for c_filename in files: if self.c_file_ext in c_filename: try: c_filepath = path.join(root, c_filename) d_filepath = c_filepath.replace('color', 'depth').replace(self.c_file_ext, self.d_file_ext) _, c_data = utils.load_data(c_filepath, self.data_type) d_data, d_data_norm = utils.load_data(d_filepath, 'depth') c_fgmask = self.c_fgbg.apply(c_data) c_fgmask = cv2.morphologyEx(c_fgmask, cv2.MORPH_OPEN, self.kernel) img_bs = d_data_norm.copy() img_bs[c_fgmask==0] = 0 img_bs = ip.smooth_image(img_bs) boxes = ip.parse_hands(img_bs, display=False) hands = Hands(c_fgmask, d_data, boxes, d_filepath, 'depth') yield hands except Exception as e: print e
def initialize_detector(self, background_folder, video_folder, file_ext): self.c_fgbg = cv2.BackgroundSubtractorMOG2(300, 20, True) self.d_fgbg = cv2.BackgroundSubtractorMOG2(300, 10, False) color_bg_folder = background_folder + "/color" self.color_folder = video_folder + "/color" self.depth_folder = video_folder + "/depth" self.c_file_ext = file_ext self.d_file_ext = 'bin' count = 0 thresh_total = 0 for root, directories, files in walk(color_bg_folder): for c_filename in files: if self.c_file_ext in c_filename: try: c_filepath = path.join(root, c_filename) # d_filepath = c_filepath.replace('color', 'depth').replace(self.c_file_ext, self.d_file_ext) _, c_data = utils.load_data(c_filepath, self.data_type) # d_data, d_data_norm = utils.load_data(d_filepath, 'depth') c_fgmask = self.c_fgbg.apply(c_data) # d_fgmask = self.d_fgbg.apply(d_data) except Exception as e: print e self.initialized = True
def hand_generator(self): for root, directories, files in walk(self.folder): for filename in files: if self.file_ext in filename: filepath = path.join(root, filename) data, data_norm = utils.load_data(filepath, self.data_type) fgmask = self.fgbg.apply(data_norm) fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, self.kernel) img_bs = data.copy() img_bs[fgmask==0] = 0 img_bs = ip.smooth_image(img_bs) orig_mask = fgmask.copy() fgmask, thresh_val = self.remove_legs(img_bs) img_fg = data_norm.copy() img_fg[fgmask==0] = 0 boxes = ip.parse_hands(fgmask, display=False) hands = Hands(fgmask, data, boxes, filepath, 'depth') yield hands
def get_index(folder_name, force_update=False): index_file_name = folder_name + '.pickle' if not force_update and os.path.exists(index_file_name): return load_data(index_file_name) else: elements = [] documents = get_file_list(folder_name) for doc_id in xrange(len(documents)): elements += map(lambda x: Element(x, doc_id), get_tokens(documents[doc_id])) elements.sort() result = [] for el in elements: if result and result[-1] == el: result[-1].update(el) else: result.append(el) save_data(result, name=index_file_name)
def img_generator(self): for root, directories, files in walk(self.folder): for filename in files: if self.file_ext in filename: filepath = path.join(root, filename) try: data, data_norm = utils.load_data(filepath, self.data_type) fgmask = self.fgbg.apply(data_norm) fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, self.kernel) img_bs = data_norm.copy() img_bs[fgmask==0] = 0 img_bs = ip.smooth_image(img_bs) yield img_bs, filepath except Exception as e: print e
def initialize_detector(self, background_folder, video_folder, file_ext): self.fgbg = cv2.BackgroundSubtractorMOG2(300,.2,False) self.folder = video_folder self.file_ext = file_ext count = 0 thresh_total = 0 for root, directories, files in walk(background_folder): for filename in files: if self.file_ext in filename: try: filepath = path.join(root, filename) data, data_norm = utils.load_data(filepath, self.data_type) fgmask = self.fgbg.apply(data_norm) except Exception as e: print e self.initialized = True
if __name__ == "__main__": args = parse_args() # fix random seed set_random_seed(args.seed) # use cuda or not if args.use_cuda: device = torch.device('cuda') else: device = torch.device('cpu') # load data data, n_fields, n_features = load_data(args.dataset, device=device, use_content=args.use_content, use_rating=False, print_info=True) # create model model = AttentionalFM(n_features=n_features, n_fields=n_fields, embedding_dim=args.embedding_dim) model.to(device=device) # output dir output_dir = "./results/{:%Y%m%d_%H%M%S}/".format(datetime.now()) output_path = output_dir + "model.weights" if not os.path.exists(output_dir): os.makedirs(output_dir)
############### PREPROCESSING ############### classes = results.classes.replace(" ", "").split(',') preprocess_dir(TRAIN_DIR, PREPROCESSED_DIR, REORIENT_SCRIPT_PATH, ROBUSTFOV_SCRIPT_PATH, classes, results.numcores, verbose=0) ############### DATA IMPORT ############### X, y, filenames, num_classes, img_shape = load_data( PREPROCESSED_DIR, classes) print("Finished data processing") ############### MODEL SELECTION ############### LR = 1e-3 LOAD_WEIGHTS = False MODEL_NAME = "phinet_model_" + "-".join(results.classes.split(",")) MODEL_PATH = os.path.join(WEIGHT_DIR, MODEL_NAME + ".json") if not os.path.exists(WEIGHT_DIR): os.makedirs(WEIGHT_DIR) if LOAD_WEIGHTS: weight_files = os.listdir(WEIGHT_DIR)
cont_channels = [ i for (i, x) in enumerate(discretizer_header) if x.find("->") == -1 ] normalizer = Normalizer( fields=cont_channels) # choose here which columns to standardize normalizer_state = 'ihm_normalizer' normalizer_state = os.path.join(os.path.dirname(data_path), normalizer_state) normalizer.load_params(normalizer_state) # %% n_trained_chunks = 0 train_raw = utils.load_data(train_reader, discretizer, normalizer, small_part, return_names=True) val_raw = utils.load_data(val_reader, discretizer, normalizer, small_part, return_names=True) # %% demographic_data = [] diagnosis_data = [] idx_list = [] demo_path = data_path + 'demographic/'
import sys from utils.utils import load_data, datset_gen, savepickle from utils.trainer import train_with_checkpoin_tensorboard from Models import svhn_model_simple import tensorflow as tf import numpy as np import pandas as pd root_path = "I:\\Files_ML\\Coursera\\Dl_ON_ud\\dataset" test_path = os.path.join(root_path, "test") train_path = os.path.join(root_path, "train") extra_path = os.path.join(root_path, "extra") X_test, y_test = load_data(test_path, num_only=True) X_train, y_train = load_data(train_path, num_only=True) X_extra, y_extra = load_data(extra_path, num_only=True) #X_train = np.concatenate([X_train,X_extra]) #y_train = np.concatenate([y_train,y_extra]) y_train = y_train.reshape((-1, 5, 1)) y_test = y_test.reshape((-1, 5, 1)) # %% batch_size = 32 ds_train = datset_gen(X_train, y_train, batch_size=batch_size, buffer_size=100) ds_test = datset_gen(X_test, y_test, batch_size=batch_size, buffer_size=100) # %%
def initial_load(): kwargs = request.args.to_dict() print kwargs return load_data(**kwargs)
def main(): logger = logMaster.get_logger('main') logger.info('loading data...') att_feats, train_data, val_data, test_data, test_s_data, classes = load_data( att_path=att_path, res_path=res_path) logger.info('building model...') gen = Generator(x_dim=args.x_dim, s_dim=args.s_dim, z_dim=args.z_dim, layers=args.dec) # gen.train() # states = torch.load(args.vae_ckpt) # gen.load_state_dict(states['model']) dis = Discriminator(x_dim=args.x_dim, s_dim=args.s_dim, layers=args.dis) reg = Regressor(x_dim=args.x_dim, s_dim=args.s_dim, layers=args.reg) gen.cuda() dis.cuda() reg.cuda() mse_loss = nn.MSELoss() l1_loss = nn.L1Loss() adam_betas = (0.8, 0.999) gen_opt = optim.Adam(gen.parameters(), lr=args.learning_rate, weight_decay=0.01, betas=adam_betas) dis_opt = optim.Adam(dis.parameters(), lr=args.learning_rate, weight_decay=0.01, betas=adam_betas) reg_opt = optim.Adam(reg.parameters(), lr=args.learning_rate, weight_decay=0.01, betas=adam_betas) train_manager = DataManager(train_data, args.epoch, args.batch, infinite=True) ones = Variable(torch.ones([args.batch, 1]), requires_grad=False).float().cuda() zeros = Variable(torch.zeros([args.batch, 1]), requires_grad=False).float().cuda() loss_history = [] logger.info('start training...') for epoch in range(args.epoch): running_loss = 0 t1 = time.time() d_total_loss = 0.0 g_total_loss = 0.0 # cyc_total_loss = 0.0 r_total_loss = 0.0 # rd_total_loss = 0.0 # vae_total_loss = 0.0 g_scores = 0.0 if args.steps == -1: steps = train_manager.num_batch else: steps = args.steps for batch in tqdm(range(steps), leave=False, ncols=70, unit='b'): for i in range(args.d_iter): dis.zero_grad() # get true data data = train_manager.get_batch() X = Variable( torch.from_numpy(np.asarray([item[0] for item in data ]))).float().cuda() Y = [item[1] for item in data] S = Variable(torch.from_numpy(att_feats[Y])).float().cuda() Yc = get_negative_samples(Y, classes['train']) Sc = Variable(torch.from_numpy(att_feats[Yc])).float().cuda() # get fake data # Xp = gen.forward(X, S) # Xp = Xp.detach() # fix the generator Xpp = gen.sample(S).detach() Sp = reg.forward(X).detach() # fix the regressor # get scores true_scores = dis.forward(X, S) # fake_scores = dis.forward(Xp, S) fake_scores2 = dis.forward(Xpp, S) # reg_scores = dis.forward(X, Sp) # ctrl_scores = dis.forward(X, Sc) # calculate loss d_loss = mse_loss(true_scores, ones) + mse_loss( fake_scores2, zeros) # + args.theta3 * mse_loss(reg_scores, zeros) \ # + mse_loss(ctrl_scores, zeros) d_loss.backward() dis_opt.step() d_total_loss += d_loss.cpu().data.numpy() for i in range(args.g_iter): gen.zero_grad() reg.zero_grad() # get true data data = train_manager.get_batch() X = Variable( torch.from_numpy(np.asarray([item[0] for item in data ]))).float().cuda() Y = [item[1] for item in data] S = Variable(torch.from_numpy(att_feats[Y])).float().cuda() # get fake data # Xp, mu, log_sigma = gen.forward(X, S) Xp2 = gen.sample(S) Sp = reg.forward(X) # Spp = reg.forward(Xp) # Xpp, _, _ = gen.forward(X, Sp) # get scores # fake_scores = dis.forward(Xp, S) fake_scores2 = dis.forward(Xp2, S) # reg_scores = dis.forward(X, Sp) # calculate loss # vae_loss = gen.vae_loss(X=X, Xp=Xp, mu=mu, log_sigma=log_sigma) # cyc_loss = mse_loss(Spp, S) + mse_loss(Xpp, X) g_loss = mse_loss(fake_scores2, ones) r_loss = mse_loss(Sp, S) # rd_loss = mse_loss(reg_scores, ones) # total_loss = vae_loss + g_loss + args.theta1 * cyc_loss + args.theta2 * r_loss + args.theta3 * rd_loss total_loss = g_loss + args.theta2 * r_loss total_loss.backward() gen_opt.step() reg_opt.step() # vae_total_loss += vae_loss.cpu().data.numpy() g_total_loss += g_loss.cpu().data.numpy() # cyc_total_loss += cyc_loss.cpu().data.numpy() r_total_loss += r_loss.cpu().data.numpy() # rd_total_loss += rd_loss.cpu().data.numpy() g_scores += np.mean(fake_scores2.cpu().data.numpy()) g_total_steps = steps * args.g_iter d_total_steps = steps * args.d_iter # vae_avg_loss = vae_total_loss / g_total_steps g_avg_loss = g_total_loss / g_total_steps # cyc_avg_loss = cyc_total_loss / g_total_steps r_avg_loss = r_total_loss / g_total_steps # rd_avg_loss = rd_total_loss / g_total_steps d_avg_loss = d_total_loss / d_total_steps g_avg_score = g_scores / g_total_steps loss_history.append( f'{g_avg_loss:.4}\t{d_avg_loss:.4}\t{r_avg_loss:.4}\t' f'{g_avg_score:.4}\n') elapsed = (time.time() - t1) / 60.0 if (epoch + 1) % 10 == 0 or epoch == 0: filename = 'gdan_' + str(epoch + 1) + '.pkl' save_path = save_dir / Path(filename) states = dict() states['epoch'] = epoch + 1 states['gen'] = gen.state_dict() states['dis'] = dis.state_dict() states['reg'] = reg.state_dict() # states['enc_layers'] = args.enc states['gen_layers'] = args.dec states['reg_layers'] = args.reg states['dis_layers'] = args.dis states['z_dim'] = args.z_dim states['x_dim'] = args.x_dim states['s_dim'] = args.s_dim states['gen_opt'] = gen_opt.state_dict() states['dis_opt'] = dis_opt.state_dict() states['reg_opt'] = reg_opt.state_dict() states['theta1'] = args.theta1 states['theta2'] = args.theta2 states['theta3'] = args.theta3 torch.save(states, str(save_path)) logger.info( f'epoch: {epoch+1:4}, g_loss: {g_avg_loss: .4}, d_loss: {d_avg_loss: .4}, \n' f'r_loss: {r_avg_loss: .4}, ' f'g_score: {g_avg_score:.4}') with result_path.open('w') as fout: for s in loss_history: fout.write(s) logger.info('program finished')
def partOne(data): valid = 0 for n in data: l, u, k, p = processItem(n) count = utils.count_characters_in_string(p, k) if utils.int_in_range(count, l, u): valid += 1 return valid def partTwo(data): valid = 0 for n in data: p1, p2, k, p = processItem(n) if (p[int(p1) - 1] is k) ^ (p[int(p2) - 1] is k): valid += 1 return valid if __name__ == "__main__": # Load Data data = utils.load_data("day2.txt") # Do puzzle print("---- Day 2 ----") print("Part 1: " + str(partOne(data))) print("Part 2: " + str(partTwo(data)))
def plot_power_sector(s=None, c=None, order=None): data = load_data() data = data[data.cost != 'none'] if s is None: all_cost = sorted(list(set(data.cost))) s = [ min(all_cost), all_cost[int((len(all_cost) - 1) / 2)], max(all_cost) ] if c is None: all_cost = sorted(list(set(data.tax))) c = [ min(all_cost), all_cost[int((len(all_cost) - 1) / 2)], max(all_cost) ] data = data[data.cost.isin(s)] data = data[data.tax.isin(c)] synonyms = { 'Solar': 'Renewable', 'Hydro': 'Renewable', 'Wind': 'Renewable', 'Biomass': 'Renewable', 'Import': 'Others', 'Oil': 'Others' } col_dic = { 'Coal|w/o CCS': '#000000', 'Coal|w/ CCS': '#918F88', 'Gas|w/o CCS': '#A3CFD6', 'Gas|w/ CCS': '#D3EAED', 'Renewable': '#4EB378', 'Nuclear': '#724ac1', 'Others': '#b2b2b2' } years = [2020, 2030, 2040, 2050] activity = get_plot_data(data, keyword='Secondary Energy|Electricity', synonyms=synonyms, col_dic=col_dic) activity[years] = activity[years] * 8.760 plot_facet_grids(activity, y_title='PPL Activity [TWh]', figure_title='Power_Activity_TWh', col_dic=col_dic, y_max=880, order=order) capacity = get_plot_data(data, keyword='Capacity|Electricity', synonyms=synonyms, col_dic=col_dic) plot_facet_grids(capacity, y_title='PPL Capacity [GW]', figure_title='Power_Capacity_GW', col_dic=col_dic, y_max=280, order=order)
parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, default='MUTAG') parser.add_argument('--hidden', type=int, default=32) parser.add_argument('--idx', type=int, default=1) parser.add_argument('--batch_size', type=int, default=128) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--lr', type=float, default=0.01) args = parser.parse_args() np.random.seed(0) torch.manual_seed(0) torch.cuda.manual_seed_all(0) # writer = SummaryWriter('runs/la_PROTEINS') device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') dataset = load_data(args.dataset) index_train = [] index_test = [] with open( osp.join(osp.dirname(osp.realpath(__file__)), 'datasets', '%s' % args.dataset, '10fold_idx', 'train_idx-%d.txt' % args.idx), 'r') as f_train: for line in f_train: index_train.append(int(line.split('\n')[0])) with open( osp.join(osp.dirname(osp.realpath(__file__)), 'datasets', '%s' % args.dataset, '10fold_idx', 'test_idx-%d.txt' % args.idx), 'r') as f_test: for line in f_test: index_test.append(int(line.split('\n')[0]))
import os from S_and_R import main from utils.utils import load_data if __name__ == '__main__': input_parameters = dict(number_of_candles=60, minimum_window_size=5, maximum_window_size=20, tolerance=0.001) path = os.path.join("data", "EURUSD1440.csv") dataframe = load_data(path) unique_resistances_supports_list, scaled_power_resistances_supports_list = main(dataframe, input_parameters)
def load_data_from_json(self): title, description = load_data() self.set_title(title) self.set_description(description)
import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import roc_auc_score, log_loss from sklearn.model_selection import StratifiedKFold from keras.models import load_model from keras.models import Sequential, Model from keras.layers import Input, Dense, Dropout, Activation from keras.layers.normalization import BatchNormalization from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, CSVLogger from keras.wrappers.scikit_learn import KerasClassifier MODEL_NAME = 'keras_1' combined = utils.load_data() combined = utils.cat_transform(combined, 'onehot') train, test = utils.recover_train_test_na(combined, fillna=True) # Fillna for minmax scaler train = train.replace(np.NaN, -1) test = test.replace(np.NaN, -1) X_train = train.drop('target', axis=1) y_train = train.target X_test = test scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test)
args = get_args() if (args.model_type != const.GMVAE and args.model_type != const.GMVAECNN): print('Choose a valid model_type!') sys.exit() # odel checkpoint experiments/checkpoint/GMV_FREY_0001_8_2_8_64_2/-1029 .. config, flags = get_config_and_flags(args) # create the experiments dirs utils.create_dirs( [config.summary_dir, config.checkpoint_dir, config.results_dir]) utils.save_args(args, config.summary_dir) ''' ------------------------------------------------------------------------------ GET DATA ------------------------------------------------------------------------------ ''' print('\n Loading data...') data_train, data_valid, data_test = utils.load_data(config.dataset_name) ''' ------------------------------------------------------------------------------ GET NETWORK PARAMS ------------------------------------------------------------------------------ ''' network_params = Bunch() network_params.input_height = data_train.height network_params.input_width = data_train.width network_params.input_nchannels = data_train.num_channels network_params.hidden_dim = config.hidden_dim network_params.z_dim = config.z_dim network_params.w_dim = config.w_dim network_params.K = config.K_clusters network_params.num_layers = config.num_layers ''' ----------------------------------------------------------------------------- COMPUTATION GRAPH (Build the model)
def run(args): print("\nInput args:") pprint(vars(args)) t0 = time() te_size = verify_size(args.te_size) datapath = Path(args.datapath).resolve() # Hard split # split_on = None if args.split_on is None else args.split_on.upper() cv_method = args.cv_method te_method = cv_method # Specify ML task (regression or classification) if cv_method == "strat": mltask = "cls" # cast mltask to cls in case of stratification else: mltask = args.ml_task # Target column name trg_name = str(args.trg_name) # assert args.trg_name in data.columns, f'The prediction target ({args.name}) \ # was not found in the dataset.' # import ipdb; ipdb.set_trace() # ----------------------------------------------- # Create outdir # ----------------------------------------------- if args.gout is not None: gout = Path(args.gout).resolve() sufx = "none" if args.split_on is None else args.split_on gout = gout / datapath.with_suffix(".splits") if args.split_on is not None: gout = gout / f"split_on_{sufx}" else: gout = gout / f"split_on_none" else: # Note! useful for drug response sufx = "none" if args.split_on is None else args.split_on gout = datapath.with_suffix(".splits") outfigs = gout / "outfigs" os.makedirs(gout, exist_ok=True) os.makedirs(outfigs, exist_ok=True) # ----------------------------------------------- # Create logger # ----------------------------------------------- lg = Logger(gout / "data.splitter.log") print_fn = get_print_func(lg.logger) print_fn(f"File path: {fdir}") print_fn(f"\n{pformat(vars(args))}") dump_dict(vars(args), outpath=gout / "data.splitter.args.txt") # ----------------------------------------------- # Load data # ----------------------------------------------- print_fn("\nLoad master dataset.") data = load_data(datapath) print_fn("data.shape {}".format(data.shape)) # ydata = data[trg_name] if trg_name in data.columns else None # if (cv_method == "strat") and (ydata is None): # raise ValueError("Prediction target column must be available if splits need to be stratified.") if (cv_method == "strat") and (trg_name not in data.columns): raise ValueError( "Prediction target column must be available if splits need to be stratified." ) # if ydata is not None: # plot_hist(ydata, title=f"{trg_name}", fit=None, bins=100, # path=outfigs/f"{trg_name}_hist_all.png") if trg_name in data.columns: plot_hist(data[trg_name], title=f"{trg_name}", fit=None, bins=100, path=outfigs / f"{trg_name}_hist_all.png") # ----------------------------------------------- # Generate splits (train/val/test) # ----------------------------------------------- print_fn("\n{}".format("-" * 50)) print_fn("Split data into hold-out train/val/test") print_fn("{}".format("-" * 50)) kwargs = { "cv_method": cv_method, "te_method": te_method, "te_size": te_size, "mltask": mltask, "split_on": args.split_on } data_splitter( data=data, n_splits=args.n_splits, gout=gout, outfigs=outfigs, # ydata = ydata, target_name=trg_name, print_fn=print_fn, seed=seed, **kwargs) print_fn("Runtime: {:.1f} min".format((time() - t0) / 60)) print_fn("Done.") lg.close_logger()
def gen_ml_data( fpath, common_samples, # fea_type, # drg_set, dd_fea=None, fps_fea=None, img_fea=None, ID='TITLE', fea_sep='_', score_name='reg', n_samples=None, n_top=None, sampling=None, q_cls=0.025, binner=False, bin_th=2.0, baseline=False, print_fn=print, outdir=Path('out'), outfigs=Path('outfigs')): """ Generate a single set of ML data for the loaded target from fpath. This func was specifically created to process the new LARGE DOE-MD datasets with ZINC drugs that contains >6M molecules. Args: fpath: path to load docking scores file common_samples : list of drug names that are commong to all features types including dd_fea, fps_fea, and img_fea dd_fea : df of Mordred descriptors fps_fea : df pf ecfp2 fingerprints img_fea : image data (TODO: this is not supported yet!) fea_sep : separator between feature prefix string and feature name score_name : rename the docking score col with score_name n_samples : total number of samples in the final ml_df n_top : keep this number of top-most dockers sampling : specify the method to use when sampling samples from df q_cls : quantile value to compute along the docking scores to generate the 'cls' col bin_th : threshold value of docking score to generate the 'binner' col binner : add binner column baseline : whether to compute ML baseline scores Returns: res : results summary """ print_fn(f'\nProcess {fpath.name} ...') res = {} trg_name = fpath.with_suffix('').name # note! depends on dock file names res['target'] = trg_name # Load docking dock = load_data(fpath) if dock.empty: print_fn('Empty file') return None if (n_samples is not None) and (dock.shape[0] <= n_samples): print_fn("n_samples is larger than len(dock), skip this receptor") return res # Pre-proc the dock file ## ID = 'TITLE' scoring_func = 'Chemgauss4' dock = proc_dock_score(dock, ID=ID, score_name=score_name, scoring_func=scoring_func) # Plot histogram of all (raw) scores plot_hist_dock_scores(dock, outfigs=outfigs, subdir_name='all.raw', trg_name=trg_name, scoring_func=scoring_func) # Convert and bound scores to >=0 dock[score_name] = abs(np.clip(dock[score_name], a_min=None, a_max=0)) print_fn('dock: {}'.format(dock.shape)) # Plot histogram of all (transformed) scores plot_hist_dock_scores(dock, outfigs=outfigs, subdir_name='all.transformed', trg_name=trg_name, scoring_func=scoring_func) # ----------------------------------------- # Sample a subset of scores # ------------------------- # Extract samples that are common to all feature types aa = dock[dock[ID].isin(common_samples)].reset_index(drop=True) # Extract subset of samples if (n_samples is not None) and (n_top is not None): n_bot = n_samples - n_top aa = aa.sort_values('reg', ascending=False).reset_index(drop=True) df_top = aa[:n_top].reset_index(drop=True) # e.g. 100K df_rest = aa[n_top:].reset_index(drop=True) # if flatten: # df_bot = flatten_dist(df=df_rest, n=n_bot, score_name=score_name) # else: # df_bot = df_rest.sample(n=n_bot, replace=False) if sampling == 'flatten': df_bot = flatten_dist(df=df_rest, n=n_bot, score_name=score_name) elif sampling == 'random': df_bot = df_rest.sample(n=n_bot, replace=False) else: raise ValueError("'sampling' arg must be specified.") assert df_top.shape[1] == df_bot.shape[ 1], 'Num cols must be the same when concat.' aa = pd.concat([df_top, df_bot], axis=0).reset_index(drop=True) # Plot histogram of sampled scores outfigs_dir = outfigs / 'sampled.transformed' os.makedirs(outfigs_dir, exist_ok=True) fig, ax = plt.subplots() ax.hist(df_top[score_name], bins=100, facecolor='r', alpha=0.7, label='Top 10K Docking Ligands') ax.hist(df_bot[score_name], bins=100, facecolor='b', alpha=0.7, label='Other Ligands (balanced)') ax.set_xlabel(f'Docking Score ({scoring_func})') ax.set_ylabel('Count') plt.grid(True) plt.legend(loc='best', framealpha=0.5) plt.title(f'sampled.transformed; Samples {n_samples}; n_top {n_top}') plt.savefig(outfigs_dir / f'dock.dist.{trg_name}.png', dpi=150) del df_top, df_bot, df_rest elif (n_samples is not None): # if flatten: # aa = flatten_dist(df=aa, n=n_samples, score_name=score_name) # else: # aa = aa.sample(n=n_samples, replace=False) if sampling == 'flatten': aa = flatten_dist(df=aa, n=n_samples, score_name=score_name) elif sampling == 'random': aa = aa.sample(n=n_samples, replace=False) else: raise ValueError("'sampling' arg must be specified.") plot_hist_dock_scores(dock, outfigs=outfigs, subdir_name='sampled.transformed', trg_name=trg_name, scoring_func=scoring_func) dock = aa del aa # ----------------------------------------- # Create cls col # -------------- # Find quantile value if dock[score_name].min() >= 0: # if scores were transformed to >=0 q_cls = 1.0 - q_cls cls_th = dock[score_name].quantile(q=q_cls) res['cls_th'] = cls_th print_fn('Quantile score (q_cls={:.3f}): {:.3f}'.format(q_cls, cls_th)) # Generate a classification target col if dock[score_name].min() >= 0: # if scores were transformed to >=0 value = (dock[score_name] >= cls_th).astype(int) else: value = (dock[score_name] <= cls_th).astype(int) dock.insert(loc=1, column='cls', value=value) # print_fn('Ratio {:.2f}'.format( dd['dock_bin'].sum() / dd.shape[0] )) # Plot hist, bin_edges = np.histogram(dock[score_name], bins=100) x = np.ones((10, )) * cls_th y = np.linspace(0, hist.max(), len(x)) fig, ax = plt.subplots() plt.hist(dock[score_name], bins=200, density=False, facecolor='b', alpha=0.7) plt.title(f'Scores clipped to 0: {trg_name}') plt.xlabel(f'Docking Score ({scoring_func})') plt.ylabel('Count') plt.plot(x, y, 'm--', alpha=0.7, label=f'{q_cls}-th quantile') plt.grid(True) plt.tight_layout() plt.savefig(outfigs / f'dock.dist.cls.{trg_name}.png') # ----------------------------------------- # Save dock scores cols = ['Inchi-key', 'SMILES', 'TITLE', 'reg', 'cls'] dock = dock[[c for c in cols if c in dock.columns]] trg_outdir = outdir / f'DIR.ml.{trg_name}' outpath = trg_outdir / f'docks.df.{trg_name}.csv' os.makedirs(trg_outdir, exist_ok=True) dock.to_csv(outpath, index=False) # Add binner (note! may not be necessary since we get good dock scores) # if binner: # dock = add_binner(dock, score_name=score_name, bin_th=bin_th) # Merge only on TITLE (when including also SMILES, there is a mismatch on # certain samples; maybe smiles that come with features are canonicalied) merger = ID def merge_dock_and_fea(dock, fea_df, fea_prfx, fea_sep, merger='TITLE', fea_name=None, baseline=False): """ ... """ # drug_names = set(common_samples).intersection(set(dock[ID].values)) ml_df = pd.merge(dock, fea_df, how='inner', on=merger).reset_index(drop=True) del fea_df # bb = fea_df[ fea_df[merger].isin(dock[merger].tolist()) ].reset_index(drop=True) # xdata = extract_subset_fea(bb, fea_list=[fea_prfx], fea_sep=fea_sep) # bb = pd.concat([bb[merger], xdata], axis=1) # keep only the merger meta col from fea_df # xdata = extract_subset_fea(fea_df, fea_list=[fea_prfx], fea_sep=fea_sep) # fea_df = pd.concat([fea_df[merger], xdata], axis=1) # keep only the merger meta col from fea_df # ml_df = pd.merge(dock, fea_df, how='inner', on=merger).reset_index(drop=True) # del fea_df, xdata # Re-org cols fea_cols = extract_subset_fea_col_names(ml_df, fea_list=[fea_prfx], fea_sep=fea_sep) meta_cols = ['Inchi-key', 'SMILES', 'TITLE', 'CAT', 'reg', 'cls'] cols = meta_cols + fea_cols # ml_df = ml_df[cols] ml_df = ml_df[[c for c in cols if c in ml_df.columns]] print_fn('{}: {}'.format(fea_name, ml_df.shape)) # Save outpath = trg_outdir / f'ml.{trg_name}.{fea_name}' ml_df.to_parquet(str(outpath) + '.parquet') # Compute baseline if specified if baseline: te_scr = trn_baseline(ml_df, fea_list=[fea_prfx], fea_sep=fea_sep) res[f'{fea_prfx}_r2'] = te_scr['r2'] res[f'{fea_prfx}_mae'] = te_scr['median_absolute_error'] del te_scr del ml_df if dd_fea is not None: merge_dock_and_fea(dock, fea_df=dd_fea, fea_prfx='dd', fea_sep=fea_sep, merger=ID, fea_name='descriptors', baseline=baseline) if fps_fea is not None: merge_dock_and_fea(dock, fea_df=fps_fea, fea_prfx='ecfp2', fea_sep=fea_sep, merger=ID, fea_name='ecfp2', baseline=baseline) if img_fea is not None: pass # if n_samples is not None: # assert n_samples == ml_df.shape[0], 'Final ml_df size must match n_samples {}'.format(fpath) return res
from time import time from optparse import OptionParser from multiprocessing import Process from mne import Epochs, find_events from time import time, strftime, gmtime import os from stimulus_presentation import auditory_p300 from utils import utils from collections import OrderedDict import numpy as np from pandas import DataFrame from psychopy import visual, core, event, sound, monitors from pylsl import StreamInfo, StreamOutlet, resolve_byprop, StreamInlet raw = utils.load_data('auditory/P300', sfreq=256., subject_nb=subject, session_nb=session) raw.plot_psd() raw.filter(1, 30, method='iir') events = find_events(raw) event_id = {'Non-Target': 1, 'Target': 2} epochs = Epochs(raw, events=events, event_id=event_id, tmin=-0.1, tmax=0.8, baseline=None,
def train(network='rnn'): word2id, id2word = load_data(TOKEN_DATA) tag2id, id2tag = load_data(TAG_DATA) x_train, y_train, seq_lens, _, _ = generate_data(TRAIN_DATA, word2id, tag2id, max_len=hp.max_len) x_dev, y_dev, dev_seq_lens, _, source_tag = generate_data( DEV_DATA, word2id, tag2id, max_len=hp.max_len) vocab_size = len(word2id) num_tags = len(tag2id) if network == "transformer": model = TransformerCRFModel(vocab_size, num_tags, is_training=True) elif network == 'rnn': model = BiRnnCRF(vocab_size, num_tags) elif network == 'cnn': model = CnnCRF(vocab_size, num_tags) elif network == 'match-pyramid': model = CnnCRF(vocab_size, num_tags) else: return sv = tf.train.Supervisor(graph=model.graph, logdir=logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break train_loss = [] for x_batch, y_batch, len_batch in batch_data( x_train, y_train, seq_lens, hp.batch_size): feed_dict = { model.x: x_batch, model.y: y_batch, model.seq_lens: len_batch } loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict) train_loss.append(loss) dev_loss = [] predict_lists = [] for x_batch, y_batch, len_batch in batch_data( x_dev, y_dev, dev_seq_lens, hp.batch_size): feed_dict = { model.x: x_batch, model.y: y_batch, model.seq_lens: len_batch } loss, logits = sess.run([model.loss, model.logits], feed_dict) dev_loss.append(loss) transition = model.transition.eval(session=sess) pre_seq = model.predict(logits, transition, len_batch) pre_label = recover_label(pre_seq, len_batch, id2tag) predict_lists.extend(pre_label) train_loss_v = np.round(float(np.mean(train_loss)), 4) dev_loss_v = np.round(float(np.mean(dev_loss)), 4) print('****************************************************') acc, p, r, f = get_ner_fmeasure(source_tag, predict_lists) print('epoch:\t{}\ttrain loss:\t{}\tdev loss:\t{}'.format( epoch, train_loss_v, dev_loss_v)) print('acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format(acc, p, r, f)) print('****************************************************\n\n')
def main( data_dir: str = '/project/cq-training-1/project2/teams/team12/data/', model_name: str = 'seq2seqgru', epochs: int = 20, optimizer: str = 'adam', lr: float = 1e-3, batch_size: int = 32, vocab_size: int = None, # If None all tokens of will be in vocab seq_len: int = None, # If None the seq len is dynamic (might not work with all models) seed: bool = True, model_config: dict = None, embedding: str = None, embedding_dim: int = 128, back_translation_model: str = 'saved_model/<model_folder_name>', back_translation: bool = False, back_translation_ratio: float = 1.0, fr_to_en: bool = False): # Call to remove tensorflow warning about casting float64 to float32 tf.keras.backend.set_floatx('float32') # Set random seed if seed: tf.random.set_seed(SEED) np.random.seed(SEED) # Data paths path_en = os.path.join(data_dir, 'train.lang1') path_fr = os.path.join(data_dir, 'train.lang2') path_unaligned_en = os.path.join(data_dir, 'unaligned-tok.en') path_unaligned_fr = os.path.join(data_dir, 'unaligned-tok.fr') if fr_to_en: # Switch paths tmp = path_en path_en = path_fr path_fr = tmp # Create vocabs logger.info('Creating vocab...') word2idx_en, idx2word_en = utils.create_vocab(path_en, vocab_size) word2idx_fr, idx2word_fr = utils.create_vocab(path_fr, vocab_size) logger.info( f'Size of english vocab : {len(word2idx_en)}, size of french vocab : {len(word2idx_fr)}' ) # Back translation prediction_file = None if back_translation: prediction_file = os.path.join(utils.SHARED_PATH, 'translated_unaligned.en') if os.path.exists(prediction_file): logger.info( f'Using translation from {prediction_file} for back-translation.' ) else: logger.info( f'Translating {path_unaligned_fr} for back-translation...') # Load data data = utils.load_data(path_unaligned_fr, word2idx_fr) dataset = tf.data.Dataset.from_generator( lambda: [ex for ex in data], tf.int64, output_shapes=tf.TensorShape([None])).padded_batch( 128, padded_shapes=[None]) # Load model model_config = { 'num_layers': 2, 'd_model': 128, 'dff': 512, 'num_heads': 8 } model = Transformer(model_config, len(word2idx_fr), word2idx_en) model.load_weights(os.path.join(back_translation_model, "model")) # Write prediction to file with open(prediction_file, 'w') as f: print('opening file and writing predictions...') for batch in tqdm(dataset, desc='Translating...', total=len(data) // 128 + 1): preds = model({ 'inputs': batch, 'labels': tf.zeros_like(batch) }) for pred in preds: sentence = utils.generate_sentence( np.argmax(pred.numpy(), axis=1).astype('int'), idx2word_en) f.writelines([sentence, '\n']) # Load datasets logger.info('Loading datasets...') train_dataset, valid_dataset, nb_train_ex, nb_valid_ex = utils.load_training_data( path_en, path_fr, word2idx_en, word2idx_fr, seq_len, batch_size, en_back_translated_path=prediction_file, fr_unaligned_path=path_unaligned_fr, back_translation_ratio=back_translation_ratio) logger.info( f'Number of training examples : {nb_train_ex}, number of valid examples : {nb_valid_ex}' ) # Load embeddings embedding_matrix = None if embedding: logger.info(f'Loading embedding {embedding} ...') if embedding == 'fasttext': embedding_matrix = utils.create_fasttext_embedding_matrix( path_unaligned_en, word2idx_en, embedding_dim) elif embedding == 'word2vec': raise Exception(f'Embedding "{embedding}" not implemented yet') elif embedding == 'glove': raise Exception(f'Embedding "{embedding}" not implemented yet') else: raise Exception(f'Embedding "{embedding}" not recognized.') # Create model if model_name == 'gru': model = baselines.GRU(len(word2idx_fr), batch_size) elif model_name == 'seq2seqgru': if model_config is None: model_config = { 'embedding_dim': 256, 'encoder_units': 512, 'decoder_units': 512, 'n_layers': 1 } model = Seq2SeqGRU(len(word2idx_en), word2idx_fr, batch_size, model_config, embedding_matrix=embedding_matrix) elif model_name == 'transformer': if model_config is None: model_config = { 'num_layers': 2, 'd_model': 128, 'dff': 512, 'num_heads': 8 } model = Transformer(model_config, len(word2idx_en), word2idx_fr, embedding_matrix=embedding_matrix) else: raise Exception(f'Model "{model}" not recognized.') # Optimizer if optimizer == 'adam': if model_name == 'transformer': # Use adam according to transformer paper optimizer = tf.keras.optimizers.Adam(utils.CustomSchedule( model_config['d_model']), beta_1=0.9, beta_2=0.98, epsilon=1e-9) logger.info( 'Using custom scheduler for learning rate, --lr argument ignored.' ) else: optimizer = tf.keras.optimizers.Adam(lr) elif optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD(lr) else: raise Exception(f'Optimizer "{optimizer}" not recognized.') # Training loop logger.info(f'Training with model {model.get_name()} ...') metrics = { 'train_accuracy': [], 'valid_accuracy': [], 'train_loss': [], 'valid_loss': [], 'train_bleu': [], 'valid_bleu': [] } model_path = model.get_name() + f'_fr_to_en_{fr_to_en}_embedding_{embedding}_embedding_dim_{embedding_dim}'\ f'_back_translation_{back_translation}_ratio_{back_translation_ratio}' best_valid_bleu = 0 for epoch in range(epochs): train_epoch(model, train_dataset, optimizer, np.ceil(nb_train_ex / batch_size), idx2word_fr) test_epoch(model, valid_dataset, np.ceil(nb_valid_ex / batch_size), idx2word_fr, idx2word_en) train_accuracy = train_accuracy_metric.result().numpy() valid_accuracy = valid_accuracy_metric.result().numpy() train_loss = train_loss_metric.result().numpy() valid_loss = valid_loss_metric.result().numpy() train_bleu = train_bleu_metric.result() valid_bleu = valid_bleu_metric.result() if valid_bleu > best_valid_bleu: best_valid_bleu = valid_bleu utils.save_model(model, model_path) # Logs logger.info(f'Epoch {epoch}\n'\ f' Train BLEU : {train_bleu:.4f} - Valid BLEU : {valid_bleu:.4f}\n'\ f' Train Accuracy : {train_accuracy:.4f} - Valid Accuracy : {valid_accuracy:.4f}\n'\ f' Train Loss : {train_loss:.4f} - Valid Loss : {valid_loss:.4f}') metrics['train_accuracy'].append(train_accuracy) metrics['valid_accuracy'].append(valid_accuracy) metrics['train_loss'].append(train_loss) metrics['valid_loss'].append(valid_loss) metrics['train_bleu'].append(train_bleu) metrics['valid_bleu'].append(valid_bleu) # If using back translation, sample new generated examples for next epoch if back_translation: train_dataset, _, _, _ = utils.load_training_data( path_en, path_fr, word2idx_en, word2idx_fr, seq_len, batch_size, en_back_translated_path=prediction_file, fr_unaligned_path=path_unaligned_fr, back_translation_ratio=back_translation_ratio) # If training with embeddings, unfreeze embedding layer at 50th epoch if epoch == 48 and embedding and model_name == 'transformer': model.unfreeze_embedding_layer() # save metrics utils.save_metrics(metrics, model_path) # Plot accuracy plots.plot_accuracy(metrics['train_accuracy'], metrics['valid_accuracy'])
def ensemble_methods_regressor_forest_dataset(): data = utils.load_data('forestfires.csv') new_data = utils.convert_data_to_numeric(data, [2, 3]) feature_vector = new_data[:, 0:-1] targets = new_data[:, -1] # Data normalization data_features_normalized = normalization.z_score_normalization(feature_vector) data_features_train, data_features_test, data_targets_train, data_targets_test = \ train_test_split(data_features_normalized, targets, test_size=0.25) # Model declaration """ Parameters to select: n_estimators: The number of base estimators in the ensemble. Values: Random Forest and Bagging. Default 10 AdaBoost. Default: 50 ###Only for Bagging and Boosting:### base_estimator: Base algorithm of the ensemble. Default: DecisionTree ###Only for Random Forest:### criterion: "entropy" or "gini": default: gini max_depth: maximum depth of tree, default: None """ names = ["Bagging Regressor", "AdaBoost Regressor", "Random Forest Regressor"] models = [ BaggingRegressor( base_estimator=tree.DecisionTreeRegressor( criterion='mse', max_depth=10) ), AdaBoostRegressor( base_estimator=tree.DecisionTreeRegressor( criterion='mse', max_depth=10) ), RandomForestRegressor( criterion='mse', max_depth=10 ) ] for name, em_reg in zip(names, models): logger.info("###################---" + name + "---###################") em_reg.fit(data_features_train, data_targets_train) # Model evaluation test_data_predicted = em_reg.predict(data_features_test) error = metrics.mean_absolute_error(data_targets_test, test_data_predicted) logger.debug('Total Error: %s', error)
def incremental_load(): kwargs = request.args.to_dict() return load_data("incremental", **kwargs)
from utils import utils from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB, GaussianNB import numpy as np from sklearn import svm from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, precision_score from sklearn.decomposition import TruncatedSVD features, labels, valid_adj, valid_graph_labels, train_adj, train_graph_labels = utils.load_data( cuda=False) svd = TruncatedSVD(300) tr = [[v for row in graph for v in row] for graph in train_adj] #tr = svd.fit_transform(tr) vr = [[v for row in graph for v in row] for graph in valid_adj] #vr = svd.fit_transform(vr) tl = np.argmax(train_graph_labels, axis=1) vl = np.argmax(valid_graph_labels, axis=1) print('Decision Tree:') cl = DecisionTreeClassifier() cl = cl.fit(tr, tl) pl = cl.predict(vr) print(accuracy_score(vl, pl), precision_score(vl, pl), recall_score(vl, pl)) print('Random Forest:') cl = RandomForestClassifier(oob_score=True, random_state=10) cl = cl.fit(tr, tl) pl = cl.predict(vr) print(accuracy_score(vl, pl), precision_score(vl, pl), recall_score(vl, pl))
]) device = torch.device( 'cuda', args.cuda) if torch.cuda.is_available() else torch.device('cpu') # Reset random state for reproducibility np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if args.multinet: args.sym = 0 args.embedding = 0 # Load dataset graph, adj, features, labels, idx_train, idx_val, idx_test = load_data( path=args.data_dir, percent=args.train_percent, sym=args.sym) embedding = node2vec(graph, args.data_dir, args.sym) if args.multinet: embedding1 = node2vec(graph, args.data_dir, 1) del graph gc.collect() if args.pca > 0: aff_features = PCA(args.pca, whiten=True).fit_transform(features.numpy()) features = torch.FloatTensor(aff_features) if args.embedding == 0: del features features = embedding.to(device) msg = 'Uses only the embedding features (extracting from Node2Vec model).' if args.multinet: features1 = embedding1.to(device) msg = '`multinet` specified, load embeddings for both adj and adj.T.'