model = tf.keras.models.load_model(path) l_start = time.time() log_every = 200 for n in range(n_data): print('current data ', n) if n%log_every==0 and n!=0: l_end = time.time() l_time = l_end - l_start print(f'current data: {n}') print(f'{log_every} data done in: {l_time:.05} seconds') l_start = time.time() s = s_data[n] s = one_hot_encode(list(s), s_n_chars) s = np.expand_dims(s, axis=0) logit = model.predict([p,s]) pred = tf.nn.sigmoid(logit) fpred = float(pred.numpy()) if all_smi[n] in all_y_preds: all_y_preds[all_smi[n]].append(fpred) else: all_y_preds[all_smi[n]] = [fpred] hp.save_pkl(f'{savepath}chunk_{chunk_id}_model_{i}_all_y_preds.pkl', all_y_preds) end = time.time() print(f'PREDICTIONS DONE in {end - start:.05} seconds') ####################################
warnings.warn( 'It seems that you are using a very low amount of your data for traning' ) #################################### #################################### # start processing data = hp.read_with_pd(filename) #random split with 95% of the data in the training set all_idx = np.arange(len(data)) np.random.shuffle(all_idx) cut = int(len(all_idx) * t_split) idx_train = all_idx[:cut] idx_val = all_idx[cut:] if verbose: print(f'N data used for training: {len(idx_train)}') print(f'N data used for validation: {len(idx_val)}') # create the partitions for the data generator partition = {} partition['train'] = idx_train partition['val'] = idx_val hp.save_pkl(f'{savepath}partitions.pkl', partition) end = time.time() print(f'Train-val split for Swiss-Prot DONE in {end - start:.04} seconds') ####################################
temp_all_clean[pair].append(score) print('Starting the second pass') all_clean = [] n_removed = 0 n_same_duplicate = 0 for k,v in temp_all_clean.items(): if len(v)==1: datapoint = (k[0], k[1], v[0]) all_clean.append(datapoint) # case with duplicates, # but same binary activity elif len(set(v))==1: datapoint = (k[0], k[1], v[0]) all_clean.append(datapoint) n_same_duplicate+=1 else: n_removed+=1 print(f'N remaining data points: {len(all_clean)}') print(f'Duplicates removed because of inconsistent binary activity: {n_removed}') print(f'Entries with multiple same binary activity, so taken once : {n_same_duplicate}') os.makedirs(savepath, exist_ok=True) hp.save_pkl(f'{savepath}all_clean.pkl', all_clean) #################################### end = time.time() print(f'BindingDB extraction DONE in {end - start:.04} seconds')
if ngpu > 1: with strategy.scope(): seqmodel = SeqModel(vocab_size, max_len_model, layers, dropouts, trainables, lr, batchnorm) else: seqmodel = SeqModel(vocab_size, max_len_model, layers, dropouts, trainables, lr, batchnorm) if config.getboolean('RESTART', 'restart'): # Load the pretrained model path_model = config['RESTART']['path_model'] if path_model is None: raise ValueError( 'You did not provide a path to a model to be loaded for the restart' ) seqmodel.model = tf.keras.models.load_model(path_model) history = seqmodel.model.fit_generator( generator=tr_generator, validation_data=val_generator, use_multiprocessing=True, epochs=epochs, callbacks=[checkpointer, lr_reduction, early_stopper], workers=num_workers, verbose=2) hp.save_pkl(f'{save_path}history', history.history) end = time.time() print(f'TRAINING DONE in {end - start:.05} seconds') ####################################
# init protein pretrained model model = ProteinBertModel.from_pretrained('bert-base') tokenizer = TAPETokenizer(vocab='iupac') results = Parallel(n_jobs=nworkers)(delayed(get_PROTrepr)(i,x,model,tokenizer,f_savepath) for i,x in enumerate(unique_protein)) all_indices = {} for x in results: i = x[0] prot = x[1] _all_idx = unique_prot_to_idx[prot] for _idx in _all_idx: all_indices[_idx] = i hp.save_pkl(f'{savepath}all_indices_{rpr}.pkl', all_indices) z_norma = True if z_norma: rep_dim = 768 mean, std = do_z_norma(f'{f_savepath}', rep_dim) hp.save_pkl(f'{savepath}z_norma_param_{rpr}.pkl', {'mean':mean, 'std':std}) elif rpr=='clm': from src import helper_clm as hp_clm from keras.models import load_model max_len_model = 100 + 2 pad_char = 'A' start_char ='G' end_char = 'E'
tr_all_id.extend(all_clusters_id[str(id_)]) val_all_id = [] for id_ in val_cluster_idx: val_all_id.extend(all_clusters_id[str(id_)]) assert len(tr_all_id) + len(val_all_id) == len(all_ids) # get all the data id from the protein index for id_ in tr_all_id: seq = data_id_to_seq[id_] indices = [idx for idx, x in enumerate(all_protein) if x == seq] data_id_tr.extend(indices) for id_ in val_all_id: seq = data_id_to_seq[id_] indices = [idx for idx, x in enumerate(all_protein) if x == seq] data_id_val.extend(indices) assert len(data_id_tr) + len(data_id_val) == len(all_protein) partition = {} partition['train'] = data_id_tr partition['validation'] = data_id_val hp.save_pkl(f'{savepath}/CV_partition_{i}.pkl', partition) i += 1 #################################### end = time.time() print(f'CV folds DONE in {end - start:.04} seconds')
start = time.time() #################################### # get back parameters args = vars(parser.parse_args()) datapath = args['datapath'] savepath = args['savepath'] verbose = args['verbose'] #################################### #################################### # start processing with open(f'{datapath}/clusterRes_cluster.tsv') as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") for i, line in enumerate(tsvreader): cluster_repr = line[0] in_cluster = line[1] if cluster_repr in all_clusters_id: all_clusters_id[cluster_repr].append(in_cluster) else: all_clusters_id[cluster_repr] = [in_cluster] hp.save_pkl(f'{savepath}/data_id_to_seq.pkl', data_id_to_seq) hp.save_pkl(f'{savepath}/all_clusters_id.pkl', all_clusters_id) end = time.time() print(f'Data process from MMseqs2 DONE in {end - start:.04} seconds') ####################################