def cadd_generate_batched_lmdb_from_many_csv(lmdb_batch_dir, csv_folder, variant_ids_file, batch_size, num_batches=-1): it = dir_batch_generator(csv_folder, batch_size) test_batch = next(it) variant_ids = load_pickle(variant_ids_file) nrows = len(variant_ids) row_example = { "batch_id": np.int32(0), "inputs": test_batch[0].values.astype(np.float16), "targets": test_batch[1].values.astype(np.float16), "metadata": { "row_num": np.array(test_batch[0].index, dtype=np.int32), "variant_id": np.array(variant_ids.loc[test_batch[0].index], dtype='<U20') } } ms = calculate_map_size(row_example, nrows) it = dir_batch_generator(csv_folder, batch_size) create_batched_lmdb_from_iterator(it, lmdb_batch_dir, variant_ids_file, num_batches=num_batches, map_size=ms)
def __init__(self, lmbd_dir, batch_idx_file, version="1.3", hg_assembly="GRCh37"): self.version = version # indexed by location self.batch_idxs = load_pickle(batch_idx_file) self.lmdb_cadd_path = lmbd_dir self.lmdb_cadd = None self.txn = None
def reorder_sparse_matrix(input_npz, row_ids, output_npz): from scipy.sparse import load_npz, save_npz """Re-order the vcf file. Note: the output vcf Args: input_npz: path to a .npz file row_ids List[int]: a list of integer numbers or a path to a .txt file containing the shuffled rows output_npz: output .npz file path """ if isinstance(row_ids, str): row_ids = load_pickle(row_ids) npz = load_npz(input_npz) npz = npz[row_ids] save_npz(output_npz, npz)
def __init__(self, lmbd_dir, variant_id_file, version="1.3", hg_assembly="GRCh37"): self.version = version self.lmdb_cadd_path = lmbd_dir self.lmdb_cadd = None self.txn = None # indexed by location self.variant_ids_file = variant_id_file self.variant_ids = load_pickle(self.variant_ids_file) self.variant_ids = self.variant_ids.values
def sparse_cadd_dataset(sparse_matrix, variant_ids_file, targets_col=0, split=0.3, random_state=42, output_npz=None, output_ids=None, separate_x_y=False): """Splits a sparse matrix into train and test set. Args: sparse_matrix: path-like or csr_matrix instance. """ from sklearn.model_selection import ShuffleSplit import os if isinstance(sparse_matrix, str): sparse_matrix = load_npz(sparse_matrix) elif not isinstance(sparse_matrix, csr_matrix): raise ValueError( "Input must be either a path to a sparse matrix or an object of csr_matrix type." ) keep_cols = list(range(sparse_matrix.shape[1])) keep_cols.remove(targets_col) assert targets_col not in keep_cols variant_ids = load_pickle(variant_ids_file) rs = ShuffleSplit(n_splits=1, test_size=split, random_state=random_state) train_index, valid_index = next(rs.split(variant_ids)) train, valid = sparse_matrix[train_index], sparse_matrix[valid_index] train_ids, valid_ids = variant_ids[train_index], variant_ids[valid_index] if separate_x_y: train = train[:, keep_cols], train[:, targets_col] valid = valid[:, keep_cols], valid[:, targets_col] del sparse_matrix if output_npz is not None: save_npz(os.path.join(output_npz, "train.npz"), train) save_npz(os.path.join(output_npz, "valid.npz"), valid) if output_ids is not None: dump_to_pickle(os.path.join(output_ids, "train.pkl"), train_ids) dump_to_pickle(os.path.join(output_ids, "valid.pkl"), valid_ids) return (train, train_ids), (valid, valid_ids)
def create_batched_lmdb_from_iterator(it, lmdb_batch_dir, variant_ids_file, num_batches=-1, map_size=23399354270): start = time.time() index_mapping = OrderedDict() map_size = None txn = None batch_num = 0 variant_ids = load_pickle(variant_ids_file) env = lmdb.Environment(lmdb_batch_dir, map_size=map_size, max_dbs=0, lock=False) with env.begin(write=True, buffers=True) as txn: for batch in tqdm(it): b = { "batch_id": np.int32(batch_num), "inputs": batch[0].values.astype(np.float16), "targets": batch[1].values.astype(np.float16), "metadata": { "row_num": np.array(batch[0].index, dtype=np.int32), "variant_id": np.array(variant_ids.loc[batch[0].index], dtype='<U20') } } # Serialize and compress buff = pa.serialize(b).to_buffer() blzpacked = blosc.compress(buff, typesize=8, cname='blosclz') try: txn.put(str(batch_num).encode('ascii'), blzpacked) except lmdb.MapFullError as err: print(str(err) + ". Exiting the program.") batch_num += 1 # if batch_num >= num_batches: break print("Finished putting " + str(batch_num) + " batches to lmdb.") end = time.time() print("Total elapsed time: {:.2f} minutes.".format((end - start) / 60))
def __init__(self, lmdb_dir, variant_id_file, version="1.3", hg_assembly="GRCh37"): """Reads LMDB database and obtains all predictions available for each variant. """ self.version = version self.lmdb_dir = lmdb_dir self.lmdb_kipoi = None self.txn = None self._column_names = None self.variant_ids_file = variant_id_file self.variant_ids = load_pickle(self.variant_ids_file) self.variant_ids = self.variant_ids.values
def __init__(self, sparse_npz, variant_ids, version="1.3", hg_assembly="GRCh37"): if isinstance(sparse_npz, str) and isinstance(variant_ids, str): self.data = load_npz(sparse_npz) self.variant_ids = load_pickle(variant_ids) elif isinstance(sparse_npz, csr_matrix) and isinstance( variant_ids, pd.Series): self.data = sparse_npz self.variant_ids = variant_ids else: raise ValueError( "Inputs must be either a paths or objects of csr_matrix and pd.Series types." ) self.variant_ids = self.variant_ids.values
def __init__(self, lmdb_dirs_list, variant_ids_file, version="1.3", hg_assembly="GRCh37"): self.version = version self.lmdb_dirs_list = lmdb_dirs_list self.variant_ids_file = variant_ids_file self.variant_ids = load_pickle(self.variant_ids_file) self.variant_ids = self.variant_ids.values self._column_names = None self.datasets = [ KipoiLmdbDataset(db, variant_ids_file, version) for db in lmdb_dirs_list ]
def reorder_vcf(input_vcf, row_ids, output_vcf, discard_metadata=False): """Re-order the vcf file. Note: the output vcf Args: input_vcf: path to a vcf file row_ids List[int]: a list of integer numbers or a path to a .txt file containing the shuffled rows output_vcf: output vcf file path discard_metadata: if True, the INFO field of the vcf is ignored """ if isinstance(row_ids, str): row_ids = load_pickle(row_ids) colnames = "" with open(input_vcf, 'r') as f: for l in f.readlines(): if "#CHROM" in l: colnames = l.replace("\n", "").split("\t") break vcf_df = pd.read_csv(input_vcf, sep="\t", header=None, names=colnames, comment="#") vcf_df = vcf_df.loc[row_ids] if discard_metadata: vcf_df.drop(columns=['ID', 'QUAL', 'FILTER', 'INFO'], inplace=True) header = "##fileformat=VCFv4.0\n" with open(output_vcf, 'w') as f: f.write(header) vcf_df.to_csv(output_vcf, sep="\t", mode='a', index=None) else: header_lines = "" with open(input_vcf, 'r') as f: for l in f.readlines(): if l.startswith("#"): header_lines += l with open(output_vcf, 'w') as f: f.write(header_lines) vcf_df.to_csv(output_vcf, sep="\t", mode='a', header=None, index=None)
def train(self, sample_weight=None, scaler_path=None, training_type=np.float32): """Train the model Args: batch_size: num_workers: how many workers to use in parallel """ from sklearn.externals import joblib print("Started loading training dataset") X_train, y_train = self.train_dataset.load_all() if len(self.valid_dataset) == 0: raise ValueError("len(self.valid_dataset) == 0") if scaler_path: scaler = load_pickle(scaler_path) print("Started scaling X.") X_infl = X_train.astype(np.float32) X_infl = scaler.transform(X_infl) if training_type is not np.float32: X_train = X_infl.astype(np.float16) if isinstance(X_train, csr_matrix): X_train.data = np.minimum(X_train.data, 65500) else: X_train = np.minimum(X_train, 65500) del X_infl print("The dataset was downscaled.") print("Finished scaling X.") print("Finished loading training dataset. Shape: ", X_train.shape, "True values:", y_train.sum()/y_train.shape[0]) self.model.fit(X_train, y_train, sample_weight=sample_weight) print("Calculating training accuracy:") acc = self.model.score(X_train, y_train) print("Obtained training accuracy: ", acc) joblib.dump(self.model, self.ckp_file)
def evaluate(self, metric, scaler_path=None, eval_type=np.float32, save=True): """Evaluate the model on the validation set Args: metrics: a list or a dictionary of metrics batch_size: num_workers: """ print("Started loading validation dataset") X_valid, y_valid = self.valid_dataset.load_all() if scaler_path: scaler = load_pickle(scaler_path) print("Started scaling X.") X_infl = X_valid.astype(np.float32) X_infl = scaler.transform(X_infl) if eval_type is not np.float32: X_valid = X_infl.astype(np.float16) if isinstance(X_valid, csr_matrix): X_valid.data = np.minimum(X_valid.data, 65500) else: X_valid = np.minimum(X_valid, 65500) del X_infl print("Finished scaling X.") print("Finished loading validation dataset. Shape: ", X_valid.shape, "True values:", y_valid.sum()/y_valid.shape[0]) y_pred = self.model.predict(X_valid) metric_res = metric(y_valid, y_pred) print("metric_res", metric_res, np.amax(X_valid)) if save: write_json(metric_res, self.evaluation_path, indent=2) if self.cometml_experiment: self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/") return metric_res
for f in TRAINING_DATA_FILES: inputfile = os.path.join(training_dir, f + ".tsv.gz") out = os.path.join(variant_ids_dir, f + ".pkl") if not os.path.isfile(out): # skip if file exists generate_variant_ids(inputfile, out, variant_cols=variant_cols, dtype=dtype) print("Generating sparse matrices...") # Generate sparse matrices for f in TRAINING_DATA_FILES: # Get the base of the name inputfile = os.path.join(training_dir, f + ".csv.gz") # Num lines is necessary to set the total in tqdm, important feedback in a lengthy function num_lines = len(load_pickle(os.path.join(variant_ids_dir, f + ".pkl"))) output = os.path.join(sparse_matrices_dir, f + ".npz") if not os.path.isfile(output): load_csv_chunks_tosparse(inputfile, 10000, np.float32, num_lines=num_lines, output=output, header=None) # Merge variant ids output = os.path.join(variant_ids_dir, "all.pkl") if not os.path.isfile(output): print("Merging variant ids...") all_ids = None for f in tqdm(TRAINING_DATA_FILES):
def train_test_split_indexes(variant_id_file, test_size, random_state=1): variants = load_pickle(variant_id_file) train_vars, test_vars = train_test_split(variants, test_size=test_size, random_state=random_state) return train_vars, test_vars