def __call__(self): import glob import h5py records = [] name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'} header = ['experiment_type', 'data_name', 'model_experiment_type', 'model_data_name', 'percentile', 'window_size', 'model_name'] + self.metrics for dirname in os.listdir('metrics/cross/{},{}'.format(self.experiment_type, self.data_name)): model_experiment_type, model_data_name = dirname.split(',') for filename in glob.glob('metrics/cross/{},{}/{}/*.h5'.format(self.experiment_type, self.data_name, dirname)): d = parse_filename(filename, name_dict) if d['region'] != self.region: continue f = h5py.File(filename, 'r') record = [self.experiment_type, self.data_name, model_experiment_type, model_data_name, d['percentile'], d['window_size'], d['model_name']] grp = f['metrics'] for metric in self.metrics: record.append(str(grp[metric][()])) records.append(record) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: f.write('\t'.join(header) + '\n') for record in records: f.write('\t'.join(record) + '\n')
def __call__(self): import h5py import numpy as np import_matplotlib() model_weights = h5py.File(self.infile, 'r')['/model_weights/dense_1/dense_1/kernel:0'][:] window_size = model_weights.shape[0]/len(self.alphabet) offset = (window_size + 1)/2 model_weights = model_weights.reshape((window_size, 4)) fig, ax = plt.subplots(figsize=(20, 4)) for i in range(len(self.alphabet)): ax.plot(np.arange(window_size), model_weights[:, i], '-', label=self.alphabet[i]) ax.set_xticks(np.arange(window_size, step=5)) ax.set_xlim(0, window_size) ax.set_ylabel('Weight') ax.set_xlabel('Position') ax.set_xticks(np.arange(window_size), minor=True) ax.set_xticks(np.arange(offset%5, window_size + 1, step=5)) ax.set_xticklabels(np.arange(offset%5, window_size + 1, step=5) - window_size/2) ax.set_ylim(-2, 2) ax.legend() plt.tight_layout() self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile, dpi=150, bbox_inches='tight')
def __call__(self): import h5py from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split X, y = make_classification(self.n_samples, self.n_features, n_informative=self.n_informative, n_redundant=self.n_redundant, n_repeated=self.n_repeated, n_classes=self.n_classes, n_clusters_per_class=self.n_clusters_per_class, flip_y=self.flip_y, class_sep=self.class_sep, shift=self.shift, scale=self.scale) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_ratio) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) f = h5py.File(self.outfile, 'w') f.create_dataset('X_train', data=X_train) f.create_dataset('y_train', data=y_train) f.create_dataset('X_test', data=X_test) f.create_dataset('y_test', data=y_test) f.close()
def train(args): import numpy as np import keras import h5py from models import get_model from ioutils import prepare_output_file, make_dir logger.info('load training data: ' + args.input_file) fin = h5py.File(args.input_file, 'r') X_train = fin[args.xname][:] y_train = fin[args.yname][:] fin.close() valid_data = None if args.valid_file: logger.info('load validation data: ' + args.valid_file) fin = h5py.File(args.valid_file, 'r') X_valid = fin[args.valid_xname][:] y_valid = fin[args.valid_yname][:] fin.close() valid_data = (X_valid, y_valid) if args.n_threads >= 1: logger.info('set number of threads to {} for TensorFlow'.format( args.n_threads)) set_keras_num_threads(args.n_threads) window_size = X_train.shape[1] model = get_model(args.model)(window_size) if args.regression: loss = 'mean_squared_error' metrics = ['mean_squared_error'] else: loss = 'binary_crossentropy' metrics = ['accuracy'] model.compile(optimizer='Adam', loss=loss, metrics=metrics) model.summary() callbacks = [] if args.tensorboard_log_dir: from keras.callbacks import TensorBoard callbacks = [TensorBoard(log_dir=args.tensorboard_log_dir)] else: callbacks = [] if args.keras_log is not None: logger.info('open CSV log file: {}'.format(args.keras_log)) make_dir(os.path.dirname(args.keras_log)) callbacks.append(keras.callbacks.CSVLogger(args.keras_log)) logger.info('train model') model.fit(X_train, y_train, batch_size=args.batch_size, epochs=args.epochs, callbacks=callbacks, verbose=args.keras_verbose, validation_data=valid_data) logger.info('save model: {}'.format(args.model_file)) prepare_output_file(args.model_file) model.save(args.model_file)
def __call__(self): import h5py import numpy as np import_matplotlib() from scipy import signal self.logger.info('read BUMHMM file: ' + self.posterior_file) posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:] self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file) f = h5py.File(self.bumhmm_input_file, 'r') start = f['start'][:] end = f['end'][:] name = f['name'][:] f.close() self.logger.info('open pdf file: ' + self.outfile) prepare_output_file(self.outfile) plt.rcParams['axes.labelsize'] = 'small' plt.rcParams['xtick.labelsize'] = 'x-small' plt.rcParams['ytick.labelsize'] = 'x-small' plt.rcParams['axes.titlesize'] = 'small' window_size = 300 window = np.ones(300) with PdfPages(self.outfile) as pdf: n_plots = 0 while n_plots < 50: i = np.random.choice(len(name)) if end[i] - start[i] < window_size: continue na_fraction = signal.convolve(np.isnan(posteriors[start[i]:end[i]]).astype('float'), window, mode='valid')/window_size valid_windows = np.nonzero(na_fraction < 0.75)[0] if len(valid_windows) <= 0: continue self.logger.info('plot %s'%name[i]) offset = valid_windows[0] length = min(window_size, end[i] - start[i] - offset) index = np.arange(start[i] + offset, start[i] + length + offset) posteriors_fillna = posteriors[index] na_index = np.nonzero(np.isnan(posteriors_fillna))[0] posteriors_fillna[na_index] = -0.1 color = np.full(length, '#0000ff', dtype='S7') color[na_index] = '#999999' x = np.arange(offset, offset + length) fig, ax = plt.subplots(figsize=(15, 1.5)) ax.bar(x, posteriors_fillna, color=color, edgecolor='none') ax.set_xlim(0, length) ax.set_ylim(-0.1, 1) ax.set_title('BUMHMM posteriors (%s)'%name[i]) plt.tight_layout() pdf.savefig(fig) plt.clf() plt.close(fig) n_plots += 1
def __call__(self): import h5py self.logger.info('load training data: ' + self.infile) fin = h5py.File(self.infile, 'r') X_train = fin[self.xname][:] y_train = fin[self.yname][:] fin.close() valid_data = None if self.valid_file: self.logger.info('load validation data: ' + self.valid_file) fin = h5py.File(self.valid_file, 'r') X_valid = fin[self.valid_xname][:] y_valid = fin[self.valid_yname][:] fin.close() valid_data = (X_valid, y_valid) window_size = X_train.shape[1] from keras.optimizers import RMSprop optimizer = RMSprop(lr=self.learning_rate) # load model # variables optimizer, loss may be overloaded regression = self.regression if self.regression: loss = 'mean_squared_error' metrics = ['mean_squared_error'] else: loss = 'binary_crossentropy' metrics = ['accuracy'] with open(self.model_script, 'r') as f: exec compile(f.read(), self.model_script, 'exec') model.compile(optimizer=optimizer, loss=loss, metrics=metrics) model.summary() callbacks = [] if self.tensorboard_log_dir: from keras.callbacks import TensorBoard callbacks = [TensorBoard(log_dir=self.tensorboard_log_dir)] else: callbacks = [] if self.keras_log is not None: self.logger.info('open CSV log file: {}'.format(self.keras_log)) make_dir(os.path.dirname(self.keras_log)) callbacks.append(keras.callbacks.CSVLogger(self.keras_log)) self.logger.info('train model') model.fit(X_train, y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=callbacks, verbose=self.keras_verbose, validation_data=valid_data) self.logger.info('save model: {}'.format(self.model_file)) prepare_output_file(self.model_file) model.save(self.model_file)
def __call__(self): import h5py import numpy as np import_matplotlib() self.logger.info('read BUMHMM file: ' + self.posterior_file) posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:] self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file) f = h5py.File(self.bumhmm_input_file, 'r') start = f['start'][:] end = f['end'][:] name = f['name'][:] coverage = f['coverage'][:] sample_name = f['sample_name'][:] replicate = f['replicate'][:] dropoff_count = f['dropoff_count'][:] f.close() self.logger.info('open pdf file: ' + self.outfile) prepare_output_file(self.outfile) plt.rcParams['axes.labelsize'] = 'small' plt.rcParams['xtick.labelsize'] = 'x-small' plt.rcParams['ytick.labelsize'] = 'x-small' plt.rcParams['axes.titlesize'] = 'small' with PdfPages(self.outfile) as pdf: for i in np.random.choice(len(name), size=10): self.logger.info('plot %s'%name[i]) length = min(300, end[i] - start[i] - 50) index = np.arange(start[i] + 50, start[i] + length + 50) x = np.arange(50, 50 + length) fig, axes = plt.subplots(1 + 2*coverage.shape[0], figsize=(15, 2 + 2*coverage.shape[0]), sharex=True) posteriors_fillna = posteriors[index] color = np.asarray(['#999999' if np.isnan(a) else '#0000ff' for a in posteriors_fillna]) posteriors_fillna[np.isnan(posteriors_fillna)] = -0.05 axes[0].bar(x, posteriors_fillna, color=color, edgecolor='none') axes[0].set_xlim(0, length) axes[0].set_ylim(-0.1, 1) axes[0].set_title('BUMHMM posteriors (%s)'%name[i]) for j in range(coverage.shape[0]): axes[2*j + 1].bar(x, dropoff_count[j, index].astype('float')/coverage[j, index], edgecolor='none') axes[2*j + 1].set_title('Dropoff rate of %s (%s)'%(replicate[j], sample_name[j])) axes[2*j + 1].set_ylim(0, 0.5) axes[2*j + 2].bar(x, coverage[j, index], edgecolor='none') axes[2*j + 2].set_title('Coverage of %s (%s)'%(replicate[j], sample_name[j])) axes[2*j + 1].set_ylim(0, 0.5) plt.tight_layout() pdf.savefig(fig) plt.clf() plt.close(fig)
def __call__(self): import h5py import zipfile self.logger.info('load test dataset: ' + self.test_file) f = h5py.File(self.test_file, 'r') X_test = f['X_test'][:] y_test = f['y_test'][:] f.close() if self.flatten: self.logger.info('flatten the test data to dimension: (%d, %d)'%X_test.shape[:2]) X_test = X_test.reshape((X_test.shape[0], -1)) if self.model_type == 'keras': import_keras() self.logger.info('load keras model: ' + self.model_file) model = keras.models.load_model(self.model_file) elif self.model_type == 'sklearn': import cPickle self.logger.info('load sklearn model: ' + self.model_file) zipf = zipfile.ZipFile(self.model_file, 'r') f = zipf.open('model', 'r') model = cPickle.load(f) zipf.close() if self.model_type == 'sklearn': y_pred_labels = model.predict(X_test) model_name = model.__class__.__name__ if model_name == 'SVC': y_pred = model.decision_function(X_test) elif model_name == 'RandomForestClassifier': y_pred = model.predict_proba(X_test)[:, 1] else: raise ValueError('unknown sklearn model ' + model_name) elif self.model_type == 'keras': y_pred = model.predict(X_test) y_pred_labels = (y_pred >= 0.5).astype('int32') self.logger.info('save metrics: ' + self.metric_file) prepare_output_file(self.metric_file) f = h5py.File(self.metric_file, 'w') f.create_dataset('y_true', data=y_test) f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_labels', data=y_pred_labels) g = f.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if metric == 'roc_auc': score = scorer(y_test, y_pred) else: score = scorer(y_test, y_pred_labels) self.logger.info('calculate metric {}: {}'.format(metric, score)) g.create_dataset(metric, data=score) f.close()
def __call__(self): import numpy as np from sklearn.model_selection import train_test_split import h5py from common import sequence_to_array from scipy import signal self.logger.info('read input file: ' + self.infile) _, base_density, length, _ = read_background_rt(self.infile) names = base_density.keys() self.logger.info('read sequence file: ' + self.sequence_file) sequences = dict(read_fasta(self.sequence_file)) if self.offset is None: self.offset = (self.window_size + 1) / 2 X = [] y = [] if self.smooth: self.logger.info( 'smooth the values using Gaussian window of width %.1f' % self.smooth_width) window = signal.gaussian(100, std=self.smooth_width) for name in names: seq = sequences[name] values = base_density[name] / base_density[name].mean() if self.smooth: # smooth the signal values = signal.convolve(values, window, mode='same') for i in range(0, len(seq) - self.window_size, self.stride): X.append(sequence_to_array(seq[i:(i + self.window_size)])) y.append(values[i + self.offset]) if len(X) >= self.max_samples: break n_samples = len(X) self.logger.info('created {} samples'.format(n_samples)) X = np.concatenate(X) X = X.reshape((n_samples, self.window_size, 4)) y = np.asarray(y, dtype='float32') X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_ratio) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) f = h5py.File(self.outfile, 'w') f.create_dataset('offset', data=int(self.offset)) f.create_dataset('window_size', data=int(self.window_size)) f.create_dataset('X_train', data=X_train) f.create_dataset('y_train', data=y_train) f.create_dataset('X_test', data=X_test) f.create_dataset('y_test', data=y_test) f.close()
def __call__(self): import_matplotlib() import pandas as pd import numpy as np df = pd.read_table(self.infile) summary = [] for name, subdf in df.groupby(['model_name']): i = subdf['roc_auc'].idxmax() summary.append(subdf.ix[i, :]) summary = pd.concat(summary, axis=1).T self.logger.info('save summary table: {}'.format(self.outfile)) prepare_output_file(self.outfile) summary.to_csv(self.outfile, index=False, sep='\t')
def __call__(self): import json import h5py import cPickle from sklearn.metrics import roc_auc_score, accuracy_score hyperparam = json.loads(self.hyperparam) model = get_model(self.model_name, hyperparam) self.logger.info('load data: {}'.format(self.infile)) fin = h5py.File(self.infile, 'r') X_train = fin['X_train'][:] y_train = fin['y_train'][:] fin.close() X_valid = None y_valid = None if self.cv_index_file is not None: if self.cv_fold is None: raise ValueError('argument --cv-fold is required if --cv-index-file is specified') self.logger.info('load CV index: ' + self.cv_index_file) f = h5py.File(self.cv_index_file, 'r') train_index = f[str(self.cv_fold)]['train'][:] test_index = f[str(self.cv_fold)]['test'][:] f.close() X_valid = X_train[test_index] y_valid = y_train[test_index] X_train = X_train[train_index] y_train = y_train[train_index] self.logger.info('train the model') model.fit(X_train, y_train) self.logger.info('save model: {}'.format(self.outfile)) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: cPickle.dump(model, self.outfile) if X_valid: self.logger.info('validate the model') y_pred_labels = model.predict(X_valid) self.logger.info('save the metrics: ' + self.valid_metric_file) prepare_output_file(self.valid_metric_file) f = h5py.File(self.valid_metric_file, 'w') f.create_dataset('model_name', dtype='S', data=self.model_name) f.create_dataset('hyperparam', dtype='S', data=json.dumps(self.hyperparam)) f.create_dataset('y_pred_labels', data=y_pred_labels) f.create_dataset('y_true', data=y_true) g = f.create_group('metrics') g.create_dataset('accuracy', data=accuracy_score(y_valid, y_pred_labels)) f.close()
def __call__(self): import pandas as pd import_matplotlib() table1 = pd.read_table(self.infile1) table2 = pd.read_table(self.infile2) merged = pd.merge(table1, table2, on='name') diff = merged['{}_x'.format(self.metric)] - merged['{}_y'.format(self.metric)] fig, ax = plt.subplots(figsize=(10, 6)) ax.hist(diff, bins=50) ax.set_title(self.title.format(metric=self.metric, compare_method=self.compare_method, mean=diff.mean(), median=diff.median())) ax.set_xlim(-1, 1) self.logger.info('save plot file: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
def __call__(self): import h5py from sklearn.model_selection import KFold import numpy as np self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) fout = h5py.File(self.outfile, 'w') kfold = KFold(self.n_folds, shuffle=True) fold = 0 for train_index, test_index in kfold.split(np.arange(self.n_samples)): g = fout.create_group('%d'%fold) g.create_dataset('train', data=train_index) g.create_dataset('test', data=test_index) fold += 1 fout.close()
def __call__(self): import_matplotlib() import numpy as np data = GenomicData(self.infile, feature_names=[self.feature]) fig, ax = plt.subplots(figsize=(4, 4)) valid_data = data.features[self.feature][np.logical_not(np.isnan(data.features[self.feature]))] ax.hist(valid_data, weights=np.full(len(valid_data), self.weight), bins=20, color='#808080') ax.set_xlabel(self.xlabel) ax.set_ylabel(self.ylabel) #ax.set_yticks(np.arange(len(counts)), map(lambda x: '%.1f'%x, counts.astype('float')*1e-6)) plt.tight_layout() if self.title: ax.set_title(self.title) self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
def __call__(self): import pandas as pd import json metric_table = pd.read_table(self.metric_file) if self.num <= 0: self.num = metric_table.shape[0] else: self.num = min(self.num, metric_table.shape[0]) selected = metric_table.sort_values('accuracy', ascending=False).iloc[:self.num, :] if self.outfile is not None: paramlist = [] for index, row in selected.iterrows(): paramlist.append(row.to_dict()) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: json.dump(paramlist, f, indent=2) print selected
def __call__(self): import wave import numpy as np rt = icshape_raw_rt_to_genomic_data(self.rt_file, self.logger) def modulate(values, wav_file, sample_rate=44100, n_channels=2, max_amp=32767, x_freq=20): upsample_rate = float(sample_rate) / x_freq T = float(len(values)) / x_freq n_samples = int(sample_rate * T) x = np.empty(n_samples, dtype='float32') for i in range(len(values)): x[int(upsample_rate * i):int(upsample_rate * (i + 1))] = np.log(values[i] + 1) t = np.linspace(0, T, n_samples) y = max_amp * np.sin(2 * 880 * np.pi * t) y *= x y *= float(max_amp) / np.abs(y.max()) data = np.empty(n_samples * n_channels, dtype='int16') channel_index = np.arange(0, n_samples * n_channels, n_channels) data[channel_index] = y data[channel_index + 1] = data[channel_index] wav = wave.open(wav_file, 'wb') wav.setnchannels(n_channels) wav.setsampwidth(2) wav.setframerate(sample_rate) wav.setnframes(n_samples) wav.setcomptype('NONE', 'no compression') wav.writeframes(np.getbuffer(data)) wav.close() for i in np.argsort(-rt.meta['rpkm'])[:10]: name = rt.names[i] values = rt.feature('rt_stop', name) wav_file = os.path.join(self.outdir, '%s.wav' % name) self.logger.info('create wav file: ' + wav_file) prepare_output_file(wav_file) modulate(values, wav_file)
def __call__(self): from genomic_data import GenomicData import numpy as np import h5py self.logger.info('read BUMHMM file: ' + self.posterior_file) posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:] self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file) f = h5py.File(self.bumhmm_input_file, 'r') start = f['start'][:] end = f['end'][:] name = f['name'][:] f.close() values = map(lambda i: posteriors[start[i]:end[i]], range(len(name))) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) GenomicData.from_data(name, features={ 'bumhmm': values }).save(self.outfile)
def number_of_samples(self): import h5py import glob name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'} header = ['experiment_type', 'data_name', 'region', 'window_size', 'percentile', 'n_train', 'n_test'] records = [] for filename in glob.glob('data/{}/{}/deepfold/*.h5'.format(self.experiment_type, self.data_name)): d = parse_filename(filename, name_dict) f = h5py.File(filename, 'r') records.append((self.experiment_type, self.data_name, d['region'], d['window_size'], d['percentile'], f['y_train'].shape[0], f['y_test'].shape[0])) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: f.write('\t'.join(header)) f.write('\n') for record in records: f.write('\t'.join(map(str, record))) f.write('\n')
def __call__(self): import subprocess from formats import read_rme, read_ct values = read_rme(self.value_file).values()[0] title, seq, pairs = read_ct(self.ct_file) values_fillna = [] for i in range(len(seq)): if i in values: values_fillna.append(values[i]) else: values_fillna.append(0.5) colormap = ';'.join(map(lambda x: '%.3f'%x, values_fillna)) prepare_output_file(self.outfile) cmdline = ['java', '-cp', self.varna_path, 'fr.orsay.lri.varna.applications.VARNAcmd', '-i', self.ct_file, '-resolution', '5.0', '-colorMapStyle', 'rocknroll', '-colorMap', colormap, '-o', self.outfile] self.logger.info('execute: {}'.format(' '.join(cmdline))) p = subprocess.Popen(cmdline) p.wait()
def auc_lines(self, auc, params): import numpy as np fig, ax = plt.subplots(figsize=(5, 4)) for keyval in params.keys(): if self.xkey == 'window_size': window_sizes = [int(d['window_size']) for d in params[keyval]] sorted_index = np.argsort(window_sizes) auc[keyval] = map(lambda i: auc[keyval][i], sorted_index) params[keyval] = map(lambda i: params[keyval][i], sorted_index) ax.set_xticks(np.arange(len(window_sizes))) ax.set_xticklabels(sorted(window_sizes)) ax.set_xlabel('Window Size') ax.plot(np.arange(len(window_sizes)), auc[keyval], lw=1.5, label=keyval) ax.set_title(self.title) ax.set_ylabel('AUROC') ax.set_ylim(0.75, 1) ax.legend(loc='lower right') plt.tight_layout() self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
def __call__(self): keys = ['length', 'sensitivity', 'ppv', 'tp_in_true', 'true_pairs', 'tp_in_pred', 'pred_pairs'] fout = sys.stdout if self.outfile is not None: self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) fout = open(self.outfile, 'w') if os.path.isdir(self.true_file) and os.path.isdir(self.pred_file): names = [os.path.splitext(a)[0] for a in os.listdir(self.pred_file)] fout.write('\t'.join(['name'] + keys) + '\n') for name in names: #self.logger.debug('read ct: {}'.format(name)) scores = self.score_ct('{}/{}.ct'.format(self.true_file, name), '{}/{}.ct'.format(self.pred_file, name)) fout.write('\t'.join([name] + map(str, map(lambda x: scores[x], keys))) + '\n') else: scores = self.score_ct(self.true_file, self.pred_file) name = os.path.splitext(self.pred_file)[0] fout.write('\t'.join(['name'] + keys) + '\n') fout.write('\t'.join([ct_true[0]] + map(str, map(lambda x: scores[x], keys))) + '\n') if self.outfile is not None: fout.close()
def __call__(self): from genomic_data import GenomicData import numpy as np self.logger.info('read input rt file: ' + self.infile) name = [] length = [] rpkm = [] rt_stop = [] base_density = [] with open(self.infile, 'r') as f: f.readline() n_records = 0 for lineno, line in enumerate(f): c = line.strip().split('\t') if (lineno % 2) == 0: name.append(c[0]) length.append(int(c[1])) rpkm.append(float(c[2].split(',')[0])) rt_stop.append( np.asarray(c[3:], dtype='float').astype('int32')) else: base_density.append( np.asarray(c[3:], dtype='float').astype('int32')) n_records += 1 self.logger.info('successfully read %d records' % n_records) self.logger.info('create output file: ' + self.outfile) prepare_output_file(self.outfile) GenomicData.from_data(name, features={ 'rt_stop': rt_stop, 'base_density': base_density }, meta={ 'rpkm': np.asarray(rpkm, dtype='float64'), 'length': np.asarray(length, dtype='int64') }).save(self.outfile)
def __call__(self): import h5py from sklearn.model_selection import KFold import numpy as np if (self.n_samples is None) and (self.data_file is None): raise ValueError('either --n-samples/--data-file should be specified') if self.data_file: self.logger.info('determine number of samples from data file: {}' + self.data_file) fin = h5py.File(self.data_file, 'r') self.n_samples = fin['y_train'].shape[0] fin.close() self.logger.info('number of training samples: {}'.format(self.n_samples)) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) fout = h5py.File(self.outfile, 'w') kfold = KFold(self.n_folds, shuffle=True) fold = 0 for train_index, test_index in kfold.split(np.arange(self.n_samples)): g = fout.create_group('%d'%fold) g.create_dataset('train', data=train_index) g.create_dataset('test', data=test_index) fold += 1 fout.close()
def __call__(self): import h5py from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X, y = make_regression(self.n_samples, self.n_features, n_informative=self.n_informative, bias=self.bias, noise=self.noise) if self.scale_targets: self.logger.info('scale target values using StandardScaler') scaler = StandardScaler() y = scaler.fit_transform(y.reshape(-1, 1)) y = y.reshape((-1,)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_ratio) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) f = h5py.File(self.outfile, 'w') f.create_dataset('X_train', data=X_train) f.create_dataset('y_train', data=y_train) f.create_dataset('X_test', data=X_test) f.create_dataset('y_test', data=y_test) f.close()
def roc_curve(self, fpr, tpr, params): import numpy as np fig, ax = plt.subplots(figsize=(5, 5)) plt.rcParams['font.size'] = 11 plt.rcParams['legend.fontsize'] = 11 ax.plot([0, 1], [0, 1], 'k--') for keyval in params.keys(): best_index = np.argmax(auc[keyval]) ax.plot(fpr[keyval][best_index], tpr[keyval][best_index], label='{} (AUC = {:.3f})'.format(keyval, auc[keyval][best_index])) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_xlim(0, 1) ax.set_ylim(0, 1.1) ax.set_xticks(np.linspace(0, 1, 6)) ax.set_yticks(np.linspace(0, 1, 6)) ax.legend(loc='lower right') ax.set_title(self.title) plt.tight_layout() self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
def __call__(self): import pandas as pd import_matplotlib() if os.path.isdir(self.infile): df = [] n = 0 for filename in os.listdir(self.infile): df.append(pd.read_table('{}/{}'.format(self.infile, filename))) n += 1 if n > self.max_plots: break df = pd.concat(df) else: df = pd.read_table(self.infile) prepare_output_file(self.outfile) self.logger.info('save file: {}'.format(self.outfile)) with PdfPages(self.outfile) as pdf: n_plots = 0 for name, sub_df in df.groupby('name'): fig, ax = plt.subplots(figsize=(15, 1.5)) length = sub_df.shape[0] if length > self.max_length: start = length/2 - self.max_length/2 sub_df = sub_df.iloc[start:(start + self.max_length), :] ax.plot(sub_df['position'], sub_df['pred'], 'b-', label='prediction') ax.plot(sub_df['position'], sub_df['true'], 'k-', label='known') ax.legend(loc='upper right') ax.set_ylim(-0.1, 1.1) ax.set_title(name) plt.tight_layout() pdf.savefig(fig) plt.close(fig) n_plots += 1 if n_plots >= self.max_plots: break
def __call__(self): import_matplotlib() import numpy as np names1 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir1)) names2 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir2)) prepare_output_file(self.outfile) self.logger.info('save file: {}'.format(self.outfile)) with PdfPages(self.outfile) as pdf: n_plots = 0 for name in names1: if name not in names2: continue name1, seq1, pairs1 = read_ct('{}/{}.ct'.format(self.indir1, name)) name2, seq2, pairs2 = read_ct('{}/{}.ct'.format(self.indir2, name)) fig, axes = plt.subplots(nrows=2, figsize=(12, 4), sharex=True) length = len(seq1) x = np.arange(length) pairs1 = np.asarray(pairs1, dtype='int64') pairs1[pairs1 > 1] = 1 pairs2 = np.asarray(pairs2, dtype='int64') pairs2[pairs2 > 1] = 1 if length > self.max_length: start = length/2 - self.max_length/2 pairs1 = pairs1[start:(start + self.max_length)] pairs2 = pairs2[start:(start + self.max_length)] x = x[start:(start + self.max_length)] axes[0].bar(x, pairs1, label=self.group_name1, color='b', edgecolor='w') axes[0].set_title('{}({})'.format(name, self.group_name1)) axes[1].bar(x, pairs2, label=self.group_name1, color='b', edgecolor='w') axes[1].set_title('{}({})'.format(name, self.group_name2)) pdf.savefig(fig) n_plots += 1 if n_plots >= self.max_plots: break
def __call__(self): import pandas as pd import numpy as np import h5py regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS'] records = [] for indir in self.indirs: for region in regions: deepfold_dataset = 'r={},p=5,w=100.h5'.format(region) data = GenomicData(os.path.join(indir, '{}.h5'.format(region))) if not self.feature: feature = data.features.keys()[0] else: feature = self.feature n_samples_total = len(data.features[feature]) - np.isnan(data.features[feature]).sum() f = h5py.File(os.path.join(indir, 'deepfold', deepfold_dataset), 'r') n_samples_train = f['X_train'].shape[0] n_samples_test = f['X_test'].shape[0] f.close() records.append((indir, deepfold_dataset, region, n_samples_total, n_samples_train, n_samples_test)) df = pd.DataFrame.from_records(records, columns=('dataset', 'deepfold_dataset', 'region', 'n_samples_total', 'n_samples_train', 'n_samples_test')) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) df.to_csv(self.outfile, sep='\t', index=False)
def __call__(self): import json import h5py import cPickle import zipfile if self.hyperparam_file: with open(self.hyperparam_file, 'r') as f: hyperparam = json.load(f) else: hyperparam = json.loads(self.hyperparam) self.logger.info('load data: {}'.format(self.train_file)) fin = h5py.File(self.train_file, 'r') X_train = fin['X_train'][:] y_train = fin['y_train'][:] fin.close() X_valid = None y_valid = None if self.cv_index_file is not None: if self.cv_fold is None: raise ValueError('argument --cv-fold is required if --cv-index-file is specified') if self.valid_metric_file is None: raise ValueError('argument --valid-metric-file is required if --cv-index-file is specified') self.logger.info('load CV index: ' + self.cv_index_file) f = h5py.File(self.cv_index_file, 'r') train_index = f[str(self.cv_fold)]['train'][:] test_index = f[str(self.cv_fold)]['test'][:] f.close() X_valid = X_train[test_index] y_valid = y_train[test_index] X_train = X_train[train_index] y_train = y_train[train_index] if self.flatten: X_train = X_train.reshape((X_train.shape[0], -1)) self.logger.info('flatten the training data to dimension: (%d, %d)'%X_train.shape) if X_valid is not None: X_valid = X_valid.reshape((X_valid.shape[0], -1)) self.logger.info('flatten the validation data to dimension: (%d, %d)'%X_train.shape) if self.scale_targets: self.logger.info('scale the target values using StandardScaler') from sklearn.preprocessing import StandardScaler scaler = StandardScaler() y_train = scaler.fit_transform(y_train.reshape(-1, 1)).reshape((-1,)) if y_valid is not None: y_valid = scaler.transform(y_valid.reshape(-1, 1)).reshape((-1,)) if self.model_script: self.logger.info('create model from script: ' + self.model_script) if self.model_type == 'keras': self.logger.info('use the keras model') #with open(os.path.join(os.path.dirname(__file__), 'import_keras.py'), 'r') as f: # exec compile(f.read(), 'import_keras.py', 'exec') import_keras() with open(self.model_script, 'r') as f: exec compile(f.read(), self.model_script, 'exec') from keras.optimizers import SGD optimizer = SGD() if self.regress: loss = 'mean_squared_error' metrics = ['mae'] else: loss = 'binary_crossentropy' metrics = ['acc'] model.compile(optimizer=optimizer, loss=loss, metrics=metrics) model.summary() else: with open(self.model_script, 'r') as f: exec compile(f.read(), self.model_script, 'exec') else: self.logger.info('create model by name: ' + self.model_name) model = get_model(self.model_name, hyperparam) self.logger.info('train the model') if self.model_type == 'keras': model.fit(X_train, y_train, batch_size=100, epochs=20) else: self.logger.info('model parameters: ' + json.dumps(model.get_params())) model.fit(X_train, y_train) if self.model_file: self.logger.info('save model: {}'.format(self.model_file)) prepare_output_file(self.model_file) if self.model_type == 'keras': model.save(self.model_file) f = h5py.File(self.model_file, 'r+') f.create_dataset('hyperparam', data=json.dumps(hyperparam)) f.close() else: zipf = zipfile.ZipFile(self.model_file, 'w', zipfile.ZIP_DEFLATED) zipf.writestr('model', cPickle.dumps(model)) zipf.writestr('hyperparam', json.dumps(hyperparam)) zipf.close() if X_valid is not None: if self.metrics is None: if self.regress: self.metrics = ['mean_squared_error', 'r2'] else: self.metrics = ['accuracy'] self.logger.info('validate the model') if self.regress: y_pred = model.predict(X_valid) else: y_pred_labels = model.predict(X_valid) self.logger.info('save the metrics: ' + self.valid_metric_file) prepare_output_file(self.valid_metric_file) f = h5py.File(self.valid_metric_file, 'w') f.create_dataset('model_name', data=self.model_name) f.create_dataset('hyperparam', data=json.dumps(self.hyperparam)) f.create_dataset('y_true', data=y_valid) if self.regress: f.create_dataset('y_pred', data=y_pred) else: f.create_dataset('y_pred_labels', data=y_pred_labels) g = f.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if self.regress: score = scorer(y_valid, y_pred) else: score = scorer(y_valid, y_pred_labels) self.logger.info('calculate metric {}: {}'.format(metric, score)) g.create_dataset(metric, data=score) if self.scale_targets: g.create_dataset('scale_y_mean', data=scaler.mean_) g.create_dataset('scale_y_std', data=scaler.scale_) f.close()
def __call__(self): from formats import read_fasta from tqdm import tqdm import numpy as np import pandas as pd import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt self.logger.info('read sequence file: ' + self.sequence_file) sequences = dict(read_fasta(self.sequence_file)) self.logger.info('read input file: ' + self.infile) data = GenomicData(self.infile) if self.feature is None: if len(data.features.keys()) == 1: self.feature = data.features.keys()[0] else: raise ValueError('multiple features found in the input file and the feature is not specified') # freqs[i]['A']: frequency of A in bin i freqs = [] scores_all = data.features[self.feature] scores_avail = scores_all[np.logical_not(np.isnan(scores_all))] self.logger.info('use bin method: %s'%self.bin_method) if self.bin_method == 'percentile': qs = np.arange(1, self.bins + 1, dtype='float')*100.0/self.bins percentiles = np.zeros(self.bins + 1, dtype='float') percentiles[0] = scores_avail.min() - 1e-6 for i in range(1, self.bins): percentiles[i] = np.percentile(scores_avail, qs[i - 1]) percentiles[self.bins] = scores_avail.max() + 1e-6 elif self.bin_method == 'value': density, percentiles = np.histogram(scores_avail, bins=self.bins, density=True) qs = np.cumsum(density)*100.0 percentiles[0] -= 1e-6 percentiles[-1] += 1e-6 else: raise ValueError('unknown bin method: %s'%self.bin_method) for i in range(self.bins): d = {a:0 for a in self.alphabet} freqs.append(d) self.logger.info('count base frequencies with offset %d'%self.offset) for name in tqdm(data.names): scores_ts = data.feature(self.feature, name) avail_ind = np.nonzero(np.logical_not(np.isnan(scores_ts)))[0] seq_ts = np.frombuffer(sequences[name], dtype='S1') avail_ind += self.offset if self.offset > 0: avail_ind = avail_ind[avail_ind < len(seq_ts)] elif self.offset < 0: avail_ind = avail_ind[avail_ind >= 0] scores_avail_ts = scores_ts[avail_ind - self.offset] seq_avail_ts = seq_ts[avail_ind] for i in range(self.bins): seq_bin = seq_avail_ts[np.logical_and(scores_avail_ts <= percentiles[i + 1], scores_avail_ts > percentiles[i])] for a in self.alphabet: freqs[i][a] += np.count_nonzero(seq_bin == a) # normalize base frequencies for each percentile freq_total = [] for i in range(self.bins): total = sum(freqs[i].values()) freq_total.append(total) for a in self.alphabet: if total == 0: freqs[i][a] = 1.0/len(self.alphabet) else: freqs[i][a] = float(freqs[i][a])/total table_file = self.prefix + '.txt' self.logger.info('save results to file: ' + table_file) prepare_output_file(table_file) df = [] for i in range(self.bins): for a in self.alphabet: df.append((i, qs[i], percentiles[i], a, freq_total[i], freqs[i][a])) df = pd.DataFrame.from_records(df, columns=['bin', 'q', 'percentile', 'base', 'total_freq', 'fraction']) df.to_csv(table_file, sep='\t', index=False) # plot the distribution self.logger.info('create plot') plt.rcParams['font.family'] = 'Arial' plt.rcParams['axes.labelsize'] = 'medium' plt.rcParams['xtick.labelsize'] = 'x-small' plt.rcParams['ytick.labelsize'] = 'x-small' plt.rcParams['axes.titlesize'] = 'medium' fig, ax = plt.subplots(figsize=(7, 5)) x = np.arange(self.bins) xticklabels = ['%.2f'%a for a in percentiles[1:]] for base in self.alphabet: sub_df = df[df['base'] == base] ax.plot(x, sub_df['fraction'], label=base) ax.set_xticks(x) ax.set_xticklabels(xticklabels) ax.set_ylim(0, 1) ax.set_xlabel('Values') ax.set_ylabel('Base fraction') ax.legend() plt.tight_layout() plot_file = self.prefix + '.pdf' self.logger.info('save plot to file: ' + plot_file) plt.savefig(plot_file)