class SelectBestModel(CommandLineTool): arguments = [Argument('metric_dir', short_opt='-i', type=str, required=True, help='directory containing prediction metric files in HDF5 format with datasets: y_true, y_pred'), Argument('metric', type=str, default='accuracy'), Argument('outfile', short_opt='-o', type=str, required=True)] def __call__(self): import h5py
class DeepfoldDatasetStatistics(CommandLineTool): description = 'Basic statistics (e.g. number of samples) for a deepfold dataset' arguments = [Argument('experiment_type', type=str, required=True), Argument('data_name', type=str, required=True), Argument('outfile', short_opt='-o', required=True)] def number_of_samples(self): import h5py import glob name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'} header = ['experiment_type', 'data_name', 'region', 'window_size', 'percentile', 'n_train', 'n_test'] records = [] for filename in glob.glob('data/{}/{}/deepfold/*.h5'.format(self.experiment_type, self.data_name)): d = parse_filename(filename, name_dict) f = h5py.File(filename, 'r') records.append((self.experiment_type, self.data_name, d['region'], d['window_size'], d['percentile'], f['y_train'].shape[0], f['y_test'].shape[0])) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: f.write('\t'.join(header)) f.write('\n') for record in records: f.write('\t'.join(map(str, record))) f.write('\n') def __call__(self): self.number_of_samples()
class DrawRnaStructureWithValues(CommandLineTool): description = 'Draw RNA secondary structure with color mapping from a list of values' arguments = [Argument('varna_path', type=str, required=True, help='path to VARNAvX-Y.jar'), Argument('ct_file', type=str, required=True), Argument('value_file', type=str, required=True), Argument('outfile', type=str, required=True), Argument('value_format', type=str, default='rme', choices=('rme',))] def __call__(self): import subprocess from formats import read_rme, read_ct values = read_rme(self.value_file).values()[0] title, seq, pairs = read_ct(self.ct_file) values_fillna = [] for i in range(len(seq)): if i in values: values_fillna.append(values[i]) else: values_fillna.append(0.5) colormap = ';'.join(map(lambda x: '%.3f'%x, values_fillna)) prepare_output_file(self.outfile) cmdline = ['java', '-cp', self.varna_path, 'fr.orsay.lri.varna.applications.VARNAcmd', '-i', self.ct_file, '-resolution', '5.0', '-colorMapStyle', 'rocknroll', '-colorMap', colormap, '-o', self.outfile] self.logger.info('execute: {}'.format(' '.join(cmdline))) p = subprocess.Popen(cmdline) p.wait()
class LogRegWeights(CommandLineTool): description = 'Plot weights of a Logistic regression model as lines' arguments = [Argument('infile', short_opt='-i', type=str, required=True, help='model file in HDF5 format'), Argument('outfile', short_opt='-o', type=str, required=True), Argument('alphabet', type=str, default='ATCG')] def __call__(self): import h5py import numpy as np import_matplotlib() model_weights = h5py.File(self.infile, 'r')['/model_weights/dense_1/dense_1/kernel:0'][:] window_size = model_weights.shape[0]/len(self.alphabet) offset = (window_size + 1)/2 model_weights = model_weights.reshape((window_size, 4)) fig, ax = plt.subplots(figsize=(20, 4)) for i in range(len(self.alphabet)): ax.plot(np.arange(window_size), model_weights[:, i], '-', label=self.alphabet[i]) ax.set_xticks(np.arange(window_size, step=5)) ax.set_xlim(0, window_size) ax.set_ylabel('Weight') ax.set_xlabel('Position') ax.set_xticks(np.arange(window_size), minor=True) ax.set_xticks(np.arange(offset%5, window_size + 1, step=5)) ax.set_xticklabels(np.arange(offset%5, window_size + 1, step=5) - window_size/2) ax.set_ylim(-2, 2) ax.legend() plt.tight_layout() self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile, dpi=150, bbox_inches='tight')
class CreateCvIndex(CommandLineTool): arguments = [Argument('n_samples', short_opt='-n', type=int, help='number of samples'), Argument('data_file', short_opt='-i', type=str, help='determine the number of samples from input file (from y_train)'), Argument('n_folds', short_opt='-k', type=int, required=True, help='number of folds'), Argument('outfile', short_opt='-o', type=str, required=True, help='output file in HDF5 format with dataset names: /<fold>/train, /<fold>/test')] def __call__(self): import h5py from sklearn.model_selection import KFold import numpy as np if (self.n_samples is None) and (self.data_file is None): raise ValueError('either --n-samples/--data-file should be specified') if self.data_file: self.logger.info('determine number of samples from data file: {}' + self.data_file) fin = h5py.File(self.data_file, 'r') self.n_samples = fin['y_train'].shape[0] fin.close() self.logger.info('number of training samples: {}'.format(self.n_samples)) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) fout = h5py.File(self.outfile, 'w') kfold = KFold(self.n_folds, shuffle=True) fold = 0 for train_index, test_index in kfold.split(np.arange(self.n_samples)): g = fout.create_group('%d'%fold) g.create_dataset('train', data=train_index) g.create_dataset('test', data=test_index) fold += 1 fout.close()
class BumhmmToGenomicData(CommandLineTool): description = 'Combine BUMHMM output files into one GenomicData file' arguments = [ Argument('posterior_file', short_opt='-i', type=str, required=True, help='BUMHMM output file'), Argument('bumhmm_input_file', type=str, required=True), Argument('outfile', short_opt='-o', type=str, required=True, help='output file') ] def __call__(self): from genomic_data import GenomicData import numpy as np import h5py self.logger.info('read BUMHMM file: ' + self.posterior_file) posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:] self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file) f = h5py.File(self.bumhmm_input_file, 'r') start = f['start'][:] end = f['end'][:] name = f['name'][:] f.close() values = map(lambda i: posteriors[start[i]:end[i]], range(len(name))) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) GenomicData.from_data(name, features={ 'bumhmm': values }).save(self.outfile)
class HyperParamGrid(CommandLineTool): description = 'Expand a grid specification of hyperparameters into a list in JSON format' arguments = [Argument('infile', short_opt='-i', default='-', help='parameter ranges in JSON format'), Argument('sample', type=int, help='randomly sample up to a certain number of parameters')] def __call__(self): import json import itertools import random fin = open_file_or_stdin(self.infile) grid_spec = json.load(fin) fin.close() fout = sys.stdout param_names = [] param_values = [] for name, value in grid_spec.iteritems(): param_names.append(name) param_values.append(value) param_list = [] for param in itertools.product(*param_values): param_list.append(dict(zip(param_names, param))) if self.sample: param_list = random.sample(param_list, min(len(param_list), self.sample)) for param in param_list: fout.write(json.dumps(param) + '\n')
class MetricTableCross(CommandLineTool): arguments = [Argument('experiment_type', type=str, required=True), Argument('data_name', type=str, required=True), Argument('region', type=str, required=True), Argument('outfile', type=str, short_opt='-o', required=True, help='output file'), Argument('metrics', type=list, default='accuracy,roc_auc,sensitivity,ppv')] def __call__(self): import glob import h5py records = [] name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'} header = ['experiment_type', 'data_name', 'model_experiment_type', 'model_data_name', 'percentile', 'window_size', 'model_name'] + self.metrics for dirname in os.listdir('metrics/cross/{},{}'.format(self.experiment_type, self.data_name)): model_experiment_type, model_data_name = dirname.split(',') for filename in glob.glob('metrics/cross/{},{}/{}/*.h5'.format(self.experiment_type, self.data_name, dirname)): d = parse_filename(filename, name_dict) if d['region'] != self.region: continue f = h5py.File(filename, 'r') record = [self.experiment_type, self.data_name, model_experiment_type, model_data_name, d['percentile'], d['window_size'], d['model_name']] grp = f['metrics'] for metric in self.metrics: record.append(str(grp[metric][()])) records.append(record) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: f.write('\t'.join(header) + '\n') for record in records: f.write('\t'.join(record) + '\n')
class CompareBumhmmWithCoverageAndDropoff(CommandLineTool): arguments = [Argument('posterior_file', short_opt='-i', type=str, required=True, help='BUMHMM output file'), Argument('bumhmm_input_file', type=str, required=True), Argument('outfile', short_opt='-o', type=str, required=True, help='output file')] def __call__(self): import h5py import numpy as np import_matplotlib() self.logger.info('read BUMHMM file: ' + self.posterior_file) posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:] self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file) f = h5py.File(self.bumhmm_input_file, 'r') start = f['start'][:] end = f['end'][:] name = f['name'][:] coverage = f['coverage'][:] sample_name = f['sample_name'][:] replicate = f['replicate'][:] dropoff_count = f['dropoff_count'][:] f.close() self.logger.info('open pdf file: ' + self.outfile) prepare_output_file(self.outfile) plt.rcParams['axes.labelsize'] = 'small' plt.rcParams['xtick.labelsize'] = 'x-small' plt.rcParams['ytick.labelsize'] = 'x-small' plt.rcParams['axes.titlesize'] = 'small' with PdfPages(self.outfile) as pdf: for i in np.random.choice(len(name), size=10): self.logger.info('plot %s'%name[i]) length = min(300, end[i] - start[i] - 50) index = np.arange(start[i] + 50, start[i] + length + 50) x = np.arange(50, 50 + length) fig, axes = plt.subplots(1 + 2*coverage.shape[0], figsize=(15, 2 + 2*coverage.shape[0]), sharex=True) posteriors_fillna = posteriors[index] color = np.asarray(['#999999' if np.isnan(a) else '#0000ff' for a in posteriors_fillna]) posteriors_fillna[np.isnan(posteriors_fillna)] = -0.05 axes[0].bar(x, posteriors_fillna, color=color, edgecolor='none') axes[0].set_xlim(0, length) axes[0].set_ylim(-0.1, 1) axes[0].set_title('BUMHMM posteriors (%s)'%name[i]) for j in range(coverage.shape[0]): axes[2*j + 1].bar(x, dropoff_count[j, index].astype('float')/coverage[j, index], edgecolor='none') axes[2*j + 1].set_title('Dropoff rate of %s (%s)'%(replicate[j], sample_name[j])) axes[2*j + 1].set_ylim(0, 0.5) axes[2*j + 2].bar(x, coverage[j, index], edgecolor='none') axes[2*j + 2].set_title('Coverage of %s (%s)'%(replicate[j], sample_name[j])) axes[2*j + 1].set_ylim(0, 0.5) plt.tight_layout() pdf.savefig(fig) plt.clf() plt.close(fig)
class IcshapeRawRtToGenomicData(CommandLineTool): description = 'Convert libX.rt files to GenomicData format' arguments = [ Argument('infile', short_opt='-i', type=str, required=True, help='input icSHAPE rt file'), Argument('outfile', short_opt='-o', type=str, required=True, help='output GenomicData file') ] def __call__(self): from genomic_data import GenomicData import numpy as np self.logger.info('read input rt file: ' + self.infile) name = [] length = [] rpkm = [] rt_stop = [] base_density = [] with open(self.infile, 'r') as f: f.readline() n_records = 0 for lineno, line in enumerate(f): c = line.strip().split('\t') if (lineno % 2) == 0: name.append(c[0]) length.append(int(c[1])) rpkm.append(float(c[2].split(',')[0])) rt_stop.append( np.asarray(c[3:], dtype='float').astype('int32')) else: base_density.append( np.asarray(c[3:], dtype='float').astype('int32')) n_records += 1 self.logger.info('successfully read %d records' % n_records) self.logger.info('create output file: ' + self.outfile) prepare_output_file(self.outfile) GenomicData.from_data(name, features={ 'rt_stop': rt_stop, 'base_density': base_density }, meta={ 'rpkm': np.asarray(rpkm, dtype='float64'), 'length': np.asarray(length, dtype='int64') }).save(self.outfile)
class MakeRegression(CommandLineTool): arguments = [Argument('n_samples', type=int, short_opt='-n', default=100), Argument('n_features', type=int, short_opt='-p', default=100), Argument('n_informative', type=int, default=10), Argument('noise', type=float, default=0.0), Argument('bias', type=float, default=0.0), Argument('scale_targets', action='store_true', help='scale the target values to zero-mean and unit variance'), Argument('test_ratio', type=float, default=0.3), Argument('outfile', short_opt='-o', type=str, required=True)] def __call__(self): import h5py from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X, y = make_regression(self.n_samples, self.n_features, n_informative=self.n_informative, bias=self.bias, noise=self.noise) if self.scale_targets: self.logger.info('scale target values using StandardScaler') scaler = StandardScaler() y = scaler.fit_transform(y.reshape(-1, 1)) y = y.reshape((-1,)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_ratio) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) f = h5py.File(self.outfile, 'w') f.create_dataset('X_train', data=X_train) f.create_dataset('y_train', data=y_train) f.create_dataset('X_test', data=X_test) f.create_dataset('y_test', data=y_test) f.close()
class IcshapeRtToWav(CommandLineTool): arguments = [ Argument('rt_file', short_opt='-i', type=str, required=True), Argument('outdir', short_opt='-o', type=str, required=True) ] def __call__(self): import wave import numpy as np rt = icshape_raw_rt_to_genomic_data(self.rt_file, self.logger) def modulate(values, wav_file, sample_rate=44100, n_channels=2, max_amp=32767, x_freq=20): upsample_rate = float(sample_rate) / x_freq T = float(len(values)) / x_freq n_samples = int(sample_rate * T) x = np.empty(n_samples, dtype='float32') for i in range(len(values)): x[int(upsample_rate * i):int(upsample_rate * (i + 1))] = np.log(values[i] + 1) t = np.linspace(0, T, n_samples) y = max_amp * np.sin(2 * 880 * np.pi * t) y *= x y *= float(max_amp) / np.abs(y.max()) data = np.empty(n_samples * n_channels, dtype='int16') channel_index = np.arange(0, n_samples * n_channels, n_channels) data[channel_index] = y data[channel_index + 1] = data[channel_index] wav = wave.open(wav_file, 'wb') wav.setnchannels(n_channels) wav.setsampwidth(2) wav.setframerate(sample_rate) wav.setnframes(n_samples) wav.setcomptype('NONE', 'no compression') wav.writeframes(np.getbuffer(data)) wav.close() for i in np.argsort(-rt.meta['rpkm'])[:10]: name = rt.names[i] values = rt.feature('rt_stop', name) wav_file = os.path.join(self.outdir, '%s.wav' % name) self.logger.info('create wav file: ' + wav_file) prepare_output_file(wav_file) modulate(values, wav_file)
class CheckStatus(CommandLineTool): arguments = [ Argument('task_name', short_opt='-t', type=str, required=True, choices=Task.get_all_task_names(), help='task name'), Argument('summary', action='store_true'), Argument('monitor', action='store_true'), Argument('interval', type=float, default=2, help='watch interval'), Argument('status_dir', type=str, default='status') ] def check_status(self, task): if self.summary: finished = 0 for params in task.paramlist: unique_name = task.tool.unique_name.format(**params) if os.path.exists( os.path.join(self.status_dir, task.__class__.__name__, unique_name)): finished += 1 sys.stdout.write('\r\bFinished: {}/{} ({:.2f}%)'.format( finished, len(task.paramlist), 100 * float(finished) / len(task.paramlist))) sys.stdout.flush() else: for params in task.paramlist: unique_name = task.tool.unique_name.format(**params) if os.path.exists( os.path.join(self.status_dir, task.__class__.__name__, unique_name)): status = '\x1B[32mYES\x1B[0m' else: status = '\x1B[31mNO\x1B[0m' print '{}({})\t{}'.format(task.__class__.__name__, unique_name, status) def __call__(self): task = Task.get_task(self.task_name)() if self.monitor: import time while True: self.check_status(task) time.sleep(self.interval) else: self.check_status(task) sys.stdout.write('\n')
class CleanStatus(CommandLineTool): arguments = [ Argument('task_name', short_opt='-t', type=str, required=True, choices=Task.get_all_task_names(), help='task name'), Argument('status_dir', type=str, default='status') ] def __call__(self): task = Task.get_task(self.task_name)() for params in task.paramlist: unique_name = task.tool.unique_name.format(**params) print 'rm ' + os.path.join(self.status_dir, task.__class__.__name__, unique_name)
class GenomicDataDistribution(CommandLineTool): description = 'Plot distribution of values in GenomicData files' arguments = [Argument('infile', short_opt='-i', type=str, required=True, help='input file in GenomicData format'), Argument('feature', type=str, required=True, help='the feature to plot'), Argument('outfile', short_opt='-o', type=str, required=True), Argument('xlabel', type=str, default='Values'), Argument('ylabel', type=str, default='Counts'), Argument('weight', type=float, default=1), Argument('title', type=str)] def __call__(self): import_matplotlib() import numpy as np data = GenomicData(self.infile, feature_names=[self.feature]) fig, ax = plt.subplots(figsize=(4, 4)) valid_data = data.features[self.feature][np.logical_not(np.isnan(data.features[self.feature]))] ax.hist(valid_data, weights=np.full(len(valid_data), self.weight), bins=20, color='#808080') ax.set_xlabel(self.xlabel) ax.set_ylabel(self.ylabel) #ax.set_yticks(np.arange(len(counts)), map(lambda x: '%.1f'%x, counts.astype('float')*1e-6)) plt.tight_layout() if self.title: ax.set_title(self.title) self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
class DeepfoldSampleSizeTable(CommandLineTool): description = 'Generate a table of the number of samples in the Deepfold dataset and original dataset' arguments = [Argument('indirs', short_opt='-i', required=True, nargs='+'), Argument('feature', type=str), Argument('deepfold_dataset', type=str, required=False, default='r={},p=5,w=100.h5'), Argument('outfile', short_opt='-o', required=True), Argument('percentile', type=float, default=5), Argument('region', type=str), Argument('sequence_file', type=str, help='FASTA file')] def __call__(self): import pandas as pd import numpy as np import h5py regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS'] records = [] for indir in self.indirs: for region in regions: deepfold_dataset = 'r={},p=5,w=100.h5'.format(region) data = GenomicData(os.path.join(indir, '{}.h5'.format(region))) if not self.feature: feature = data.features.keys()[0] else: feature = self.feature n_samples_total = len(data.features[feature]) - np.isnan(data.features[feature]).sum() f = h5py.File(os.path.join(indir, 'deepfold', deepfold_dataset), 'r') n_samples_train = f['X_train'].shape[0] n_samples_test = f['X_test'].shape[0] f.close() records.append((indir, deepfold_dataset, region, n_samples_total, n_samples_train, n_samples_test)) df = pd.DataFrame.from_records(records, columns=('dataset', 'deepfold_dataset', 'region', 'n_samples_total', 'n_samples_train', 'n_samples_test')) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) df.to_csv(self.outfile, sep='\t', index=False)
class CompareDeepfold1DMetrics(CommandLineTool): description = 'Compare 1D structure prediction metrics grouped by dataset' arguments = [Argument('infile', short_opt='-i', type=str, help='metric table generated by MetricTable command'), Argument('outfile', short_opt='-o', type=str)] def __call__(self): import_matplotlib() import pandas as pd import numpy as np df = pd.read_table(self.infile) summary = [] for name, subdf in df.groupby(['model_name']): i = subdf['roc_auc'].idxmax() summary.append(subdf.ix[i, :]) summary = pd.concat(summary, axis=1).T self.logger.info('save summary table: {}'.format(self.outfile)) prepare_output_file(self.outfile) summary.to_csv(self.outfile, index=False, sep='\t')
class CompareDeepfold1DWithKnown(CommandLineTool): arguments = [Argument('infile', type=str, required=True, help='output file of PredictDeepfold1D or a directory'), Argument('outfile', type=str, required=True, help='output plot file'), Argument('max_plots', type=int, default=30, help='maximum number of sequences to plot'), Argument('max_length', type=int, default=200)] def __call__(self): import pandas as pd import_matplotlib() if os.path.isdir(self.infile): df = [] n = 0 for filename in os.listdir(self.infile): df.append(pd.read_table('{}/{}'.format(self.infile, filename))) n += 1 if n > self.max_plots: break df = pd.concat(df) else: df = pd.read_table(self.infile) prepare_output_file(self.outfile) self.logger.info('save file: {}'.format(self.outfile)) with PdfPages(self.outfile) as pdf: n_plots = 0 for name, sub_df in df.groupby('name'): fig, ax = plt.subplots(figsize=(15, 1.5)) length = sub_df.shape[0] if length > self.max_length: start = length/2 - self.max_length/2 sub_df = sub_df.iloc[start:(start + self.max_length), :] ax.plot(sub_df['position'], sub_df['pred'], 'b-', label='prediction') ax.plot(sub_df['position'], sub_df['true'], 'k-', label='known') ax.legend(loc='upper right') ax.set_ylim(-0.1, 1.1) ax.set_title(name) plt.tight_layout() pdf.savefig(fig) plt.close(fig) n_plots += 1 if n_plots >= self.max_plots: break
class SampleSizeTable(CommandLineTool): description = 'Generate a table of the number of samples in the Deepfold dataset and original dataset' arguments = [Argument('indir', type=str, required=True, help='directory containing GenomicData files with path <dataset>/<region>.h5'), Argument('feature', type=str), Argument('outfile', short_opt='-o', required=True), Argument('percentile', type=float), Argument('region', type=str)] def __call__(self): import pandas as pd import h5py import numpy as np regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS', 'ncRNA', 'miRNA'] if self.region: regions = [self.region] records = [] for dataset in os.listdir(self.indir): for region in regions: data_file = os.path.join(self.indir, dataset, '%s.h5'%region) if not os.path.isfile(data_file): self.logger.warn('GenomicData file {} does not exist'.format(data_file)) continue data = GenomicData(data_file) if not self.feature: feature = data.features.keys()[0] #self.logger.info('use the default feature %s because --feature is not given') else: feature = self.feature if self.percentile is not None: data_valid = data.features[feature][np.logical_not(np.isnan(data.features[feature]))] cutoff1 = np.percentile(data_valid, self.percentile) cutoff2 = np.percentile(data_valid, 100 - self.percentile) n_samples = np.logical_or(data_valid <= cutoff1, data_valid >= cutoff2).sum() else: n_samples = len(data.features[feature]) - np.isnan(data.features[feature]).sum() records.append((dataset, region, n_samples)) df = pd.DataFrame.from_records(records, columns=['dataset', 'region', 'n_samples']) print df.to_csv(sep='\t', index=False)
class SelectBestModel(CommandLineTool): arguments = [Argument('cvdir', short_opt='-i', type=str, required=True, help='directory containing prediction metric files in HDF5 format with datasets: y_true, y_pred'), Argument('metric', type=str, default='accuracy'), Argument('prefix', short_opt='-o', type=str, required=True)] def __call__(self): import h5py import json import pandas as pd from sklearn.metrics import roc_auc_score, accuracy_score hyperparams = [] with open(os.path.join(self.cvdir, 'hyperparam.txt'), 'r') as f: for line in f: hyperparams.append(line.strip()) f = h5py.File(os.path.join(self.cvdir, 'cv_index.h5'), 'r') n_folds = len(f.keys()) f.close() scores = [] for param_index, hyperparam in enumerate(hyperparams): for cv_fold in range(n_folds): metric_file = '%s/%d/%d.valid_metrics'%(self.cvdir, param_index, cv_fold) f = h5py.File(metric_file, 'r') score = accuracy_score(f['y_pred_labels'][:], f['y_true'][:]) scores.append((param_index, cv_fold, score, hyperparam)) scores = pd.DataFrame.from_records(scores, columns=('param_index', 'cv_index', self.metric, 'hyperparam')) scores.to_csv(self.prefix + '.detail.txt', sep='\t', index=False, doublequote=False, quotechar="'") scores_by_hyperparam = scores.groupby(['param_index'], as_index=False)[self.metric].mean() scores_by_hyperparam['hyperparam'] = hyperparams scores_by_hyperparam.to_csv(self.prefix + '.mean_by_hyperparam.txt', sep='\t', index=False, doublequote=False, quotechar="'") best_param_index = scores_by_hyperparam['param_index'][scores_by_hyperparam[self.metric].idxmax()] with open(self.prefix + '.best_hyperparam.json', 'w') as f: f.write(hyperparams[best_param_index]) with open(self.prefix + '.best_param_index.txt', 'w') as f: f.write(str(best_param_index))
class PrintCommands(CommandLineTool): arguments = [ Argument('task_name', short_opt='-t', type=str, required=True, choices=Task.get_all_task_names(), help='task name'), Argument( 'param_file', type=str, help= 'parameter file in JSON format which is decoded as a list of dict') ] def __call__(self): task = Task.get_task(self.task_name)() if self.param_file is not None: import json with open(self.param_file, 'r') as f: task.paramlist = json.load(f) for cmd in task.generate_commands(): print cmd
class MutateMap(CommandLineTool): description = '' arguments = [Argument('model_file', type=str, required=True), Argument('n_sequences', type=int, default=100)] def __call__(self): import numpy as np self.logger.info('load model: {}'.format(self.model_file)) model = keras.models.load_model(self.model_file) window_size = K.int_shape(model.input)[1] n = K.int_shape(model.input)[2] X = np.empty((n_sequences*(n - 1)*window_size, window_size), dtype='float32') X_wt = np.random.randint(n, size=(n_sequences, window_size)) X_mut = np.repeat(X_wt, self.n_sequences*(window_size*(n - 1))) for i in range(self.n_sequences): for j in range(n): X_mut[j::n] += (j + 1) X_mut = np.mod(X_mut, n) X_wt = onehot_encode(X_wt, range(n)) X_mut = onehot_encode(X_mut, range(n)) y_wt = model.predict(X_wt) y_mut = model.predict(X_mut)
class SelectModel(CommandLineTool): arguments = [Argument('metric_file', type=str, required=True, help='A table generated by MetricTable'), Argument('outfile', short_opt='-o', type=str, required=False, help='parameters of best models in JSON format'), Argument('num', short_opt='-n', type=int, default=1, help='maximum number of models to select for each dataset. 0 for all models.'), Argument('metric', type=str, default='accuracy')] def __call__(self): import pandas as pd import json metric_table = pd.read_table(self.metric_file) if self.num <= 0: self.num = metric_table.shape[0] else: self.num = min(self.num, metric_table.shape[0]) selected = metric_table.sort_values('accuracy', ascending=False).iloc[:self.num, :] if self.outfile is not None: paramlist = [] for index, row in selected.iterrows(): paramlist.append(row.to_dict()) prepare_output_file(self.outfile) with open(self.outfile, 'w') as f: json.dump(paramlist, f, indent=2) print selected
class CreateCvIndex(CommandLineTool): arguments = [Argument('n_samples', short_opt='-n', type=int, required=True, help='number of samples'), Argument('n_folds', short_opt='-k', type=int, required=True, help='number of folds'), Argument('outfile', short_opt='-o', type=str, required=True, help='output file in HDF5 format with dataset names: /<fold>/train, /<fold>/test')] def __call__(self): import h5py from sklearn.model_selection import KFold import numpy as np self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) fout = h5py.File(self.outfile, 'w') kfold = KFold(self.n_folds, shuffle=True) fold = 0 for train_index, test_index in kfold.split(np.arange(self.n_samples)): g = fout.create_group('%d'%fold) g.create_dataset('train', data=train_index) g.create_dataset('test', data=test_index) fold += 1 fout.close()
class ScoreStructure(CommandLineTool): description = 'Compare predicted and known structures in CT format and calculate the metrics' arguments = [Argument('true_file', type=str, required=True, help='CT format'), Argument('pred_file', type=str, required=True, help='CT format'), Argument('outfile', short_opt='-o', type=str), Argument('exact', action='store_true', help='count exact pairs')] def score_ct(self, true_ct_file, pred_ct_file): ct_true = read_ct(true_ct_file) ct_pred = read_ct(pred_ct_file) scores = score_structure(make_pair_list(ct_true[2]), make_pair_list(ct_pred[2]), exact=self.exact) scores['length'] = len(ct_pred[1]) return scores def __call__(self): keys = ['length', 'sensitivity', 'ppv', 'tp_in_true', 'true_pairs', 'tp_in_pred', 'pred_pairs'] fout = sys.stdout if self.outfile is not None: self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) fout = open(self.outfile, 'w') if os.path.isdir(self.true_file) and os.path.isdir(self.pred_file): names = [os.path.splitext(a)[0] for a in os.listdir(self.pred_file)] fout.write('\t'.join(['name'] + keys) + '\n') for name in names: #self.logger.debug('read ct: {}'.format(name)) scores = self.score_ct('{}/{}.ct'.format(self.true_file, name), '{}/{}.ct'.format(self.pred_file, name)) fout.write('\t'.join([name] + map(str, map(lambda x: scores[x], keys))) + '\n') else: scores = self.score_ct(self.true_file, self.pred_file) name = os.path.splitext(self.pred_file)[0] fout.write('\t'.join(['name'] + keys) + '\n') fout.write('\t'.join([ct_true[0]] + map(str, map(lambda x: scores[x], keys))) + '\n') if self.outfile is not None: fout.close()
class CompareCtFiles(CommandLineTool): arguments = [Argument('indir1', type=str, required=True, help='directory containing ct files'), Argument('indir2', type=str, required=True, help='directory containing ct files'), Argument('group_name1', type=str, default='Group 1'), Argument('group_name2', type=str, default='Group 2'), Argument('outfile', type=str, required=True, help='output plot file'), Argument('max_plots', type=int, default=20, help='maximum number of sequences to plot'), Argument('random', action='store_true', help='randomly select structures to plot'), Argument('max_length', type=int, default=200)] def __call__(self): import_matplotlib() import numpy as np names1 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir1)) names2 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir2)) prepare_output_file(self.outfile) self.logger.info('save file: {}'.format(self.outfile)) with PdfPages(self.outfile) as pdf: n_plots = 0 for name in names1: if name not in names2: continue name1, seq1, pairs1 = read_ct('{}/{}.ct'.format(self.indir1, name)) name2, seq2, pairs2 = read_ct('{}/{}.ct'.format(self.indir2, name)) fig, axes = plt.subplots(nrows=2, figsize=(12, 4), sharex=True) length = len(seq1) x = np.arange(length) pairs1 = np.asarray(pairs1, dtype='int64') pairs1[pairs1 > 1] = 1 pairs2 = np.asarray(pairs2, dtype='int64') pairs2[pairs2 > 1] = 1 if length > self.max_length: start = length/2 - self.max_length/2 pairs1 = pairs1[start:(start + self.max_length)] pairs2 = pairs2[start:(start + self.max_length)] x = x[start:(start + self.max_length)] axes[0].bar(x, pairs1, label=self.group_name1, color='b', edgecolor='w') axes[0].set_title('{}({})'.format(name, self.group_name1)) axes[1].bar(x, pairs2, label=self.group_name1, color='b', edgecolor='w') axes[1].set_title('{}({})'.format(name, self.group_name2)) pdf.savefig(fig) n_plots += 1 if n_plots >= self.max_plots: break
class CheckReady(CommandLineTool): arguments = [ Argument('task_name', short_opt='-t', type=str, required=True, choices=Task.get_all_task_names(), help='task name') ] def __call__(self): task = Task.get_task(self.task_name) for params in task.paramlist: unique_name = task.unique_name.format(**params) ready, missing = task.ready(params) if ready: print '{}({})\t\x1B[32mYES\x1B[0m'.format( task.__name__, unique_name) else: print '{}({})\t\x1B[31mNO\x1B[0m\t{}'.format( task.__name__, unique_name, missing)
class CompareStructurePredictionMetrics(CommandLineTool): description = 'Plot the difference between score metrics of two methods' arguments = [Argument('infile1', type=str, required=True, help='output file of ScoreStructure'), Argument('infile2', type=str, required=True, help='output file of ScoreStructure'), Argument('outfile', short_opt='-o', type=str, help='output plot file'), Argument('metric', type=str, default='sensitivity'), Argument('title', type=str, default='Distribution of {metric} {compare_method}'), Argument('compare_method', type=str, default='difference')] def __call__(self): import pandas as pd import_matplotlib() table1 = pd.read_table(self.infile1) table2 = pd.read_table(self.infile2) merged = pd.merge(table1, table2, on='name') diff = merged['{}_x'.format(self.metric)] - merged['{}_y'.format(self.metric)] fig, ax = plt.subplots(figsize=(10, 6)) ax.hist(diff, bins=50) ax.set_title(self.title.format(metric=self.metric, compare_method=self.compare_method, mean=diff.mean(), median=diff.median())) ax.set_xlim(-1, 1) self.logger.info('save plot file: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
class TestEstimator(CommandLineTool): arguments = [Argument('test_file', short_opt='-i', type=str, required=True, help='the dataset in HDF5 format, required datasets: X_test, y_test'), Argument('model_file', type=str, required=True, help='file path for saving the model (in Python pickle format)'), Argument('model_type', type=str, default='sklearn', choices=('sklearn', 'keras')), Argument('metrics', type=list, default='accuracy'), Argument('metric_file', short_opt='-o', type=str, required=True), Argument('flatten', action='store_true', help='flatten the input dataset before applying the model'),] def __call__(self): import h5py import zipfile self.logger.info('load test dataset: ' + self.test_file) f = h5py.File(self.test_file, 'r') X_test = f['X_test'][:] y_test = f['y_test'][:] f.close() if self.flatten: self.logger.info('flatten the test data to dimension: (%d, %d)'%X_test.shape[:2]) X_test = X_test.reshape((X_test.shape[0], -1)) if self.model_type == 'keras': import_keras() self.logger.info('load keras model: ' + self.model_file) model = keras.models.load_model(self.model_file) elif self.model_type == 'sklearn': import cPickle self.logger.info('load sklearn model: ' + self.model_file) zipf = zipfile.ZipFile(self.model_file, 'r') f = zipf.open('model', 'r') model = cPickle.load(f) zipf.close() if self.model_type == 'sklearn': y_pred_labels = model.predict(X_test) model_name = model.__class__.__name__ if model_name == 'SVC': y_pred = model.decision_function(X_test) elif model_name == 'RandomForestClassifier': y_pred = model.predict_proba(X_test)[:, 1] else: raise ValueError('unknown sklearn model ' + model_name) elif self.model_type == 'keras': y_pred = model.predict(X_test) y_pred_labels = (y_pred >= 0.5).astype('int32') self.logger.info('save metrics: ' + self.metric_file) prepare_output_file(self.metric_file) f = h5py.File(self.metric_file, 'w') f.create_dataset('y_true', data=y_test) f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_labels', data=y_pred_labels) g = f.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if metric == 'roc_auc': score = scorer(y_test, y_pred) else: score = scorer(y_test, y_pred_labels) self.logger.info('calculate metric {}: {}'.format(metric, score)) g.create_dataset(metric, data=score) f.close()
class TrainEstimator(CommandLineTool): arguments = [Argument('train_file', short_opt='-i', type=str, required=True, help='the dataset in HDF5 format, required datasets: X_train, y_train'), Argument('cv_index_file', type=str, help='CV index created by CreateCvIndex'), Argument('cv_fold', type=int), Argument('model_name', type=str, required=True, help='name of the classifier'), Argument('model_type', type=str, default='sklearn', choices=('sklearn', 'keras')), Argument('model_file', short_opt='-o', type=str, help='file path for saving the model (in Python pickle format)'), Argument('model_script', type=str, help='load a model specification from a Python script (should define the model variable)'), Argument('valid_metric_file', type=str), Argument('flatten', action='store_true', help='flatten the input dataset before applying the model'), Argument('regress', action='store_true', help='train a regression model'), Argument('metrics', type=list), Argument('scale_targets', action='store_true', help='scale the targets values by mean and variance'), Argument('hyperparam', type=str, default='{}', help='model hyper-parameter in JSON format'), Argument('hyperparam_file', type=str, help='model hyper-parameter in JSON format from file')] def __call__(self): import json import h5py import cPickle import zipfile if self.hyperparam_file: with open(self.hyperparam_file, 'r') as f: hyperparam = json.load(f) else: hyperparam = json.loads(self.hyperparam) self.logger.info('load data: {}'.format(self.train_file)) fin = h5py.File(self.train_file, 'r') X_train = fin['X_train'][:] y_train = fin['y_train'][:] fin.close() X_valid = None y_valid = None if self.cv_index_file is not None: if self.cv_fold is None: raise ValueError('argument --cv-fold is required if --cv-index-file is specified') if self.valid_metric_file is None: raise ValueError('argument --valid-metric-file is required if --cv-index-file is specified') self.logger.info('load CV index: ' + self.cv_index_file) f = h5py.File(self.cv_index_file, 'r') train_index = f[str(self.cv_fold)]['train'][:] test_index = f[str(self.cv_fold)]['test'][:] f.close() X_valid = X_train[test_index] y_valid = y_train[test_index] X_train = X_train[train_index] y_train = y_train[train_index] if self.flatten: X_train = X_train.reshape((X_train.shape[0], -1)) self.logger.info('flatten the training data to dimension: (%d, %d)'%X_train.shape) if X_valid is not None: X_valid = X_valid.reshape((X_valid.shape[0], -1)) self.logger.info('flatten the validation data to dimension: (%d, %d)'%X_train.shape) if self.scale_targets: self.logger.info('scale the target values using StandardScaler') from sklearn.preprocessing import StandardScaler scaler = StandardScaler() y_train = scaler.fit_transform(y_train.reshape(-1, 1)).reshape((-1,)) if y_valid is not None: y_valid = scaler.transform(y_valid.reshape(-1, 1)).reshape((-1,)) if self.model_script: self.logger.info('create model from script: ' + self.model_script) if self.model_type == 'keras': self.logger.info('use the keras model') #with open(os.path.join(os.path.dirname(__file__), 'import_keras.py'), 'r') as f: # exec compile(f.read(), 'import_keras.py', 'exec') import_keras() with open(self.model_script, 'r') as f: exec compile(f.read(), self.model_script, 'exec') from keras.optimizers import SGD optimizer = SGD() if self.regress: loss = 'mean_squared_error' metrics = ['mae'] else: loss = 'binary_crossentropy' metrics = ['acc'] model.compile(optimizer=optimizer, loss=loss, metrics=metrics) model.summary() else: with open(self.model_script, 'r') as f: exec compile(f.read(), self.model_script, 'exec') else: self.logger.info('create model by name: ' + self.model_name) model = get_model(self.model_name, hyperparam) self.logger.info('train the model') if self.model_type == 'keras': model.fit(X_train, y_train, batch_size=100, epochs=20) else: self.logger.info('model parameters: ' + json.dumps(model.get_params())) model.fit(X_train, y_train) if self.model_file: self.logger.info('save model: {}'.format(self.model_file)) prepare_output_file(self.model_file) if self.model_type == 'keras': model.save(self.model_file) f = h5py.File(self.model_file, 'r+') f.create_dataset('hyperparam', data=json.dumps(hyperparam)) f.close() else: zipf = zipfile.ZipFile(self.model_file, 'w', zipfile.ZIP_DEFLATED) zipf.writestr('model', cPickle.dumps(model)) zipf.writestr('hyperparam', json.dumps(hyperparam)) zipf.close() if X_valid is not None: if self.metrics is None: if self.regress: self.metrics = ['mean_squared_error', 'r2'] else: self.metrics = ['accuracy'] self.logger.info('validate the model') if self.regress: y_pred = model.predict(X_valid) else: y_pred_labels = model.predict(X_valid) self.logger.info('save the metrics: ' + self.valid_metric_file) prepare_output_file(self.valid_metric_file) f = h5py.File(self.valid_metric_file, 'w') f.create_dataset('model_name', data=self.model_name) f.create_dataset('hyperparam', data=json.dumps(self.hyperparam)) f.create_dataset('y_true', data=y_valid) if self.regress: f.create_dataset('y_pred', data=y_pred) else: f.create_dataset('y_pred_labels', data=y_pred_labels) g = f.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if self.regress: score = scorer(y_valid, y_pred) else: score = scorer(y_valid, y_pred_labels) self.logger.info('calculate metric {}: {}'.format(metric, score)) g.create_dataset(metric, data=score) if self.scale_targets: g.create_dataset('scale_y_mean', data=scaler.mean_) g.create_dataset('scale_y_std', data=scaler.scale_) f.close()