Ejemplo n.º 1
0
class SelectBestModel(CommandLineTool):
    arguments = [Argument('metric_dir', short_opt='-i', type=str, required=True,
            help='directory containing prediction metric files in HDF5 format with datasets: y_true, y_pred'),
        Argument('metric', type=str, default='accuracy'),
        Argument('outfile', short_opt='-o', type=str, required=True)]
    def __call__(self):
        import h5py
Ejemplo n.º 2
0
class DeepfoldDatasetStatistics(CommandLineTool):
    description = 'Basic statistics (e.g. number of samples) for a deepfold dataset'
    arguments = [Argument('experiment_type', type=str, required=True),
        Argument('data_name', type=str, required=True),
        Argument('outfile', short_opt='-o', required=True)]
    def number_of_samples(self):
        import h5py
        import glob
        name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'}
        header = ['experiment_type', 'data_name', 'region', 'window_size', 'percentile', 'n_train', 'n_test']
        records = []
        for filename in glob.glob('data/{}/{}/deepfold/*.h5'.format(self.experiment_type, self.data_name)):
            d = parse_filename(filename, name_dict)
            f = h5py.File(filename, 'r')
            records.append((self.experiment_type, self.data_name, d['region'], d['window_size'], d['percentile'],
                f['y_train'].shape[0], f['y_test'].shape[0]))
        self.logger.info('save file: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        with open(self.outfile, 'w') as f:
            f.write('\t'.join(header))
            f.write('\n')
            for record in records:
                f.write('\t'.join(map(str, record)))
                f.write('\n')

    def __call__(self):
        self.number_of_samples()
Ejemplo n.º 3
0
class DrawRnaStructureWithValues(CommandLineTool):
    description = 'Draw RNA secondary structure with color mapping from a list of values'
    arguments = [Argument('varna_path', type=str, required=True, help='path to VARNAvX-Y.jar'),
        Argument('ct_file', type=str, required=True),
        Argument('value_file', type=str, required=True),
        Argument('outfile', type=str, required=True),
        Argument('value_format', type=str, default='rme', choices=('rme',))]
    def __call__(self):
        import subprocess
        from formats import read_rme, read_ct
        values = read_rme(self.value_file).values()[0]
        title, seq, pairs = read_ct(self.ct_file)
        values_fillna = []
        for i in range(len(seq)):
            if i in values:
                values_fillna.append(values[i])
            else:
                values_fillna.append(0.5)
        colormap = ';'.join(map(lambda x: '%.3f'%x, values_fillna))
        prepare_output_file(self.outfile)
        cmdline = ['java', '-cp', self.varna_path,
            'fr.orsay.lri.varna.applications.VARNAcmd',
            '-i', self.ct_file, '-resolution', '5.0',
            '-colorMapStyle', 'rocknroll',
            '-colorMap', colormap, '-o', self.outfile]
        self.logger.info('execute: {}'.format(' '.join(cmdline)))
        p = subprocess.Popen(cmdline)
        p.wait()
Ejemplo n.º 4
0
class LogRegWeights(CommandLineTool):
    description = 'Plot weights of a Logistic regression model as lines'
    arguments = [Argument('infile', short_opt='-i', type=str, required=True,
            help='model file in HDF5 format'),
        Argument('outfile', short_opt='-o', type=str, required=True),
        Argument('alphabet', type=str, default='ATCG')]

    def __call__(self):
        import h5py
        import numpy as np
        import_matplotlib()

        model_weights = h5py.File(self.infile, 'r')['/model_weights/dense_1/dense_1/kernel:0'][:]
        window_size = model_weights.shape[0]/len(self.alphabet)
        offset = (window_size + 1)/2
        model_weights = model_weights.reshape((window_size, 4))
        fig, ax = plt.subplots(figsize=(20, 4))
        for i in range(len(self.alphabet)):
            ax.plot(np.arange(window_size), model_weights[:, i], '-', label=self.alphabet[i])
        ax.set_xticks(np.arange(window_size, step=5))
        ax.set_xlim(0, window_size)
        ax.set_ylabel('Weight')
        ax.set_xlabel('Position')
        ax.set_xticks(np.arange(window_size), minor=True)
        ax.set_xticks(np.arange(offset%5, window_size + 1, step=5))
        ax.set_xticklabels(np.arange(offset%5, window_size + 1, step=5) - window_size/2)
        ax.set_ylim(-2, 2)
        ax.legend()
        plt.tight_layout()
        self.logger.info('save figure: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        plt.savefig(self.outfile, dpi=150, bbox_inches='tight')
Ejemplo n.º 5
0
class CreateCvIndex(CommandLineTool):
    arguments = [Argument('n_samples', short_opt='-n', type=int,
            help='number of samples'),
        Argument('data_file', short_opt='-i', type=str,
            help='determine the number of samples from input file (from y_train)'),
        Argument('n_folds', short_opt='-k', type=int, required=True,
            help='number of folds'),
        Argument('outfile', short_opt='-o', type=str, required=True,
            help='output file in HDF5 format with dataset names: /<fold>/train, /<fold>/test')]
    def __call__(self):
        import h5py
        from sklearn.model_selection import KFold
        import numpy as np
        if (self.n_samples is None) and (self.data_file is None):
            raise ValueError('either --n-samples/--data-file should be specified')
        if self.data_file:
            self.logger.info('determine number of samples from data file: {}' + self.data_file)
            fin = h5py.File(self.data_file, 'r')
            self.n_samples = fin['y_train'].shape[0]
            fin.close()
            self.logger.info('number of training samples: {}'.format(self.n_samples))
        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        fout = h5py.File(self.outfile, 'w')
        kfold = KFold(self.n_folds, shuffle=True)
        fold = 0
        for train_index, test_index in kfold.split(np.arange(self.n_samples)):
            g = fout.create_group('%d'%fold)
            g.create_dataset('train', data=train_index)
            g.create_dataset('test', data=test_index)
            fold += 1
        fout.close()
Ejemplo n.º 6
0
class BumhmmToGenomicData(CommandLineTool):
    description = 'Combine BUMHMM output files into one GenomicData file'
    arguments = [
        Argument('posterior_file',
                 short_opt='-i',
                 type=str,
                 required=True,
                 help='BUMHMM output file'),
        Argument('bumhmm_input_file', type=str, required=True),
        Argument('outfile',
                 short_opt='-o',
                 type=str,
                 required=True,
                 help='output file')
    ]

    def __call__(self):
        from genomic_data import GenomicData
        import numpy as np
        import h5py

        self.logger.info('read BUMHMM file: ' + self.posterior_file)
        posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:]
        self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file)
        f = h5py.File(self.bumhmm_input_file, 'r')
        start = f['start'][:]
        end = f['end'][:]
        name = f['name'][:]
        f.close()
        values = map(lambda i: posteriors[start[i]:end[i]], range(len(name)))
        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        GenomicData.from_data(name, features={
            'bumhmm': values
        }).save(self.outfile)
Ejemplo n.º 7
0
class HyperParamGrid(CommandLineTool):
    description = 'Expand a grid specification of hyperparameters into a list in JSON format'
    arguments = [Argument('infile', short_opt='-i', default='-',
            help='parameter ranges in JSON format'),
        Argument('sample', type=int,
            help='randomly sample up to a certain number of parameters')]
    def __call__(self):
        import json
        import itertools
        import random
        fin = open_file_or_stdin(self.infile)
        grid_spec = json.load(fin)
        fin.close()

        fout = sys.stdout
        param_names = []
        param_values = []
        for name, value in grid_spec.iteritems():
            param_names.append(name)
            param_values.append(value)
        param_list = []
        for param in itertools.product(*param_values):
            param_list.append(dict(zip(param_names, param)))
        if self.sample:
            param_list = random.sample(param_list, min(len(param_list), self.sample))
        for param in param_list:
            fout.write(json.dumps(param) + '\n')
Ejemplo n.º 8
0
class MetricTableCross(CommandLineTool):
    arguments = [Argument('experiment_type', type=str, required=True),
        Argument('data_name', type=str, required=True),
        Argument('region', type=str, required=True),
        Argument('outfile', type=str, short_opt='-o', required=True, help='output file'),
        Argument('metrics', type=list, default='accuracy,roc_auc,sensitivity,ppv')]
    def __call__(self):
        import glob
        import h5py
        records = []
        name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'}
        header = ['experiment_type', 'data_name', 'model_experiment_type', 'model_data_name',
            'percentile', 'window_size', 'model_name'] + self.metrics
        for dirname in os.listdir('metrics/cross/{},{}'.format(self.experiment_type, self.data_name)):
            model_experiment_type, model_data_name = dirname.split(',')
            for filename in glob.glob('metrics/cross/{},{}/{}/*.h5'.format(self.experiment_type, self.data_name, dirname)):
                d = parse_filename(filename, name_dict)
                if d['region'] != self.region:
                    continue
                f = h5py.File(filename, 'r')
                record = [self.experiment_type, self.data_name,
                    model_experiment_type, model_data_name,
                    d['percentile'], d['window_size'], d['model_name']]
                grp = f['metrics']
                for metric in self.metrics:
                    record.append(str(grp[metric][()]))
                records.append(record)
        self.logger.info('save file: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        with open(self.outfile, 'w') as f:
            f.write('\t'.join(header) + '\n')
            for record in records:
                f.write('\t'.join(record) + '\n')
Ejemplo n.º 9
0
class CompareBumhmmWithCoverageAndDropoff(CommandLineTool):
    arguments = [Argument('posterior_file', short_opt='-i', type=str, required=True,
            help='BUMHMM output file'),
        Argument('bumhmm_input_file', type=str, required=True),
        Argument('outfile', short_opt='-o', type=str, required=True,
            help='output file')]
    def __call__(self):
        import h5py
        import numpy as np
        import_matplotlib()

        self.logger.info('read BUMHMM file: ' + self.posterior_file)
        posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:]
        self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file)
        f = h5py.File(self.bumhmm_input_file, 'r')
        start = f['start'][:]
        end = f['end'][:]
        name = f['name'][:]
        coverage = f['coverage'][:]
        sample_name = f['sample_name'][:]
        replicate = f['replicate'][:]
        dropoff_count = f['dropoff_count'][:]
        f.close()

        self.logger.info('open pdf file: ' + self.outfile)
        prepare_output_file(self.outfile)
        plt.rcParams['axes.labelsize'] = 'small'
        plt.rcParams['xtick.labelsize'] = 'x-small'
        plt.rcParams['ytick.labelsize'] = 'x-small'
        plt.rcParams['axes.titlesize'] = 'small'

        with PdfPages(self.outfile) as pdf:
            for i in np.random.choice(len(name), size=10):
                self.logger.info('plot %s'%name[i])
                length = min(300, end[i] - start[i] - 50)
                index = np.arange(start[i] + 50, start[i] + length + 50)
                x = np.arange(50, 50 + length)
                fig, axes = plt.subplots(1 + 2*coverage.shape[0], figsize=(15, 2 + 2*coverage.shape[0]), sharex=True)

                posteriors_fillna = posteriors[index]
                color = np.asarray(['#999999' if np.isnan(a) else '#0000ff' for a in posteriors_fillna])
                posteriors_fillna[np.isnan(posteriors_fillna)] = -0.05

                axes[0].bar(x, posteriors_fillna, color=color, edgecolor='none')
                axes[0].set_xlim(0, length)
                axes[0].set_ylim(-0.1, 1)
                axes[0].set_title('BUMHMM posteriors (%s)'%name[i])
                for j in range(coverage.shape[0]):
                    axes[2*j + 1].bar(x, dropoff_count[j, index].astype('float')/coverage[j, index], edgecolor='none')
                    axes[2*j + 1].set_title('Dropoff rate of %s (%s)'%(replicate[j], sample_name[j]))
                    axes[2*j + 1].set_ylim(0, 0.5)

                    axes[2*j + 2].bar(x, coverage[j, index], edgecolor='none')
                    axes[2*j + 2].set_title('Coverage of %s (%s)'%(replicate[j], sample_name[j]))
                    axes[2*j + 1].set_ylim(0, 0.5)
                plt.tight_layout()
                pdf.savefig(fig)
                plt.clf()
                plt.close(fig)
Ejemplo n.º 10
0
class IcshapeRawRtToGenomicData(CommandLineTool):
    description = 'Convert libX.rt files to GenomicData format'
    arguments = [
        Argument('infile',
                 short_opt='-i',
                 type=str,
                 required=True,
                 help='input icSHAPE rt file'),
        Argument('outfile',
                 short_opt='-o',
                 type=str,
                 required=True,
                 help='output GenomicData file')
    ]

    def __call__(self):
        from genomic_data import GenomicData
        import numpy as np

        self.logger.info('read input rt file: ' + self.infile)
        name = []
        length = []
        rpkm = []
        rt_stop = []
        base_density = []
        with open(self.infile, 'r') as f:
            f.readline()
            n_records = 0
            for lineno, line in enumerate(f):
                c = line.strip().split('\t')
                if (lineno % 2) == 0:
                    name.append(c[0])
                    length.append(int(c[1]))
                    rpkm.append(float(c[2].split(',')[0]))
                    rt_stop.append(
                        np.asarray(c[3:], dtype='float').astype('int32'))
                else:
                    base_density.append(
                        np.asarray(c[3:], dtype='float').astype('int32'))
                n_records += 1
        self.logger.info('successfully read %d records' % n_records)

        self.logger.info('create output file: ' + self.outfile)
        prepare_output_file(self.outfile)
        GenomicData.from_data(name,
                              features={
                                  'rt_stop': rt_stop,
                                  'base_density': base_density
                              },
                              meta={
                                  'rpkm': np.asarray(rpkm, dtype='float64'),
                                  'length': np.asarray(length, dtype='int64')
                              }).save(self.outfile)
Ejemplo n.º 11
0
class MakeRegression(CommandLineTool):
    arguments = [Argument('n_samples', type=int, short_opt='-n', default=100),
        Argument('n_features', type=int, short_opt='-p', default=100),
        Argument('n_informative', type=int, default=10),
        Argument('noise', type=float, default=0.0),
        Argument('bias', type=float, default=0.0),
        Argument('scale_targets', action='store_true',
            help='scale the target values to zero-mean and unit variance'),
        Argument('test_ratio', type=float, default=0.3),
        Argument('outfile', short_opt='-o', type=str, required=True)]
    def __call__(self):
        import h5py
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split
        from sklearn.preprocessing import StandardScaler

        X, y = make_regression(self.n_samples, self.n_features,
            n_informative=self.n_informative, bias=self.bias, noise=self.noise)
        if self.scale_targets:
            self.logger.info('scale target values using StandardScaler')
            scaler = StandardScaler()
            y = scaler.fit_transform(y.reshape(-1, 1))
            y = y.reshape((-1,))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_ratio)

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        f = h5py.File(self.outfile, 'w')
        f.create_dataset('X_train', data=X_train)
        f.create_dataset('y_train', data=y_train)
        f.create_dataset('X_test',  data=X_test)
        f.create_dataset('y_test', data=y_test)
        f.close()
Ejemplo n.º 12
0
class IcshapeRtToWav(CommandLineTool):
    arguments = [
        Argument('rt_file', short_opt='-i', type=str, required=True),
        Argument('outdir', short_opt='-o', type=str, required=True)
    ]

    def __call__(self):
        import wave
        import numpy as np
        rt = icshape_raw_rt_to_genomic_data(self.rt_file, self.logger)

        def modulate(values,
                     wav_file,
                     sample_rate=44100,
                     n_channels=2,
                     max_amp=32767,
                     x_freq=20):
            upsample_rate = float(sample_rate) / x_freq
            T = float(len(values)) / x_freq
            n_samples = int(sample_rate * T)
            x = np.empty(n_samples, dtype='float32')
            for i in range(len(values)):
                x[int(upsample_rate * i):int(upsample_rate *
                                             (i + 1))] = np.log(values[i] + 1)
            t = np.linspace(0, T, n_samples)
            y = max_amp * np.sin(2 * 880 * np.pi * t)
            y *= x
            y *= float(max_amp) / np.abs(y.max())
            data = np.empty(n_samples * n_channels, dtype='int16')
            channel_index = np.arange(0, n_samples * n_channels, n_channels)
            data[channel_index] = y
            data[channel_index + 1] = data[channel_index]

            wav = wave.open(wav_file, 'wb')
            wav.setnchannels(n_channels)
            wav.setsampwidth(2)
            wav.setframerate(sample_rate)
            wav.setnframes(n_samples)
            wav.setcomptype('NONE', 'no compression')
            wav.writeframes(np.getbuffer(data))
            wav.close()

        for i in np.argsort(-rt.meta['rpkm'])[:10]:
            name = rt.names[i]
            values = rt.feature('rt_stop', name)

            wav_file = os.path.join(self.outdir, '%s.wav' % name)
            self.logger.info('create wav file: ' + wav_file)
            prepare_output_file(wav_file)
            modulate(values, wav_file)
Ejemplo n.º 13
0
class CheckStatus(CommandLineTool):
    arguments = [
        Argument('task_name',
                 short_opt='-t',
                 type=str,
                 required=True,
                 choices=Task.get_all_task_names(),
                 help='task name'),
        Argument('summary', action='store_true'),
        Argument('monitor', action='store_true'),
        Argument('interval', type=float, default=2, help='watch interval'),
        Argument('status_dir', type=str, default='status')
    ]

    def check_status(self, task):
        if self.summary:
            finished = 0
            for params in task.paramlist:
                unique_name = task.tool.unique_name.format(**params)
                if os.path.exists(
                        os.path.join(self.status_dir, task.__class__.__name__,
                                     unique_name)):
                    finished += 1
            sys.stdout.write('\r\bFinished: {}/{} ({:.2f}%)'.format(
                finished, len(task.paramlist),
                100 * float(finished) / len(task.paramlist)))
            sys.stdout.flush()
        else:
            for params in task.paramlist:
                unique_name = task.tool.unique_name.format(**params)
                if os.path.exists(
                        os.path.join(self.status_dir, task.__class__.__name__,
                                     unique_name)):
                    status = '\x1B[32mYES\x1B[0m'
                else:
                    status = '\x1B[31mNO\x1B[0m'
                print '{}({})\t{}'.format(task.__class__.__name__, unique_name,
                                          status)

    def __call__(self):
        task = Task.get_task(self.task_name)()
        if self.monitor:
            import time
            while True:
                self.check_status(task)
                time.sleep(self.interval)
        else:
            self.check_status(task)
        sys.stdout.write('\n')
Ejemplo n.º 14
0
class CleanStatus(CommandLineTool):
    arguments = [
        Argument('task_name',
                 short_opt='-t',
                 type=str,
                 required=True,
                 choices=Task.get_all_task_names(),
                 help='task name'),
        Argument('status_dir', type=str, default='status')
    ]

    def __call__(self):
        task = Task.get_task(self.task_name)()
        for params in task.paramlist:
            unique_name = task.tool.unique_name.format(**params)
            print 'rm ' + os.path.join(self.status_dir,
                                       task.__class__.__name__, unique_name)
Ejemplo n.º 15
0
class GenomicDataDistribution(CommandLineTool):
    description = 'Plot distribution of values in GenomicData files'
    arguments = [Argument('infile', short_opt='-i', type=str, required=True,
            help='input file in GenomicData format'),
        Argument('feature', type=str, required=True, help='the feature to plot'),
        Argument('outfile', short_opt='-o', type=str, required=True),
        Argument('xlabel', type=str, default='Values'),
        Argument('ylabel', type=str, default='Counts'),
        Argument('weight', type=float, default=1),
        Argument('title', type=str)]
    def __call__(self):
        import_matplotlib()
        import numpy as np
        data = GenomicData(self.infile, feature_names=[self.feature])
        fig, ax = plt.subplots(figsize=(4, 4))
        valid_data = data.features[self.feature][np.logical_not(np.isnan(data.features[self.feature]))]
        ax.hist(valid_data, weights=np.full(len(valid_data), self.weight), bins=20, color='#808080')
        ax.set_xlabel(self.xlabel)
        ax.set_ylabel(self.ylabel)
        #ax.set_yticks(np.arange(len(counts)), map(lambda x: '%.1f'%x, counts.astype('float')*1e-6))
        plt.tight_layout()
        if self.title:
            ax.set_title(self.title)
        self.logger.info('save figure: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        plt.savefig(self.outfile)
Ejemplo n.º 16
0
class DeepfoldSampleSizeTable(CommandLineTool):
    description = 'Generate a table of the number of samples in the Deepfold dataset and original dataset'
    arguments = [Argument('indirs', short_opt='-i', required=True, nargs='+'),
        Argument('feature', type=str),
        Argument('deepfold_dataset', type=str, required=False, default='r={},p=5,w=100.h5'),
        Argument('outfile', short_opt='-o', required=True),
        Argument('percentile', type=float, default=5),
        Argument('region', type=str),
        Argument('sequence_file', type=str, help='FASTA file')]

    def __call__(self):
        import pandas as pd
        import numpy as np
        import h5py
        regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS']
        records = []
        for indir in self.indirs:
            for region in regions:
                deepfold_dataset = 'r={},p=5,w=100.h5'.format(region)
                data = GenomicData(os.path.join(indir, '{}.h5'.format(region)))
                if not self.feature:
                    feature = data.features.keys()[0]
                else:
                    feature = self.feature
                n_samples_total = len(data.features[feature]) - np.isnan(data.features[feature]).sum()
                f = h5py.File(os.path.join(indir, 'deepfold', deepfold_dataset), 'r')
                n_samples_train = f['X_train'].shape[0]
                n_samples_test = f['X_test'].shape[0]
                f.close()
                records.append((indir, deepfold_dataset, region, n_samples_total, n_samples_train, n_samples_test))
        df = pd.DataFrame.from_records(records, columns=('dataset', 'deepfold_dataset', 'region', 'n_samples_total', 'n_samples_train', 'n_samples_test'))
        self.logger.info('save file: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        df.to_csv(self.outfile, sep='\t', index=False)
Ejemplo n.º 17
0
class CompareDeepfold1DMetrics(CommandLineTool):
    description = 'Compare 1D structure prediction metrics grouped by dataset'
    arguments = [Argument('infile', short_opt='-i', type=str,
        help='metric table generated by MetricTable command'),
        Argument('outfile', short_opt='-o', type=str)]
    def __call__(self):
        import_matplotlib()
        import pandas as pd
        import numpy as np
        df = pd.read_table(self.infile)
        summary = []
        for name, subdf in df.groupby(['model_name']):
            i = subdf['roc_auc'].idxmax()
            summary.append(subdf.ix[i, :])
        summary = pd.concat(summary, axis=1).T
        self.logger.info('save summary table: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        summary.to_csv(self.outfile, index=False, sep='\t')
Ejemplo n.º 18
0
class CompareDeepfold1DWithKnown(CommandLineTool):
    arguments = [Argument('infile', type=str, required=True, help='output file of PredictDeepfold1D or a directory'),
        Argument('outfile', type=str, required=True, help='output plot file'),
        Argument('max_plots', type=int, default=30, help='maximum number of sequences to plot'),
        Argument('max_length', type=int, default=200)]
    def __call__(self):
        import pandas as pd
        import_matplotlib()

        if os.path.isdir(self.infile):
            df = []
            n = 0
            for filename in os.listdir(self.infile):
                df.append(pd.read_table('{}/{}'.format(self.infile, filename)))
                n += 1
                if n > self.max_plots:
                    break
            df = pd.concat(df)
        else:
            df = pd.read_table(self.infile)
        prepare_output_file(self.outfile)
        self.logger.info('save file: {}'.format(self.outfile))
        with PdfPages(self.outfile) as pdf:
            n_plots = 0
            for name, sub_df in df.groupby('name'):
                fig, ax = plt.subplots(figsize=(15, 1.5))
                length = sub_df.shape[0]
                if length > self.max_length:
                    start = length/2 - self.max_length/2
                    sub_df = sub_df.iloc[start:(start + self.max_length), :]
                ax.plot(sub_df['position'], sub_df['pred'], 'b-', label='prediction')
                ax.plot(sub_df['position'], sub_df['true'], 'k-', label='known')
                ax.legend(loc='upper right')
                ax.set_ylim(-0.1, 1.1)
                ax.set_title(name)
                plt.tight_layout()
                pdf.savefig(fig)
                plt.close(fig)
                n_plots += 1
                if n_plots >= self.max_plots:
                    break
Ejemplo n.º 19
0
class SampleSizeTable(CommandLineTool):
    description = 'Generate a table of the number of samples in the Deepfold dataset and original dataset'
    arguments = [Argument('indir', type=str, required=True,
            help='directory containing GenomicData files with path <dataset>/<region>.h5'),
        Argument('feature', type=str),
        Argument('outfile', short_opt='-o', required=True),
        Argument('percentile', type=float),
        Argument('region', type=str)]
    def __call__(self):
        import pandas as pd
        import h5py
        import numpy as np
        regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS', 'ncRNA', 'miRNA']
        if self.region:
            regions = [self.region]
        records = []
        for dataset in os.listdir(self.indir):
            for region in regions:
                data_file = os.path.join(self.indir, dataset, '%s.h5'%region)
                if not os.path.isfile(data_file):
                    self.logger.warn('GenomicData file {} does not exist'.format(data_file))
                    continue
                data = GenomicData(data_file)
                if not self.feature:
                    feature = data.features.keys()[0]
                    #self.logger.info('use the default feature %s because --feature is not given')
                else:
                    feature = self.feature
                if self.percentile is not None:
                    data_valid = data.features[feature][np.logical_not(np.isnan(data.features[feature]))]
                    cutoff1 = np.percentile(data_valid, self.percentile)
                    cutoff2 = np.percentile(data_valid, 100 - self.percentile)
                    n_samples = np.logical_or(data_valid <= cutoff1, data_valid >= cutoff2).sum()
                else:
                    n_samples = len(data.features[feature]) - np.isnan(data.features[feature]).sum()
                records.append((dataset, region, n_samples))
        df = pd.DataFrame.from_records(records, columns=['dataset', 'region', 'n_samples'])
        print df.to_csv(sep='\t', index=False)
Ejemplo n.º 20
0
class SelectBestModel(CommandLineTool):
    arguments = [Argument('cvdir', short_opt='-i', type=str, required=True,
            help='directory containing prediction metric files in HDF5 format with datasets: y_true, y_pred'),
        Argument('metric', type=str, default='accuracy'),
        Argument('prefix', short_opt='-o', type=str, required=True)]
    def __call__(self):
        import h5py
        import json
        import pandas as pd
        from sklearn.metrics import roc_auc_score, accuracy_score

        hyperparams = []
        with open(os.path.join(self.cvdir, 'hyperparam.txt'), 'r') as f:
            for line in f:
                hyperparams.append(line.strip())
        f = h5py.File(os.path.join(self.cvdir, 'cv_index.h5'), 'r')
        n_folds = len(f.keys())
        f.close()

        scores = []
        for param_index, hyperparam in enumerate(hyperparams):
            for cv_fold in range(n_folds):
                metric_file = '%s/%d/%d.valid_metrics'%(self.cvdir, param_index, cv_fold)
                f = h5py.File(metric_file, 'r')
                score = accuracy_score(f['y_pred_labels'][:], f['y_true'][:])
                scores.append((param_index, cv_fold, score, hyperparam))
        scores = pd.DataFrame.from_records(scores, columns=('param_index', 'cv_index', self.metric, 'hyperparam'))
        scores.to_csv(self.prefix + '.detail.txt', sep='\t', index=False, doublequote=False, quotechar="'")

        scores_by_hyperparam = scores.groupby(['param_index'], as_index=False)[self.metric].mean()
        scores_by_hyperparam['hyperparam'] = hyperparams
        scores_by_hyperparam.to_csv(self.prefix + '.mean_by_hyperparam.txt', sep='\t', index=False, doublequote=False, quotechar="'")
        best_param_index = scores_by_hyperparam['param_index'][scores_by_hyperparam[self.metric].idxmax()]

        with open(self.prefix + '.best_hyperparam.json', 'w') as f:
            f.write(hyperparams[best_param_index])
        with open(self.prefix + '.best_param_index.txt', 'w') as f:
            f.write(str(best_param_index))
Ejemplo n.º 21
0
class PrintCommands(CommandLineTool):
    arguments = [
        Argument('task_name',
                 short_opt='-t',
                 type=str,
                 required=True,
                 choices=Task.get_all_task_names(),
                 help='task name'),
        Argument(
            'param_file',
            type=str,
            help=
            'parameter file in JSON format which is decoded as a list of dict')
    ]

    def __call__(self):
        task = Task.get_task(self.task_name)()
        if self.param_file is not None:
            import json
            with open(self.param_file, 'r') as f:
                task.paramlist = json.load(f)
        for cmd in task.generate_commands():
            print cmd
Ejemplo n.º 22
0
class MutateMap(CommandLineTool):
    description = ''
    arguments = [Argument('model_file', type=str, required=True),
        Argument('n_sequences', type=int, default=100)]
    def __call__(self):
        import numpy as np

        self.logger.info('load model: {}'.format(self.model_file))
        model = keras.models.load_model(self.model_file)
        window_size = K.int_shape(model.input)[1]
        n = K.int_shape(model.input)[2]

        X = np.empty((n_sequences*(n - 1)*window_size, window_size), dtype='float32')
        X_wt = np.random.randint(n, size=(n_sequences, window_size))
        X_mut = np.repeat(X_wt, self.n_sequences*(window_size*(n - 1)))
        for i in range(self.n_sequences):
            for j in range(n):
                X_mut[j::n] += (j + 1)
            X_mut = np.mod(X_mut, n)
        X_wt = onehot_encode(X_wt, range(n))
        X_mut = onehot_encode(X_mut, range(n))
        y_wt = model.predict(X_wt)
        y_mut = model.predict(X_mut)
Ejemplo n.º 23
0
class SelectModel(CommandLineTool):
    arguments = [Argument('metric_file', type=str, required=True, help='A table generated by MetricTable'),
        Argument('outfile', short_opt='-o', type=str, required=False, help='parameters of best models in JSON format'),
        Argument('num', short_opt='-n', type=int, default=1,
            help='maximum number of models to select for each dataset. 0 for all models.'),
        Argument('metric', type=str, default='accuracy')]
    def __call__(self):
        import pandas as pd
        import json
        metric_table = pd.read_table(self.metric_file)
        if self.num <= 0:
            self.num = metric_table.shape[0]
        else:
            self.num = min(self.num, metric_table.shape[0])
        selected = metric_table.sort_values('accuracy', ascending=False).iloc[:self.num, :]
        if self.outfile is not None:
            paramlist = []
            for index, row in  selected.iterrows():
                paramlist.append(row.to_dict())
            prepare_output_file(self.outfile)
            with open(self.outfile, 'w') as f:
                json.dump(paramlist, f, indent=2)
        print selected
Ejemplo n.º 24
0
class CreateCvIndex(CommandLineTool):
    arguments = [Argument('n_samples', short_opt='-n', type=int, required=True,
            help='number of samples'),
        Argument('n_folds', short_opt='-k', type=int, required=True,
            help='number of folds'),
        Argument('outfile', short_opt='-o', type=str, required=True,
            help='output file in HDF5 format with dataset names: /<fold>/train, /<fold>/test')]
    def __call__(self):
        import h5py
        from sklearn.model_selection import KFold
        import numpy as np

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        fout = h5py.File(self.outfile, 'w')
        kfold = KFold(self.n_folds, shuffle=True)
        fold = 0
        for train_index, test_index in kfold.split(np.arange(self.n_samples)):
            g = fout.create_group('%d'%fold)
            g.create_dataset('train', data=train_index)
            g.create_dataset('test', data=test_index)
            fold += 1
        fout.close()
Ejemplo n.º 25
0
class ScoreStructure(CommandLineTool):
    description = 'Compare predicted and known structures in CT format and calculate the metrics'
    arguments = [Argument('true_file', type=str, required=True, help='CT format'),
        Argument('pred_file', type=str, required=True, help='CT format'),
        Argument('outfile', short_opt='-o', type=str),
        Argument('exact', action='store_true', help='count exact pairs')]
    def score_ct(self, true_ct_file, pred_ct_file):
        ct_true = read_ct(true_ct_file)
        ct_pred = read_ct(pred_ct_file)
        scores = score_structure(make_pair_list(ct_true[2]),
            make_pair_list(ct_pred[2]), exact=self.exact)
        scores['length'] = len(ct_pred[1])
        return scores

    def __call__(self):
        keys = ['length', 'sensitivity', 'ppv', 'tp_in_true', 'true_pairs', 'tp_in_pred', 'pred_pairs']
        fout = sys.stdout
        if self.outfile is not None:
            self.logger.info('save file: {}'.format(self.outfile))
            prepare_output_file(self.outfile)
            fout = open(self.outfile, 'w')
        if os.path.isdir(self.true_file) and os.path.isdir(self.pred_file):
            names = [os.path.splitext(a)[0] for a in os.listdir(self.pred_file)]
            fout.write('\t'.join(['name'] + keys) + '\n')
            for name in names:
                #self.logger.debug('read ct: {}'.format(name))
                scores = self.score_ct('{}/{}.ct'.format(self.true_file, name),
                    '{}/{}.ct'.format(self.pred_file, name))
                fout.write('\t'.join([name] + map(str, map(lambda x: scores[x], keys))) + '\n')
        else:
            scores = self.score_ct(self.true_file, self.pred_file)
            name = os.path.splitext(self.pred_file)[0]
            fout.write('\t'.join(['name'] + keys) + '\n')
            fout.write('\t'.join([ct_true[0]] + map(str, map(lambda x: scores[x], keys))) + '\n')
        if self.outfile is not None:
            fout.close()
Ejemplo n.º 26
0
class CompareCtFiles(CommandLineTool):
    arguments = [Argument('indir1', type=str, required=True, help='directory containing ct files'),
        Argument('indir2', type=str, required=True, help='directory containing ct files'),
        Argument('group_name1', type=str, default='Group 1'),
        Argument('group_name2', type=str, default='Group 2'),
        Argument('outfile', type=str, required=True, help='output plot file'),
        Argument('max_plots', type=int, default=20, help='maximum number of sequences to plot'),
        Argument('random', action='store_true', help='randomly select structures to plot'),
        Argument('max_length', type=int, default=200)]
    def __call__(self):
        import_matplotlib()
        import numpy as np

        names1 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir1))
        names2 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir2))

        prepare_output_file(self.outfile)
        self.logger.info('save file: {}'.format(self.outfile))
        with PdfPages(self.outfile) as pdf:
            n_plots = 0
            for name in names1:
                if name not in names2:
                    continue
                name1, seq1, pairs1 = read_ct('{}/{}.ct'.format(self.indir1, name))
                name2, seq2, pairs2 = read_ct('{}/{}.ct'.format(self.indir2, name))
                fig, axes = plt.subplots(nrows=2, figsize=(12, 4), sharex=True)
                length = len(seq1)
                x = np.arange(length)
                pairs1 = np.asarray(pairs1, dtype='int64')
                pairs1[pairs1 > 1] = 1
                pairs2 = np.asarray(pairs2, dtype='int64')
                pairs2[pairs2 > 1] = 1
                if length > self.max_length:
                    start = length/2 - self.max_length/2
                    pairs1 = pairs1[start:(start + self.max_length)]
                    pairs2 = pairs2[start:(start + self.max_length)]
                    x = x[start:(start + self.max_length)]
                axes[0].bar(x, pairs1, label=self.group_name1, color='b', edgecolor='w')
                axes[0].set_title('{}({})'.format(name, self.group_name1))
                axes[1].bar(x, pairs2, label=self.group_name1, color='b', edgecolor='w')
                axes[1].set_title('{}({})'.format(name, self.group_name2))
                pdf.savefig(fig)
                n_plots += 1
                if n_plots >= self.max_plots:
                    break
Ejemplo n.º 27
0
class CheckReady(CommandLineTool):
    arguments = [
        Argument('task_name',
                 short_opt='-t',
                 type=str,
                 required=True,
                 choices=Task.get_all_task_names(),
                 help='task name')
    ]

    def __call__(self):
        task = Task.get_task(self.task_name)
        for params in task.paramlist:
            unique_name = task.unique_name.format(**params)
            ready, missing = task.ready(params)
            if ready:
                print '{}({})\t\x1B[32mYES\x1B[0m'.format(
                    task.__name__, unique_name)
            else:
                print '{}({})\t\x1B[31mNO\x1B[0m\t{}'.format(
                    task.__name__, unique_name, missing)
Ejemplo n.º 28
0
class CompareStructurePredictionMetrics(CommandLineTool):
    description = 'Plot the difference between score metrics of two methods'
    arguments = [Argument('infile1', type=str, required=True, help='output file of ScoreStructure'),
        Argument('infile2', type=str, required=True, help='output file of ScoreStructure'),
        Argument('outfile', short_opt='-o', type=str, help='output plot file'),
        Argument('metric', type=str, default='sensitivity'),
        Argument('title', type=str, default='Distribution of {metric} {compare_method}'),
        Argument('compare_method', type=str, default='difference')]
    def __call__(self):
        import pandas as pd
        import_matplotlib()
        table1 = pd.read_table(self.infile1)
        table2 = pd.read_table(self.infile2)
        merged = pd.merge(table1, table2, on='name')
        diff = merged['{}_x'.format(self.metric)] - merged['{}_y'.format(self.metric)]
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.hist(diff, bins=50)
        ax.set_title(self.title.format(metric=self.metric, compare_method=self.compare_method,
            mean=diff.mean(), median=diff.median()))
        ax.set_xlim(-1, 1)
        self.logger.info('save plot file: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        plt.savefig(self.outfile)
Ejemplo n.º 29
0
class TestEstimator(CommandLineTool):
    arguments = [Argument('test_file', short_opt='-i', type=str, required=True,
            help='the dataset in HDF5 format, required datasets: X_test, y_test'),
        Argument('model_file', type=str, required=True,
            help='file path for saving the model (in Python pickle format)'),
        Argument('model_type', type=str, default='sklearn', choices=('sklearn', 'keras')),
        Argument('metrics', type=list, default='accuracy'),
        Argument('metric_file', short_opt='-o', type=str, required=True),
        Argument('flatten', action='store_true', help='flatten the input dataset before applying the model'),]
    def __call__(self):
        import h5py
        import zipfile

        self.logger.info('load test dataset: ' + self.test_file)
        f = h5py.File(self.test_file, 'r')
        X_test = f['X_test'][:]
        y_test = f['y_test'][:]
        f.close()
        if self.flatten:
            self.logger.info('flatten the test data to dimension: (%d, %d)'%X_test.shape[:2])
            X_test = X_test.reshape((X_test.shape[0], -1))
        if self.model_type == 'keras':
            import_keras()
            self.logger.info('load keras model: ' + self.model_file)
            model = keras.models.load_model(self.model_file)
        elif self.model_type == 'sklearn':
            import cPickle
            self.logger.info('load sklearn model: ' + self.model_file)
            zipf = zipfile.ZipFile(self.model_file, 'r')
            f = zipf.open('model', 'r')
            model = cPickle.load(f)
            zipf.close()
        if self.model_type == 'sklearn':
            y_pred_labels = model.predict(X_test)
            model_name = model.__class__.__name__
            if model_name == 'SVC':
                y_pred = model.decision_function(X_test)
            elif model_name == 'RandomForestClassifier':
                y_pred = model.predict_proba(X_test)[:, 1]
            else:
                raise ValueError('unknown sklearn model ' + model_name)
        elif self.model_type == 'keras':
            y_pred = model.predict(X_test)
            y_pred_labels = (y_pred >= 0.5).astype('int32')

        self.logger.info('save metrics: ' + self.metric_file)
        prepare_output_file(self.metric_file)
        f = h5py.File(self.metric_file, 'w')
        f.create_dataset('y_true', data=y_test)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('y_pred_labels', data=y_pred_labels)
        g = f.create_group('metrics')
        for metric in self.metrics:
            scorer = get_scorer(metric)
            if metric == 'roc_auc':
                score = scorer(y_test, y_pred)
            else:
                score = scorer(y_test, y_pred_labels)
            self.logger.info('calculate metric {}: {}'.format(metric, score))
            g.create_dataset(metric, data=score)
        f.close()
Ejemplo n.º 30
0
class TrainEstimator(CommandLineTool):
    arguments = [Argument('train_file', short_opt='-i', type=str, required=True,
            help='the dataset in HDF5 format, required datasets: X_train, y_train'),
        Argument('cv_index_file', type=str, help='CV index created by CreateCvIndex'),
        Argument('cv_fold', type=int),
        Argument('model_name', type=str, required=True,
            help='name of the classifier'),
        Argument('model_type', type=str, default='sklearn', choices=('sklearn', 'keras')),
        Argument('model_file', short_opt='-o', type=str,
            help='file path for saving the model (in Python pickle format)'),
        Argument('model_script', type=str,
            help='load a model specification from a Python script (should define the model variable)'),
        Argument('valid_metric_file', type=str),
        Argument('flatten', action='store_true', help='flatten the input dataset before applying the model'),
        Argument('regress', action='store_true', help='train a regression model'),
        Argument('metrics', type=list),
        Argument('scale_targets', action='store_true',
            help='scale the targets values by mean and variance'),
        Argument('hyperparam', type=str, default='{}', help='model hyper-parameter in JSON format'),
        Argument('hyperparam_file', type=str, help='model hyper-parameter in JSON format from file')]

    def __call__(self):
        import json
        import h5py
        import cPickle
        import zipfile

        if self.hyperparam_file:
            with open(self.hyperparam_file, 'r') as f:
                hyperparam = json.load(f)
        else:
            hyperparam = json.loads(self.hyperparam)

        self.logger.info('load data: {}'.format(self.train_file))
        fin = h5py.File(self.train_file, 'r')
        X_train = fin['X_train'][:]
        y_train = fin['y_train'][:]
        fin.close()
        X_valid = None
        y_valid = None

        if self.cv_index_file is not None:
            if self.cv_fold is None:
                raise ValueError('argument --cv-fold is required if --cv-index-file is specified')
            if self.valid_metric_file is None:
                raise ValueError('argument --valid-metric-file is required if --cv-index-file is specified')
            self.logger.info('load CV index: ' + self.cv_index_file)
            f = h5py.File(self.cv_index_file, 'r')
            train_index = f[str(self.cv_fold)]['train'][:]
            test_index = f[str(self.cv_fold)]['test'][:]
            f.close()
            X_valid = X_train[test_index]
            y_valid = y_train[test_index]
            X_train = X_train[train_index]
            y_train = y_train[train_index]

        if self.flatten:
            X_train = X_train.reshape((X_train.shape[0], -1))
            self.logger.info('flatten the training data to dimension: (%d, %d)'%X_train.shape)
            if X_valid is not None:
                X_valid = X_valid.reshape((X_valid.shape[0], -1))
                self.logger.info('flatten the validation data to dimension: (%d, %d)'%X_train.shape)

        if self.scale_targets:
            self.logger.info('scale the target values using StandardScaler')
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            y_train = scaler.fit_transform(y_train.reshape(-1, 1)).reshape((-1,))
            if y_valid is not None:
                y_valid = scaler.transform(y_valid.reshape(-1, 1)).reshape((-1,))

        if self.model_script:
            self.logger.info('create model from script: ' + self.model_script)
            if self.model_type == 'keras':
                self.logger.info('use the keras model')
                #with open(os.path.join(os.path.dirname(__file__), 'import_keras.py'), 'r') as f:
                #    exec compile(f.read(), 'import_keras.py', 'exec')
                import_keras()
                with open(self.model_script, 'r') as f:
                    exec compile(f.read(), self.model_script, 'exec')
                from keras.optimizers import SGD
                optimizer = SGD()
                if self.regress:
                    loss = 'mean_squared_error'
                    metrics = ['mae']
                else:
                    loss = 'binary_crossentropy'
                    metrics = ['acc']
                model.compile(optimizer=optimizer,
                            loss=loss,
                            metrics=metrics)
                model.summary()
            else:
                with open(self.model_script, 'r') as f:
                    exec compile(f.read(), self.model_script, 'exec')
        else:
            self.logger.info('create model by name: ' + self.model_name)
            model = get_model(self.model_name, hyperparam)
        self.logger.info('train the model')
        if self.model_type == 'keras':
            model.fit(X_train, y_train, batch_size=100, epochs=20)
        else:
            self.logger.info('model parameters: ' + json.dumps(model.get_params()))
            model.fit(X_train, y_train)
        if self.model_file:
            self.logger.info('save model: {}'.format(self.model_file))
            prepare_output_file(self.model_file)
            if self.model_type == 'keras':
                model.save(self.model_file)
                f = h5py.File(self.model_file, 'r+')
                f.create_dataset('hyperparam', data=json.dumps(hyperparam))
                f.close()
            else:
                zipf = zipfile.ZipFile(self.model_file, 'w', zipfile.ZIP_DEFLATED)
                zipf.writestr('model', cPickle.dumps(model))
                zipf.writestr('hyperparam', json.dumps(hyperparam))
                zipf.close()

        if X_valid is not None:
            if self.metrics is None:
                if self.regress:
                    self.metrics = ['mean_squared_error', 'r2']
                else:
                    self.metrics = ['accuracy']
            self.logger.info('validate the model')
            if self.regress:
                y_pred = model.predict(X_valid)
            else:
                y_pred_labels = model.predict(X_valid)
            self.logger.info('save the metrics: ' + self.valid_metric_file)
            prepare_output_file(self.valid_metric_file)
            f = h5py.File(self.valid_metric_file, 'w')
            f.create_dataset('model_name', data=self.model_name)
            f.create_dataset('hyperparam', data=json.dumps(self.hyperparam))
            f.create_dataset('y_true', data=y_valid)
            if self.regress:
                f.create_dataset('y_pred', data=y_pred)
            else:
                f.create_dataset('y_pred_labels', data=y_pred_labels)
            g = f.create_group('metrics')
            for metric in self.metrics:
                scorer = get_scorer(metric)
                if self.regress:
                    score = scorer(y_valid, y_pred)
                else:
                    score = scorer(y_valid, y_pred_labels)
                self.logger.info('calculate metric {}: {}'.format(metric, score))
                g.create_dataset(metric, data=score)
            if self.scale_targets:
                g.create_dataset('scale_y_mean', data=scaler.mean_)
                g.create_dataset('scale_y_std', data=scaler.scale_)
            f.close()