コード例 #1
0
 def __call__(self):
     import glob
     import h5py
     records = []
     name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'}
     header = ['experiment_type', 'data_name', 'model_experiment_type', 'model_data_name',
         'percentile', 'window_size', 'model_name'] + self.metrics
     for dirname in os.listdir('metrics/cross/{},{}'.format(self.experiment_type, self.data_name)):
         model_experiment_type, model_data_name = dirname.split(',')
         for filename in glob.glob('metrics/cross/{},{}/{}/*.h5'.format(self.experiment_type, self.data_name, dirname)):
             d = parse_filename(filename, name_dict)
             if d['region'] != self.region:
                 continue
             f = h5py.File(filename, 'r')
             record = [self.experiment_type, self.data_name,
                 model_experiment_type, model_data_name,
                 d['percentile'], d['window_size'], d['model_name']]
             grp = f['metrics']
             for metric in self.metrics:
                 record.append(str(grp[metric][()]))
             records.append(record)
     self.logger.info('save file: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     with open(self.outfile, 'w') as f:
         f.write('\t'.join(header) + '\n')
         for record in records:
             f.write('\t'.join(record) + '\n')
コード例 #2
0
    def __call__(self):
        import h5py
        import numpy as np
        import_matplotlib()

        model_weights = h5py.File(self.infile, 'r')['/model_weights/dense_1/dense_1/kernel:0'][:]
        window_size = model_weights.shape[0]/len(self.alphabet)
        offset = (window_size + 1)/2
        model_weights = model_weights.reshape((window_size, 4))
        fig, ax = plt.subplots(figsize=(20, 4))
        for i in range(len(self.alphabet)):
            ax.plot(np.arange(window_size), model_weights[:, i], '-', label=self.alphabet[i])
        ax.set_xticks(np.arange(window_size, step=5))
        ax.set_xlim(0, window_size)
        ax.set_ylabel('Weight')
        ax.set_xlabel('Position')
        ax.set_xticks(np.arange(window_size), minor=True)
        ax.set_xticks(np.arange(offset%5, window_size + 1, step=5))
        ax.set_xticklabels(np.arange(offset%5, window_size + 1, step=5) - window_size/2)
        ax.set_ylim(-2, 2)
        ax.legend()
        plt.tight_layout()
        self.logger.info('save figure: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        plt.savefig(self.outfile, dpi=150, bbox_inches='tight')
コード例 #3
0
    def __call__(self):
        import h5py
        from sklearn.datasets import make_classification
        from sklearn.model_selection import train_test_split

        X, y = make_classification(self.n_samples, self.n_features,
            n_informative=self.n_informative,
            n_redundant=self.n_redundant,
            n_repeated=self.n_repeated,
            n_classes=self.n_classes,
            n_clusters_per_class=self.n_clusters_per_class,
            flip_y=self.flip_y,
            class_sep=self.class_sep,
            shift=self.shift,
            scale=self.scale)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_ratio)

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        f = h5py.File(self.outfile, 'w')
        f.create_dataset('X_train', data=X_train)
        f.create_dataset('y_train', data=y_train)
        f.create_dataset('X_test',  data=X_test)
        f.create_dataset('y_test', data=y_test)
        f.close()
コード例 #4
0
def train(args):
    import numpy as np
    import keras
    import h5py
    from models import get_model
    from ioutils import prepare_output_file, make_dir

    logger.info('load training data: ' + args.input_file)
    fin = h5py.File(args.input_file, 'r')
    X_train = fin[args.xname][:]
    y_train = fin[args.yname][:]
    fin.close()

    valid_data = None
    if args.valid_file:
        logger.info('load validation data: ' + args.valid_file)
        fin = h5py.File(args.valid_file, 'r')
        X_valid = fin[args.valid_xname][:]
        y_valid = fin[args.valid_yname][:]
        fin.close()
        valid_data = (X_valid, y_valid)

    if args.n_threads >= 1:
        logger.info('set number of threads to {} for TensorFlow'.format(
            args.n_threads))
        set_keras_num_threads(args.n_threads)
    window_size = X_train.shape[1]
    model = get_model(args.model)(window_size)
    if args.regression:
        loss = 'mean_squared_error'
        metrics = ['mean_squared_error']
    else:
        loss = 'binary_crossentropy'
        metrics = ['accuracy']
    model.compile(optimizer='Adam', loss=loss, metrics=metrics)
    model.summary()

    callbacks = []
    if args.tensorboard_log_dir:
        from keras.callbacks import TensorBoard
        callbacks = [TensorBoard(log_dir=args.tensorboard_log_dir)]
    else:
        callbacks = []
    if args.keras_log is not None:
        logger.info('open CSV log file: {}'.format(args.keras_log))
        make_dir(os.path.dirname(args.keras_log))
        callbacks.append(keras.callbacks.CSVLogger(args.keras_log))

    logger.info('train model')
    model.fit(X_train,
              y_train,
              batch_size=args.batch_size,
              epochs=args.epochs,
              callbacks=callbacks,
              verbose=args.keras_verbose,
              validation_data=valid_data)
    logger.info('save model: {}'.format(args.model_file))
    prepare_output_file(args.model_file)
    model.save(args.model_file)
コード例 #5
0
    def __call__(self):
        import h5py
        import numpy as np
        import_matplotlib()
        from scipy import signal

        self.logger.info('read BUMHMM file: ' + self.posterior_file)
        posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:]
        self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file)
        f = h5py.File(self.bumhmm_input_file, 'r')
        start = f['start'][:]
        end = f['end'][:]
        name = f['name'][:]
        f.close()

        self.logger.info('open pdf file: ' + self.outfile)
        prepare_output_file(self.outfile)
        plt.rcParams['axes.labelsize'] = 'small'
        plt.rcParams['xtick.labelsize'] = 'x-small'
        plt.rcParams['ytick.labelsize'] = 'x-small'
        plt.rcParams['axes.titlesize'] = 'small'

        window_size = 300
        window = np.ones(300)
        with PdfPages(self.outfile) as pdf:
            n_plots = 0
            while n_plots < 50:
                i = np.random.choice(len(name))
                if end[i] - start[i] < window_size:
                    continue
                na_fraction = signal.convolve(np.isnan(posteriors[start[i]:end[i]]).astype('float'), window, mode='valid')/window_size
                valid_windows = np.nonzero(na_fraction < 0.75)[0]
                if len(valid_windows) <= 0:
                    continue

                self.logger.info('plot %s'%name[i])
                offset = valid_windows[0]
                length = min(window_size, end[i] - start[i] - offset)
                index = np.arange(start[i] + offset, start[i] + length + offset)
                posteriors_fillna = posteriors[index]
                na_index = np.nonzero(np.isnan(posteriors_fillna))[0]
                posteriors_fillna[na_index] = -0.1
                color = np.full(length, '#0000ff', dtype='S7')
                color[na_index] = '#999999'

                x = np.arange(offset, offset + length)
                fig, ax = plt.subplots(figsize=(15, 1.5))

                ax.bar(x, posteriors_fillna, color=color, edgecolor='none')
                ax.set_xlim(0, length)
                ax.set_ylim(-0.1, 1)
                ax.set_title('BUMHMM posteriors (%s)'%name[i])

                plt.tight_layout()
                pdf.savefig(fig)
                plt.clf()
                plt.close(fig)

                n_plots += 1
コード例 #6
0
    def __call__(self):
        import h5py

        self.logger.info('load training data: ' + self.infile)
        fin = h5py.File(self.infile, 'r')
        X_train = fin[self.xname][:]
        y_train = fin[self.yname][:]
        fin.close()

        valid_data = None
        if self.valid_file:
            self.logger.info('load validation data: ' + self.valid_file)
            fin = h5py.File(self.valid_file, 'r')
            X_valid = fin[self.valid_xname][:]
            y_valid = fin[self.valid_yname][:]
            fin.close()
            valid_data = (X_valid, y_valid)

        window_size = X_train.shape[1]
        from keras.optimizers import RMSprop
        optimizer = RMSprop(lr=self.learning_rate)
        # load model
        # variables optimizer, loss may be overloaded
        regression = self.regression
        if self.regression:
            loss = 'mean_squared_error'
            metrics = ['mean_squared_error']
        else:
            loss = 'binary_crossentropy'
            metrics = ['accuracy']

        with open(self.model_script, 'r') as f:
            exec compile(f.read(), self.model_script, 'exec')

        model.compile(optimizer=optimizer,
                    loss=loss,
                    metrics=metrics)
        model.summary()

        callbacks = []
        if self.tensorboard_log_dir:
            from keras.callbacks import TensorBoard
            callbacks = [TensorBoard(log_dir=self.tensorboard_log_dir)]
        else:
            callbacks = []
        if self.keras_log is not None:
            self.logger.info('open CSV log file: {}'.format(self.keras_log))
            make_dir(os.path.dirname(self.keras_log))
            callbacks.append(keras.callbacks.CSVLogger(self.keras_log))

        self.logger.info('train model')
        model.fit(X_train, y_train,
            batch_size=self.batch_size, epochs=self.epochs,
            callbacks=callbacks, verbose=self.keras_verbose,
            validation_data=valid_data)
        self.logger.info('save model: {}'.format(self.model_file))
        prepare_output_file(self.model_file)
        model.save(self.model_file)
コード例 #7
0
    def __call__(self):
        import h5py
        import numpy as np
        import_matplotlib()

        self.logger.info('read BUMHMM file: ' + self.posterior_file)
        posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:]
        self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file)
        f = h5py.File(self.bumhmm_input_file, 'r')
        start = f['start'][:]
        end = f['end'][:]
        name = f['name'][:]
        coverage = f['coverage'][:]
        sample_name = f['sample_name'][:]
        replicate = f['replicate'][:]
        dropoff_count = f['dropoff_count'][:]
        f.close()

        self.logger.info('open pdf file: ' + self.outfile)
        prepare_output_file(self.outfile)
        plt.rcParams['axes.labelsize'] = 'small'
        plt.rcParams['xtick.labelsize'] = 'x-small'
        plt.rcParams['ytick.labelsize'] = 'x-small'
        plt.rcParams['axes.titlesize'] = 'small'

        with PdfPages(self.outfile) as pdf:
            for i in np.random.choice(len(name), size=10):
                self.logger.info('plot %s'%name[i])
                length = min(300, end[i] - start[i] - 50)
                index = np.arange(start[i] + 50, start[i] + length + 50)
                x = np.arange(50, 50 + length)
                fig, axes = plt.subplots(1 + 2*coverage.shape[0], figsize=(15, 2 + 2*coverage.shape[0]), sharex=True)

                posteriors_fillna = posteriors[index]
                color = np.asarray(['#999999' if np.isnan(a) else '#0000ff' for a in posteriors_fillna])
                posteriors_fillna[np.isnan(posteriors_fillna)] = -0.05

                axes[0].bar(x, posteriors_fillna, color=color, edgecolor='none')
                axes[0].set_xlim(0, length)
                axes[0].set_ylim(-0.1, 1)
                axes[0].set_title('BUMHMM posteriors (%s)'%name[i])
                for j in range(coverage.shape[0]):
                    axes[2*j + 1].bar(x, dropoff_count[j, index].astype('float')/coverage[j, index], edgecolor='none')
                    axes[2*j + 1].set_title('Dropoff rate of %s (%s)'%(replicate[j], sample_name[j]))
                    axes[2*j + 1].set_ylim(0, 0.5)

                    axes[2*j + 2].bar(x, coverage[j, index], edgecolor='none')
                    axes[2*j + 2].set_title('Coverage of %s (%s)'%(replicate[j], sample_name[j]))
                    axes[2*j + 1].set_ylim(0, 0.5)
                plt.tight_layout()
                pdf.savefig(fig)
                plt.clf()
                plt.close(fig)
コード例 #8
0
    def __call__(self):
        import h5py
        import zipfile

        self.logger.info('load test dataset: ' + self.test_file)
        f = h5py.File(self.test_file, 'r')
        X_test = f['X_test'][:]
        y_test = f['y_test'][:]
        f.close()
        if self.flatten:
            self.logger.info('flatten the test data to dimension: (%d, %d)'%X_test.shape[:2])
            X_test = X_test.reshape((X_test.shape[0], -1))
        if self.model_type == 'keras':
            import_keras()
            self.logger.info('load keras model: ' + self.model_file)
            model = keras.models.load_model(self.model_file)
        elif self.model_type == 'sklearn':
            import cPickle
            self.logger.info('load sklearn model: ' + self.model_file)
            zipf = zipfile.ZipFile(self.model_file, 'r')
            f = zipf.open('model', 'r')
            model = cPickle.load(f)
            zipf.close()
        if self.model_type == 'sklearn':
            y_pred_labels = model.predict(X_test)
            model_name = model.__class__.__name__
            if model_name == 'SVC':
                y_pred = model.decision_function(X_test)
            elif model_name == 'RandomForestClassifier':
                y_pred = model.predict_proba(X_test)[:, 1]
            else:
                raise ValueError('unknown sklearn model ' + model_name)
        elif self.model_type == 'keras':
            y_pred = model.predict(X_test)
            y_pred_labels = (y_pred >= 0.5).astype('int32')

        self.logger.info('save metrics: ' + self.metric_file)
        prepare_output_file(self.metric_file)
        f = h5py.File(self.metric_file, 'w')
        f.create_dataset('y_true', data=y_test)
        f.create_dataset('y_pred', data=y_pred)
        f.create_dataset('y_pred_labels', data=y_pred_labels)
        g = f.create_group('metrics')
        for metric in self.metrics:
            scorer = get_scorer(metric)
            if metric == 'roc_auc':
                score = scorer(y_test, y_pred)
            else:
                score = scorer(y_test, y_pred_labels)
            self.logger.info('calculate metric {}: {}'.format(metric, score))
            g.create_dataset(metric, data=score)
        f.close()
コード例 #9
0
    def __call__(self):
        import numpy as np
        from sklearn.model_selection import train_test_split
        import h5py
        from common import sequence_to_array
        from scipy import signal

        self.logger.info('read input file: ' + self.infile)
        _, base_density, length, _ = read_background_rt(self.infile)
        names = base_density.keys()
        self.logger.info('read sequence file: ' + self.sequence_file)
        sequences = dict(read_fasta(self.sequence_file))

        if self.offset is None:
            self.offset = (self.window_size + 1) / 2
        X = []
        y = []
        if self.smooth:
            self.logger.info(
                'smooth the values using Gaussian window of width %.1f' %
                self.smooth_width)
            window = signal.gaussian(100, std=self.smooth_width)
        for name in names:
            seq = sequences[name]
            values = base_density[name] / base_density[name].mean()
            if self.smooth:
                # smooth the signal
                values = signal.convolve(values, window, mode='same')
            for i in range(0, len(seq) - self.window_size, self.stride):
                X.append(sequence_to_array(seq[i:(i + self.window_size)]))
                y.append(values[i + self.offset])
                if len(X) >= self.max_samples:
                    break
        n_samples = len(X)
        self.logger.info('created {} samples'.format(n_samples))

        X = np.concatenate(X)
        X = X.reshape((n_samples, self.window_size, 4))
        y = np.asarray(y, dtype='float32')
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_ratio)

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        f = h5py.File(self.outfile, 'w')
        f.create_dataset('offset', data=int(self.offset))
        f.create_dataset('window_size', data=int(self.window_size))
        f.create_dataset('X_train', data=X_train)
        f.create_dataset('y_train', data=y_train)
        f.create_dataset('X_test', data=X_test)
        f.create_dataset('y_test', data=y_test)
        f.close()
コード例 #10
0
 def __call__(self):
     import_matplotlib()
     import pandas as pd
     import numpy as np
     df = pd.read_table(self.infile)
     summary = []
     for name, subdf in df.groupby(['model_name']):
         i = subdf['roc_auc'].idxmax()
         summary.append(subdf.ix[i, :])
     summary = pd.concat(summary, axis=1).T
     self.logger.info('save summary table: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     summary.to_csv(self.outfile, index=False, sep='\t')
コード例 #11
0
    def __call__(self):
        import json
        import h5py
        import cPickle
        from sklearn.metrics import roc_auc_score, accuracy_score

        hyperparam = json.loads(self.hyperparam)
        model = get_model(self.model_name, hyperparam)

        self.logger.info('load data: {}'.format(self.infile))
        fin = h5py.File(self.infile, 'r')
        X_train = fin['X_train'][:]
        y_train = fin['y_train'][:]
        fin.close()
        X_valid = None
        y_valid = None

        if self.cv_index_file is not None:
            if self.cv_fold is None:
                raise ValueError('argument --cv-fold is required if --cv-index-file is specified')
            self.logger.info('load CV index: ' + self.cv_index_file)
            f = h5py.File(self.cv_index_file, 'r')
            train_index = f[str(self.cv_fold)]['train'][:]
            test_index = f[str(self.cv_fold)]['test'][:]
            f.close()
            X_valid = X_train[test_index]
            y_valid = y_train[test_index]
            X_train = X_train[train_index]
            y_train = y_train[train_index]

        self.logger.info('train the model')
        model.fit(X_train, y_train)
        self.logger.info('save model: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        with open(self.outfile, 'w') as f:
            cPickle.dump(model, self.outfile)

        if X_valid:
            self.logger.info('validate the model')
            y_pred_labels = model.predict(X_valid)
            self.logger.info('save the metrics: ' + self.valid_metric_file)
            prepare_output_file(self.valid_metric_file)
            f = h5py.File(self.valid_metric_file, 'w')
            f.create_dataset('model_name', dtype='S', data=self.model_name)
            f.create_dataset('hyperparam', dtype='S', data=json.dumps(self.hyperparam))
            f.create_dataset('y_pred_labels', data=y_pred_labels)
            f.create_dataset('y_true', data=y_true)
            g = f.create_group('metrics')
            g.create_dataset('accuracy', data=accuracy_score(y_valid, y_pred_labels))
            f.close()
コード例 #12
0
 def __call__(self):
     import pandas as pd
     import_matplotlib()
     table1 = pd.read_table(self.infile1)
     table2 = pd.read_table(self.infile2)
     merged = pd.merge(table1, table2, on='name')
     diff = merged['{}_x'.format(self.metric)] - merged['{}_y'.format(self.metric)]
     fig, ax = plt.subplots(figsize=(10, 6))
     ax.hist(diff, bins=50)
     ax.set_title(self.title.format(metric=self.metric, compare_method=self.compare_method,
         mean=diff.mean(), median=diff.median()))
     ax.set_xlim(-1, 1)
     self.logger.info('save plot file: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     plt.savefig(self.outfile)
コード例 #13
0
    def __call__(self):
        import h5py
        from sklearn.model_selection import KFold
        import numpy as np

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        fout = h5py.File(self.outfile, 'w')
        kfold = KFold(self.n_folds, shuffle=True)
        fold = 0
        for train_index, test_index in kfold.split(np.arange(self.n_samples)):
            g = fout.create_group('%d'%fold)
            g.create_dataset('train', data=train_index)
            g.create_dataset('test', data=test_index)
            fold += 1
        fout.close()
コード例 #14
0
 def __call__(self):
     import_matplotlib()
     import numpy as np
     data = GenomicData(self.infile, feature_names=[self.feature])
     fig, ax = plt.subplots(figsize=(4, 4))
     valid_data = data.features[self.feature][np.logical_not(np.isnan(data.features[self.feature]))]
     ax.hist(valid_data, weights=np.full(len(valid_data), self.weight), bins=20, color='#808080')
     ax.set_xlabel(self.xlabel)
     ax.set_ylabel(self.ylabel)
     #ax.set_yticks(np.arange(len(counts)), map(lambda x: '%.1f'%x, counts.astype('float')*1e-6))
     plt.tight_layout()
     if self.title:
         ax.set_title(self.title)
     self.logger.info('save figure: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     plt.savefig(self.outfile)
コード例 #15
0
 def __call__(self):
     import pandas as pd
     import json
     metric_table = pd.read_table(self.metric_file)
     if self.num <= 0:
         self.num = metric_table.shape[0]
     else:
         self.num = min(self.num, metric_table.shape[0])
     selected = metric_table.sort_values('accuracy', ascending=False).iloc[:self.num, :]
     if self.outfile is not None:
         paramlist = []
         for index, row in  selected.iterrows():
             paramlist.append(row.to_dict())
         prepare_output_file(self.outfile)
         with open(self.outfile, 'w') as f:
             json.dump(paramlist, f, indent=2)
     print selected
コード例 #16
0
    def __call__(self):
        import wave
        import numpy as np
        rt = icshape_raw_rt_to_genomic_data(self.rt_file, self.logger)

        def modulate(values,
                     wav_file,
                     sample_rate=44100,
                     n_channels=2,
                     max_amp=32767,
                     x_freq=20):
            upsample_rate = float(sample_rate) / x_freq
            T = float(len(values)) / x_freq
            n_samples = int(sample_rate * T)
            x = np.empty(n_samples, dtype='float32')
            for i in range(len(values)):
                x[int(upsample_rate * i):int(upsample_rate *
                                             (i + 1))] = np.log(values[i] + 1)
            t = np.linspace(0, T, n_samples)
            y = max_amp * np.sin(2 * 880 * np.pi * t)
            y *= x
            y *= float(max_amp) / np.abs(y.max())
            data = np.empty(n_samples * n_channels, dtype='int16')
            channel_index = np.arange(0, n_samples * n_channels, n_channels)
            data[channel_index] = y
            data[channel_index + 1] = data[channel_index]

            wav = wave.open(wav_file, 'wb')
            wav.setnchannels(n_channels)
            wav.setsampwidth(2)
            wav.setframerate(sample_rate)
            wav.setnframes(n_samples)
            wav.setcomptype('NONE', 'no compression')
            wav.writeframes(np.getbuffer(data))
            wav.close()

        for i in np.argsort(-rt.meta['rpkm'])[:10]:
            name = rt.names[i]
            values = rt.feature('rt_stop', name)

            wav_file = os.path.join(self.outdir, '%s.wav' % name)
            self.logger.info('create wav file: ' + wav_file)
            prepare_output_file(wav_file)
            modulate(values, wav_file)
コード例 #17
0
    def __call__(self):
        from genomic_data import GenomicData
        import numpy as np
        import h5py

        self.logger.info('read BUMHMM file: ' + self.posterior_file)
        posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:]
        self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file)
        f = h5py.File(self.bumhmm_input_file, 'r')
        start = f['start'][:]
        end = f['end'][:]
        name = f['name'][:]
        f.close()
        values = map(lambda i: posteriors[start[i]:end[i]], range(len(name)))
        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        GenomicData.from_data(name, features={
            'bumhmm': values
        }).save(self.outfile)
コード例 #18
0
 def number_of_samples(self):
     import h5py
     import glob
     name_dict = {'d': 'data_name', 'w': 'window_size', 'p': 'percentile', 'm': 'model_name', 'r': 'region'}
     header = ['experiment_type', 'data_name', 'region', 'window_size', 'percentile', 'n_train', 'n_test']
     records = []
     for filename in glob.glob('data/{}/{}/deepfold/*.h5'.format(self.experiment_type, self.data_name)):
         d = parse_filename(filename, name_dict)
         f = h5py.File(filename, 'r')
         records.append((self.experiment_type, self.data_name, d['region'], d['window_size'], d['percentile'],
             f['y_train'].shape[0], f['y_test'].shape[0]))
     self.logger.info('save file: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     with open(self.outfile, 'w') as f:
         f.write('\t'.join(header))
         f.write('\n')
         for record in records:
             f.write('\t'.join(map(str, record)))
             f.write('\n')
コード例 #19
0
 def __call__(self):
     import subprocess
     from formats import read_rme, read_ct
     values = read_rme(self.value_file).values()[0]
     title, seq, pairs = read_ct(self.ct_file)
     values_fillna = []
     for i in range(len(seq)):
         if i in values:
             values_fillna.append(values[i])
         else:
             values_fillna.append(0.5)
     colormap = ';'.join(map(lambda x: '%.3f'%x, values_fillna))
     prepare_output_file(self.outfile)
     cmdline = ['java', '-cp', self.varna_path,
         'fr.orsay.lri.varna.applications.VARNAcmd',
         '-i', self.ct_file, '-resolution', '5.0',
         '-colorMapStyle', 'rocknroll',
         '-colorMap', colormap, '-o', self.outfile]
     self.logger.info('execute: {}'.format(' '.join(cmdline)))
     p = subprocess.Popen(cmdline)
     p.wait()
コード例 #20
0
 def auc_lines(self, auc, params):
     import numpy as np
     fig, ax = plt.subplots(figsize=(5, 4))
     for keyval in params.keys():
         if self.xkey == 'window_size':
             window_sizes = [int(d['window_size']) for d in params[keyval]]
             sorted_index = np.argsort(window_sizes)
             auc[keyval] = map(lambda i: auc[keyval][i], sorted_index)
             params[keyval] = map(lambda i: params[keyval][i], sorted_index)
             ax.set_xticks(np.arange(len(window_sizes)))
             ax.set_xticklabels(sorted(window_sizes))
             ax.set_xlabel('Window Size')
         ax.plot(np.arange(len(window_sizes)), auc[keyval], lw=1.5, label=keyval)
     ax.set_title(self.title)
     ax.set_ylabel('AUROC')
     ax.set_ylim(0.75, 1)
     ax.legend(loc='lower right')
     plt.tight_layout()
     self.logger.info('save figure: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     plt.savefig(self.outfile)
コード例 #21
0
 def __call__(self):
     keys = ['length', 'sensitivity', 'ppv', 'tp_in_true', 'true_pairs', 'tp_in_pred', 'pred_pairs']
     fout = sys.stdout
     if self.outfile is not None:
         self.logger.info('save file: {}'.format(self.outfile))
         prepare_output_file(self.outfile)
         fout = open(self.outfile, 'w')
     if os.path.isdir(self.true_file) and os.path.isdir(self.pred_file):
         names = [os.path.splitext(a)[0] for a in os.listdir(self.pred_file)]
         fout.write('\t'.join(['name'] + keys) + '\n')
         for name in names:
             #self.logger.debug('read ct: {}'.format(name))
             scores = self.score_ct('{}/{}.ct'.format(self.true_file, name),
                 '{}/{}.ct'.format(self.pred_file, name))
             fout.write('\t'.join([name] + map(str, map(lambda x: scores[x], keys))) + '\n')
     else:
         scores = self.score_ct(self.true_file, self.pred_file)
         name = os.path.splitext(self.pred_file)[0]
         fout.write('\t'.join(['name'] + keys) + '\n')
         fout.write('\t'.join([ct_true[0]] + map(str, map(lambda x: scores[x], keys))) + '\n')
     if self.outfile is not None:
         fout.close()
コード例 #22
0
    def __call__(self):
        from genomic_data import GenomicData
        import numpy as np

        self.logger.info('read input rt file: ' + self.infile)
        name = []
        length = []
        rpkm = []
        rt_stop = []
        base_density = []
        with open(self.infile, 'r') as f:
            f.readline()
            n_records = 0
            for lineno, line in enumerate(f):
                c = line.strip().split('\t')
                if (lineno % 2) == 0:
                    name.append(c[0])
                    length.append(int(c[1]))
                    rpkm.append(float(c[2].split(',')[0]))
                    rt_stop.append(
                        np.asarray(c[3:], dtype='float').astype('int32'))
                else:
                    base_density.append(
                        np.asarray(c[3:], dtype='float').astype('int32'))
                n_records += 1
        self.logger.info('successfully read %d records' % n_records)

        self.logger.info('create output file: ' + self.outfile)
        prepare_output_file(self.outfile)
        GenomicData.from_data(name,
                              features={
                                  'rt_stop': rt_stop,
                                  'base_density': base_density
                              },
                              meta={
                                  'rpkm': np.asarray(rpkm, dtype='float64'),
                                  'length': np.asarray(length, dtype='int64')
                              }).save(self.outfile)
コード例 #23
0
 def __call__(self):
     import h5py
     from sklearn.model_selection import KFold
     import numpy as np
     if (self.n_samples is None) and (self.data_file is None):
         raise ValueError('either --n-samples/--data-file should be specified')
     if self.data_file:
         self.logger.info('determine number of samples from data file: {}' + self.data_file)
         fin = h5py.File(self.data_file, 'r')
         self.n_samples = fin['y_train'].shape[0]
         fin.close()
         self.logger.info('number of training samples: {}'.format(self.n_samples))
     self.logger.info('save file: ' + self.outfile)
     prepare_output_file(self.outfile)
     fout = h5py.File(self.outfile, 'w')
     kfold = KFold(self.n_folds, shuffle=True)
     fold = 0
     for train_index, test_index in kfold.split(np.arange(self.n_samples)):
         g = fout.create_group('%d'%fold)
         g.create_dataset('train', data=train_index)
         g.create_dataset('test', data=test_index)
         fold += 1
     fout.close()
コード例 #24
0
    def __call__(self):
        import h5py
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split
        from sklearn.preprocessing import StandardScaler

        X, y = make_regression(self.n_samples, self.n_features,
            n_informative=self.n_informative, bias=self.bias, noise=self.noise)
        if self.scale_targets:
            self.logger.info('scale target values using StandardScaler')
            scaler = StandardScaler()
            y = scaler.fit_transform(y.reshape(-1, 1))
            y = y.reshape((-1,))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_ratio)

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        f = h5py.File(self.outfile, 'w')
        f.create_dataset('X_train', data=X_train)
        f.create_dataset('y_train', data=y_train)
        f.create_dataset('X_test',  data=X_test)
        f.create_dataset('y_test', data=y_test)
        f.close()
コード例 #25
0
    def roc_curve(self, fpr, tpr, params):
        import numpy as np
        fig, ax = plt.subplots(figsize=(5, 5))
        plt.rcParams['font.size'] = 11
        plt.rcParams['legend.fontsize'] = 11
        ax.plot([0, 1], [0, 1], 'k--')
        for keyval in params.keys():
            best_index = np.argmax(auc[keyval])
            ax.plot(fpr[keyval][best_index], tpr[keyval][best_index],
                label='{} (AUC = {:.3f})'.format(keyval, auc[keyval][best_index]))
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1.1)
        ax.set_xticks(np.linspace(0, 1, 6))
        ax.set_yticks(np.linspace(0, 1, 6))
        ax.legend(loc='lower right')
        ax.set_title(self.title)
        plt.tight_layout()

        self.logger.info('save figure: {}'.format(self.outfile))
        prepare_output_file(self.outfile)
        plt.savefig(self.outfile)
コード例 #26
0
    def __call__(self):
        import pandas as pd
        import_matplotlib()

        if os.path.isdir(self.infile):
            df = []
            n = 0
            for filename in os.listdir(self.infile):
                df.append(pd.read_table('{}/{}'.format(self.infile, filename)))
                n += 1
                if n > self.max_plots:
                    break
            df = pd.concat(df)
        else:
            df = pd.read_table(self.infile)
        prepare_output_file(self.outfile)
        self.logger.info('save file: {}'.format(self.outfile))
        with PdfPages(self.outfile) as pdf:
            n_plots = 0
            for name, sub_df in df.groupby('name'):
                fig, ax = plt.subplots(figsize=(15, 1.5))
                length = sub_df.shape[0]
                if length > self.max_length:
                    start = length/2 - self.max_length/2
                    sub_df = sub_df.iloc[start:(start + self.max_length), :]
                ax.plot(sub_df['position'], sub_df['pred'], 'b-', label='prediction')
                ax.plot(sub_df['position'], sub_df['true'], 'k-', label='known')
                ax.legend(loc='upper right')
                ax.set_ylim(-0.1, 1.1)
                ax.set_title(name)
                plt.tight_layout()
                pdf.savefig(fig)
                plt.close(fig)
                n_plots += 1
                if n_plots >= self.max_plots:
                    break
コード例 #27
0
    def __call__(self):
        import_matplotlib()
        import numpy as np

        names1 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir1))
        names2 = map(lambda x: os.path.splitext(x)[0], os.listdir(self.indir2))

        prepare_output_file(self.outfile)
        self.logger.info('save file: {}'.format(self.outfile))
        with PdfPages(self.outfile) as pdf:
            n_plots = 0
            for name in names1:
                if name not in names2:
                    continue
                name1, seq1, pairs1 = read_ct('{}/{}.ct'.format(self.indir1, name))
                name2, seq2, pairs2 = read_ct('{}/{}.ct'.format(self.indir2, name))
                fig, axes = plt.subplots(nrows=2, figsize=(12, 4), sharex=True)
                length = len(seq1)
                x = np.arange(length)
                pairs1 = np.asarray(pairs1, dtype='int64')
                pairs1[pairs1 > 1] = 1
                pairs2 = np.asarray(pairs2, dtype='int64')
                pairs2[pairs2 > 1] = 1
                if length > self.max_length:
                    start = length/2 - self.max_length/2
                    pairs1 = pairs1[start:(start + self.max_length)]
                    pairs2 = pairs2[start:(start + self.max_length)]
                    x = x[start:(start + self.max_length)]
                axes[0].bar(x, pairs1, label=self.group_name1, color='b', edgecolor='w')
                axes[0].set_title('{}({})'.format(name, self.group_name1))
                axes[1].bar(x, pairs2, label=self.group_name1, color='b', edgecolor='w')
                axes[1].set_title('{}({})'.format(name, self.group_name2))
                pdf.savefig(fig)
                n_plots += 1
                if n_plots >= self.max_plots:
                    break
コード例 #28
0
 def __call__(self):
     import pandas as pd
     import numpy as np
     import h5py
     regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS']
     records = []
     for indir in self.indirs:
         for region in regions:
             deepfold_dataset = 'r={},p=5,w=100.h5'.format(region)
             data = GenomicData(os.path.join(indir, '{}.h5'.format(region)))
             if not self.feature:
                 feature = data.features.keys()[0]
             else:
                 feature = self.feature
             n_samples_total = len(data.features[feature]) - np.isnan(data.features[feature]).sum()
             f = h5py.File(os.path.join(indir, 'deepfold', deepfold_dataset), 'r')
             n_samples_train = f['X_train'].shape[0]
             n_samples_test = f['X_test'].shape[0]
             f.close()
             records.append((indir, deepfold_dataset, region, n_samples_total, n_samples_train, n_samples_test))
     df = pd.DataFrame.from_records(records, columns=('dataset', 'deepfold_dataset', 'region', 'n_samples_total', 'n_samples_train', 'n_samples_test'))
     self.logger.info('save file: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     df.to_csv(self.outfile, sep='\t', index=False)
コード例 #29
0
    def __call__(self):
        import json
        import h5py
        import cPickle
        import zipfile

        if self.hyperparam_file:
            with open(self.hyperparam_file, 'r') as f:
                hyperparam = json.load(f)
        else:
            hyperparam = json.loads(self.hyperparam)

        self.logger.info('load data: {}'.format(self.train_file))
        fin = h5py.File(self.train_file, 'r')
        X_train = fin['X_train'][:]
        y_train = fin['y_train'][:]
        fin.close()
        X_valid = None
        y_valid = None

        if self.cv_index_file is not None:
            if self.cv_fold is None:
                raise ValueError('argument --cv-fold is required if --cv-index-file is specified')
            if self.valid_metric_file is None:
                raise ValueError('argument --valid-metric-file is required if --cv-index-file is specified')
            self.logger.info('load CV index: ' + self.cv_index_file)
            f = h5py.File(self.cv_index_file, 'r')
            train_index = f[str(self.cv_fold)]['train'][:]
            test_index = f[str(self.cv_fold)]['test'][:]
            f.close()
            X_valid = X_train[test_index]
            y_valid = y_train[test_index]
            X_train = X_train[train_index]
            y_train = y_train[train_index]

        if self.flatten:
            X_train = X_train.reshape((X_train.shape[0], -1))
            self.logger.info('flatten the training data to dimension: (%d, %d)'%X_train.shape)
            if X_valid is not None:
                X_valid = X_valid.reshape((X_valid.shape[0], -1))
                self.logger.info('flatten the validation data to dimension: (%d, %d)'%X_train.shape)

        if self.scale_targets:
            self.logger.info('scale the target values using StandardScaler')
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            y_train = scaler.fit_transform(y_train.reshape(-1, 1)).reshape((-1,))
            if y_valid is not None:
                y_valid = scaler.transform(y_valid.reshape(-1, 1)).reshape((-1,))

        if self.model_script:
            self.logger.info('create model from script: ' + self.model_script)
            if self.model_type == 'keras':
                self.logger.info('use the keras model')
                #with open(os.path.join(os.path.dirname(__file__), 'import_keras.py'), 'r') as f:
                #    exec compile(f.read(), 'import_keras.py', 'exec')
                import_keras()
                with open(self.model_script, 'r') as f:
                    exec compile(f.read(), self.model_script, 'exec')
                from keras.optimizers import SGD
                optimizer = SGD()
                if self.regress:
                    loss = 'mean_squared_error'
                    metrics = ['mae']
                else:
                    loss = 'binary_crossentropy'
                    metrics = ['acc']
                model.compile(optimizer=optimizer,
                            loss=loss,
                            metrics=metrics)
                model.summary()
            else:
                with open(self.model_script, 'r') as f:
                    exec compile(f.read(), self.model_script, 'exec')
        else:
            self.logger.info('create model by name: ' + self.model_name)
            model = get_model(self.model_name, hyperparam)
        self.logger.info('train the model')
        if self.model_type == 'keras':
            model.fit(X_train, y_train, batch_size=100, epochs=20)
        else:
            self.logger.info('model parameters: ' + json.dumps(model.get_params()))
            model.fit(X_train, y_train)
        if self.model_file:
            self.logger.info('save model: {}'.format(self.model_file))
            prepare_output_file(self.model_file)
            if self.model_type == 'keras':
                model.save(self.model_file)
                f = h5py.File(self.model_file, 'r+')
                f.create_dataset('hyperparam', data=json.dumps(hyperparam))
                f.close()
            else:
                zipf = zipfile.ZipFile(self.model_file, 'w', zipfile.ZIP_DEFLATED)
                zipf.writestr('model', cPickle.dumps(model))
                zipf.writestr('hyperparam', json.dumps(hyperparam))
                zipf.close()

        if X_valid is not None:
            if self.metrics is None:
                if self.regress:
                    self.metrics = ['mean_squared_error', 'r2']
                else:
                    self.metrics = ['accuracy']
            self.logger.info('validate the model')
            if self.regress:
                y_pred = model.predict(X_valid)
            else:
                y_pred_labels = model.predict(X_valid)
            self.logger.info('save the metrics: ' + self.valid_metric_file)
            prepare_output_file(self.valid_metric_file)
            f = h5py.File(self.valid_metric_file, 'w')
            f.create_dataset('model_name', data=self.model_name)
            f.create_dataset('hyperparam', data=json.dumps(self.hyperparam))
            f.create_dataset('y_true', data=y_valid)
            if self.regress:
                f.create_dataset('y_pred', data=y_pred)
            else:
                f.create_dataset('y_pred_labels', data=y_pred_labels)
            g = f.create_group('metrics')
            for metric in self.metrics:
                scorer = get_scorer(metric)
                if self.regress:
                    score = scorer(y_valid, y_pred)
                else:
                    score = scorer(y_valid, y_pred_labels)
                self.logger.info('calculate metric {}: {}'.format(metric, score))
                g.create_dataset(metric, data=score)
            if self.scale_targets:
                g.create_dataset('scale_y_mean', data=scaler.mean_)
                g.create_dataset('scale_y_std', data=scaler.scale_)
            f.close()
コード例 #30
0
    def __call__(self):
        from formats import read_fasta
        from tqdm import tqdm
        import numpy as np
        import pandas as pd
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        self.logger.info('read sequence file: ' + self.sequence_file)
        sequences = dict(read_fasta(self.sequence_file))
        self.logger.info('read input file: ' + self.infile)
        data = GenomicData(self.infile)
        if self.feature is None:
            if len(data.features.keys()) == 1:
                self.feature = data.features.keys()[0]
            else:
                raise ValueError('multiple features found in the input file and the feature is not specified')

        # freqs[i]['A']: frequency of A in bin i
        freqs = []
        scores_all = data.features[self.feature]
        scores_avail = scores_all[np.logical_not(np.isnan(scores_all))]

        self.logger.info('use bin method: %s'%self.bin_method)
        if self.bin_method == 'percentile':
            qs = np.arange(1, self.bins + 1, dtype='float')*100.0/self.bins
            percentiles = np.zeros(self.bins + 1, dtype='float')
            percentiles[0] = scores_avail.min() - 1e-6
            for i in range(1, self.bins):
                percentiles[i] = np.percentile(scores_avail, qs[i - 1])
            percentiles[self.bins] = scores_avail.max() + 1e-6
        elif self.bin_method == 'value':
            density, percentiles = np.histogram(scores_avail, bins=self.bins, density=True)
            qs = np.cumsum(density)*100.0
            percentiles[0] -= 1e-6
            percentiles[-1] += 1e-6
        else:
            raise ValueError('unknown bin method: %s'%self.bin_method)

        for i in range(self.bins):
            d = {a:0 for a in self.alphabet}
            freqs.append(d)
        self.logger.info('count base frequencies with offset %d'%self.offset)
        for name in tqdm(data.names):
            scores_ts = data.feature(self.feature, name)
            avail_ind = np.nonzero(np.logical_not(np.isnan(scores_ts)))[0]
            seq_ts = np.frombuffer(sequences[name], dtype='S1')
            avail_ind += self.offset
            if self.offset > 0:
                avail_ind = avail_ind[avail_ind < len(seq_ts)]
            elif self.offset < 0:
                avail_ind = avail_ind[avail_ind >= 0]
            scores_avail_ts = scores_ts[avail_ind - self.offset]
            seq_avail_ts = seq_ts[avail_ind]
            for i in range(self.bins):
                seq_bin = seq_avail_ts[np.logical_and(scores_avail_ts <= percentiles[i + 1],
                                                      scores_avail_ts > percentiles[i])]
                for a in self.alphabet:
                    freqs[i][a] += np.count_nonzero(seq_bin == a)
        # normalize base frequencies for each percentile
        freq_total = []
        for i in range(self.bins):
            total = sum(freqs[i].values())
            freq_total.append(total)
            for a in self.alphabet:
                if total == 0:
                    freqs[i][a] = 1.0/len(self.alphabet)
                else:
                    freqs[i][a] = float(freqs[i][a])/total
        table_file = self.prefix + '.txt'
        self.logger.info('save results to file: ' + table_file)
        prepare_output_file(table_file)
        df = []
        for i in range(self.bins):
            for a in self.alphabet:
                    df.append((i, qs[i], percentiles[i], a, freq_total[i], freqs[i][a]))
        df = pd.DataFrame.from_records(df,
                                       columns=['bin', 'q', 'percentile', 'base', 'total_freq', 'fraction'])
        df.to_csv(table_file, sep='\t', index=False)
        # plot the distribution
        self.logger.info('create plot')
        plt.rcParams['font.family'] = 'Arial'
        plt.rcParams['axes.labelsize'] = 'medium'
        plt.rcParams['xtick.labelsize'] = 'x-small'
        plt.rcParams['ytick.labelsize'] = 'x-small'
        plt.rcParams['axes.titlesize'] = 'medium'

        fig, ax = plt.subplots(figsize=(7, 5))
        x = np.arange(self.bins)
        xticklabels = ['%.2f'%a for a in percentiles[1:]]
        for base in self.alphabet:
            sub_df = df[df['base'] == base]
            ax.plot(x, sub_df['fraction'], label=base)
        ax.set_xticks(x)
        ax.set_xticklabels(xticklabels)
        ax.set_ylim(0, 1)
        ax.set_xlabel('Values')
        ax.set_ylabel('Base fraction')
        ax.legend()
        plt.tight_layout()

        plot_file = self.prefix + '.pdf'
        self.logger.info('save plot to file: ' + plot_file)
        plt.savefig(plot_file)