Esempio n. 1
0
def run_BUMHMM_for_icSHAPE(control_files, treatment_files, sequence_file):
    control_data = []
    for filename in control_files:
        control_data.append(GenomicData.load(filename))
    treatment_data = []
    for filename in treatment_files:
        treatment_data.append(GenomicData.load(filename))
    sequences = dict(read_fasta(sequence_file))
    names, counts = np.unique(np.concatenate(
        map(lambda x: x.names, control_data) +
        map(lambda x: x.names, treatment_data)),
                              return_counts=True)
    common_names = names[counts >= (len(control_data) + len(treatment_data))]

    for name in common_names:
        run_BUMHMM(
            rt_stop_control=map(lambda x: x.feature(name, 'rt_stop'),
                                control_data),
            coverage_control=map(lambda x: x.feature(name, 'base_density'),
                                 control_data),
            rt_stop_treatment=map(lambda x: x.feature(name, 'rt_stop'),
                                  treatment_data),
            coverage_treatment=map(lambda x: x.feature(name, 'base_density'),
                                   treatment_data),
            seq=sequences[name])
Esempio n. 2
0
 def __call__(self):
     import pandas as pd
     import h5py
     import numpy as np
     regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS', 'ncRNA', 'miRNA']
     if self.region:
         regions = [self.region]
     records = []
     for dataset in os.listdir(self.indir):
         for region in regions:
             data_file = os.path.join(self.indir, dataset, '%s.h5'%region)
             if not os.path.isfile(data_file):
                 self.logger.warn('GenomicData file {} does not exist'.format(data_file))
                 continue
             data = GenomicData(data_file)
             if not self.feature:
                 feature = data.features.keys()[0]
                 #self.logger.info('use the default feature %s because --feature is not given')
             else:
                 feature = self.feature
             if self.percentile is not None:
                 data_valid = data.features[feature][np.logical_not(np.isnan(data.features[feature]))]
                 cutoff1 = np.percentile(data_valid, self.percentile)
                 cutoff2 = np.percentile(data_valid, 100 - self.percentile)
                 n_samples = np.logical_or(data_valid <= cutoff1, data_valid >= cutoff2).sum()
             else:
                 n_samples = len(data.features[feature]) - np.isnan(data.features[feature]).sum()
             records.append((dataset, region, n_samples))
     df = pd.DataFrame.from_records(records, columns=['dataset', 'region', 'n_samples'])
     print df.to_csv(sep='\t', index=False)
Esempio n. 3
0
def hmm_align_reactivities(args):
    import numpy as np
    from Bio import AlignIO
    from glob import glob
    import re
    import h5py
    from tqdm import tqdm
    from genomic_data import GenomicData

    logger.info('read reactivity file: ' + args.reactivity_file)
    data = GenomicData(args.reactivity_file)
    reactivities = {name:data.feature(args.feature, name) for name in data.names}

    logger.info('read alignment directory: ' + args.alignment_dir)
    pat_id = re.compile(r'[0-9]*\|*(?P<te_type>[^:]+)::(?P<seq_name>[^:]+):(?P<start>[0-9]+)-(?P<end>[0-9]+)')
    logger.info('create output file: ' + args.output_file)
    fout = h5py.File(args.output_file, 'w')

    for alignment_file in tqdm(glob(os.path.join(args.alignment_dir, '*.sto')), unit='file'):
        with open(alignment_file, 'r') as fin:
            reactivities_tx = []
            sequences_tx = []
            seq_names = []
            for record in AlignIO.read(fin, 'stockholm'):
                # parse transcriptomic coordinates from sequence names
                m = pat_id.match(record.id)
                if m is None:
                    raise ValueError('invalid record name %s in file: %s'%(record.id, alignment_file))
                r = reactivities.get(m.group('seq_name'))
                if r is not None:
                    r_aligned = np.full(len(record.seq), np.nan, dtype=np.float32)
                    # map reactivities to alignment
                    seq = np.frombuffer(str(record.seq), dtype='S1')
                    r_aligned[seq != '-'] = r[int(m.group('start')):int(m.group('end'))]
                    if np.all(np.isnan(r_aligned)):
                        continue
                    reactivities_tx.append(r_aligned.reshape((1, -1)))
                    sequences_tx.append(seq.reshape((1, -1)))
                    seq_names.append(record.id)
            if len(reactivities_tx) >= args.min_records:
                g = fout.create_group(os.path.splitext(os.path.basename(alignment_file))[0])
                g.create_dataset('reactivities', data=np.concatenate(reactivities_tx, axis=0))
                g.create_dataset('sequences', data=np.concatenate(sequences_tx, axis=0))
                g.create_dataset('seq_names', data=np.asarray(seq_names))
    fout.close()
Esempio n. 4
0
    def __call__(self):
        from genomic_data import GenomicData
        import numpy as np
        import h5py

        self.logger.info('read BUMHMM file: ' + self.posterior_file)
        posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:]
        self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file)
        f = h5py.File(self.bumhmm_input_file, 'r')
        start = f['start'][:]
        end = f['end'][:]
        name = f['name'][:]
        f.close()
        values = map(lambda i: posteriors[start[i]:end[i]], range(len(name)))
        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        GenomicData.from_data(name, features={
            'bumhmm': values
        }).save(self.outfile)
Esempio n. 5
0
    def __call__(self):
        from genomic_data import GenomicData
        import numpy as np

        self.logger.info('read input rt file: ' + self.infile)
        name = []
        length = []
        rpkm = []
        rt_stop = []
        base_density = []
        with open(self.infile, 'r') as f:
            f.readline()
            n_records = 0
            for lineno, line in enumerate(f):
                c = line.strip().split('\t')
                if (lineno % 2) == 0:
                    name.append(c[0])
                    length.append(int(c[1]))
                    rpkm.append(float(c[2].split(',')[0]))
                    rt_stop.append(
                        np.asarray(c[3:], dtype='float').astype('int32'))
                else:
                    base_density.append(
                        np.asarray(c[3:], dtype='float').astype('int32'))
                n_records += 1
        self.logger.info('successfully read %d records' % n_records)

        self.logger.info('create output file: ' + self.outfile)
        prepare_output_file(self.outfile)
        GenomicData.from_data(name,
                              features={
                                  'rt_stop': rt_stop,
                                  'base_density': base_density
                              },
                              meta={
                                  'rpkm': np.asarray(rpkm, dtype='float64'),
                                  'length': np.asarray(length, dtype='int64')
                              }).save(self.outfile)
Esempio n. 6
0
def analyze_periodicity(args):
    import h5py
    import pandas as pd
    import matplotlib
    matplotlib.use('Agg')
    import seaborn as sns
    sns.set()
    from genomic_data import GenomicData

    logger.info('read input file: ' + args.input_file)
    reactivities = {}
    sequences = {}
    if args.assay_type == 'shapemap':
        with h5py.File(args.input_file, 'r') as f:
            for tx_id in f['rep1'].keys():
                reactivities[tx_id] = f['rep1/' + tx_id][:]
            for tx_id in f['seq'].keys():
                sequences[tx_id] = f['seq/' + tx_id][()]
    elif args.assay_type == 'icshape':
        icshape = GenomicData(args.input_file)
        for name in icshape.names:
            reactivities[name] = icshape.feature('icshape', name)
        for name, seq in read_fasta(args.sequence_file):
            if name in icshape.names:
                sequences[name] = np.frombuffer(seq, dtype='S1')
    seq_names = sequences.keys()

    reactivities_concat = np.concatenate(
        [reactivities[name] for name in seq_names])
    sequences_concat = np.concatenate([sequences[name] for name in seq_names])
    notnan_mask = ~np.isnan(reactivities_concat)

    # plot overall distribution of SHAPE reactivity
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(reactivities_concat[notnan_mask], bins=50)
    ax.set_xlabel('Reactivity')
    ax.set_ylabel('Counts')
    plt.savefig()
Esempio n. 7
0
def get_te_icshape(args):
    import numpy as np
    import h5py
    import pandas as pd
    from genomic_data import GenomicData

    logger.info('read TE region file: ' + args.bed_file)
    bed = pd.read_table(args.bed_file, header=None)
    bed[3] = bed[3].astype('S')

    logger.info('read icSHAPE data file: ' + args.icshape_file)
    icshape = GenomicData(args.icshape_file)

    logger.info('create output file: ' + args.output_file)
    te_data = []
    te_names = []
    for row in bed.itertuples(index=False):
        data = icshape.feature(args.feature, row[0])
        if data is not None:
            te_data.append(data[row[1]:row[2]])
            te_names.append('%s,%s,%d,%d'%(row[3], row[0], row[1], row[2]))
    
    logger.info('create output file: ' + args.output_file)
    GenomicData.from_data(names=te_names, features={args.feature: te_data}).save(args.output_file)
Esempio n. 8
0
 def __call__(self):
     import_matplotlib()
     import numpy as np
     data = GenomicData(self.infile, feature_names=[self.feature])
     fig, ax = plt.subplots(figsize=(4, 4))
     valid_data = data.features[self.feature][np.logical_not(np.isnan(data.features[self.feature]))]
     ax.hist(valid_data, weights=np.full(len(valid_data), self.weight), bins=20, color='#808080')
     ax.set_xlabel(self.xlabel)
     ax.set_ylabel(self.ylabel)
     #ax.set_yticks(np.arange(len(counts)), map(lambda x: '%.1f'%x, counts.astype('float')*1e-6))
     plt.tight_layout()
     if self.title:
         ax.set_title(self.title)
     self.logger.info('save figure: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     plt.savefig(self.outfile)
Esempio n. 9
0
def icshape_raw_rt_to_genomic_data(infile, logger=None):
    import numpy as np
    from genomic_data import GenomicData
    if not logger:
        logger = logging.getLogger('icshape_raw_rt_to_genomic_data')

    logger.info('read input rt file: ' + infile)
    name = []
    length = []
    rpkm = []
    rt_stop = []
    base_density = []
    with open(infile, 'r') as f:
        f.readline()
        n_records = 0
        for lineno, line in enumerate(f):
            c = line.strip().split('\t')
            if (lineno % 2) == 0:
                name.append(c[0])
                length.append(int(c[1]))
                rpkm.append(float(c[2].split(',')[0]))
                base_density.append(
                    np.asarray(c[3:], dtype='float').astype('int32'))
            else:
                rt_stop.append(
                    np.asarray(c[3:], dtype='float').astype('int32'))
            n_records += 1
    logger.info('successfully read %d records' % n_records)

    data = GenomicData.from_data(name,
                                 features={
                                     'rt_stop': rt_stop,
                                     'base_density': base_density
                                 },
                                 meta={
                                     'rpkm': np.asarray(rpkm, dtype='float64'),
                                     'length': np.asarray(length,
                                                          dtype='int64')
                                 })
    return data
Esempio n. 10
0
 def __call__(self):
     import pandas as pd
     import numpy as np
     import h5py
     regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS']
     records = []
     for indir in self.indirs:
         for region in regions:
             deepfold_dataset = 'r={},p=5,w=100.h5'.format(region)
             data = GenomicData(os.path.join(indir, '{}.h5'.format(region)))
             if not self.feature:
                 feature = data.features.keys()[0]
             else:
                 feature = self.feature
             n_samples_total = len(data.features[feature]) - np.isnan(data.features[feature]).sum()
             f = h5py.File(os.path.join(indir, 'deepfold', deepfold_dataset), 'r')
             n_samples_train = f['X_train'].shape[0]
             n_samples_test = f['X_test'].shape[0]
             f.close()
             records.append((indir, deepfold_dataset, region, n_samples_total, n_samples_train, n_samples_test))
     df = pd.DataFrame.from_records(records, columns=('dataset', 'deepfold_dataset', 'region', 'n_samples_total', 'n_samples_train', 'n_samples_test'))
     self.logger.info('save file: {}'.format(self.outfile))
     prepare_output_file(self.outfile)
     df.to_csv(self.outfile, sep='\t', index=False)
Esempio n. 11
0
    def __call__(self):
        from formats import read_fasta
        from tqdm import tqdm
        import numpy as np
        import pandas as pd
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        self.logger.info('read sequence file: ' + self.sequence_file)
        sequences = dict(read_fasta(self.sequence_file))
        self.logger.info('read input file: ' + self.infile)
        data = GenomicData(self.infile)
        if self.feature is None:
            if len(data.features.keys()) == 1:
                self.feature = data.features.keys()[0]
            else:
                raise ValueError('multiple features found in the input file and the feature is not specified')

        # freqs[i]['A']: frequency of A in bin i
        freqs = []
        scores_all = data.features[self.feature]
        scores_avail = scores_all[np.logical_not(np.isnan(scores_all))]

        self.logger.info('use bin method: %s'%self.bin_method)
        if self.bin_method == 'percentile':
            qs = np.arange(1, self.bins + 1, dtype='float')*100.0/self.bins
            percentiles = np.zeros(self.bins + 1, dtype='float')
            percentiles[0] = scores_avail.min() - 1e-6
            for i in range(1, self.bins):
                percentiles[i] = np.percentile(scores_avail, qs[i - 1])
            percentiles[self.bins] = scores_avail.max() + 1e-6
        elif self.bin_method == 'value':
            density, percentiles = np.histogram(scores_avail, bins=self.bins, density=True)
            qs = np.cumsum(density)*100.0
            percentiles[0] -= 1e-6
            percentiles[-1] += 1e-6
        else:
            raise ValueError('unknown bin method: %s'%self.bin_method)

        for i in range(self.bins):
            d = {a:0 for a in self.alphabet}
            freqs.append(d)
        self.logger.info('count base frequencies with offset %d'%self.offset)
        for name in tqdm(data.names):
            scores_ts = data.feature(self.feature, name)
            avail_ind = np.nonzero(np.logical_not(np.isnan(scores_ts)))[0]
            seq_ts = np.frombuffer(sequences[name], dtype='S1')
            avail_ind += self.offset
            if self.offset > 0:
                avail_ind = avail_ind[avail_ind < len(seq_ts)]
            elif self.offset < 0:
                avail_ind = avail_ind[avail_ind >= 0]
            scores_avail_ts = scores_ts[avail_ind - self.offset]
            seq_avail_ts = seq_ts[avail_ind]
            for i in range(self.bins):
                seq_bin = seq_avail_ts[np.logical_and(scores_avail_ts <= percentiles[i + 1],
                                                      scores_avail_ts > percentiles[i])]
                for a in self.alphabet:
                    freqs[i][a] += np.count_nonzero(seq_bin == a)
        # normalize base frequencies for each percentile
        freq_total = []
        for i in range(self.bins):
            total = sum(freqs[i].values())
            freq_total.append(total)
            for a in self.alphabet:
                if total == 0:
                    freqs[i][a] = 1.0/len(self.alphabet)
                else:
                    freqs[i][a] = float(freqs[i][a])/total
        table_file = self.prefix + '.txt'
        self.logger.info('save results to file: ' + table_file)
        prepare_output_file(table_file)
        df = []
        for i in range(self.bins):
            for a in self.alphabet:
                    df.append((i, qs[i], percentiles[i], a, freq_total[i], freqs[i][a]))
        df = pd.DataFrame.from_records(df,
                                       columns=['bin', 'q', 'percentile', 'base', 'total_freq', 'fraction'])
        df.to_csv(table_file, sep='\t', index=False)
        # plot the distribution
        self.logger.info('create plot')
        plt.rcParams['font.family'] = 'Arial'
        plt.rcParams['axes.labelsize'] = 'medium'
        plt.rcParams['xtick.labelsize'] = 'x-small'
        plt.rcParams['ytick.labelsize'] = 'x-small'
        plt.rcParams['axes.titlesize'] = 'medium'

        fig, ax = plt.subplots(figsize=(7, 5))
        x = np.arange(self.bins)
        xticklabels = ['%.2f'%a for a in percentiles[1:]]
        for base in self.alphabet:
            sub_df = df[df['base'] == base]
            ax.plot(x, sub_df['fraction'], label=base)
        ax.set_xticks(x)
        ax.set_xticklabels(xticklabels)
        ax.set_ylim(0, 1)
        ax.set_xlabel('Values')
        ax.set_ylabel('Base fraction')
        ax.legend()
        plt.tight_layout()

        plot_file = self.prefix + '.pdf'
        self.logger.info('save plot to file: ' + plot_file)
        plt.savefig(plot_file)
Esempio n. 12
0
    def __call__(self):
        import_matplotlib()
        import numpy as np
        import h5py
        import pandas as pd
        from sklearn.metrics import roc_auc_score
        from scipy.stats import pearsonr, spearmanr, ttest_ind

        def normalized_mutual_information(p, epsilon=1e-12):
            p = p + epsilon
            p /= p.sum()
            px = p.sum(axis=1)
            py = p.sum(axis=0)
            pxpy = np.dot(px.reshape(-1, 1), py.reshape(1, -1))
            Hxy = np.sum(p*np.log(p/pxpy))
            Hx = np.sum(px*np.log(px))
            Hy = np.sum(py*np.log(py))
            return Hxy/np.sqrt(Hx*Hy)

        self.logger.info('read BUMHMM posteriors: ' + self.bumhmm_file)
        bumhmm = GenomicData(self.bumhmm_file, feature_names=['bumhmm'])
        self.logger.info('read icSHAPE scores: ' + self.icshape_file)
        icshape = GenomicData(self.icshape_file, feature_names=['icshape'])
        names, counts = np.unique(np.concatenate((bumhmm.names, icshape.names)), return_counts=True)
        common_names = names[counts >= 2]

        metrics = ('roc_auc', 't_test_p', 'pearsonr', 'spearmanr', 'normalized_mi')
        correlation = {}
        for metric in metrics:
            correlation[metric] = np.full(len(common_names), np.nan)
        values_bumhmm_selected = []
        values_icshape_selected = []
        for i, name in enumerate(common_names):
            values_icshape = icshape.feature('icshape', name)
            values_bumhmm = bumhmm.feature('bumhmm', name)
            valid_index = np.nonzero(np.logical_not(np.logical_or(np.isnan(values_icshape), np.isnan(values_bumhmm))))[0]
            values_icshape = values_icshape[valid_index]
            values_bumhmm = values_bumhmm[valid_index]
            values_bumhmm_binary = (values_bumhmm > 0.5).astype('int32')

            if values_bumhmm_binary.sum() in (0, len(values_bumhmm_binary)):
                self.logger.warn('ignoring %s because only one class is defined by the BUMHMM posteriors'%name)
                continue
            if len(values_icshape_selected) < 10:
                values_icshape_selected.append(values_icshape)
                values_bumhmm_selected.append(values_bumhmm)


            correlation['roc_auc'][i] = roc_auc_score(values_bumhmm_binary, values_icshape)
            a = values_icshape[values_bumhmm_binary == 0]
            b = values_icshape[values_bumhmm_binary == 1]
            correlation['t_test_p'][i] = ttest_ind(a, b)[1]
            correlation['pearsonr'][i] = pearsonr(values_icshape, values_bumhmm)[0]
            correlation['spearmanr'][i] = spearmanr(values_icshape, values_bumhmm)[0]

            bins = np.linspace(0.0, 1.0, 21)
            bin_index = np.digitize(values_icshape, bins) - 1
            pxy = np.empty((2, 20), dtype='float64')
            pxy[0] = np.histogram(a, bins=bins)[0]
            pxy[1] = np.histogram(b, bins=bins)[0]
            correlation['normalized_mi'][i] = normalized_mutual_information(pxy)

        for metric in correlation:
            correlation[metric][np.isinf(correlation[metric])] = np.nan
        df = pd.DataFrame(correlation)
        df['seqname'] = common_names
        df = df.dropna(axis=0, how='any')

        table_file = self.prefix + '.txt'
        self.logger.info('save correlations to file: ' + table_file)
        prepare_output_file(table_file)
        df.to_csv(table_file, sep='\t', index=False)

        plot_file = self.prefix + '.pdf'
        self.logger.info('save plot file: ' + plot_file)
        with PdfPages(plot_file) as pdf:
            values_bumhmm_selected = np.concatenate(values_bumhmm_selected)
            values_icshape_selected = np.concatenate(values_icshape_selected)
            fig, ax = plt.subplots(figsize=(8, 6))
            ax.scatter(values_icshape_selected, values_bumhmm_selected, s=1, edgecolor='none')
            ax.set_xlabel('icSHAPE scores')
            ax.set_ylabel('BUMHMM posterior probabilities')
            ax.set_title('icSHAPE scores and BUMHMM posteriors')
            pdf.savefig(fig)
            plt.clf()
            plt.close(fig)

            for metric in metrics:
                fig, ax = plt.subplots(figsize=(8, 6))
                if metric in ('t_test_p'):
                    ax.hist(np.log(df[metric] + 1e-12), bins=50)
                else:
                    ax.hist(df[metric], bins=50)
                ax.set_title('Correlation between icSHAPE scores and BUMHMM (%s)'%metric)
                ax.set_xlabel(metric)
                ax.set_ylabel('Counts')

                plt.tight_layout()
                pdf.savefig(fig)
                plt.clf()
                plt.close(fig)
Esempio n. 13
0
    def __call__(self):
        from genomic_data import GenomicData
        import pandas as pd
        import numpy as np

        known = GenomicData(self.known_file, [self.feature])
        y_pred = []
        y_true = []
        names = []
        length = []
        for name, seq, structure, energy in read_rnafold(self.infile):
            names.append(name)
            structure = np.frombuffer(structure, dtype='S1')
            length.append(len(structure))
            y_pred.append((structure != '.').astype('int32'))
            y_true_seq = known.feature(self.feature, name)
            if known.feature(self.feature, name) is None:
                found = np.nonzero(map(lambda x: x.startswith(name), known.names))[0]
                if len(found) == 0:
                    raise ValueError('sequence {} could not be found'.format(name))
                elif len(found) == 1:
                    self.logger.warn('partial sequence name match {} => {}'.format(known.names[found[0]], name))
                    y_true_seq = known.feature(self.feature, known.names[found[0]])
                else:
                    raise ValueError('multiple partial matches found for {}'.format(name))
            y_true.append(y_true_seq)
        """
        y_pred = np.concatenate(y_pred)
        y_true = np.concatenate(y_true)

        scores = {}
        for metric in self.metrics:
            # y_pred is an array of continous scores
            scorer = get_scorer(metric)
            scores[metric] = scorer(y_true, y_pred)
            self.logger.info('metric {} = {}'.format(metric, scores[metric]))
        if self.outfile is not None:
            self.logger.info('save file: {}'.format(self.outfile))
            prepare_output_file(self.outfile)
            fout = h5py.File(self.outfile, 'w')
            fout.create_dataset('y_true', data=y_true)
            fout.create_dataset('y_pred', data=y_pred)
            fout.create_dataset('y_pred_labels', data=y_pred)
            grp = fout.create_group('metrics')
            for metric in self.metrics:
                scorer = get_scorer(metric)
                if get_scorer_type(metric) == 'continuous':
                    try:
                        score = scorer(y_true, y_pred)
                    except ValueError:
                        score = np.nan
                else:
                    score = scorer(y_true, y_pred_labels)
                
                grp.create_dataset(metric, data=scores[metric])
            fout.close()"""
        if True:
            self.logger.info('calculate metrics by sequence')
            records = []
            for i in range(len(names)):
                y_true_ = y_true[i]
                y_pred_ = y_pred[i]
                y_pred_labels_ = y_pred_
                scores = []
                for metric in self.metrics:
                    scorer = get_scorer(metric)
                    if get_scorer_type(metric) == 'continuous':
                        try:
                            score = scorer(y_true_, y_pred_)
                        except ValueError:
                            score = np.nan
                    else:
                        score = scorer(y_true_, y_pred_labels_)
                    scores.append(score)
                records.append([names[i], length[i]] + scores)
            records = pd.DataFrame.from_records(records, columns=['name', 'length'] + self.metrics)
            self.logger.info('save metric by sequence file: ' + self.outfile)
            prepare_output_file(self.outfile)
            records.to_csv(self.outfile, sep='\t', index=False, na_rep='nan')
Esempio n. 14
0
    def __call__(self):
        import numpy as np
        import pandas as pd
        import h5py
        from formats import read_rnafold, structure_to_pairs

        self.logger.info('load model: {}'.format(self.model_file))
        model = keras.models.load_model(self.model_file)
        window_size = K.int_shape(model.input)[1]
        self.logger.info('load input data (in %s format): %s'%(self.format, self.infile))
        have_structure = False
        if self.format == 'fasta':
            # list of tuples: (name, seq)
            input_data = list(read_fasta(self.infile))
        elif self.format == 'ct_dir':
            # read all .ct files from the directory
            # list of tuples: (name, seq, pairs)
            input_data = []
            for filename in os.listdir(self.infile):
                title, seq, pairs = read_ct(os.path.join(self.infile, filename))
                title = os.path.splitext(filename)[0]
                input_data.append((title, seq, pairs))
            have_structure = True
        elif self.format == 'ct':
            title, seq, pairs = read_ct(self.infile)
            title = os.path.splitext(os.path.basename(self.infile))[0]
            input_data = [(title, seq, pairs)]
            have_structure = True
        elif self.format == 'rnafold':
            input_data = []
            for name, seq, structure, energy in read_rnafold(self.infile, parse_energy=False):
                pairs = structure_to_pairs(structure)
                input_data.append((name, seq, pairs))
            have_structure = True
        elif self.format == 'genomic_data':
            from genomic_data import GenomicData
            input_data = []
            data = GenomicData(self.infile)
            for name in data.names:
                input_data.append((name,
                    data.feature('sequence', name).tostring(),
                    data.feature('reactivity', name)))
            del data
            have_structure = True

        # combine all structures (base-pairs) into one array in the ct file
        if have_structure:
            structure = []
            for i in range(len(input_data)):
                structure.append(np.asarray(input_data[i][2], dtype='int32'))
            structure = np.concatenate(structure)
        else:
            structure = None

        X = []
        names = []
        # offset default to the center of the window
        if self.offset is None:
            self.offset = (window_size + 1)/2
        offset = self.offset

        # convert sequences to windows
        windows = []
        length = []
        sequence = []
        for item in input_data:
            name = item[0]
            seq = item[1]
            windows += self.sequence_to_windows(seq, window_size, offset)
            names.append(name)
            length.append(len(seq))
            sequence.append(seq)
        # combine all sequences into one dataset
        sequence = np.frombuffer(''.join(sequence), dtype='S1')
        length = np.asarray(length, dtype='int64')

        n_samples = len(windows)
        windows = np.frombuffer(''.join(windows), dtype='S1').reshape((n_samples, window_size))
        X = onehot_encode(windows, self.alphabet)
        # set one-hot coding of padded sequence to [0.25, 0.25, 0.25, 0.25]
        X[X.sum(axis=2) == 0] = 1.0/len(self.alphabet)

        self.logger.info('run the model')
        y_pred = model.predict(X, batch_size=self.batch_size)
        y_pred = np.squeeze(y_pred)
        if self.swap_labels:
            self.logger.info('swap labels')
            y_pred = 1 - y_pred

        # start/end position of each transcript in the y_pred
        end = np.cumsum(length)
        start = end - length
        if len(y_pred.shape) > 1:
            # average the predictions
            self.logger.info('average windows for dense prediction')
            y_pred_dense = []
            for i in range(len(input_data)):
                y_pred_dense.append(self.predict_dense(y_pred[start[i]:end[i]], offset))

            if self.dense_pred_file:
                self.logger.info('save dense predictions: ' + self.dense_pred_file)
                f = h5py.File(self.dense_pred_file, 'w')
                for i in range(len(names)):
                    g = f.create_group(names[i])
                    g.create_dataset('predicted_values_dense', data=y_pred[start[i]:end[i]])
                    g.create_dataset('predicted_values_average', data=y_pred_dense[i])
                    # 0-based start/end position of each transcript in the array (y_pred, sequence, structure)
                    g.create_dataset('sequence', data=sequence[start[i]:end[i]])
                    if structure is not None:
                        g.create_dataset('structure', data=structure[start[i]:end[i]])
                f.close()

            y_pred = np.concatenate(y_pred_dense)
            y_pred_labels = np.round(y_pred).astype('int32')
        else:
            y_pred_labels = np.round(y_pred).astype('int32')

        if self.restraint_file:
            header = ['name', 'position', 'pred', 'base']
            table = pd.DataFrame()
            table['name'] = np.repeat(np.asarray(names, dtype='S'), length)
            # start position of each transcript relative to the y_pred
            start = np.repeat(cum_length - length, length)
            # position (1-based) relative to the transcript
            position = np.arange(1, length.sum() + 1) - start
            table['position'] = position
            table['pred'] = y_pred_labels
            table['base'] = sequence
            table['true'] = structure
            self.logger.info('write restraint file: ' + self.restraint_file)
            prepare_output_file(self.restraint_file)
            table.to_csv(self.restraint_file, sep='\t', index=False)
        if self.metric_file:
            self.logger.info('save metric file: ' + self.metric_file)
            prepare_output_file(self.metric_file)
            f = h5py.File(self.metric_file, 'w')
            from sklearn.metrics import accuracy_score
            f.create_dataset('y_pred', data=y_pred)
            f.create_dataset('y_pred_labels', data=y_pred_labels)
            if have_structure:
                #print structure
                y_true = (structure > 0).astype('int32')
                f.create_dataset('y_true', data=y_true)
                g = f.create_group('metrics')
                for metric in self.metrics:
                    scorer = get_scorer(metric)
                    if get_scorer_type(metric) == 'continous':
                        score = scorer(y_true, y_pred)
                    else:
                        score = scorer(y_true, y_pred_labels)
                    self.logger.info('%s: %f'%(metric, score))
                    g.create_dataset(metric, data=score)
            f.close()
        if self.metric_by_sequence_file:
            self.logger.info('calculate metrics by sequence')
            records = []
            for i in range(len(names)):
                y_true_ = (structure[start[i]:end[i]] > 0).astype('int32')
                y_pred_ = y_pred[start[i]:end[i]]
                y_pred_labels_ = y_pred_labels[start[i]:end[i]]
                scores = []
                for metric in self.metrics:
                    scorer = get_scorer(metric)
                    if get_scorer_type(metric) == 'continuous':
                        try:
                            score = scorer(y_true_, y_pred_)
                        except ValueError:
                            score = np.nan
                    else:
                        score = scorer(y_true_, y_pred_labels_)
                    scores.append(score)
                records.append([names[i], length[i]] + scores)
            records = pd.DataFrame.from_records(records, columns=['name', 'length'] + self.metrics)
            self.logger.info('save metric by sequence file: ' + self.metric_by_sequence_file)
            prepare_output_file(self.metric_by_sequence_file)
            records.to_csv(self.metric_by_sequence_file, sep='\t', index=False, na_rep='nan')
        if self.pred_file:
            self.logger.info('save predictions to file: ' + self.pred_file)
            prepare_output_file(self.pred_file)
            f = h5py.File(self.pred_file, 'w')
            for i in range(len(names)):
                y_true_ = (structure[start[i]:end[i]] > 0).astype('int32')
                g = f.create_group(names[i])
                g.create_dataset('sequence', data=sequence[start[i]:end[i]])
                g.create_dataset('predicted_values', data=y_pred[start[i]:end[i]])
                g.create_dataset('predicted_labels', data=y_pred[start[i]:end[i]])
                g.create_dataset('true_labels', data=y_true_)
            f.close()
Esempio n. 15
0
def analyze_reactivity_periodicity(args):
    import h5py
    import numpy as np
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    import seaborn as sns
    sns.set()
    from genomic_data import GenomicData
    from scipy.fftpack import fft
    from formats import read_fasta
    from ioutils import prepare_output_file

    logger.info('read input file: ' + args.input_file)
    reactivities = {}
    if args.assay_type == 'shapemap':
        with h5py.File(args.input_file, 'r') as f:
            for tx_id in f['reactivities'].keys():
                reactivities[tx_id] = f['reactivities/' + tx_id][:]
    elif args.assay_type in ('icshape', 'rt_stop'):
        data = GenomicData(args.input_file)
        for name in data.names:
            reactivities[name] = data.feature(args.assay_type, name)
    seq_names = np.asarray(reactivities.keys(), dtype='S')
    # get coverage of each transcript
    lengths = np.asarray([reactivities[name].shape[0] for name in seq_names])
    coverage = np.asarray(
        [np.sum(~np.isnan(reactivities[name])) for name in seq_names])
    coverage = coverage.astype(np.float64) / lengths
    # filter sequences of by coverage
    seq_names = seq_names[coverage >= args.min_coverage]
    reactivities = {name: reactivities[name] for name in seq_names}

    def calc_average_reactivities(reactivities, direction, aligned_length=100):
        reactivities_avg = np.full((len(reactivities), aligned_length), np.nan)
        for i, name in enumerate(reactivities.keys()):
            x = reactivities[name]
            L = min(x.shape[0], aligned_length)
            if direction == '5p':
                reactivities_avg[i, :L] = x[:L]
            elif direction == '3p':
                reactivities_avg[i, -L:] = x[-L:]
        transcript_counts = np.sum(~np.isnan(reactivities_avg), axis=0)
        reactivities_avg = np.nan_to_num(reactivities_avg)
        reactivities_avg = np.sum(
            reactivities_avg, axis=0) / transcript_counts.astype(np.float64)
        return reactivities_avg

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    aligned_length = args.aligned_length
    with PdfPages(args.output_file) as pdf:
        # 5'-end
        reactivities_avg_5p = calc_average_reactivities(
            reactivities, '5p', aligned_length=aligned_length)
        fig, ax = plt.subplots(figsize=(18, 4))
        ax.plot(np.arange(aligned_length), reactivities_avg_5p)
        ax.set_xlabel('Position in CDS from 5\'-end')
        ax.set_ylabel('Reactivity')
        ax.set_xlim(0, aligned_length)
        pdf.savefig()
        plt.close()

        # 3'-end
        reactivities_avg_3p = calc_average_reactivities(
            reactivities, '3p', aligned_length=aligned_length)
        fig, ax = plt.subplots(figsize=(18, 4))
        ax.plot(np.arange(-aligned_length, 0), reactivities_avg_3p)
        ax.set_xlabel('Distance from CDS from 3\'-end')
        ax.set_ylabel('Reactivity')
        ax.set_xlim(-aligned_length, 0)
        pdf.savefig()
        plt.close()

        ## FFT
        plot_fft(reactivities_avg_5p, 'CDS from 5\'-end')
        pdf.savefig()
        plt.close()
        plot_fft(reactivities_avg_3p, 'CDS from 3\'-end')
        pdf.savefig()
        plt.close()