Exemple #1
0
def run_BUMHMM_for_icSHAPE(control_files, treatment_files, sequence_file):
    control_data = []
    for filename in control_files:
        control_data.append(GenomicData.load(filename))
    treatment_data = []
    for filename in treatment_files:
        treatment_data.append(GenomicData.load(filename))
    sequences = dict(read_fasta(sequence_file))
    names, counts = np.unique(np.concatenate(
        map(lambda x: x.names, control_data) +
        map(lambda x: x.names, treatment_data)),
                              return_counts=True)
    common_names = names[counts >= (len(control_data) + len(treatment_data))]

    for name in common_names:
        run_BUMHMM(
            rt_stop_control=map(lambda x: x.feature(name, 'rt_stop'),
                                control_data),
            coverage_control=map(lambda x: x.feature(name, 'base_density'),
                                 control_data),
            rt_stop_treatment=map(lambda x: x.feature(name, 'rt_stop'),
                                  treatment_data),
            coverage_treatment=map(lambda x: x.feature(name, 'base_density'),
                                   treatment_data),
            seq=sequences[name])
def create_dataset(args):
    import numpy as np
    import h5py
    import pandas as pd
    from tqdm import tqdm
    from formats import read_fasta

    logger.info('read peak file: ' + args.peak_file)
    peaks = pd.read_table(args.peak_file,
        names=['chrom', 'start', 'end', 'peak_id', 'label', 'strand'])
    peaks['peak_id'] = peaks['peak_id'].astype('U')
    peaks.index = peaks['peak_id']
    logger.info('read sequence file: ' + args.sequence_file)
    sequences = {name:seq for name, seq in read_fasta(args.sequence_file)}
    if args.reactivity_file is not None:
        logger.info('read reactivity file: ' + args.reactivity_file)
        reactivities = {}
        with h5py.File(args.reactivity_file, 'r') as f:
            for peak_id in f.keys():
                reactivities[peak_id.split(',')[0]] = f[peak_id][:]
        peak_ids = []
        for peak_id in sequences.keys():
            if peak_id in reactivities:
                coverage = np.sum(~np.isnan(reactivities[peak_id]))
            else:
                coverage = 0
            if coverage >= args.min_coverage:
                peak_ids.append(peak_id)
                if coverage == 0:
                    reactivities[peak_id] = np.full(len(sequences[peak_id]), np.nan, dtype=np.float32)
    else:
        peak_ids = list(sorted(sequences.keys()))

    def onehot_encode(x, alphabet='ATCG'):
        alphabet = np.frombuffer(bytearray(alphabet, encoding='ascii'), dtype='S1')
        x_shape = list(x.shape)
        encoded = (x.reshape(x_shape + [1]) == alphabet.reshape([1]*len(x_shape) + [-1])).astype(np.int32)
        return encoded

    X_seq = np.concatenate([np.frombuffer(bytearray(sequences[peak_id], encoding='ascii'), dtype='S1')[np.newaxis, :] for peak_id in peak_ids], axis=0)
    X_seq = onehot_encode(X_seq)
    if args.reactivity_file is not None:
        X_r   = np.concatenate([reactivities[peak_id][np.newaxis, :, np.newaxis] for peak_id in peak_ids], axis=0)
        # imputate reactivities with median values
        X_r[np.isnan(X_r)] = np.nanmedian(X_r.flatten())
        X = np.concatenate([X_seq, X_r], axis=2)
    else:
        X = X_seq
    y = peaks['label'][peak_ids]
    logger.info('create output file: ' + args.output_file)
    with h5py.File(args.output_file, 'w') as fout:
        fout.create_dataset('X', data=X)
        fout.create_dataset('y', data=y)
def predict(args):
    import numpy as np
    import keras
    import h5py
    import models
    from tqdm import tqdm
    import six.moves.cPickle as pickle
    from ioutils import prepare_output_file, make_dir
    from formats import read_fasta

    if args.n_threads >= 1:
        logger.info('set number of threads to {} for TensorFlow'.format(
            args.n_threads))
        set_keras_num_threads(args.n_threads)

    logger.info('load model: {}'.format(args.model_file))
    model_format = detect_model_format(args.model_file)
    logger.info('detected model format: ' + model_format)
    if model_format == 'keras':
        model = keras.models.load_model(args.model_file)
        window_size = model.input.shape[1].value
    elif model_format == 'sklearn':
        with open(args.model_file, 'r') as f:
            model = pickle.load(f)

    # default offset
    if args.offset is None:
        offset = int(window_size) // 2
    logger.info('load data: {}'.format(args.input_file))
    if args.format == 'fasta':
        names = []
        logger.info('create output file: ' + args.output_file)
        fout = h5py.File(args.output_file, 'w')
        for name, sequence in tqdm(read_fasta(args.input_file),
                                   unit='transcript'):
            names.append(name)
            sequence = np.frombuffer(bytearray(sequence, encoding='ascii'),
                                     dtype='S1')
            windows = split_windows_same(sequence,
                                         window_size,
                                         1,
                                         offset=offset)
            X = onehot_encode(windows)
            y_pred = model.predict(X, batch_size=args.batch_size)
            y_pred = np.squeeze(y_pred)
            if args.swap_labels:
                logger.info('swap labels')
                y_pred = 1 - y_pred
            fout.create_dataset(name, data=y_pred)
        fout.close()
    else:
        raise ValueError('unknown input format: ' + args.format)
Exemple #4
0
    def __call__(self):
        import numpy as np
        from sklearn.model_selection import train_test_split
        import h5py
        from common import sequence_to_array
        from scipy import signal

        self.logger.info('read input file: ' + self.infile)
        _, base_density, length, _ = read_background_rt(self.infile)
        names = base_density.keys()
        self.logger.info('read sequence file: ' + self.sequence_file)
        sequences = dict(read_fasta(self.sequence_file))

        if self.offset is None:
            self.offset = (self.window_size + 1) / 2
        X = []
        y = []
        if self.smooth:
            self.logger.info(
                'smooth the values using Gaussian window of width %.1f' %
                self.smooth_width)
            window = signal.gaussian(100, std=self.smooth_width)
        for name in names:
            seq = sequences[name]
            values = base_density[name] / base_density[name].mean()
            if self.smooth:
                # smooth the signal
                values = signal.convolve(values, window, mode='same')
            for i in range(0, len(seq) - self.window_size, self.stride):
                X.append(sequence_to_array(seq[i:(i + self.window_size)]))
                y.append(values[i + self.offset])
                if len(X) >= self.max_samples:
                    break
        n_samples = len(X)
        self.logger.info('created {} samples'.format(n_samples))

        X = np.concatenate(X)
        X = X.reshape((n_samples, self.window_size, 4))
        y = np.asarray(y, dtype='float32')
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_ratio)

        self.logger.info('save file: ' + self.outfile)
        prepare_output_file(self.outfile)
        f = h5py.File(self.outfile, 'w')
        f.create_dataset('offset', data=int(self.offset))
        f.create_dataset('window_size', data=int(self.window_size))
        f.create_dataset('X_train', data=X_train)
        f.create_dataset('y_train', data=y_train)
        f.create_dataset('X_test', data=X_test)
        f.create_dataset('y_test', data=y_test)
        f.close()
def analyze_periodicity(args):
    import h5py
    import pandas as pd
    import matplotlib
    matplotlib.use('Agg')
    import seaborn as sns
    sns.set()
    from genomic_data import GenomicData

    logger.info('read input file: ' + args.input_file)
    reactivities = {}
    sequences = {}
    if args.assay_type == 'shapemap':
        with h5py.File(args.input_file, 'r') as f:
            for tx_id in f['rep1'].keys():
                reactivities[tx_id] = f['rep1/' + tx_id][:]
            for tx_id in f['seq'].keys():
                sequences[tx_id] = f['seq/' + tx_id][()]
    elif args.assay_type == 'icshape':
        icshape = GenomicData(args.input_file)
        for name in icshape.names:
            reactivities[name] = icshape.feature('icshape', name)
        for name, seq in read_fasta(args.sequence_file):
            if name in icshape.names:
                sequences[name] = np.frombuffer(seq, dtype='S1')
    seq_names = sequences.keys()

    reactivities_concat = np.concatenate(
        [reactivities[name] for name in seq_names])
    sequences_concat = np.concatenate([sequences[name] for name in seq_names])
    notnan_mask = ~np.isnan(reactivities_concat)

    # plot overall distribution of SHAPE reactivity
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(reactivities_concat[notnan_mask], bins=50)
    ax.set_xlabel('Reactivity')
    ax.set_ylabel('Counts')
    plt.savefig()
Exemple #6
0
    def __call__(self):
        from formats import read_fasta
        from tqdm import tqdm
        import numpy as np
        import pandas as pd
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt

        self.logger.info('read sequence file: ' + self.sequence_file)
        sequences = dict(read_fasta(self.sequence_file))
        self.logger.info('read input file: ' + self.infile)
        data = GenomicData(self.infile)
        if self.feature is None:
            if len(data.features.keys()) == 1:
                self.feature = data.features.keys()[0]
            else:
                raise ValueError('multiple features found in the input file and the feature is not specified')

        # freqs[i]['A']: frequency of A in bin i
        freqs = []
        scores_all = data.features[self.feature]
        scores_avail = scores_all[np.logical_not(np.isnan(scores_all))]

        self.logger.info('use bin method: %s'%self.bin_method)
        if self.bin_method == 'percentile':
            qs = np.arange(1, self.bins + 1, dtype='float')*100.0/self.bins
            percentiles = np.zeros(self.bins + 1, dtype='float')
            percentiles[0] = scores_avail.min() - 1e-6
            for i in range(1, self.bins):
                percentiles[i] = np.percentile(scores_avail, qs[i - 1])
            percentiles[self.bins] = scores_avail.max() + 1e-6
        elif self.bin_method == 'value':
            density, percentiles = np.histogram(scores_avail, bins=self.bins, density=True)
            qs = np.cumsum(density)*100.0
            percentiles[0] -= 1e-6
            percentiles[-1] += 1e-6
        else:
            raise ValueError('unknown bin method: %s'%self.bin_method)

        for i in range(self.bins):
            d = {a:0 for a in self.alphabet}
            freqs.append(d)
        self.logger.info('count base frequencies with offset %d'%self.offset)
        for name in tqdm(data.names):
            scores_ts = data.feature(self.feature, name)
            avail_ind = np.nonzero(np.logical_not(np.isnan(scores_ts)))[0]
            seq_ts = np.frombuffer(sequences[name], dtype='S1')
            avail_ind += self.offset
            if self.offset > 0:
                avail_ind = avail_ind[avail_ind < len(seq_ts)]
            elif self.offset < 0:
                avail_ind = avail_ind[avail_ind >= 0]
            scores_avail_ts = scores_ts[avail_ind - self.offset]
            seq_avail_ts = seq_ts[avail_ind]
            for i in range(self.bins):
                seq_bin = seq_avail_ts[np.logical_and(scores_avail_ts <= percentiles[i + 1],
                                                      scores_avail_ts > percentiles[i])]
                for a in self.alphabet:
                    freqs[i][a] += np.count_nonzero(seq_bin == a)
        # normalize base frequencies for each percentile
        freq_total = []
        for i in range(self.bins):
            total = sum(freqs[i].values())
            freq_total.append(total)
            for a in self.alphabet:
                if total == 0:
                    freqs[i][a] = 1.0/len(self.alphabet)
                else:
                    freqs[i][a] = float(freqs[i][a])/total
        table_file = self.prefix + '.txt'
        self.logger.info('save results to file: ' + table_file)
        prepare_output_file(table_file)
        df = []
        for i in range(self.bins):
            for a in self.alphabet:
                    df.append((i, qs[i], percentiles[i], a, freq_total[i], freqs[i][a]))
        df = pd.DataFrame.from_records(df,
                                       columns=['bin', 'q', 'percentile', 'base', 'total_freq', 'fraction'])
        df.to_csv(table_file, sep='\t', index=False)
        # plot the distribution
        self.logger.info('create plot')
        plt.rcParams['font.family'] = 'Arial'
        plt.rcParams['axes.labelsize'] = 'medium'
        plt.rcParams['xtick.labelsize'] = 'x-small'
        plt.rcParams['ytick.labelsize'] = 'x-small'
        plt.rcParams['axes.titlesize'] = 'medium'

        fig, ax = plt.subplots(figsize=(7, 5))
        x = np.arange(self.bins)
        xticklabels = ['%.2f'%a for a in percentiles[1:]]
        for base in self.alphabet:
            sub_df = df[df['base'] == base]
            ax.plot(x, sub_df['fraction'], label=base)
        ax.set_xticks(x)
        ax.set_xticklabels(xticklabels)
        ax.set_ylim(0, 1)
        ax.set_xlabel('Values')
        ax.set_ylabel('Base fraction')
        ax.legend()
        plt.tight_layout()

        plot_file = self.prefix + '.pdf'
        self.logger.info('save plot to file: ' + plot_file)
        plt.savefig(plot_file)
Exemple #7
0
    def __call__(self):
        import numpy as np
        import pandas as pd
        import h5py
        from formats import read_rnafold, structure_to_pairs

        self.logger.info('load model: {}'.format(self.model_file))
        model = keras.models.load_model(self.model_file)
        window_size = K.int_shape(model.input)[1]
        self.logger.info('load input data (in %s format): %s'%(self.format, self.infile))
        have_structure = False
        if self.format == 'fasta':
            # list of tuples: (name, seq)
            input_data = list(read_fasta(self.infile))
        elif self.format == 'ct_dir':
            # read all .ct files from the directory
            # list of tuples: (name, seq, pairs)
            input_data = []
            for filename in os.listdir(self.infile):
                title, seq, pairs = read_ct(os.path.join(self.infile, filename))
                title = os.path.splitext(filename)[0]
                input_data.append((title, seq, pairs))
            have_structure = True
        elif self.format == 'ct':
            title, seq, pairs = read_ct(self.infile)
            title = os.path.splitext(os.path.basename(self.infile))[0]
            input_data = [(title, seq, pairs)]
            have_structure = True
        elif self.format == 'rnafold':
            input_data = []
            for name, seq, structure, energy in read_rnafold(self.infile, parse_energy=False):
                pairs = structure_to_pairs(structure)
                input_data.append((name, seq, pairs))
            have_structure = True
        elif self.format == 'genomic_data':
            from genomic_data import GenomicData
            input_data = []
            data = GenomicData(self.infile)
            for name in data.names:
                input_data.append((name,
                    data.feature('sequence', name).tostring(),
                    data.feature('reactivity', name)))
            del data
            have_structure = True

        # combine all structures (base-pairs) into one array in the ct file
        if have_structure:
            structure = []
            for i in range(len(input_data)):
                structure.append(np.asarray(input_data[i][2], dtype='int32'))
            structure = np.concatenate(structure)
        else:
            structure = None

        X = []
        names = []
        # offset default to the center of the window
        if self.offset is None:
            self.offset = (window_size + 1)/2
        offset = self.offset

        # convert sequences to windows
        windows = []
        length = []
        sequence = []
        for item in input_data:
            name = item[0]
            seq = item[1]
            windows += self.sequence_to_windows(seq, window_size, offset)
            names.append(name)
            length.append(len(seq))
            sequence.append(seq)
        # combine all sequences into one dataset
        sequence = np.frombuffer(''.join(sequence), dtype='S1')
        length = np.asarray(length, dtype='int64')

        n_samples = len(windows)
        windows = np.frombuffer(''.join(windows), dtype='S1').reshape((n_samples, window_size))
        X = onehot_encode(windows, self.alphabet)
        # set one-hot coding of padded sequence to [0.25, 0.25, 0.25, 0.25]
        X[X.sum(axis=2) == 0] = 1.0/len(self.alphabet)

        self.logger.info('run the model')
        y_pred = model.predict(X, batch_size=self.batch_size)
        y_pred = np.squeeze(y_pred)
        if self.swap_labels:
            self.logger.info('swap labels')
            y_pred = 1 - y_pred

        # start/end position of each transcript in the y_pred
        end = np.cumsum(length)
        start = end - length
        if len(y_pred.shape) > 1:
            # average the predictions
            self.logger.info('average windows for dense prediction')
            y_pred_dense = []
            for i in range(len(input_data)):
                y_pred_dense.append(self.predict_dense(y_pred[start[i]:end[i]], offset))

            if self.dense_pred_file:
                self.logger.info('save dense predictions: ' + self.dense_pred_file)
                f = h5py.File(self.dense_pred_file, 'w')
                for i in range(len(names)):
                    g = f.create_group(names[i])
                    g.create_dataset('predicted_values_dense', data=y_pred[start[i]:end[i]])
                    g.create_dataset('predicted_values_average', data=y_pred_dense[i])
                    # 0-based start/end position of each transcript in the array (y_pred, sequence, structure)
                    g.create_dataset('sequence', data=sequence[start[i]:end[i]])
                    if structure is not None:
                        g.create_dataset('structure', data=structure[start[i]:end[i]])
                f.close()

            y_pred = np.concatenate(y_pred_dense)
            y_pred_labels = np.round(y_pred).astype('int32')
        else:
            y_pred_labels = np.round(y_pred).astype('int32')

        if self.restraint_file:
            header = ['name', 'position', 'pred', 'base']
            table = pd.DataFrame()
            table['name'] = np.repeat(np.asarray(names, dtype='S'), length)
            # start position of each transcript relative to the y_pred
            start = np.repeat(cum_length - length, length)
            # position (1-based) relative to the transcript
            position = np.arange(1, length.sum() + 1) - start
            table['position'] = position
            table['pred'] = y_pred_labels
            table['base'] = sequence
            table['true'] = structure
            self.logger.info('write restraint file: ' + self.restraint_file)
            prepare_output_file(self.restraint_file)
            table.to_csv(self.restraint_file, sep='\t', index=False)
        if self.metric_file:
            self.logger.info('save metric file: ' + self.metric_file)
            prepare_output_file(self.metric_file)
            f = h5py.File(self.metric_file, 'w')
            from sklearn.metrics import accuracy_score
            f.create_dataset('y_pred', data=y_pred)
            f.create_dataset('y_pred_labels', data=y_pred_labels)
            if have_structure:
                #print structure
                y_true = (structure > 0).astype('int32')
                f.create_dataset('y_true', data=y_true)
                g = f.create_group('metrics')
                for metric in self.metrics:
                    scorer = get_scorer(metric)
                    if get_scorer_type(metric) == 'continous':
                        score = scorer(y_true, y_pred)
                    else:
                        score = scorer(y_true, y_pred_labels)
                    self.logger.info('%s: %f'%(metric, score))
                    g.create_dataset(metric, data=score)
            f.close()
        if self.metric_by_sequence_file:
            self.logger.info('calculate metrics by sequence')
            records = []
            for i in range(len(names)):
                y_true_ = (structure[start[i]:end[i]] > 0).astype('int32')
                y_pred_ = y_pred[start[i]:end[i]]
                y_pred_labels_ = y_pred_labels[start[i]:end[i]]
                scores = []
                for metric in self.metrics:
                    scorer = get_scorer(metric)
                    if get_scorer_type(metric) == 'continuous':
                        try:
                            score = scorer(y_true_, y_pred_)
                        except ValueError:
                            score = np.nan
                    else:
                        score = scorer(y_true_, y_pred_labels_)
                    scores.append(score)
                records.append([names[i], length[i]] + scores)
            records = pd.DataFrame.from_records(records, columns=['name', 'length'] + self.metrics)
            self.logger.info('save metric by sequence file: ' + self.metric_by_sequence_file)
            prepare_output_file(self.metric_by_sequence_file)
            records.to_csv(self.metric_by_sequence_file, sep='\t', index=False, na_rep='nan')
        if self.pred_file:
            self.logger.info('save predictions to file: ' + self.pred_file)
            prepare_output_file(self.pred_file)
            f = h5py.File(self.pred_file, 'w')
            for i in range(len(names)):
                y_true_ = (structure[start[i]:end[i]] > 0).astype('int32')
                g = f.create_group(names[i])
                g.create_dataset('sequence', data=sequence[start[i]:end[i]])
                g.create_dataset('predicted_values', data=y_pred[start[i]:end[i]])
                g.create_dataset('predicted_labels', data=y_pred[start[i]:end[i]])
                g.create_dataset('true_labels', data=y_true_)
            f.close()
Exemple #8
0
    def __call__(self):
        import numpy as np
        import h5py

        control_data = []
        for filename in self.control_file:
            control_data.append(
                icshape_raw_rt_to_genomic_data(filename, self.logger))
        treatment_data = []
        for filename in self.treatment_file:
            treatment_data.append(
                icshape_raw_rt_to_genomic_data(filename, self.logger))
        combined_data = control_data + treatment_data
        self.logger.info('read sequence file: ' + self.sequence_file)
        sequences = dict(read_fasta(self.sequence_file))
        names, counts = np.unique(np.concatenate(
            map(lambda x: x.names, combined_data)),
                                  return_counts=True)
        common_names = names[counts >= len(combined_data)]

        self.logger.info('create output file: ' + self.outfile)
        prepare_output_file(self.outfile)
        fout = h5py.File(self.outfile, 'w')
        ncol = len(control_data) + len(treatment_data)
        sample_name = np.asarray(
            ['C%d' % i for i in range(len(control_data))] +
            ['T%d' % i for i in range(len(treatment_data))],
            dtype='S')
        replicate = np.asarray(['control'] * len(control_data) +
                               ['treatment'] * len(treatment_data),
                               dtype='S')
        """
        for i, name in enumerate(common_names):
            self.logger.info('create group: ' + str(name))
            g = fout.create_group(name)
            coverage = np.vstack(map(lambda x: x.feature('base_density', name)[1:], combined_data))
            dropoff_count = np.vstack(map(lambda x: x.feature('rt_stop', name)[:-1], combined_data))
            rpkm = np.mean(map(lambda x: x.feature('rpkm', name), combined_data))
            g.create_dataset('coverage', data=coverage)
            g.create_dataset('dropoff_count', data=dropoff_count)
            g.create_dataset('sequence', data=np.asarray(sequences[name], dtype='S'))
            g.create_dataset('sample_name', data=sample_name)
            g.create_dataset('replicate', data=replicates)
            g.create_dataset('rpkm', data=rpkm)
        """
        coverage = [[]] * len(combined_data)
        dropoff_count = [[]] * len(combined_data)
        for i in range(len(combined_data)):
            coverage[i] = [None] * len(common_names)
            dropoff_count[i] = [None] * len(common_names)
            for j in range(len(common_names)):
                coverage[i][j] = combined_data[i].feature(
                    'base_density', common_names[j])[1:]
                coverage[i][j][:20] = 0
                dropoff_count[i][j] = combined_data[i].feature(
                    'rt_stop', common_names[j])[:-1]
                dropoff_count[i][j][:20] = 0
            if i == 0:
                length = np.asarray(map(len, coverage[i]), dtype='int64')
                end = np.cumsum(length)
                start = end - length
            coverage[i] = np.concatenate(coverage[i])
            dropoff_count[i] = np.concatenate(dropoff_count[i])
        coverage = np.vstack(coverage)
        dropoff_count = np.vstack(dropoff_count)
        sequence = np.asarray(''.join(
            map(lambda name: sequences[name], common_names)),
                              dtype='S')

        fout.create_dataset('name', data=common_names)
        fout.create_dataset('start', data=start)
        fout.create_dataset('end', data=end)
        fout.create_dataset('coverage', data=coverage)
        fout.create_dataset('dropoff_count', data=dropoff_count)
        fout.create_dataset('sequence', data=sequence)
        fout.create_dataset('replicate', data=replicate)
        fout.create_dataset('sample_name', data=sample_name)
        fout.close()
def create_dataset(args):
    from tqdm import tqdm
    from formats import read_fasta
    from ioutils import prepare_output_file

    c = args.input_file.split(':')
    input_file = c[0]
    dataset = c[1] if len(c) > 1 else '/'
    logger.info('read input file: ' + input_file)
    g_input = open_hdf5_group(args.input_file, 'r')
    names = np.asarray(list(g_input.keys()))
    reactivities = {name: g_input[name][:] for name in names}

    logger.info('read sequence file: ' + args.sequence_file)
    sequences = {
        name: np.frombuffer(bytearray(seq, encoding='ascii'), dtype='S1')
        for name, seq in read_fasta(args.sequence_file)
    }

    if args.offset is None:
        offset = int(args.window_size) // 2
    else:
        offset = args.offset

    if args.cv_split_file is not None:
        cv_split = open_hdf5_group(args.cv_split_file, 'r')
        train_index = cv_split['train'][:]
        test_index = cv_split['test'][:]
        names_train = names[train_index]
        names_test = names[test_index]
        X_train, y_train = create_single_point_dataset(sequences, reactivities,
                                                       names_train, offset,
                                                       args.window_size,
                                                       args.stride)
        X_test, y_test = create_single_point_dataset(sequences, reactivities,
                                                     names_test, offset,
                                                     args.window_size,
                                                     args.stride)
        if args.balanced:
            logger.info('create balanced dataset')
            X_train, y_train = balance_dataset(X_train, y_train)
            logger.info('number of training samples: {}'.format(
                y_train.shape[0]))
            X_test, y_test = balance_dataset(X_test, y_test)
            logger.info('number of test samples: {}'.format(y_test.shape[0]))

        logger.info('create output file: ' + args.output_file)
        prepare_output_file(args.output_file)
        with h5py.File(args.output_file, 'w') as fout:
            fout.create_dataset('names_train', data=names_train.astype('S'))
            fout.create_dataset('X_train', data=X_train, compression=True)
            fout.create_dataset('y_train', data=y_train, compression=True)
            fout.create_dataset('names_test', data=names_test.astype('S'))
            fout.create_dataset('X_test', data=X_test, compression=True)
            fout.create_dataset('y_test', data=y_test, compression=True)
            fout.create_dataset('offset', data=offset)
    else:
        X, y = create_single_point_dataset(sequences, reactivities, names,
                                           offset, args.window_size,
                                           args.stride)
        logger.info('create output file: ' + args.output_file)
        prepare_output_file(args.output_file)
        with h5py.File(args.output_file, 'w') as fout:
            fout.create_dataset('names', data=names.astype('S'))
            fout.create_dataset('X', data=X, compression=True)
            fout.create_dataset('y', data=y, compression=True)
def analyze_nucleotide_periodicity(args):
    import numpy as np
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    import seaborn as sns
    sns.set()
    from scipy.fftpack import fft
    from formats import read_fasta
    from ioutils import prepare_output_file

    logger.info('read sequence file: ' + args.input_file)
    sequences = {
        name: np.frombuffer(seq, dtype='S1')
        for name, seq in read_fasta(args.input_file)
    }
    aligned_length = args.aligned_length
    alphabet = args.alphabet

    def calc_nucleotide_freq(sequences,
                             direction,
                             alphabet='ATCG',
                             aligned_length=100):
        alphabet = np.frombuffer(alphabet, dtype='S1')
        m = np.full((len(sequences), aligned_length), 'N', dtype='S1')
        for i, name in enumerate(sequences.keys()):
            x = sequences[name]
            L = min(x.shape[0], aligned_length)
            if direction == '5p':
                m[i, :L] = x[:L]
            elif direction == '3p':
                m[i, -L:] = x[-L:]
        transcript_counts = np.sum(m != 'N', axis=0)
        m_onehot = (m[:, :, np.newaxis] == alphabet[np.newaxis, np.newaxis, :])
        m_counts = np.sum(m_onehot, axis=0).astype(np.float64)
        m_freq = m_counts / np.sum(m_counts, axis=1)[:, np.newaxis]
        return m_freq

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with PdfPages(args.output_file) as pdf:
        # 5'-end
        nucleotide_freq_5p = calc_nucleotide_freq(
            sequences, '5p', alphabet=alphabet, aligned_length=aligned_length)
        fig, ax = plt.subplots(figsize=(18, 4))
        for i, nucleotide in enumerate(alphabet):
            ax.plot(np.arange(aligned_length),
                    nucleotide_freq_5p[:, i],
                    label=nucleotide)
        ax.set_xlabel('Position in CDS from 5\'-end')
        ax.set_ylabel('Nucleotide frequency')
        ax.set_xlim(0, aligned_length)
        ax.set_ylim(0, 1)
        plt.legend()
        pdf.savefig()
        plt.close()

        # 3'-end
        nucleotide_freq_3p = calc_nucleotide_freq(
            sequences, '3p', alphabet=alphabet, aligned_length=aligned_length)
        fig, ax = plt.subplots(figsize=(18, 4))
        for i, nucleotide in enumerate(alphabet):
            ax.plot(np.arange(-aligned_length, 0),
                    nucleotide_freq_3p[:, i],
                    label=nucleotide)
        ax.set_xlabel('Distance from CDS from 3\'-end')
        ax.set_ylabel('Nucleotide frequency')
        ax.set_xlim(-aligned_length, 0)
        ax.set_ylim(0, 1)
        plt.legend()
        pdf.savefig()
        plt.close()

        # FFT
        for i, nucleotide in enumerate(alphabet):
            plot_fft(nucleotide_freq_5p[:, i],
                     'Nucleotide %s from 5\'-end' % nucleotide)
            pdf.savefig()
            plt.close()
        for i, nucleotide in enumerate(alphabet):
            plot_fft(nucleotide_freq_3p[:, i],
                     'Nucleotide %s from 3\'-end' % nucleotide)
            pdf.savefig()
            plt.close()