def run_BUMHMM_for_icSHAPE(control_files, treatment_files, sequence_file): control_data = [] for filename in control_files: control_data.append(GenomicData.load(filename)) treatment_data = [] for filename in treatment_files: treatment_data.append(GenomicData.load(filename)) sequences = dict(read_fasta(sequence_file)) names, counts = np.unique(np.concatenate( map(lambda x: x.names, control_data) + map(lambda x: x.names, treatment_data)), return_counts=True) common_names = names[counts >= (len(control_data) + len(treatment_data))] for name in common_names: run_BUMHMM( rt_stop_control=map(lambda x: x.feature(name, 'rt_stop'), control_data), coverage_control=map(lambda x: x.feature(name, 'base_density'), control_data), rt_stop_treatment=map(lambda x: x.feature(name, 'rt_stop'), treatment_data), coverage_treatment=map(lambda x: x.feature(name, 'base_density'), treatment_data), seq=sequences[name])
def create_dataset(args): import numpy as np import h5py import pandas as pd from tqdm import tqdm from formats import read_fasta logger.info('read peak file: ' + args.peak_file) peaks = pd.read_table(args.peak_file, names=['chrom', 'start', 'end', 'peak_id', 'label', 'strand']) peaks['peak_id'] = peaks['peak_id'].astype('U') peaks.index = peaks['peak_id'] logger.info('read sequence file: ' + args.sequence_file) sequences = {name:seq for name, seq in read_fasta(args.sequence_file)} if args.reactivity_file is not None: logger.info('read reactivity file: ' + args.reactivity_file) reactivities = {} with h5py.File(args.reactivity_file, 'r') as f: for peak_id in f.keys(): reactivities[peak_id.split(',')[0]] = f[peak_id][:] peak_ids = [] for peak_id in sequences.keys(): if peak_id in reactivities: coverage = np.sum(~np.isnan(reactivities[peak_id])) else: coverage = 0 if coverage >= args.min_coverage: peak_ids.append(peak_id) if coverage == 0: reactivities[peak_id] = np.full(len(sequences[peak_id]), np.nan, dtype=np.float32) else: peak_ids = list(sorted(sequences.keys())) def onehot_encode(x, alphabet='ATCG'): alphabet = np.frombuffer(bytearray(alphabet, encoding='ascii'), dtype='S1') x_shape = list(x.shape) encoded = (x.reshape(x_shape + [1]) == alphabet.reshape([1]*len(x_shape) + [-1])).astype(np.int32) return encoded X_seq = np.concatenate([np.frombuffer(bytearray(sequences[peak_id], encoding='ascii'), dtype='S1')[np.newaxis, :] for peak_id in peak_ids], axis=0) X_seq = onehot_encode(X_seq) if args.reactivity_file is not None: X_r = np.concatenate([reactivities[peak_id][np.newaxis, :, np.newaxis] for peak_id in peak_ids], axis=0) # imputate reactivities with median values X_r[np.isnan(X_r)] = np.nanmedian(X_r.flatten()) X = np.concatenate([X_seq, X_r], axis=2) else: X = X_seq y = peaks['label'][peak_ids] logger.info('create output file: ' + args.output_file) with h5py.File(args.output_file, 'w') as fout: fout.create_dataset('X', data=X) fout.create_dataset('y', data=y)
def predict(args): import numpy as np import keras import h5py import models from tqdm import tqdm import six.moves.cPickle as pickle from ioutils import prepare_output_file, make_dir from formats import read_fasta if args.n_threads >= 1: logger.info('set number of threads to {} for TensorFlow'.format( args.n_threads)) set_keras_num_threads(args.n_threads) logger.info('load model: {}'.format(args.model_file)) model_format = detect_model_format(args.model_file) logger.info('detected model format: ' + model_format) if model_format == 'keras': model = keras.models.load_model(args.model_file) window_size = model.input.shape[1].value elif model_format == 'sklearn': with open(args.model_file, 'r') as f: model = pickle.load(f) # default offset if args.offset is None: offset = int(window_size) // 2 logger.info('load data: {}'.format(args.input_file)) if args.format == 'fasta': names = [] logger.info('create output file: ' + args.output_file) fout = h5py.File(args.output_file, 'w') for name, sequence in tqdm(read_fasta(args.input_file), unit='transcript'): names.append(name) sequence = np.frombuffer(bytearray(sequence, encoding='ascii'), dtype='S1') windows = split_windows_same(sequence, window_size, 1, offset=offset) X = onehot_encode(windows) y_pred = model.predict(X, batch_size=args.batch_size) y_pred = np.squeeze(y_pred) if args.swap_labels: logger.info('swap labels') y_pred = 1 - y_pred fout.create_dataset(name, data=y_pred) fout.close() else: raise ValueError('unknown input format: ' + args.format)
def __call__(self): import numpy as np from sklearn.model_selection import train_test_split import h5py from common import sequence_to_array from scipy import signal self.logger.info('read input file: ' + self.infile) _, base_density, length, _ = read_background_rt(self.infile) names = base_density.keys() self.logger.info('read sequence file: ' + self.sequence_file) sequences = dict(read_fasta(self.sequence_file)) if self.offset is None: self.offset = (self.window_size + 1) / 2 X = [] y = [] if self.smooth: self.logger.info( 'smooth the values using Gaussian window of width %.1f' % self.smooth_width) window = signal.gaussian(100, std=self.smooth_width) for name in names: seq = sequences[name] values = base_density[name] / base_density[name].mean() if self.smooth: # smooth the signal values = signal.convolve(values, window, mode='same') for i in range(0, len(seq) - self.window_size, self.stride): X.append(sequence_to_array(seq[i:(i + self.window_size)])) y.append(values[i + self.offset]) if len(X) >= self.max_samples: break n_samples = len(X) self.logger.info('created {} samples'.format(n_samples)) X = np.concatenate(X) X = X.reshape((n_samples, self.window_size, 4)) y = np.asarray(y, dtype='float32') X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=self.test_ratio) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) f = h5py.File(self.outfile, 'w') f.create_dataset('offset', data=int(self.offset)) f.create_dataset('window_size', data=int(self.window_size)) f.create_dataset('X_train', data=X_train) f.create_dataset('y_train', data=y_train) f.create_dataset('X_test', data=X_test) f.create_dataset('y_test', data=y_test) f.close()
def analyze_periodicity(args): import h5py import pandas as pd import matplotlib matplotlib.use('Agg') import seaborn as sns sns.set() from genomic_data import GenomicData logger.info('read input file: ' + args.input_file) reactivities = {} sequences = {} if args.assay_type == 'shapemap': with h5py.File(args.input_file, 'r') as f: for tx_id in f['rep1'].keys(): reactivities[tx_id] = f['rep1/' + tx_id][:] for tx_id in f['seq'].keys(): sequences[tx_id] = f['seq/' + tx_id][()] elif args.assay_type == 'icshape': icshape = GenomicData(args.input_file) for name in icshape.names: reactivities[name] = icshape.feature('icshape', name) for name, seq in read_fasta(args.sequence_file): if name in icshape.names: sequences[name] = np.frombuffer(seq, dtype='S1') seq_names = sequences.keys() reactivities_concat = np.concatenate( [reactivities[name] for name in seq_names]) sequences_concat = np.concatenate([sequences[name] for name in seq_names]) notnan_mask = ~np.isnan(reactivities_concat) # plot overall distribution of SHAPE reactivity fig, ax = plt.subplots(figsize=(8, 5)) ax.hist(reactivities_concat[notnan_mask], bins=50) ax.set_xlabel('Reactivity') ax.set_ylabel('Counts') plt.savefig()
def __call__(self): from formats import read_fasta from tqdm import tqdm import numpy as np import pandas as pd import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt self.logger.info('read sequence file: ' + self.sequence_file) sequences = dict(read_fasta(self.sequence_file)) self.logger.info('read input file: ' + self.infile) data = GenomicData(self.infile) if self.feature is None: if len(data.features.keys()) == 1: self.feature = data.features.keys()[0] else: raise ValueError('multiple features found in the input file and the feature is not specified') # freqs[i]['A']: frequency of A in bin i freqs = [] scores_all = data.features[self.feature] scores_avail = scores_all[np.logical_not(np.isnan(scores_all))] self.logger.info('use bin method: %s'%self.bin_method) if self.bin_method == 'percentile': qs = np.arange(1, self.bins + 1, dtype='float')*100.0/self.bins percentiles = np.zeros(self.bins + 1, dtype='float') percentiles[0] = scores_avail.min() - 1e-6 for i in range(1, self.bins): percentiles[i] = np.percentile(scores_avail, qs[i - 1]) percentiles[self.bins] = scores_avail.max() + 1e-6 elif self.bin_method == 'value': density, percentiles = np.histogram(scores_avail, bins=self.bins, density=True) qs = np.cumsum(density)*100.0 percentiles[0] -= 1e-6 percentiles[-1] += 1e-6 else: raise ValueError('unknown bin method: %s'%self.bin_method) for i in range(self.bins): d = {a:0 for a in self.alphabet} freqs.append(d) self.logger.info('count base frequencies with offset %d'%self.offset) for name in tqdm(data.names): scores_ts = data.feature(self.feature, name) avail_ind = np.nonzero(np.logical_not(np.isnan(scores_ts)))[0] seq_ts = np.frombuffer(sequences[name], dtype='S1') avail_ind += self.offset if self.offset > 0: avail_ind = avail_ind[avail_ind < len(seq_ts)] elif self.offset < 0: avail_ind = avail_ind[avail_ind >= 0] scores_avail_ts = scores_ts[avail_ind - self.offset] seq_avail_ts = seq_ts[avail_ind] for i in range(self.bins): seq_bin = seq_avail_ts[np.logical_and(scores_avail_ts <= percentiles[i + 1], scores_avail_ts > percentiles[i])] for a in self.alphabet: freqs[i][a] += np.count_nonzero(seq_bin == a) # normalize base frequencies for each percentile freq_total = [] for i in range(self.bins): total = sum(freqs[i].values()) freq_total.append(total) for a in self.alphabet: if total == 0: freqs[i][a] = 1.0/len(self.alphabet) else: freqs[i][a] = float(freqs[i][a])/total table_file = self.prefix + '.txt' self.logger.info('save results to file: ' + table_file) prepare_output_file(table_file) df = [] for i in range(self.bins): for a in self.alphabet: df.append((i, qs[i], percentiles[i], a, freq_total[i], freqs[i][a])) df = pd.DataFrame.from_records(df, columns=['bin', 'q', 'percentile', 'base', 'total_freq', 'fraction']) df.to_csv(table_file, sep='\t', index=False) # plot the distribution self.logger.info('create plot') plt.rcParams['font.family'] = 'Arial' plt.rcParams['axes.labelsize'] = 'medium' plt.rcParams['xtick.labelsize'] = 'x-small' plt.rcParams['ytick.labelsize'] = 'x-small' plt.rcParams['axes.titlesize'] = 'medium' fig, ax = plt.subplots(figsize=(7, 5)) x = np.arange(self.bins) xticklabels = ['%.2f'%a for a in percentiles[1:]] for base in self.alphabet: sub_df = df[df['base'] == base] ax.plot(x, sub_df['fraction'], label=base) ax.set_xticks(x) ax.set_xticklabels(xticklabels) ax.set_ylim(0, 1) ax.set_xlabel('Values') ax.set_ylabel('Base fraction') ax.legend() plt.tight_layout() plot_file = self.prefix + '.pdf' self.logger.info('save plot to file: ' + plot_file) plt.savefig(plot_file)
def __call__(self): import numpy as np import pandas as pd import h5py from formats import read_rnafold, structure_to_pairs self.logger.info('load model: {}'.format(self.model_file)) model = keras.models.load_model(self.model_file) window_size = K.int_shape(model.input)[1] self.logger.info('load input data (in %s format): %s'%(self.format, self.infile)) have_structure = False if self.format == 'fasta': # list of tuples: (name, seq) input_data = list(read_fasta(self.infile)) elif self.format == 'ct_dir': # read all .ct files from the directory # list of tuples: (name, seq, pairs) input_data = [] for filename in os.listdir(self.infile): title, seq, pairs = read_ct(os.path.join(self.infile, filename)) title = os.path.splitext(filename)[0] input_data.append((title, seq, pairs)) have_structure = True elif self.format == 'ct': title, seq, pairs = read_ct(self.infile) title = os.path.splitext(os.path.basename(self.infile))[0] input_data = [(title, seq, pairs)] have_structure = True elif self.format == 'rnafold': input_data = [] for name, seq, structure, energy in read_rnafold(self.infile, parse_energy=False): pairs = structure_to_pairs(structure) input_data.append((name, seq, pairs)) have_structure = True elif self.format == 'genomic_data': from genomic_data import GenomicData input_data = [] data = GenomicData(self.infile) for name in data.names: input_data.append((name, data.feature('sequence', name).tostring(), data.feature('reactivity', name))) del data have_structure = True # combine all structures (base-pairs) into one array in the ct file if have_structure: structure = [] for i in range(len(input_data)): structure.append(np.asarray(input_data[i][2], dtype='int32')) structure = np.concatenate(structure) else: structure = None X = [] names = [] # offset default to the center of the window if self.offset is None: self.offset = (window_size + 1)/2 offset = self.offset # convert sequences to windows windows = [] length = [] sequence = [] for item in input_data: name = item[0] seq = item[1] windows += self.sequence_to_windows(seq, window_size, offset) names.append(name) length.append(len(seq)) sequence.append(seq) # combine all sequences into one dataset sequence = np.frombuffer(''.join(sequence), dtype='S1') length = np.asarray(length, dtype='int64') n_samples = len(windows) windows = np.frombuffer(''.join(windows), dtype='S1').reshape((n_samples, window_size)) X = onehot_encode(windows, self.alphabet) # set one-hot coding of padded sequence to [0.25, 0.25, 0.25, 0.25] X[X.sum(axis=2) == 0] = 1.0/len(self.alphabet) self.logger.info('run the model') y_pred = model.predict(X, batch_size=self.batch_size) y_pred = np.squeeze(y_pred) if self.swap_labels: self.logger.info('swap labels') y_pred = 1 - y_pred # start/end position of each transcript in the y_pred end = np.cumsum(length) start = end - length if len(y_pred.shape) > 1: # average the predictions self.logger.info('average windows for dense prediction') y_pred_dense = [] for i in range(len(input_data)): y_pred_dense.append(self.predict_dense(y_pred[start[i]:end[i]], offset)) if self.dense_pred_file: self.logger.info('save dense predictions: ' + self.dense_pred_file) f = h5py.File(self.dense_pred_file, 'w') for i in range(len(names)): g = f.create_group(names[i]) g.create_dataset('predicted_values_dense', data=y_pred[start[i]:end[i]]) g.create_dataset('predicted_values_average', data=y_pred_dense[i]) # 0-based start/end position of each transcript in the array (y_pred, sequence, structure) g.create_dataset('sequence', data=sequence[start[i]:end[i]]) if structure is not None: g.create_dataset('structure', data=structure[start[i]:end[i]]) f.close() y_pred = np.concatenate(y_pred_dense) y_pred_labels = np.round(y_pred).astype('int32') else: y_pred_labels = np.round(y_pred).astype('int32') if self.restraint_file: header = ['name', 'position', 'pred', 'base'] table = pd.DataFrame() table['name'] = np.repeat(np.asarray(names, dtype='S'), length) # start position of each transcript relative to the y_pred start = np.repeat(cum_length - length, length) # position (1-based) relative to the transcript position = np.arange(1, length.sum() + 1) - start table['position'] = position table['pred'] = y_pred_labels table['base'] = sequence table['true'] = structure self.logger.info('write restraint file: ' + self.restraint_file) prepare_output_file(self.restraint_file) table.to_csv(self.restraint_file, sep='\t', index=False) if self.metric_file: self.logger.info('save metric file: ' + self.metric_file) prepare_output_file(self.metric_file) f = h5py.File(self.metric_file, 'w') from sklearn.metrics import accuracy_score f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_labels', data=y_pred_labels) if have_structure: #print structure y_true = (structure > 0).astype('int32') f.create_dataset('y_true', data=y_true) g = f.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if get_scorer_type(metric) == 'continous': score = scorer(y_true, y_pred) else: score = scorer(y_true, y_pred_labels) self.logger.info('%s: %f'%(metric, score)) g.create_dataset(metric, data=score) f.close() if self.metric_by_sequence_file: self.logger.info('calculate metrics by sequence') records = [] for i in range(len(names)): y_true_ = (structure[start[i]:end[i]] > 0).astype('int32') y_pred_ = y_pred[start[i]:end[i]] y_pred_labels_ = y_pred_labels[start[i]:end[i]] scores = [] for metric in self.metrics: scorer = get_scorer(metric) if get_scorer_type(metric) == 'continuous': try: score = scorer(y_true_, y_pred_) except ValueError: score = np.nan else: score = scorer(y_true_, y_pred_labels_) scores.append(score) records.append([names[i], length[i]] + scores) records = pd.DataFrame.from_records(records, columns=['name', 'length'] + self.metrics) self.logger.info('save metric by sequence file: ' + self.metric_by_sequence_file) prepare_output_file(self.metric_by_sequence_file) records.to_csv(self.metric_by_sequence_file, sep='\t', index=False, na_rep='nan') if self.pred_file: self.logger.info('save predictions to file: ' + self.pred_file) prepare_output_file(self.pred_file) f = h5py.File(self.pred_file, 'w') for i in range(len(names)): y_true_ = (structure[start[i]:end[i]] > 0).astype('int32') g = f.create_group(names[i]) g.create_dataset('sequence', data=sequence[start[i]:end[i]]) g.create_dataset('predicted_values', data=y_pred[start[i]:end[i]]) g.create_dataset('predicted_labels', data=y_pred[start[i]:end[i]]) g.create_dataset('true_labels', data=y_true_) f.close()
def __call__(self): import numpy as np import h5py control_data = [] for filename in self.control_file: control_data.append( icshape_raw_rt_to_genomic_data(filename, self.logger)) treatment_data = [] for filename in self.treatment_file: treatment_data.append( icshape_raw_rt_to_genomic_data(filename, self.logger)) combined_data = control_data + treatment_data self.logger.info('read sequence file: ' + self.sequence_file) sequences = dict(read_fasta(self.sequence_file)) names, counts = np.unique(np.concatenate( map(lambda x: x.names, combined_data)), return_counts=True) common_names = names[counts >= len(combined_data)] self.logger.info('create output file: ' + self.outfile) prepare_output_file(self.outfile) fout = h5py.File(self.outfile, 'w') ncol = len(control_data) + len(treatment_data) sample_name = np.asarray( ['C%d' % i for i in range(len(control_data))] + ['T%d' % i for i in range(len(treatment_data))], dtype='S') replicate = np.asarray(['control'] * len(control_data) + ['treatment'] * len(treatment_data), dtype='S') """ for i, name in enumerate(common_names): self.logger.info('create group: ' + str(name)) g = fout.create_group(name) coverage = np.vstack(map(lambda x: x.feature('base_density', name)[1:], combined_data)) dropoff_count = np.vstack(map(lambda x: x.feature('rt_stop', name)[:-1], combined_data)) rpkm = np.mean(map(lambda x: x.feature('rpkm', name), combined_data)) g.create_dataset('coverage', data=coverage) g.create_dataset('dropoff_count', data=dropoff_count) g.create_dataset('sequence', data=np.asarray(sequences[name], dtype='S')) g.create_dataset('sample_name', data=sample_name) g.create_dataset('replicate', data=replicates) g.create_dataset('rpkm', data=rpkm) """ coverage = [[]] * len(combined_data) dropoff_count = [[]] * len(combined_data) for i in range(len(combined_data)): coverage[i] = [None] * len(common_names) dropoff_count[i] = [None] * len(common_names) for j in range(len(common_names)): coverage[i][j] = combined_data[i].feature( 'base_density', common_names[j])[1:] coverage[i][j][:20] = 0 dropoff_count[i][j] = combined_data[i].feature( 'rt_stop', common_names[j])[:-1] dropoff_count[i][j][:20] = 0 if i == 0: length = np.asarray(map(len, coverage[i]), dtype='int64') end = np.cumsum(length) start = end - length coverage[i] = np.concatenate(coverage[i]) dropoff_count[i] = np.concatenate(dropoff_count[i]) coverage = np.vstack(coverage) dropoff_count = np.vstack(dropoff_count) sequence = np.asarray(''.join( map(lambda name: sequences[name], common_names)), dtype='S') fout.create_dataset('name', data=common_names) fout.create_dataset('start', data=start) fout.create_dataset('end', data=end) fout.create_dataset('coverage', data=coverage) fout.create_dataset('dropoff_count', data=dropoff_count) fout.create_dataset('sequence', data=sequence) fout.create_dataset('replicate', data=replicate) fout.create_dataset('sample_name', data=sample_name) fout.close()
def create_dataset(args): from tqdm import tqdm from formats import read_fasta from ioutils import prepare_output_file c = args.input_file.split(':') input_file = c[0] dataset = c[1] if len(c) > 1 else '/' logger.info('read input file: ' + input_file) g_input = open_hdf5_group(args.input_file, 'r') names = np.asarray(list(g_input.keys())) reactivities = {name: g_input[name][:] for name in names} logger.info('read sequence file: ' + args.sequence_file) sequences = { name: np.frombuffer(bytearray(seq, encoding='ascii'), dtype='S1') for name, seq in read_fasta(args.sequence_file) } if args.offset is None: offset = int(args.window_size) // 2 else: offset = args.offset if args.cv_split_file is not None: cv_split = open_hdf5_group(args.cv_split_file, 'r') train_index = cv_split['train'][:] test_index = cv_split['test'][:] names_train = names[train_index] names_test = names[test_index] X_train, y_train = create_single_point_dataset(sequences, reactivities, names_train, offset, args.window_size, args.stride) X_test, y_test = create_single_point_dataset(sequences, reactivities, names_test, offset, args.window_size, args.stride) if args.balanced: logger.info('create balanced dataset') X_train, y_train = balance_dataset(X_train, y_train) logger.info('number of training samples: {}'.format( y_train.shape[0])) X_test, y_test = balance_dataset(X_test, y_test) logger.info('number of test samples: {}'.format(y_test.shape[0])) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as fout: fout.create_dataset('names_train', data=names_train.astype('S')) fout.create_dataset('X_train', data=X_train, compression=True) fout.create_dataset('y_train', data=y_train, compression=True) fout.create_dataset('names_test', data=names_test.astype('S')) fout.create_dataset('X_test', data=X_test, compression=True) fout.create_dataset('y_test', data=y_test, compression=True) fout.create_dataset('offset', data=offset) else: X, y = create_single_point_dataset(sequences, reactivities, names, offset, args.window_size, args.stride) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as fout: fout.create_dataset('names', data=names.astype('S')) fout.create_dataset('X', data=X, compression=True) fout.create_dataset('y', data=y, compression=True)
def analyze_nucleotide_periodicity(args): import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import seaborn as sns sns.set() from scipy.fftpack import fft from formats import read_fasta from ioutils import prepare_output_file logger.info('read sequence file: ' + args.input_file) sequences = { name: np.frombuffer(seq, dtype='S1') for name, seq in read_fasta(args.input_file) } aligned_length = args.aligned_length alphabet = args.alphabet def calc_nucleotide_freq(sequences, direction, alphabet='ATCG', aligned_length=100): alphabet = np.frombuffer(alphabet, dtype='S1') m = np.full((len(sequences), aligned_length), 'N', dtype='S1') for i, name in enumerate(sequences.keys()): x = sequences[name] L = min(x.shape[0], aligned_length) if direction == '5p': m[i, :L] = x[:L] elif direction == '3p': m[i, -L:] = x[-L:] transcript_counts = np.sum(m != 'N', axis=0) m_onehot = (m[:, :, np.newaxis] == alphabet[np.newaxis, np.newaxis, :]) m_counts = np.sum(m_onehot, axis=0).astype(np.float64) m_freq = m_counts / np.sum(m_counts, axis=1)[:, np.newaxis] return m_freq logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with PdfPages(args.output_file) as pdf: # 5'-end nucleotide_freq_5p = calc_nucleotide_freq( sequences, '5p', alphabet=alphabet, aligned_length=aligned_length) fig, ax = plt.subplots(figsize=(18, 4)) for i, nucleotide in enumerate(alphabet): ax.plot(np.arange(aligned_length), nucleotide_freq_5p[:, i], label=nucleotide) ax.set_xlabel('Position in CDS from 5\'-end') ax.set_ylabel('Nucleotide frequency') ax.set_xlim(0, aligned_length) ax.set_ylim(0, 1) plt.legend() pdf.savefig() plt.close() # 3'-end nucleotide_freq_3p = calc_nucleotide_freq( sequences, '3p', alphabet=alphabet, aligned_length=aligned_length) fig, ax = plt.subplots(figsize=(18, 4)) for i, nucleotide in enumerate(alphabet): ax.plot(np.arange(-aligned_length, 0), nucleotide_freq_3p[:, i], label=nucleotide) ax.set_xlabel('Distance from CDS from 3\'-end') ax.set_ylabel('Nucleotide frequency') ax.set_xlim(-aligned_length, 0) ax.set_ylim(0, 1) plt.legend() pdf.savefig() plt.close() # FFT for i, nucleotide in enumerate(alphabet): plot_fft(nucleotide_freq_5p[:, i], 'Nucleotide %s from 5\'-end' % nucleotide) pdf.savefig() plt.close() for i, nucleotide in enumerate(alphabet): plot_fft(nucleotide_freq_3p[:, i], 'Nucleotide %s from 3\'-end' % nucleotide) pdf.savefig() plt.close()