def run_BUMHMM_for_icSHAPE(control_files, treatment_files, sequence_file): control_data = [] for filename in control_files: control_data.append(GenomicData.load(filename)) treatment_data = [] for filename in treatment_files: treatment_data.append(GenomicData.load(filename)) sequences = dict(read_fasta(sequence_file)) names, counts = np.unique(np.concatenate( map(lambda x: x.names, control_data) + map(lambda x: x.names, treatment_data)), return_counts=True) common_names = names[counts >= (len(control_data) + len(treatment_data))] for name in common_names: run_BUMHMM( rt_stop_control=map(lambda x: x.feature(name, 'rt_stop'), control_data), coverage_control=map(lambda x: x.feature(name, 'base_density'), control_data), rt_stop_treatment=map(lambda x: x.feature(name, 'rt_stop'), treatment_data), coverage_treatment=map(lambda x: x.feature(name, 'base_density'), treatment_data), seq=sequences[name])
def __call__(self): import pandas as pd import h5py import numpy as np regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS', 'ncRNA', 'miRNA'] if self.region: regions = [self.region] records = [] for dataset in os.listdir(self.indir): for region in regions: data_file = os.path.join(self.indir, dataset, '%s.h5'%region) if not os.path.isfile(data_file): self.logger.warn('GenomicData file {} does not exist'.format(data_file)) continue data = GenomicData(data_file) if not self.feature: feature = data.features.keys()[0] #self.logger.info('use the default feature %s because --feature is not given') else: feature = self.feature if self.percentile is not None: data_valid = data.features[feature][np.logical_not(np.isnan(data.features[feature]))] cutoff1 = np.percentile(data_valid, self.percentile) cutoff2 = np.percentile(data_valid, 100 - self.percentile) n_samples = np.logical_or(data_valid <= cutoff1, data_valid >= cutoff2).sum() else: n_samples = len(data.features[feature]) - np.isnan(data.features[feature]).sum() records.append((dataset, region, n_samples)) df = pd.DataFrame.from_records(records, columns=['dataset', 'region', 'n_samples']) print df.to_csv(sep='\t', index=False)
def hmm_align_reactivities(args): import numpy as np from Bio import AlignIO from glob import glob import re import h5py from tqdm import tqdm from genomic_data import GenomicData logger.info('read reactivity file: ' + args.reactivity_file) data = GenomicData(args.reactivity_file) reactivities = {name:data.feature(args.feature, name) for name in data.names} logger.info('read alignment directory: ' + args.alignment_dir) pat_id = re.compile(r'[0-9]*\|*(?P<te_type>[^:]+)::(?P<seq_name>[^:]+):(?P<start>[0-9]+)-(?P<end>[0-9]+)') logger.info('create output file: ' + args.output_file) fout = h5py.File(args.output_file, 'w') for alignment_file in tqdm(glob(os.path.join(args.alignment_dir, '*.sto')), unit='file'): with open(alignment_file, 'r') as fin: reactivities_tx = [] sequences_tx = [] seq_names = [] for record in AlignIO.read(fin, 'stockholm'): # parse transcriptomic coordinates from sequence names m = pat_id.match(record.id) if m is None: raise ValueError('invalid record name %s in file: %s'%(record.id, alignment_file)) r = reactivities.get(m.group('seq_name')) if r is not None: r_aligned = np.full(len(record.seq), np.nan, dtype=np.float32) # map reactivities to alignment seq = np.frombuffer(str(record.seq), dtype='S1') r_aligned[seq != '-'] = r[int(m.group('start')):int(m.group('end'))] if np.all(np.isnan(r_aligned)): continue reactivities_tx.append(r_aligned.reshape((1, -1))) sequences_tx.append(seq.reshape((1, -1))) seq_names.append(record.id) if len(reactivities_tx) >= args.min_records: g = fout.create_group(os.path.splitext(os.path.basename(alignment_file))[0]) g.create_dataset('reactivities', data=np.concatenate(reactivities_tx, axis=0)) g.create_dataset('sequences', data=np.concatenate(sequences_tx, axis=0)) g.create_dataset('seq_names', data=np.asarray(seq_names)) fout.close()
def __call__(self): from genomic_data import GenomicData import numpy as np import h5py self.logger.info('read BUMHMM file: ' + self.posterior_file) posteriors = h5py.File(self.posterior_file, 'r')['posteriors'][:] self.logger.info('read BUMHMM input file: ' + self.bumhmm_input_file) f = h5py.File(self.bumhmm_input_file, 'r') start = f['start'][:] end = f['end'][:] name = f['name'][:] f.close() values = map(lambda i: posteriors[start[i]:end[i]], range(len(name))) self.logger.info('save file: ' + self.outfile) prepare_output_file(self.outfile) GenomicData.from_data(name, features={ 'bumhmm': values }).save(self.outfile)
def __call__(self): from genomic_data import GenomicData import numpy as np self.logger.info('read input rt file: ' + self.infile) name = [] length = [] rpkm = [] rt_stop = [] base_density = [] with open(self.infile, 'r') as f: f.readline() n_records = 0 for lineno, line in enumerate(f): c = line.strip().split('\t') if (lineno % 2) == 0: name.append(c[0]) length.append(int(c[1])) rpkm.append(float(c[2].split(',')[0])) rt_stop.append( np.asarray(c[3:], dtype='float').astype('int32')) else: base_density.append( np.asarray(c[3:], dtype='float').astype('int32')) n_records += 1 self.logger.info('successfully read %d records' % n_records) self.logger.info('create output file: ' + self.outfile) prepare_output_file(self.outfile) GenomicData.from_data(name, features={ 'rt_stop': rt_stop, 'base_density': base_density }, meta={ 'rpkm': np.asarray(rpkm, dtype='float64'), 'length': np.asarray(length, dtype='int64') }).save(self.outfile)
def analyze_periodicity(args): import h5py import pandas as pd import matplotlib matplotlib.use('Agg') import seaborn as sns sns.set() from genomic_data import GenomicData logger.info('read input file: ' + args.input_file) reactivities = {} sequences = {} if args.assay_type == 'shapemap': with h5py.File(args.input_file, 'r') as f: for tx_id in f['rep1'].keys(): reactivities[tx_id] = f['rep1/' + tx_id][:] for tx_id in f['seq'].keys(): sequences[tx_id] = f['seq/' + tx_id][()] elif args.assay_type == 'icshape': icshape = GenomicData(args.input_file) for name in icshape.names: reactivities[name] = icshape.feature('icshape', name) for name, seq in read_fasta(args.sequence_file): if name in icshape.names: sequences[name] = np.frombuffer(seq, dtype='S1') seq_names = sequences.keys() reactivities_concat = np.concatenate( [reactivities[name] for name in seq_names]) sequences_concat = np.concatenate([sequences[name] for name in seq_names]) notnan_mask = ~np.isnan(reactivities_concat) # plot overall distribution of SHAPE reactivity fig, ax = plt.subplots(figsize=(8, 5)) ax.hist(reactivities_concat[notnan_mask], bins=50) ax.set_xlabel('Reactivity') ax.set_ylabel('Counts') plt.savefig()
def get_te_icshape(args): import numpy as np import h5py import pandas as pd from genomic_data import GenomicData logger.info('read TE region file: ' + args.bed_file) bed = pd.read_table(args.bed_file, header=None) bed[3] = bed[3].astype('S') logger.info('read icSHAPE data file: ' + args.icshape_file) icshape = GenomicData(args.icshape_file) logger.info('create output file: ' + args.output_file) te_data = [] te_names = [] for row in bed.itertuples(index=False): data = icshape.feature(args.feature, row[0]) if data is not None: te_data.append(data[row[1]:row[2]]) te_names.append('%s,%s,%d,%d'%(row[3], row[0], row[1], row[2])) logger.info('create output file: ' + args.output_file) GenomicData.from_data(names=te_names, features={args.feature: te_data}).save(args.output_file)
def __call__(self): import_matplotlib() import numpy as np data = GenomicData(self.infile, feature_names=[self.feature]) fig, ax = plt.subplots(figsize=(4, 4)) valid_data = data.features[self.feature][np.logical_not(np.isnan(data.features[self.feature]))] ax.hist(valid_data, weights=np.full(len(valid_data), self.weight), bins=20, color='#808080') ax.set_xlabel(self.xlabel) ax.set_ylabel(self.ylabel) #ax.set_yticks(np.arange(len(counts)), map(lambda x: '%.1f'%x, counts.astype('float')*1e-6)) plt.tight_layout() if self.title: ax.set_title(self.title) self.logger.info('save figure: {}'.format(self.outfile)) prepare_output_file(self.outfile) plt.savefig(self.outfile)
def icshape_raw_rt_to_genomic_data(infile, logger=None): import numpy as np from genomic_data import GenomicData if not logger: logger = logging.getLogger('icshape_raw_rt_to_genomic_data') logger.info('read input rt file: ' + infile) name = [] length = [] rpkm = [] rt_stop = [] base_density = [] with open(infile, 'r') as f: f.readline() n_records = 0 for lineno, line in enumerate(f): c = line.strip().split('\t') if (lineno % 2) == 0: name.append(c[0]) length.append(int(c[1])) rpkm.append(float(c[2].split(',')[0])) base_density.append( np.asarray(c[3:], dtype='float').astype('int32')) else: rt_stop.append( np.asarray(c[3:], dtype='float').astype('int32')) n_records += 1 logger.info('successfully read %d records' % n_records) data = GenomicData.from_data(name, features={ 'rt_stop': rt_stop, 'base_density': base_density }, meta={ 'rpkm': np.asarray(rpkm, dtype='float64'), 'length': np.asarray(length, dtype='int64') }) return data
def __call__(self): import pandas as pd import numpy as np import h5py regions = ['all', '3UTR', '5UTR', 'lncRNA', 'CDS'] records = [] for indir in self.indirs: for region in regions: deepfold_dataset = 'r={},p=5,w=100.h5'.format(region) data = GenomicData(os.path.join(indir, '{}.h5'.format(region))) if not self.feature: feature = data.features.keys()[0] else: feature = self.feature n_samples_total = len(data.features[feature]) - np.isnan(data.features[feature]).sum() f = h5py.File(os.path.join(indir, 'deepfold', deepfold_dataset), 'r') n_samples_train = f['X_train'].shape[0] n_samples_test = f['X_test'].shape[0] f.close() records.append((indir, deepfold_dataset, region, n_samples_total, n_samples_train, n_samples_test)) df = pd.DataFrame.from_records(records, columns=('dataset', 'deepfold_dataset', 'region', 'n_samples_total', 'n_samples_train', 'n_samples_test')) self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) df.to_csv(self.outfile, sep='\t', index=False)
def __call__(self): from formats import read_fasta from tqdm import tqdm import numpy as np import pandas as pd import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt self.logger.info('read sequence file: ' + self.sequence_file) sequences = dict(read_fasta(self.sequence_file)) self.logger.info('read input file: ' + self.infile) data = GenomicData(self.infile) if self.feature is None: if len(data.features.keys()) == 1: self.feature = data.features.keys()[0] else: raise ValueError('multiple features found in the input file and the feature is not specified') # freqs[i]['A']: frequency of A in bin i freqs = [] scores_all = data.features[self.feature] scores_avail = scores_all[np.logical_not(np.isnan(scores_all))] self.logger.info('use bin method: %s'%self.bin_method) if self.bin_method == 'percentile': qs = np.arange(1, self.bins + 1, dtype='float')*100.0/self.bins percentiles = np.zeros(self.bins + 1, dtype='float') percentiles[0] = scores_avail.min() - 1e-6 for i in range(1, self.bins): percentiles[i] = np.percentile(scores_avail, qs[i - 1]) percentiles[self.bins] = scores_avail.max() + 1e-6 elif self.bin_method == 'value': density, percentiles = np.histogram(scores_avail, bins=self.bins, density=True) qs = np.cumsum(density)*100.0 percentiles[0] -= 1e-6 percentiles[-1] += 1e-6 else: raise ValueError('unknown bin method: %s'%self.bin_method) for i in range(self.bins): d = {a:0 for a in self.alphabet} freqs.append(d) self.logger.info('count base frequencies with offset %d'%self.offset) for name in tqdm(data.names): scores_ts = data.feature(self.feature, name) avail_ind = np.nonzero(np.logical_not(np.isnan(scores_ts)))[0] seq_ts = np.frombuffer(sequences[name], dtype='S1') avail_ind += self.offset if self.offset > 0: avail_ind = avail_ind[avail_ind < len(seq_ts)] elif self.offset < 0: avail_ind = avail_ind[avail_ind >= 0] scores_avail_ts = scores_ts[avail_ind - self.offset] seq_avail_ts = seq_ts[avail_ind] for i in range(self.bins): seq_bin = seq_avail_ts[np.logical_and(scores_avail_ts <= percentiles[i + 1], scores_avail_ts > percentiles[i])] for a in self.alphabet: freqs[i][a] += np.count_nonzero(seq_bin == a) # normalize base frequencies for each percentile freq_total = [] for i in range(self.bins): total = sum(freqs[i].values()) freq_total.append(total) for a in self.alphabet: if total == 0: freqs[i][a] = 1.0/len(self.alphabet) else: freqs[i][a] = float(freqs[i][a])/total table_file = self.prefix + '.txt' self.logger.info('save results to file: ' + table_file) prepare_output_file(table_file) df = [] for i in range(self.bins): for a in self.alphabet: df.append((i, qs[i], percentiles[i], a, freq_total[i], freqs[i][a])) df = pd.DataFrame.from_records(df, columns=['bin', 'q', 'percentile', 'base', 'total_freq', 'fraction']) df.to_csv(table_file, sep='\t', index=False) # plot the distribution self.logger.info('create plot') plt.rcParams['font.family'] = 'Arial' plt.rcParams['axes.labelsize'] = 'medium' plt.rcParams['xtick.labelsize'] = 'x-small' plt.rcParams['ytick.labelsize'] = 'x-small' plt.rcParams['axes.titlesize'] = 'medium' fig, ax = plt.subplots(figsize=(7, 5)) x = np.arange(self.bins) xticklabels = ['%.2f'%a for a in percentiles[1:]] for base in self.alphabet: sub_df = df[df['base'] == base] ax.plot(x, sub_df['fraction'], label=base) ax.set_xticks(x) ax.set_xticklabels(xticklabels) ax.set_ylim(0, 1) ax.set_xlabel('Values') ax.set_ylabel('Base fraction') ax.legend() plt.tight_layout() plot_file = self.prefix + '.pdf' self.logger.info('save plot to file: ' + plot_file) plt.savefig(plot_file)
def __call__(self): import_matplotlib() import numpy as np import h5py import pandas as pd from sklearn.metrics import roc_auc_score from scipy.stats import pearsonr, spearmanr, ttest_ind def normalized_mutual_information(p, epsilon=1e-12): p = p + epsilon p /= p.sum() px = p.sum(axis=1) py = p.sum(axis=0) pxpy = np.dot(px.reshape(-1, 1), py.reshape(1, -1)) Hxy = np.sum(p*np.log(p/pxpy)) Hx = np.sum(px*np.log(px)) Hy = np.sum(py*np.log(py)) return Hxy/np.sqrt(Hx*Hy) self.logger.info('read BUMHMM posteriors: ' + self.bumhmm_file) bumhmm = GenomicData(self.bumhmm_file, feature_names=['bumhmm']) self.logger.info('read icSHAPE scores: ' + self.icshape_file) icshape = GenomicData(self.icshape_file, feature_names=['icshape']) names, counts = np.unique(np.concatenate((bumhmm.names, icshape.names)), return_counts=True) common_names = names[counts >= 2] metrics = ('roc_auc', 't_test_p', 'pearsonr', 'spearmanr', 'normalized_mi') correlation = {} for metric in metrics: correlation[metric] = np.full(len(common_names), np.nan) values_bumhmm_selected = [] values_icshape_selected = [] for i, name in enumerate(common_names): values_icshape = icshape.feature('icshape', name) values_bumhmm = bumhmm.feature('bumhmm', name) valid_index = np.nonzero(np.logical_not(np.logical_or(np.isnan(values_icshape), np.isnan(values_bumhmm))))[0] values_icshape = values_icshape[valid_index] values_bumhmm = values_bumhmm[valid_index] values_bumhmm_binary = (values_bumhmm > 0.5).astype('int32') if values_bumhmm_binary.sum() in (0, len(values_bumhmm_binary)): self.logger.warn('ignoring %s because only one class is defined by the BUMHMM posteriors'%name) continue if len(values_icshape_selected) < 10: values_icshape_selected.append(values_icshape) values_bumhmm_selected.append(values_bumhmm) correlation['roc_auc'][i] = roc_auc_score(values_bumhmm_binary, values_icshape) a = values_icshape[values_bumhmm_binary == 0] b = values_icshape[values_bumhmm_binary == 1] correlation['t_test_p'][i] = ttest_ind(a, b)[1] correlation['pearsonr'][i] = pearsonr(values_icshape, values_bumhmm)[0] correlation['spearmanr'][i] = spearmanr(values_icshape, values_bumhmm)[0] bins = np.linspace(0.0, 1.0, 21) bin_index = np.digitize(values_icshape, bins) - 1 pxy = np.empty((2, 20), dtype='float64') pxy[0] = np.histogram(a, bins=bins)[0] pxy[1] = np.histogram(b, bins=bins)[0] correlation['normalized_mi'][i] = normalized_mutual_information(pxy) for metric in correlation: correlation[metric][np.isinf(correlation[metric])] = np.nan df = pd.DataFrame(correlation) df['seqname'] = common_names df = df.dropna(axis=0, how='any') table_file = self.prefix + '.txt' self.logger.info('save correlations to file: ' + table_file) prepare_output_file(table_file) df.to_csv(table_file, sep='\t', index=False) plot_file = self.prefix + '.pdf' self.logger.info('save plot file: ' + plot_file) with PdfPages(plot_file) as pdf: values_bumhmm_selected = np.concatenate(values_bumhmm_selected) values_icshape_selected = np.concatenate(values_icshape_selected) fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(values_icshape_selected, values_bumhmm_selected, s=1, edgecolor='none') ax.set_xlabel('icSHAPE scores') ax.set_ylabel('BUMHMM posterior probabilities') ax.set_title('icSHAPE scores and BUMHMM posteriors') pdf.savefig(fig) plt.clf() plt.close(fig) for metric in metrics: fig, ax = plt.subplots(figsize=(8, 6)) if metric in ('t_test_p'): ax.hist(np.log(df[metric] + 1e-12), bins=50) else: ax.hist(df[metric], bins=50) ax.set_title('Correlation between icSHAPE scores and BUMHMM (%s)'%metric) ax.set_xlabel(metric) ax.set_ylabel('Counts') plt.tight_layout() pdf.savefig(fig) plt.clf() plt.close(fig)
def __call__(self): from genomic_data import GenomicData import pandas as pd import numpy as np known = GenomicData(self.known_file, [self.feature]) y_pred = [] y_true = [] names = [] length = [] for name, seq, structure, energy in read_rnafold(self.infile): names.append(name) structure = np.frombuffer(structure, dtype='S1') length.append(len(structure)) y_pred.append((structure != '.').astype('int32')) y_true_seq = known.feature(self.feature, name) if known.feature(self.feature, name) is None: found = np.nonzero(map(lambda x: x.startswith(name), known.names))[0] if len(found) == 0: raise ValueError('sequence {} could not be found'.format(name)) elif len(found) == 1: self.logger.warn('partial sequence name match {} => {}'.format(known.names[found[0]], name)) y_true_seq = known.feature(self.feature, known.names[found[0]]) else: raise ValueError('multiple partial matches found for {}'.format(name)) y_true.append(y_true_seq) """ y_pred = np.concatenate(y_pred) y_true = np.concatenate(y_true) scores = {} for metric in self.metrics: # y_pred is an array of continous scores scorer = get_scorer(metric) scores[metric] = scorer(y_true, y_pred) self.logger.info('metric {} = {}'.format(metric, scores[metric])) if self.outfile is not None: self.logger.info('save file: {}'.format(self.outfile)) prepare_output_file(self.outfile) fout = h5py.File(self.outfile, 'w') fout.create_dataset('y_true', data=y_true) fout.create_dataset('y_pred', data=y_pred) fout.create_dataset('y_pred_labels', data=y_pred) grp = fout.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if get_scorer_type(metric) == 'continuous': try: score = scorer(y_true, y_pred) except ValueError: score = np.nan else: score = scorer(y_true, y_pred_labels) grp.create_dataset(metric, data=scores[metric]) fout.close()""" if True: self.logger.info('calculate metrics by sequence') records = [] for i in range(len(names)): y_true_ = y_true[i] y_pred_ = y_pred[i] y_pred_labels_ = y_pred_ scores = [] for metric in self.metrics: scorer = get_scorer(metric) if get_scorer_type(metric) == 'continuous': try: score = scorer(y_true_, y_pred_) except ValueError: score = np.nan else: score = scorer(y_true_, y_pred_labels_) scores.append(score) records.append([names[i], length[i]] + scores) records = pd.DataFrame.from_records(records, columns=['name', 'length'] + self.metrics) self.logger.info('save metric by sequence file: ' + self.outfile) prepare_output_file(self.outfile) records.to_csv(self.outfile, sep='\t', index=False, na_rep='nan')
def __call__(self): import numpy as np import pandas as pd import h5py from formats import read_rnafold, structure_to_pairs self.logger.info('load model: {}'.format(self.model_file)) model = keras.models.load_model(self.model_file) window_size = K.int_shape(model.input)[1] self.logger.info('load input data (in %s format): %s'%(self.format, self.infile)) have_structure = False if self.format == 'fasta': # list of tuples: (name, seq) input_data = list(read_fasta(self.infile)) elif self.format == 'ct_dir': # read all .ct files from the directory # list of tuples: (name, seq, pairs) input_data = [] for filename in os.listdir(self.infile): title, seq, pairs = read_ct(os.path.join(self.infile, filename)) title = os.path.splitext(filename)[0] input_data.append((title, seq, pairs)) have_structure = True elif self.format == 'ct': title, seq, pairs = read_ct(self.infile) title = os.path.splitext(os.path.basename(self.infile))[0] input_data = [(title, seq, pairs)] have_structure = True elif self.format == 'rnafold': input_data = [] for name, seq, structure, energy in read_rnafold(self.infile, parse_energy=False): pairs = structure_to_pairs(structure) input_data.append((name, seq, pairs)) have_structure = True elif self.format == 'genomic_data': from genomic_data import GenomicData input_data = [] data = GenomicData(self.infile) for name in data.names: input_data.append((name, data.feature('sequence', name).tostring(), data.feature('reactivity', name))) del data have_structure = True # combine all structures (base-pairs) into one array in the ct file if have_structure: structure = [] for i in range(len(input_data)): structure.append(np.asarray(input_data[i][2], dtype='int32')) structure = np.concatenate(structure) else: structure = None X = [] names = [] # offset default to the center of the window if self.offset is None: self.offset = (window_size + 1)/2 offset = self.offset # convert sequences to windows windows = [] length = [] sequence = [] for item in input_data: name = item[0] seq = item[1] windows += self.sequence_to_windows(seq, window_size, offset) names.append(name) length.append(len(seq)) sequence.append(seq) # combine all sequences into one dataset sequence = np.frombuffer(''.join(sequence), dtype='S1') length = np.asarray(length, dtype='int64') n_samples = len(windows) windows = np.frombuffer(''.join(windows), dtype='S1').reshape((n_samples, window_size)) X = onehot_encode(windows, self.alphabet) # set one-hot coding of padded sequence to [0.25, 0.25, 0.25, 0.25] X[X.sum(axis=2) == 0] = 1.0/len(self.alphabet) self.logger.info('run the model') y_pred = model.predict(X, batch_size=self.batch_size) y_pred = np.squeeze(y_pred) if self.swap_labels: self.logger.info('swap labels') y_pred = 1 - y_pred # start/end position of each transcript in the y_pred end = np.cumsum(length) start = end - length if len(y_pred.shape) > 1: # average the predictions self.logger.info('average windows for dense prediction') y_pred_dense = [] for i in range(len(input_data)): y_pred_dense.append(self.predict_dense(y_pred[start[i]:end[i]], offset)) if self.dense_pred_file: self.logger.info('save dense predictions: ' + self.dense_pred_file) f = h5py.File(self.dense_pred_file, 'w') for i in range(len(names)): g = f.create_group(names[i]) g.create_dataset('predicted_values_dense', data=y_pred[start[i]:end[i]]) g.create_dataset('predicted_values_average', data=y_pred_dense[i]) # 0-based start/end position of each transcript in the array (y_pred, sequence, structure) g.create_dataset('sequence', data=sequence[start[i]:end[i]]) if structure is not None: g.create_dataset('structure', data=structure[start[i]:end[i]]) f.close() y_pred = np.concatenate(y_pred_dense) y_pred_labels = np.round(y_pred).astype('int32') else: y_pred_labels = np.round(y_pred).astype('int32') if self.restraint_file: header = ['name', 'position', 'pred', 'base'] table = pd.DataFrame() table['name'] = np.repeat(np.asarray(names, dtype='S'), length) # start position of each transcript relative to the y_pred start = np.repeat(cum_length - length, length) # position (1-based) relative to the transcript position = np.arange(1, length.sum() + 1) - start table['position'] = position table['pred'] = y_pred_labels table['base'] = sequence table['true'] = structure self.logger.info('write restraint file: ' + self.restraint_file) prepare_output_file(self.restraint_file) table.to_csv(self.restraint_file, sep='\t', index=False) if self.metric_file: self.logger.info('save metric file: ' + self.metric_file) prepare_output_file(self.metric_file) f = h5py.File(self.metric_file, 'w') from sklearn.metrics import accuracy_score f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_labels', data=y_pred_labels) if have_structure: #print structure y_true = (structure > 0).astype('int32') f.create_dataset('y_true', data=y_true) g = f.create_group('metrics') for metric in self.metrics: scorer = get_scorer(metric) if get_scorer_type(metric) == 'continous': score = scorer(y_true, y_pred) else: score = scorer(y_true, y_pred_labels) self.logger.info('%s: %f'%(metric, score)) g.create_dataset(metric, data=score) f.close() if self.metric_by_sequence_file: self.logger.info('calculate metrics by sequence') records = [] for i in range(len(names)): y_true_ = (structure[start[i]:end[i]] > 0).astype('int32') y_pred_ = y_pred[start[i]:end[i]] y_pred_labels_ = y_pred_labels[start[i]:end[i]] scores = [] for metric in self.metrics: scorer = get_scorer(metric) if get_scorer_type(metric) == 'continuous': try: score = scorer(y_true_, y_pred_) except ValueError: score = np.nan else: score = scorer(y_true_, y_pred_labels_) scores.append(score) records.append([names[i], length[i]] + scores) records = pd.DataFrame.from_records(records, columns=['name', 'length'] + self.metrics) self.logger.info('save metric by sequence file: ' + self.metric_by_sequence_file) prepare_output_file(self.metric_by_sequence_file) records.to_csv(self.metric_by_sequence_file, sep='\t', index=False, na_rep='nan') if self.pred_file: self.logger.info('save predictions to file: ' + self.pred_file) prepare_output_file(self.pred_file) f = h5py.File(self.pred_file, 'w') for i in range(len(names)): y_true_ = (structure[start[i]:end[i]] > 0).astype('int32') g = f.create_group(names[i]) g.create_dataset('sequence', data=sequence[start[i]:end[i]]) g.create_dataset('predicted_values', data=y_pred[start[i]:end[i]]) g.create_dataset('predicted_labels', data=y_pred[start[i]:end[i]]) g.create_dataset('true_labels', data=y_true_) f.close()
def analyze_reactivity_periodicity(args): import h5py import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import seaborn as sns sns.set() from genomic_data import GenomicData from scipy.fftpack import fft from formats import read_fasta from ioutils import prepare_output_file logger.info('read input file: ' + args.input_file) reactivities = {} if args.assay_type == 'shapemap': with h5py.File(args.input_file, 'r') as f: for tx_id in f['reactivities'].keys(): reactivities[tx_id] = f['reactivities/' + tx_id][:] elif args.assay_type in ('icshape', 'rt_stop'): data = GenomicData(args.input_file) for name in data.names: reactivities[name] = data.feature(args.assay_type, name) seq_names = np.asarray(reactivities.keys(), dtype='S') # get coverage of each transcript lengths = np.asarray([reactivities[name].shape[0] for name in seq_names]) coverage = np.asarray( [np.sum(~np.isnan(reactivities[name])) for name in seq_names]) coverage = coverage.astype(np.float64) / lengths # filter sequences of by coverage seq_names = seq_names[coverage >= args.min_coverage] reactivities = {name: reactivities[name] for name in seq_names} def calc_average_reactivities(reactivities, direction, aligned_length=100): reactivities_avg = np.full((len(reactivities), aligned_length), np.nan) for i, name in enumerate(reactivities.keys()): x = reactivities[name] L = min(x.shape[0], aligned_length) if direction == '5p': reactivities_avg[i, :L] = x[:L] elif direction == '3p': reactivities_avg[i, -L:] = x[-L:] transcript_counts = np.sum(~np.isnan(reactivities_avg), axis=0) reactivities_avg = np.nan_to_num(reactivities_avg) reactivities_avg = np.sum( reactivities_avg, axis=0) / transcript_counts.astype(np.float64) return reactivities_avg logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) aligned_length = args.aligned_length with PdfPages(args.output_file) as pdf: # 5'-end reactivities_avg_5p = calc_average_reactivities( reactivities, '5p', aligned_length=aligned_length) fig, ax = plt.subplots(figsize=(18, 4)) ax.plot(np.arange(aligned_length), reactivities_avg_5p) ax.set_xlabel('Position in CDS from 5\'-end') ax.set_ylabel('Reactivity') ax.set_xlim(0, aligned_length) pdf.savefig() plt.close() # 3'-end reactivities_avg_3p = calc_average_reactivities( reactivities, '3p', aligned_length=aligned_length) fig, ax = plt.subplots(figsize=(18, 4)) ax.plot(np.arange(-aligned_length, 0), reactivities_avg_3p) ax.set_xlabel('Distance from CDS from 3\'-end') ax.set_ylabel('Reactivity') ax.set_xlim(-aligned_length, 0) pdf.savefig() plt.close() ## FFT plot_fft(reactivities_avg_5p, 'CDS from 5\'-end') pdf.savefig() plt.close() plot_fft(reactivities_avg_3p, 'CDS from 3\'-end') pdf.savefig() plt.close()