def write_ensemble_metrics(row, id_field): bp_filename = 'bpps_%s.npy.gz' % str(row[id_field]) # bpps exist already if os.path.exists(bp_filename): f = gzip.GzipFile(bp_filename, "r") bpp_mat = np.load(f) print('loaded ', row[id_field]) else: # calculate bpps and cache bpp_mat = bpps(row['sequence'], package='eternafold') print("wrote ", row[id_field]) # cache f = gzip.GzipFile(bp_filename, 'w') np.save(file=f, arr=bpp_mat) f.close() punp_vector = 1-np.sum(bpp_mat, axis=0) aup = np.mean(punp_vector) sup_init = np.sum(punp_vector[:14]) return aup, sup_init
def threshknot_util(sequence, package='vienna_2', theta=0): ''' Inputs: sequence: RNA sequence package: folding package to use Set theta = 0 to not filter base pairs as in ThreshKnot. Returns: N x N matrix of base pair probabilities. Nonzero entries represent base pairs predicted in final (possibly pseudoknotted) structure. Probabilities are their associated probability (obvs). ''' bp_matrix = bpps(sequence, package=package) # if desired, filter base pair probabilities below a cutoff bp_matrix[np.where(bp_matrix <= theta)] = 0 output = np.zeros([len(sequence), len(sequence)]) # ProbKnot heuristic part 1: get all base pairs where p(ij) == p_max(i) output[np.where(bp_matrix == np.max(bp_matrix, axis=0))] = 1 # ProbKnot heuristic part 2: get all base pairs where p(ij) == p_max(j) array_of_bps = np.clip(output + np.transpose(output) - 1, 0, 1) # setting all bp probabilities not corresponding to a final selected base pair to zero bp_matrix[np.where(array_of_bps == 0)] = 0 return bp_matrix
def get_secstruct_mea(int_start, int_end, ref_seq, secstruct_interval=20): sequence = ref_seq[(int_start-secstruct_interval):(int_end+secstruct_interval)] bp_matrix = bpps(sequence, package='contrafold_2') best_struct = '' best_mcc = 0 for log_gamma in range(-2,2): mea_mdl = MEA(bp_matrix,gamma=10**log_gamma) [exp_sen, exp_ppv, exp_mcc, exp_fscore] = mea_mdl.score_expected() if exp_mcc > best_mcc: best_struct = mea_mdl.structure best_mcc = exp_mcc conserved_str = '.'*secstruct_interval + '*'*(int_end - int_start) + '.'*secstruct_interval return((best_struct, best_mcc, conserved_str, sequence))
from arnie.utils import write_vector_to_file if __name__=='__main__': p = argparse.ArgumentParser(description= """Write unpaired posterior probabilities to files. """) p.add_argument("seq_dir", nargs='+', help="path to dir of *.seq files") p.add_argument("-o", help="name of output dir") p.add_argument("-p", "--package", default='vienna_2', help="Package to use") if len(sys.argv)==1: p.print_help(sys.stderr) sys.exit(1) args = p.parse_args() if not os.path.exists('./%s' % args.o): os.makedirs('./%s' % args.o) for seqfile in args.seq_dir: print(seqfile) seq=open(seqfile,'r').readlines()[-1].rstrip() seq_id = os.path.basename(seqfile).replace('.seq','') unp_vector = 1-np.sum(bpps.bpps(seq, package=args.package),axis=0) with open("%s/%s.unp" % (args.o, seq_id),'w') as f: write_vector_to_file(unp_vector, f)
from arnie.utils import write_matrix_to_file if __name__=='__main__': p = argparse.ArgumentParser(description= """ Write base pairing probability matrices to files. """) p.add_argument("seq_dir", nargs='+', help="path to dir of *.seq files") p.add_argument("-o", help="name of output dir") p.add_argument("-p", "--package", default='vienna_2', help="Package to use") if len(sys.argv)==1: p.print_help(sys.stderr) sys.exit(1) args = p.parse_args() if not os.path.exists('./%s' % args.o): os.makedirs('./%s' % args.o) for seqfile in args.seq_dir: print(seqfile) seq=open(seqfile,'r').readlines()[-1].rstrip() seq_id = os.path.basename(seqfile).replace('.seq','') bp_matrix = bpps.bpps(seq, package=args.package) with open("%s/%s.bpps" % (args.o, seq_id),'w') as f: write_matrix_to_file(bp_matrix, f)
def test_bpps(pkg): p = bpps(sample_seq, package=pkg) print('test bpps %s' % pkg) print(p[0]) return
def calc_bpp(seq): return bpps(seq, package=args.package, T=args.temp)