def simpleShuffleFasta(self, file): chrom, chrseq = self.readFasta(file) chrseq = chrseq.upper() out = "" while len(chrseq) > 0: if chrseq[0:4] == "N"*4: for i in range(0, len(chrseq)): if chrseq[i] == "N": continue else: out = out+chrseq[0:i] chrseq = chrseq[i:] break if chrseq.find("NNNN") == True: index = chrseq.index("NNNN") out = out+ushuffle.shuffle(chrseq[0: (index)], index+4, 3) chrseq = chrseq[(index):] else: out = out+ushuffle.shuffle(chrseq[0:], len(chrseq), 3) chrseq = "" break out = self.sep(out) f = open(chrom+"_shuffle_3.fa", "w") f.write(">"+chrom+"\n") f.write(out) f.close()
def simpleShuffleFasta(self, file): chrom, chrseq = self.readFasta(file) chrseq = chrseq.upper() out = "" while len(chrseq) > 0: if chrseq[0:4] == "N" * 4: for i in range(0, len(chrseq)): if chrseq[i] == "N": continue else: out = out + chrseq[0:i] chrseq = chrseq[i:] break if chrseq.find("NNNN") == True: index = chrseq.index("NNNN") out = out + ushuffle.shuffle(chrseq[0:(index)], index + 4, 3) chrseq = chrseq[(index):] else: out = out + ushuffle.shuffle(chrseq[0:], len(chrseq), 3) chrseq = "" break out = self.sep(out) f = open(chrom + "_shuffle_3.fa", "w") f.write(">" + chrom + "\n") f.write(out) f.close()
def padseq(input_filename,output_dirname,tablename='Zasha_20081002_plus20071102_curated'): """ If we can get the real flank by querying NCBI, then get it. Else, pad with shuffled version (preserving dinucleotide freq). """ conn = get_conn_ncRNA() cursor = conn.cursor() output_prefix = output_dirname+'/'+os.path.basename(input_filename)[:-4] f = open(output_prefix+'_padseqlen.fna','w') map_padded_to_orig_id = {} # rex = re.compile('(\S+)\\/(\d+)-(\d+)') tmp_outfile = os.tempnam() for r in SeqIO.parse(open(input_filename),'fasta'): i = r.id.rfind('_') family,db_id = r.id[:i],r.id[(i+1):] cursor.execute("select acc,start,end,strand from {0} where id={1}".format(tablename,db_id)) try: acc,start,end,strand = cursor.fetchone() except: print("failed on {0}".format(db_id)) continue # print >> sys.stderr, db_id,acc,start,end,strand # (acc,duncare),start,end,strand = parsed_accID(r.id) seqlen = end-start+1 final_start = max(1,start-seqlen) final_end = end+seqlen map_padded_to_orig_id["{0}/{1}-{2}".format(acc,final_start,final_end)] = r.id url = EUTIL_URL + 'id=' + acc + '&strand=' + str(strand) + '&seq_start=' + str(final_start) + '&seq_stop=' + str(final_end) urlretrieve(url, tmp_outfile) try: rec = SeqIO.parse(open(tmp_outfile),'fasta').next().seq.tostring() except: rec = None if rec is None or len(rec)<>(final_end-final_start+1): s = r.seq.tostring() tmp_front = '' tmp_back = '' tmp_front = shuffle(s,seqlen,2) tmp_back = shuffle(s,seqlen,2) f.write(">{4}_{0}/{1}-{2}\n{3}\n".format(acc,final_start,final_end,tmp_front+s+tmp_back,r.id)) else: f.write(">{4}_{0}/{1}-{2}\n{3}\n".format(acc,final_start,final_end,rec,r.id)) f.close() f = open(output_prefix+'_padseqlen.map','w') dump(map_padded_to_orig_id,f) f.close()
def shuffle_sequences(sequences, k=1): """Shuffle one-hot represented sequences while preserving k-let frequencies. Parameters ---------- one_hot : np.ndarray One_hot encoded sequence with shape (N, L, A) k : int, optional k of k-let frequencies to preserve. For example, with k = 2, dinucleotide shuffle is performed. The default is k = 1 (i.e., single-nucleotide shuffle). Returns ------- np.ndarray One-hot represented shuffled sequences, of the same shape as one_hot. Examples -------- >>> seqs = ["AGCGTTCAA", "TACGAATCG"] >>> seqs_shuffled = shuffle_sequences(seqs, k=2) # dinucleotide shuffle >>> seqs_shuffled ['AAGTTCGCA', 'TCGATAACG'] """ sequences_shuffled = [] for i, seq in enumerate(sequences): seq = seq.upper() seq_shuffled = shuffle(bytes(seq, "utf-8"), k).decode("utf-8") sequences_shuffled.append(seq_shuffled) return sequences_shuffled
def shuffle(self, seq): rand_seq = shuffle(seq.encode(), self.k).decode("utf-8") rand_array = np.array(list(rand_seq)) indices = np.random.choice(len(seq), self.mutations, replace=False) new_bases = np.random.choice(list("AGTC"), self.mutations) rand_array[indices] = new_bases rand_seq = "".join(rand_array) return rand_seq
def create_one_negative_sample_preserve_singles_distribution(sample, k): bases = [] for i in range(SAMPLE_LENGTH): bases.append(sample[i]) sample = concatenate_bases(bases) # shuffle the string sequence while preserving the k-let counts: sample_str = ushuffle.shuffle(sample, SAMPLE_LENGTH, k) sample_matrix = convert_sample_to_matrix(sample_str) return sample_str, sample_matrix
def local_ushuffle(seq, dishuff = True): ''' shuffle dinucletide ''' if dishuff: return_seq = ushuffle.shuffle(seq, len(seq), 2) else: l = list(seq) random.shuffle(l) return_seq = ''.join(l) return return_seq
def generate_sequences(seqs, kmer, nfold, random_seed): set_seed(random_seed) cpt = 1 bg_gc_list = [] bg_lengths = [] dinuc = [0] * len(IUPAC_DINUC) for record in seqs: seq = record.seq.__str__() descr = "Background sequence for {0:s}".format(record.name) for n in range(0, nfold): new_sequence = shuffle(str.encode(seq), kmer).decode() new_seq = SeqRecord(Seq(new_sequence, IUPACData.ambiguous_dna_letters), id="background_seq_{0:d}".format(cpt), description=descr) print(new_seq.format("fasta"), end='') bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) dinuc = [ x + y for x, y in zip(dinuc, dinuc_count(Seq(new_sequence))) ] cpt += 1 return bg_gc_list, bg_lengths, dinuc
def shuffle_window(ss, km, wl, step): bs = ss[:] for i in range(0, len(bs) - 1, step): shuff_seq = shuffle(str.encode(bs[i:(i + wl)]), km).decode() bs = bs[0:i] + shuff_seq + bs[i + wl:] return (bs) # returns shuffled sequence
import re,os,sys,glob import mycustom import ushuffle from Bio import SeqIO import pdb names=[] sequencesL=[] # path with the fasta file to be simulated filed = "/nfs/compgen-04/team218/ilias/nullomers_hg38_v2/hg38.fa" sequenceO = mycustom.FastaFile(filed) sequencesL = [ i.sequence.upper() for i in sequenceO ] names = [ i.name.upper() for i in sequenceO ] # Number of simulations for k in range(1,101): datafile=open("sims_genome_dinucleotide/hg38_bootstrap_number_"+str(k)+"_controlling_dinucleotide_content.fasta","w") sequencesL_c=[] for index,i in enumerate(sequencesL): seq_random=ushuffle.shuffle(i,len(i), 2) datafile.write(">"+names[index]+'_control_bootstrap_'+str(k)+'\n') datafile.write(seq_random+'\n') datafile.close()
def do_dinucleotide_shuffling(seq): seq1 = ushuffle.shuffle(seq, len(seq), 2) return seq1
def main(options): nan = "NaN" """Main logic of the script""" snorna_paths = options.snoRNA_paths snorna_paths_glob = glob.glob(os.path.join(snorna_paths, "*.fa")) counter = 0 with open(options.input) as infile, open(options.output, 'w') as outfile: for row in csv.reader(infile, delimiter="\t"): chrom, start, end, seqid, count, strand, snorna, sequence = row[:8] # syserr("%i\n" % counter) snorna = seqid.split(":")[-2] counter += 1 with open(os.path.join(snorna_paths, snorna + '.fa')) as snorna_in: snorna_sequence = str( SeqIO.parse(snorna_in, 'fasta').next().seq) with open(random.choice(snorna_paths_glob)) as snorna_in: snorna_sequence_random = str( SeqIO.parse(snorna_in, 'fasta').next().seq) shuffled_target = shuffle(sequence, len(sequence), 2) shuffled_snorna = shuffle(snorna_sequence, len(snorna_sequence), 2) proc = subprocess.Popen( 'printf "%s\n%s" | %s' % (snorna_sequence, sequence, options.RNAduplex_bin), stdout=subprocess.PIPE, shell=True) proc_shuf = subprocess.Popen( 'printf "%s\n%s" | %s' % (snorna_sequence_random, sequence, options.RNAduplex_bin), stdout=subprocess.PIPE, shell=True) proc_shuf_tar = subprocess.Popen( 'printf "%s\n%s" | %s' % (snorna_sequence, shuffled_target, options.RNAduplex_bin), stdout=subprocess.PIPE, shell=True) sout, serr = proc.communicate() sout_shuf, serr_shuf = proc_shuf.communicate() sout_shuf_tar, serr_shuf_tar = proc_shuf_tar.communicate() # .(.((.(((((((((((((((((((....(((((&.)))).)....)))).))))))))))))))))).). 17,50 : 35,70 (-29.20) score_shuf = sout_shuf.split()[-1][1:-1] score_shuf_tar = sout_shuf_tar.split()[-1][1:-1] score = sout.split()[-1][1:-1] part1 = sout.split()[1] part2 = sout.split()[3] structure = get_complete_structure(snorna_sequence, sout.split()[0].split("&")[0], part1) structure_random = get_complete_structure( snorna_sequence_random, sout_shuf.split()[0].split("&")[0], sout_shuf.split()[1]) outfile.write("%s\t%s\n" % ("\t".join(row), "\t".join([ str(i) for i in [ len(snorna_sequence), get_GC_frac(snorna_sequence), score, score_shuf, score_shuf_tar, structure, part1, part2, structure_random ] ])))
def kshuffle(seq, k=2): return shuffle(seq, len(seq), k)
def shuffle_onehot(one_hot, k=1): """Shuffle one-hot represented sequences while preserving k-let frequencies. Parameters ---------- one_hot : np.ndarray One_hot encoded sequence with shape (N, L, A). k : int, optional k of k-let frequencies to preserve. For example, with k = 2, dinucleotide shuffle is performed. The default is k = 1 (i.e., single-nucleotide shuffle). Returns ------- np.ndarray One-hot represented shuffled sequences, of the same shape as one_hot. Examples -------- >>> seqs = ["ACGT", "GTCA"] >>> one_hot = convert_one_hot(seqs) >>> one_hot array([[[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]], [[0., 0., 1., 0.], [0., 0., 0., 1.], [0., 1., 0., 0.], [1., 0., 0., 0.]]]) >>> one_hot_shuffled = shuffle_onehot(one_hot) >>> one_hot_shuffled array([[[0., 0., 0., 1.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.]], [[1., 0., 0., 0.], [0., 0., 1., 0.], [0., 1., 0., 0.], [0., 0., 0., 1.]]]) """ if k == 1: L = one_hot.shape[1] # one_hot has shape (N, L, A) rng = np.random.default_rng() one_hot_shuffled = [] for x in one_hot: perm = rng.permutation(L) x_shuffled = x[perm, :] one_hot_shuffled.append(x_shuffled) one_hot_shuffled = np.array(one_hot_shuffled) return one_hot_shuffled elif k >= 2: # convert one_hot to pandas Series of letters, then string letters together # (for each Series) seqs = [seq.str.cat() for seq in convert_onehot_to_sequence(one_hot)] seqs_shuffled = [] for i, seq in enumerate(seqs): seq = seq.upper() # dinucleotide shuffle seq_shuffled = shuffle(bytes(seq, "utf-8"), k).decode("utf-8") seqs_shuffled.append(seq_shuffled) one_hot_shuffled = convert_one_hot(seqs_shuffled) return one_hot_shuffled else: raise ValueError("k must be an integer greater than or equal to 1")
def main(options): """Main logic of the script""" with open(options.input) as infile, open(options.output, 'w') as outfile: for rec in SeqIO.parse(infile, 'fasta'): shuffled_seq = ushuffle.shuffle(str(rec.seq), len(rec.seq), options.let_size) outfile.write(">%s\n%s\n" % (rec.id, shuffled_seq))
def new_cluster_pipe(rfam_fam, shuffle_ratio): assert type(shuffle_ratio) is int output_prefix = "Rfam_{fam}_shuffle{X}X".format(fam=rfam_fam, X=shuffle_ratio) fasta_filename = output_prefix+'.fna' blast_output = "{input}.M8N7Q16R2W3E2.WUblast".format(input=fasta_filename) report_f = open(output_prefix+'.report', 'w') if not os.path.exists(blast_output): dummy_id = 0 nodes_to_index = {} with open(fasta_filename, 'w') as f: with get_conn_ncRNA() as cursor: cursor.execute("select id,seq from Rfam_fasta where rfam_fam='{fam}' order by id".format(fam=rfam_fam)) for _id,seq in cursor.fetchall(): id = "TP{0}_{1}".format(dummy_id, _id) f.write(">{id}\n{seq}\n".format(id=id, seq=seq)) nodes_to_index[id] = dummy_id dummy_id += 1 ushuffle.shuffle(seq, len(seq), 2) for x in xrange(shuffle_ratio): id = "FP{0}_{1}".format(dummy_id, _id) f.write(">{id}\n{seq}\n".format(id=id, seq=ushuffle.shuffle2())) nodes_to_index[id] = dummy_id dummy_id += 1 start_t = time.time() # now blast it os.system("xdformat -n -o {input} {input}".format(input=fasta_filename)) os.system("blastn -d {input} -i {input} -M 8 -N -7 -Q 16 -R 2 -E 2 \ -W 3 -mformat 2 -cpus 4 -o {output}".format(input=fasta_filename, output=blast_output)) report_f.write("(1) BLAST TIME: {0} sec\n".format(time.time()-start_t)) # now parse the blast nodes_to_index = c1.NodesToIndex(nodes_to_index, -1) G = Graph() c1.step1_process_blast(blast_output=blast_output,\ score_cutoff=35, nodes_to_index=nodes_to_index, G=G, program='WU') print >> sys.stderr, "Homology graph has {0} nodes, {1} edges....".format(\ G.number_of_nodes(), G.number_of_edges()) c1.export_to_db(G, nodes_to_index, 0, blast_output) # convert nodes_to_index into dict nodes_ind --> acc id nodes_to_index = dict( map(lambda (x,y):(y,x), nodes_to_index.d.items()) ) with open(blast_output+'.nodes_to_index', 'w') as handle: for ind,id in nodes_to_index.iteritems(): handle.write("{0}\t{1}\n".format(ind,id)) # read back the .parsed and .sets_for_nodes files G = Graph() sets_for_nodes = {} nodes_to_index = {} with open(blast_output+'.parsed') as handle: for line in handle: raw = map(int, line.strip().split('\t')) G.add_edge(raw[0],raw[1]) with open(blast_output+'.sets_for_nodes') as handle: for line in handle: raw = map(int, line.strip().split('\t')) sets_for_nodes[raw[0]] = {'nodes_ind':raw[1],'start':raw[2],'end':raw[3]} with open(blast_output+'.nodes_to_index') as handle: for line in handle: raw = line.strip().split() nodes_to_index[int(raw[0])] = raw[1] tmp = len(filter(lambda x: nodes_to_index[sets_for_nodes[x]['nodes_ind']].startswith('FP'), G.nodes_iter())) report_f.write("(2) AFTER parsing BLAST, graph has {0} negative control nodes, {1} TP nodes\n".format(tmp, G.number_of_nodes()-tmp)) # remove low deg (< 3) nodes x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter()) while len(x) > 0: G.delete_nodes_from(x) x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter()) tmp = len(filter(lambda x: nodes_to_index[sets_for_nodes[x]['nodes_ind']].startswith('FP'), G.nodes_iter())) report_f.write("(3) AFTER recursively removing nodes of degree < 3, graph has {0} negative control nodes, {1} TP nodes\n".format(tmp, G.number_of_nodes()-tmp)) report_f.write("----------------------------------------------------------------------------\n") report_f.write("OUT\tDIR\tCLIQUE_SIZE\tSCANNED_TP\tSCANNED_FP\tCM_time\n") report_f.write("----------------------------------------------------------------------------\n") # for now just brute force....go through node by node as seeds dummy_round = 0 while G.number_of_nodes()>=NEW_MIN_CLIQUE_SIZE and G.number_of_edges()>=NEW_MIN_CLIQUE_SIZE: # find perfect max cliques with a random starting node G_nodes = G.nodes() S,H = p.convert_graph_connectivity_to_sparse(G, G_nodes) tQ = p.grasp(S, H, gamma=1.0, maxitr=20, given_starting_node=None) Q = map(lambda x: G_nodes[x], tQ) if len(Q) < NEW_MIN_CLIQUE_SIZE: # delete these nodes G.delete_nodes_from(Q) continue # PERFECT CLIQUE SANITY TESTING, DELETE LATER for x in Q: for y in Q: if x!=y: print >> sys.stderr, "testing....", x,y try: assert G.has_edge(x,y) except: return Q,G Q.sort() print >> sys.stderr, "clique is...", Q prefix = output_prefix + str(dummy_round) + '_size' + str(len(Q)) + '_' dummy_round += 1 start_t = time.time() scan_dir,scan_result = run_cmfinder(Q, nodes_to_index, sets_for_nodes, prefix, os.path.abspath(fasta_filename)) cm_time = time.time()-start_t if scan_result is not None: outf = open(os.path.basename(scan_dir)+'.gv','w') outf.write("""graph test{ edge [ dir=none ]; node [ style=filled, fontsize=2.0, height=0.1, width=0.1, fixedsize=true ]; """) # draw this graph scanned = {'TP':0, 'FP':0} for n in G.nodes_iter(): id = nodes_to_index[sets_for_nodes[n]['nodes_ind']] # id is something like TP1_NC_XXXX.... or FP10_NC_XXXX.... _id = id[:id.find('_')] shape = 'circle' if id.startswith('TP') else 'box' if n in Q: outf.write("{0} [color=dodgerblue1, shape={1}];\n".format(n, shape)) elif id in scan_result: outf.write("{0} [color=darkorange, shape={1}];\n".format(n, shape)) scanned[_id[:2]] += 1 else: outf.write("{0} [color=grey, shape={1}];\n".format(n, shape)) for (n1,n2) in G.edges_iter(data=False): id1 = nodes_to_index[sets_for_nodes[n1]['nodes_ind']] id1 = id1[:id1.find('_')] id2 = nodes_to_index[sets_for_nodes[n2]['nodes_ind']] id2 = id2[:id2.find('_')] outf.write("{0} -- {1};\n".format(n1,n2)) outf.write("}") outf.close() report_f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(outf.name,scan_dir,len(Q),scanned['TP'],scanned['FP'],cm_time)) report_f.flush() # delete the edges from the graph G.delete_edges_from(itertools.combinations(Q, 2)) # again, remove low-degree nodes x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter()) while len(x) > 0: G.delete_nodes_from(x) x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter()) report_f.close()