def simpleShuffleFasta(self, file):
		chrom, chrseq = self.readFasta(file)
		chrseq = chrseq.upper()
		out = ""
		while len(chrseq) > 0:
			if chrseq[0:4] == "N"*4:
				for i in range(0, len(chrseq)):
					if chrseq[i] == "N": continue
					else:
						out = out+chrseq[0:i]
						chrseq = chrseq[i:]
						break
			if chrseq.find("NNNN") == True:
				index = chrseq.index("NNNN")
				out = out+ushuffle.shuffle(chrseq[0: (index)], index+4, 3)
				chrseq = chrseq[(index):]
			else:
				out = out+ushuffle.shuffle(chrseq[0:], len(chrseq), 3)
				chrseq = ""
				break
		out = self.sep(out)
		f = open(chrom+"_shuffle_3.fa", "w")
		f.write(">"+chrom+"\n")
		f.write(out)
		f.close()
 def simpleShuffleFasta(self, file):
     chrom, chrseq = self.readFasta(file)
     chrseq = chrseq.upper()
     out = ""
     while len(chrseq) > 0:
         if chrseq[0:4] == "N" * 4:
             for i in range(0, len(chrseq)):
                 if chrseq[i] == "N": continue
                 else:
                     out = out + chrseq[0:i]
                     chrseq = chrseq[i:]
                     break
         if chrseq.find("NNNN") == True:
             index = chrseq.index("NNNN")
             out = out + ushuffle.shuffle(chrseq[0:(index)], index + 4, 3)
             chrseq = chrseq[(index):]
         else:
             out = out + ushuffle.shuffle(chrseq[0:], len(chrseq), 3)
             chrseq = ""
             break
     out = self.sep(out)
     f = open(chrom + "_shuffle_3.fa", "w")
     f.write(">" + chrom + "\n")
     f.write(out)
     f.close()
def padseq(input_filename,output_dirname,tablename='Zasha_20081002_plus20071102_curated'):
	"""
		If we can get the real flank by querying NCBI, then get it.
		Else, pad with shuffled version (preserving dinucleotide freq).
	"""
	conn = get_conn_ncRNA()
	cursor = conn.cursor()

	output_prefix = output_dirname+'/'+os.path.basename(input_filename)[:-4]
	f = open(output_prefix+'_padseqlen.fna','w')
	map_padded_to_orig_id = {}

#	rex = re.compile('(\S+)\\/(\d+)-(\d+)')
	tmp_outfile = os.tempnam()
	for r in SeqIO.parse(open(input_filename),'fasta'):
		i = r.id.rfind('_')
		family,db_id = r.id[:i],r.id[(i+1):]
		cursor.execute("select acc,start,end,strand from {0} where id={1}".format(tablename,db_id))
		try:
			acc,start,end,strand = cursor.fetchone()
		except:
			print("failed on {0}".format(db_id))
			continue
#		print >> sys.stderr, db_id,acc,start,end,strand
#		(acc,duncare),start,end,strand = parsed_accID(r.id)
		seqlen = end-start+1
		final_start = max(1,start-seqlen)
		final_end = end+seqlen

		map_padded_to_orig_id["{0}/{1}-{2}".format(acc,final_start,final_end)] = r.id

		url = EUTIL_URL + 'id=' + acc + '&strand=' + str(strand) + '&seq_start=' + str(final_start) + '&seq_stop=' + str(final_end)
		urlretrieve(url, tmp_outfile)
	
		try:
			rec = SeqIO.parse(open(tmp_outfile),'fasta').next().seq.tostring()
		except:
			rec = None
		if rec is None or len(rec)<>(final_end-final_start+1):
			s = r.seq.tostring()
			tmp_front = ''
			tmp_back = ''
			tmp_front = shuffle(s,seqlen,2)
			tmp_back  = shuffle(s,seqlen,2)
			f.write(">{4}_{0}/{1}-{2}\n{3}\n".format(acc,final_start,final_end,tmp_front+s+tmp_back,r.id))
		else:
			f.write(">{4}_{0}/{1}-{2}\n{3}\n".format(acc,final_start,final_end,rec,r.id))
	f.close()

	f = open(output_prefix+'_padseqlen.map','w')
	dump(map_padded_to_orig_id,f)
	f.close()
Exemple #4
0
def shuffle_sequences(sequences, k=1):
    """Shuffle one-hot represented sequences while preserving k-let frequencies.

    Parameters
    ----------
    one_hot : np.ndarray
        One_hot encoded sequence with shape (N, L, A)
    k : int, optional
        k of k-let frequencies to preserve. For example, with k = 2, dinucleotide
        shuffle is performed. The default is k = 1 (i.e., single-nucleotide shuffle).

    Returns
    -------
    np.ndarray
        One-hot represented shuffled sequences, of the same shape as one_hot.

    Examples
    --------
    >>> seqs = ["AGCGTTCAA", "TACGAATCG"]
    >>> seqs_shuffled = shuffle_sequences(seqs, k=2) # dinucleotide shuffle
    >>> seqs_shuffled
    ['AAGTTCGCA', 'TCGATAACG']
    """
    sequences_shuffled = []

    for i, seq in enumerate(sequences):
        seq = seq.upper()
        seq_shuffled = shuffle(bytes(seq, "utf-8"), k).decode("utf-8")

        sequences_shuffled.append(seq_shuffled)

    return sequences_shuffled
Exemple #5
0
 def shuffle(self, seq):
     rand_seq = shuffle(seq.encode(), self.k).decode("utf-8")
     rand_array = np.array(list(rand_seq))
     indices = np.random.choice(len(seq), self.mutations, replace=False)
     new_bases = np.random.choice(list("AGTC"), self.mutations)
     rand_array[indices] = new_bases
     rand_seq = "".join(rand_array)
     return rand_seq
Exemple #6
0
def create_one_negative_sample_preserve_singles_distribution(sample, k):
    bases = []
    for i in range(SAMPLE_LENGTH):
        bases.append(sample[i])
    sample = concatenate_bases(bases)
    # shuffle the string sequence while preserving the k-let counts:
    sample_str = ushuffle.shuffle(sample, SAMPLE_LENGTH, k)
    sample_matrix = convert_sample_to_matrix(sample_str)
    return sample_str, sample_matrix
Exemple #7
0
def local_ushuffle(seq, dishuff = True):
    '''
    shuffle dinucletide
    '''

    if dishuff:
        return_seq = ushuffle.shuffle(seq, len(seq), 2)
    else:
        l = list(seq)
        random.shuffle(l)
        return_seq = ''.join(l)        
        
    return return_seq
def generate_sequences(seqs, kmer, nfold, random_seed):
    set_seed(random_seed)
    cpt = 1
    bg_gc_list = []
    bg_lengths = []
    dinuc = [0] * len(IUPAC_DINUC)
    for record in seqs:
        seq = record.seq.__str__()
        descr = "Background sequence for {0:s}".format(record.name)
        for n in range(0, nfold):
            new_sequence = shuffle(str.encode(seq), kmer).decode()
            new_seq = SeqRecord(Seq(new_sequence,
                                    IUPACData.ambiguous_dna_letters),
                                id="background_seq_{0:d}".format(cpt),
                                description=descr)
            print(new_seq.format("fasta"), end='')
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
            dinuc = [
                x + y for x, y in zip(dinuc, dinuc_count(Seq(new_sequence)))
            ]
            cpt += 1
    return bg_gc_list, bg_lengths, dinuc
def shuffle_window(ss, km, wl, step):
    bs = ss[:]
    for i in range(0, len(bs) - 1, step):
        shuff_seq = shuffle(str.encode(bs[i:(i + wl)]), km).decode()
        bs = bs[0:i] + shuff_seq + bs[i + wl:]
    return (bs)  # returns shuffled sequence
Exemple #10
0
import re,os,sys,glob
import mycustom
import ushuffle
from Bio import SeqIO
import pdb

names=[]
sequencesL=[]

# path with the fasta file to be simulated 
filed = "/nfs/compgen-04/team218/ilias/nullomers_hg38_v2/hg38.fa"
sequenceO = mycustom.FastaFile(filed)
sequencesL = [ i.sequence.upper() for i in sequenceO ]
names = [ i.name.upper() for i in sequenceO ]

# Number of simulations
for k in range(1,101):
        datafile=open("sims_genome_dinucleotide/hg38_bootstrap_number_"+str(k)+"_controlling_dinucleotide_content.fasta","w")
        sequencesL_c=[]
        for index,i in enumerate(sequencesL):
                seq_random=ushuffle.shuffle(i,len(i), 2)
                datafile.write(">"+names[index]+'_control_bootstrap_'+str(k)+'\n')
                datafile.write(seq_random+'\n')
        datafile.close()
Exemple #11
0
def do_dinucleotide_shuffling(seq):
    seq1 = ushuffle.shuffle(seq, len(seq), 2)
    return seq1
def main(options):
    nan = "NaN"
    """Main logic of the script"""
    snorna_paths = options.snoRNA_paths
    snorna_paths_glob = glob.glob(os.path.join(snorna_paths, "*.fa"))

    counter = 0
    with open(options.input) as infile, open(options.output, 'w') as outfile:
        for row in csv.reader(infile, delimiter="\t"):
            chrom, start, end, seqid, count, strand, snorna, sequence = row[:8]
            # syserr("%i\n" % counter)
            snorna = seqid.split(":")[-2]
            counter += 1
            with open(os.path.join(snorna_paths, snorna + '.fa')) as snorna_in:
                snorna_sequence = str(
                    SeqIO.parse(snorna_in, 'fasta').next().seq)
            with open(random.choice(snorna_paths_glob)) as snorna_in:
                snorna_sequence_random = str(
                    SeqIO.parse(snorna_in, 'fasta').next().seq)

            shuffled_target = shuffle(sequence, len(sequence), 2)
            shuffled_snorna = shuffle(snorna_sequence, len(snorna_sequence), 2)

            proc = subprocess.Popen(
                'printf "%s\n%s" | %s' %
                (snorna_sequence, sequence, options.RNAduplex_bin),
                stdout=subprocess.PIPE,
                shell=True)
            proc_shuf = subprocess.Popen(
                'printf "%s\n%s" | %s' %
                (snorna_sequence_random, sequence, options.RNAduplex_bin),
                stdout=subprocess.PIPE,
                shell=True)
            proc_shuf_tar = subprocess.Popen(
                'printf "%s\n%s" | %s' %
                (snorna_sequence, shuffled_target, options.RNAduplex_bin),
                stdout=subprocess.PIPE,
                shell=True)
            sout, serr = proc.communicate()
            sout_shuf, serr_shuf = proc_shuf.communicate()
            sout_shuf_tar, serr_shuf_tar = proc_shuf_tar.communicate()

            # .(.((.(((((((((((((((((((....(((((&.)))).)....)))).))))))))))))))))).).  17,50  :  35,70  (-29.20)

            score_shuf = sout_shuf.split()[-1][1:-1]
            score_shuf_tar = sout_shuf_tar.split()[-1][1:-1]
            score = sout.split()[-1][1:-1]
            part1 = sout.split()[1]
            part2 = sout.split()[3]
            structure = get_complete_structure(snorna_sequence,
                                               sout.split()[0].split("&")[0],
                                               part1)
            structure_random = get_complete_structure(
                snorna_sequence_random,
                sout_shuf.split()[0].split("&")[0],
                sout_shuf.split()[1])
            outfile.write("%s\t%s\n" % ("\t".join(row), "\t".join([
                str(i) for i in [
                    len(snorna_sequence),
                    get_GC_frac(snorna_sequence), score, score_shuf,
                    score_shuf_tar, structure, part1, part2, structure_random
                ]
            ])))
Exemple #13
0
def kshuffle(seq, k=2):
    return shuffle(seq, len(seq), k)
Exemple #14
0
def shuffle_onehot(one_hot, k=1):
    """Shuffle one-hot represented sequences while preserving k-let frequencies.

    Parameters
    ----------
    one_hot : np.ndarray
        One_hot encoded sequence with shape (N, L, A).
    k : int, optional
        k of k-let frequencies to preserve. For example, with k = 2, dinucleotide
        shuffle is performed. The default is k = 1 (i.e., single-nucleotide shuffle).

    Returns
    -------
    np.ndarray
        One-hot represented shuffled sequences, of the same shape as one_hot.

    Examples
    --------
    >>> seqs = ["ACGT", "GTCA"]
    >>> one_hot = convert_one_hot(seqs)
    >>> one_hot
    array([[[1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 1., 0.],
            [0., 0., 0., 1.]],

           [[0., 0., 1., 0.],
            [0., 0., 0., 1.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.]]])
    >>> one_hot_shuffled = shuffle_onehot(one_hot)
    >>> one_hot_shuffled
    array([[[0., 0., 0., 1.],
            [0., 1., 0., 0.],
            [1., 0., 0., 0.],
            [0., 0., 1., 0.]],

           [[1., 0., 0., 0.],
            [0., 0., 1., 0.],
            [0., 1., 0., 0.],
            [0., 0., 0., 1.]]])
    """

    if k == 1:
        L = one_hot.shape[1]  # one_hot has shape (N, L, A)
        rng = np.random.default_rng()
        one_hot_shuffled = []

        for x in one_hot:
            perm = rng.permutation(L)
            x_shuffled = x[perm, :]
            one_hot_shuffled.append(x_shuffled)

        one_hot_shuffled = np.array(one_hot_shuffled)

        return one_hot_shuffled

    elif k >= 2:
        # convert one_hot to pandas Series of letters, then string letters together
        # (for each Series)
        seqs = [seq.str.cat() for seq in convert_onehot_to_sequence(one_hot)]
        seqs_shuffled = []

        for i, seq in enumerate(seqs):
            seq = seq.upper()
            # dinucleotide shuffle
            seq_shuffled = shuffle(bytes(seq, "utf-8"), k).decode("utf-8")

            seqs_shuffled.append(seq_shuffled)

        one_hot_shuffled = convert_one_hot(seqs_shuffled)
        return one_hot_shuffled

    else:
        raise ValueError("k must be an integer greater than or equal to 1")
def main(options):
    """Main logic of the script"""
    with open(options.input) as infile, open(options.output, 'w') as outfile:
        for rec in SeqIO.parse(infile, 'fasta'):
            shuffled_seq = ushuffle.shuffle(str(rec.seq), len(rec.seq), options.let_size)
            outfile.write(">%s\n%s\n" % (rec.id, shuffled_seq))
def new_cluster_pipe(rfam_fam, shuffle_ratio):
	assert type(shuffle_ratio) is int
	output_prefix = "Rfam_{fam}_shuffle{X}X".format(fam=rfam_fam, X=shuffle_ratio)
	fasta_filename = output_prefix+'.fna'
	blast_output = "{input}.M8N7Q16R2W3E2.WUblast".format(input=fasta_filename)

	report_f = open(output_prefix+'.report', 'w')
	if not os.path.exists(blast_output):
		dummy_id = 0
		nodes_to_index = {}
		with open(fasta_filename, 'w') as f:
			with get_conn_ncRNA() as cursor:
				cursor.execute("select id,seq from Rfam_fasta where rfam_fam='{fam}' order by id".format(fam=rfam_fam))
				for _id,seq in cursor.fetchall():
					id = "TP{0}_{1}".format(dummy_id, _id)
					f.write(">{id}\n{seq}\n".format(id=id, seq=seq))
					nodes_to_index[id] = dummy_id
					dummy_id += 1
					ushuffle.shuffle(seq, len(seq), 2)
					for x in xrange(shuffle_ratio):
						id = "FP{0}_{1}".format(dummy_id, _id)
						f.write(">{id}\n{seq}\n".format(id=id, seq=ushuffle.shuffle2()))
						nodes_to_index[id] = dummy_id
						dummy_id += 1
		start_t = time.time()				
		# now blast it
		os.system("xdformat -n -o {input} {input}".format(input=fasta_filename))
		os.system("blastn -d {input} -i {input} -M 8 -N -7 -Q 16 -R 2 -E 2 \
				-W 3 -mformat 2 -cpus 4 -o {output}".format(input=fasta_filename, output=blast_output))
		report_f.write("(1)  BLAST TIME: {0} sec\n".format(time.time()-start_t))

		# now parse the blast
		nodes_to_index = c1.NodesToIndex(nodes_to_index, -1)
		G = Graph()
		c1.step1_process_blast(blast_output=blast_output,\
				score_cutoff=35, nodes_to_index=nodes_to_index, G=G, program='WU')
		print >> sys.stderr, "Homology graph has {0} nodes, {1} edges....".format(\
				G.number_of_nodes(), G.number_of_edges())
		c1.export_to_db(G, nodes_to_index, 0, blast_output)
		# convert nodes_to_index into dict nodes_ind --> acc id
		nodes_to_index = dict( map(lambda (x,y):(y,x), nodes_to_index.d.items()) )
		with open(blast_output+'.nodes_to_index', 'w') as handle:
			for ind,id in nodes_to_index.iteritems():
				handle.write("{0}\t{1}\n".format(ind,id))

	# read back the .parsed and .sets_for_nodes files
	G = Graph()
	sets_for_nodes = {}
	nodes_to_index = {}
	with open(blast_output+'.parsed') as handle:
		for line in handle:
			raw = map(int, line.strip().split('\t'))
			G.add_edge(raw[0],raw[1])
	with open(blast_output+'.sets_for_nodes') as handle:
		for line in handle:
			raw = map(int, line.strip().split('\t'))
			sets_for_nodes[raw[0]] = {'nodes_ind':raw[1],'start':raw[2],'end':raw[3]}
	with open(blast_output+'.nodes_to_index') as handle:
		for line in handle:
			raw = line.strip().split()
			nodes_to_index[int(raw[0])] = raw[1]

	tmp = len(filter(lambda x: nodes_to_index[sets_for_nodes[x]['nodes_ind']].startswith('FP'), G.nodes_iter()))
	report_f.write("(2)  AFTER parsing BLAST, graph has {0} negative control nodes, {1} TP nodes\n".format(tmp, G.number_of_nodes()-tmp))

	# remove low deg (< 3) nodes
	x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())
	while len(x) > 0:
		G.delete_nodes_from(x)
		x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())
	tmp = len(filter(lambda x: nodes_to_index[sets_for_nodes[x]['nodes_ind']].startswith('FP'), G.nodes_iter()))
	report_f.write("(3)  AFTER recursively removing nodes of degree < 3, graph has {0} negative control nodes, {1} TP nodes\n".format(tmp, G.number_of_nodes()-tmp))
	report_f.write("----------------------------------------------------------------------------\n")
	report_f.write("OUT\tDIR\tCLIQUE_SIZE\tSCANNED_TP\tSCANNED_FP\tCM_time\n")
	report_f.write("----------------------------------------------------------------------------\n")

	# for now just brute force....go through node by node as seeds
	dummy_round = 0
	while G.number_of_nodes()>=NEW_MIN_CLIQUE_SIZE and G.number_of_edges()>=NEW_MIN_CLIQUE_SIZE:
		# find perfect max cliques with a random starting node
		G_nodes = G.nodes()
		S,H = p.convert_graph_connectivity_to_sparse(G, G_nodes)
		tQ = p.grasp(S, H, gamma=1.0, maxitr=20, given_starting_node=None)
		Q = map(lambda x: G_nodes[x], tQ)
		if len(Q) < NEW_MIN_CLIQUE_SIZE: # delete these nodes
			G.delete_nodes_from(Q)
			continue
		# PERFECT CLIQUE SANITY TESTING, DELETE LATER
		for x in Q:
			for y in Q:
				if x!=y:
					print >> sys.stderr, "testing....", x,y
					try:
						assert G.has_edge(x,y)
					except:
						return Q,G
		Q.sort()
		print >> sys.stderr, "clique is...", Q
		prefix = output_prefix + str(dummy_round) + '_size' + str(len(Q)) + '_'
		dummy_round += 1
		start_t = time.time()
		scan_dir,scan_result = run_cmfinder(Q, nodes_to_index, sets_for_nodes, prefix, os.path.abspath(fasta_filename))
		cm_time = time.time()-start_t
		if scan_result is not None:
			outf = open(os.path.basename(scan_dir)+'.gv','w')
			outf.write("""graph test{
			edge [ dir=none ];
			node [ style=filled, fontsize=2.0, height=0.1, width=0.1, fixedsize=true ];
			""")
			# draw this graph
			scanned = {'TP':0, 'FP':0}
			for n in G.nodes_iter():
				id = nodes_to_index[sets_for_nodes[n]['nodes_ind']] # id is something like TP1_NC_XXXX.... or FP10_NC_XXXX....
				_id = id[:id.find('_')]
				shape = 'circle' if id.startswith('TP') else 'box'
				if n in Q:
					outf.write("{0} [color=dodgerblue1, shape={1}];\n".format(n, shape))
				elif id in scan_result:
					outf.write("{0} [color=darkorange, shape={1}];\n".format(n, shape))
					scanned[_id[:2]] += 1
				else:
					outf.write("{0} [color=grey, shape={1}];\n".format(n, shape))
			for (n1,n2) in G.edges_iter(data=False):
				id1 = nodes_to_index[sets_for_nodes[n1]['nodes_ind']]
				id1 = id1[:id1.find('_')]
				id2 = nodes_to_index[sets_for_nodes[n2]['nodes_ind']]
				id2 = id2[:id2.find('_')]
				outf.write("{0} -- {1};\n".format(n1,n2))
			outf.write("}")
			outf.close()
			report_f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(outf.name,scan_dir,len(Q),scanned['TP'],scanned['FP'],cm_time))
			report_f.flush()
		# delete the edges from the graph
		G.delete_edges_from(itertools.combinations(Q, 2))
		# again, remove low-degree nodes
		x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())
		while len(x) > 0:
			G.delete_nodes_from(x)
			x = filter(lambda n: G.degree(n)<NEW_MIN_CLIQUE_SIZE, G.nodes_iter())  
	report_f.close()