def load_datasets(self, path_pos, path_neg, background_alg, SHOULD_SHORT=0, HEADERS=False, HARDMASK=False ): positive = [] negative = [] header_pos = [] header_neg = [] print "HARDMASK:", HARDMASK #Load positive for path_pos FASTA for line in self.readfasta(path_pos, HARDMASK): if HEADERS and line[0] == '>': header_pos.append(line) else: positive.append(line) if SHOULD_SHORT > 0 and int(SHOULD_SHORT) == len(positive): break #Load background as shuffled copy of pisitive if path_neg != None: #Load background from path_neg FASTA for line in self.readfasta(path_neg): # Should bg determi hardmasking with the same variable as positive? if HEADERS and line[0] == '>': header_neg.append(line) else: negative.append(line) if SHOULD_SHORT > 0 and int(SHOULD_SHORT) == len(negative): break elif background_alg == "dinuclShuffle": for line in positive: sh_line = altschulEriksonDinuclShuffle.dinuclShuffle(line.upper()) negative.append(sh_line) else: #No background negative.append("NNNNNNNNNNNNNNNNNNNNNNNNNNN") print "Lengths:", len(positive), len(negative) if HEADERS: return positive, negative, header_pos, header_neg else: return positive, negative
def generate_sequences(seqs, nfold): bg_gc_list = [] bg_lengths = [] for record in seqs: seq = record.seq.__str__() for n in range(0, nfold): new_sequence = "" for sequence in split_seq(seq): if re.match("N", sequence): new_sequence += sequence elif sequence: new_sequence += dinuclShuffle(sequence) new_seq = SeqRecord(Seq(new_sequence, generic_dna), id="background_seq_for_{0:s}".format( record.name), description="") print(new_seq.format("fasta"), end="") bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) return bg_gc_list, bg_lengths
def generate_sequences(seqs, nfold): cpt = 1 bg_gc_list = [] bg_lengths = [] for record in seqs: seq = record.seq.__str__() descr = "Background sequence for {0:s}".format(record.name) for n in range(0, nfold): new_sequence = "" for sequence in split_seq(seq): if re.match('N', sequence): new_sequence += sequence elif sequence: new_sequence += dinuclShuffle(sequence) new_seq = SeqRecord(Seq(new_sequence, generic_dna), id="background_seq_{0:d}".format(cpt), description=descr) print new_seq.format("fasta"), bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) cpt += 1 return bg_gc_list, bg_lengths
def main(fileName, NUM): seq = file2string(fileName) for i in range(NUM): shuffledSeq = dinuclShuffle(seq) sys.stdout.write(">%d\n" % (i + 1)) sys.stdout.write("%s\n" % shuffledSeq)
def main(fileName,NUM): seq = file2string(fileName) for i in range(NUM): shuffledSeq = dinuclShuffle(seq) sys.stdout.write(">%d\n" % (i+1)) sys.stdout.write("%s\n" % shuffledSeq)
def shuffle_window(ss, wl, step): bs = ss[:] for i in range(0, len(bs) - 1, step): #print i,"\t",ss[i:(i+wl)] bs = bs[0:i] + dinuclShuffle(bs[i:(i + wl)]) + bs[i + wl:] return (bs) # returns shuffled sequence
def shuffle_window(ss, wl, step): bs = ss[:] for i in range (0, len(bs)-1, step): #print i,"\t",ss[i:(i+wl)] bs = bs[0:i] + dinuclShuffle(bs[i:(i+wl)]) + bs[i+wl:] return(bs) # returns shuffled sequence