def generate_sequences(fg_bins, bg_bins, bg_dir, nfold, random_seed): """ Choose randomly the background sequences in each bin of %GC. Follow the same distribution as the one of foreground sequences with a nfold ratio. Return the list of %GC and length distrib. """ random.seed(random_seed) lengths = [] gc_list = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for percent in range(0, 101): if fg_bins[percent]: try: nb = fg_bins[percent] * nfold if bg_bins: bin_seq = bg_bins[percent] else: bin_seq = get_bins_from_bg_dir(bg_dir, percent) sample = random.sample(bin_seq, nb) gc_list.extend([percent] * nb) except ValueError: sys.stderr.write("""*** WARNING *** Sample larger than population for {0:d}% G+C content: {1:d} needed and {2:d} obtained\n""".format( percent, fg_bins[percent] * nfold, len(bin_seq))) sample = bin_seq gc_list.extend([percent] * len(bin_seq)) for r in sample: print(r.format("fasta"), end='') lengths.append(len(r.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(r.seq))] return gc_list, lengths, dinuc
def fg_len_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. Computes %GC contant and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(fg_file) as stream: gc_bins = [] for _ in range(0, 101): gc_bins.append({}) gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) length = len(record) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] lengths.append(length) if length in gc_bins[gc]: gc_bins[gc][length] += 1 else: gc_bins[gc][length] = 1 return gc_list, gc_bins, lengths, dinuc
def bg_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Files representing the binning are stored in the "bg_dir" directory. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(bg_file) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) gc = int(round(gc)) # python3 fix gc_bins[gc].append(record) lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir) return gc_list, gc_bins, lengths, dinuc
def bg_len_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the %GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(bg_file) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths, dinuc
def fg_GC_bins(fg, winlen, step): """ Get %GC info for foreground sequences. Compute G+C content for all sequences in the foreground and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin Return the corresponding lists. """ with open_for_parsing(fg) as stream: tmp_gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): tmp_gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen, step) gc_list.append(gc) # python 3 fix gc = int(round(gc)) tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc)) lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, avg_and_sd_gc_info(tmp_gc_bins), lengths, dinuc
def bg_len_GC_bins(bg, bg_dir): """ Get lengths info for background sequences. Compute G+C content for all sequences in the background and store the information in a list. To each G+C percentage bin, we associate the corresponding sequence names with information about GC composition within sliding windows. Return info in lists. """ with open_for_parsing(bg) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths, dinuc
def fg_len_GC_bins(fg, winlen, step): """ Get needed lengths info for foreground sequences. Compute G+C content for all sequences in the foreground and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return the corresponding info in lists. """ with open_for_parsing(fg) as stream: tmp_gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) l_dic = [] for _ in range(0, 101): tmp_gc_bins.append([]) l_dic.append({}) for record in SeqIO.parse(stream, "fasta"): gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen, step) gc_list.append(gc) tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc)) length = len(record) if length in l_dic[gc]: l_dic[gc][length] += 1 else: l_dic[gc][length] = 1 lengths.append(length) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, avg_and_sd_len_gc_info(l_dic, tmp_gc_bins), lengths, dinuc
def generate_len_sequences(fg_bins, bg_bins, bg_dir, deviation, winlen, step, nfold): """ Choose randomly the background sequences in each bin of GC%. with the same distribution as the one of foreground sequences with a nfold ratio. Return the sequences and their lengths. """ sys.setrecursionlimit(10000) gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for percent in range(0, 101): if fg_bins[percent][0]: nb = sum(fg_bins[percent][0][0].values()) * nfold if bg_bins: bin_seq = bg_bins[percent] else: bin_seq = get_bins_len_from_bg_dir(bg_dir, percent) sequences = [] bg_keys = sorted(bin_seq.keys()) for size in fg_bins[percent][0][0].keys(): nb_to_retrieve = fg_bins[percent][0][0][size] * nfold seqs, _, bg_keys = extract_seq_rec(size, nb_to_retrieve, bg_keys, bin_seq, [], 0, fg_bins[percent], deviation, winlen, step) sequences.extend(seqs) nb_match = len(sequences) if nb_match != nb: sys.stderr.write("""\n*** WARNING *** Sample larger than population for {0:d}% G+C content: {1:d} needed and {2:d} obtained\n""".format( percent, nb, nb_match)) gc_list.extend([percent] * (nb_match)) for r in sequences: print("{0:s}".format(r.format("fasta")), end='') lengths.append(len(r)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(r.seq))] return gc_list, lengths, dinuc
def generate_sequences(seqs, kmer, winlen, step, nfold, random_seed): set_seed(random_seed) cpt = 1 bg_gc_list = [] bg_lengths = [] dinuc = [0] * len(IUPAC_DINUC) for record in seqs: seq = record.seq.__str__() descr = "Background sequence for {0:s}".format(record.name, cpt) for n in range(0, nfold): new_sequence = shuffle_window(seq, kmer, winlen, step) new_seq = SeqRecord(Seq(new_sequence, IUPACData.ambiguous_dna_letters), id="background_seq_{0:d}".format(cpt), description=descr) print(new_seq.format("fasta"), end='') bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(new_sequence))] cpt += 1 return bg_gc_list, bg_lengths, dinuc
def generate_len_sequences(fg, bg, bg_dir, nfold): """ Extract the sequences from the bg with similar sizes as in the fg. Return the %GC list and length distrib. """ sys.setrecursionlimit(10000) lengths = [] gc_list = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for percent in range(0, 101): if fg[percent]: nb = sum(fg[percent].values()) * nfold sequences = [] for size in fg[percent].keys(): nb_to_retrieve = fg[percent][size] * nfold if bg: bg_bins = bg[percent] else: bg_bins = get_bins_len_from_bg_dir(bg_dir, percent) bg_keys = sorted(bg_bins.keys()) seqs, _ = extract_seq_rec(size, nb_to_retrieve, bg_keys, bg_bins, [], 0) sequences.extend(seqs) nb_match = len(sequences) gc_list.extend([percent] * nb_match) if nb_match != nb: sys.stderr.write("""*** WARNING *** Sample larger than population for {0:d}% G+C content: {1:d} needed and {2:d} obtained\n""".format(percent, nb, nb_match)) for s in sequences: lengths.append(len(s)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(s.seq))] print("{0:s}".format(s.format("fasta")), end='') return gc_list, lengths, dinuc
def generate_sequences(fg_bins, bg_bins, bg_dir, deviation, winlen, step, nfold, random_seed): """ Choose randomly the background sequences in each bin of GC%. The same distribution as the one of foreground sequences with a nfold ratio is asked. Return the sequences with their lengths. """ random.seed(random_seed) gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for percent in range(0, 101): if fg_bins[percent][0]: nb = fg_bins[percent][0][0] * nfold if bg_bins: bin_seq = bg_bins[percent] else: bin_seq = get_bins_from_bg_dir(bg_dir, percent) left, sample = extract_random_sample(bin_seq, fg_bins[percent], nb, deviation, winlen, step) if left: sys.stderr.write("""\n*** WARNING *** Sample larger than population for {0:d}% G+C content: {1:d} needed and {2:d} obtained\n""".format( percent, nb, nb - left)) gc_list.extend([percent] * (nb - left)) else: gc_list.extend([percent] * nb) for r in sample: print(r.format("fasta"), end='') lengths.append(len(r.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(r.seq))] return gc_list, lengths, dinuc
def fg_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. It computes the %GC content and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, lengths distrib, and dinuc compo. """ with open_for_parsing(fg_file) as stream: gc_bins = [0] * 101 gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) # python 3 fix gc = int(round(gc)) gc_bins[gc] += 1 lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, gc_bins, lengths, dinuc