def bg_len_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the %GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(bg_file) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths, dinuc
def fg_len_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. Computes %GC contant and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(fg_file) as stream: gc_bins = [] for _ in range(0, 101): gc_bins.append({}) gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) length = len(record) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] lengths.append(length) if length in gc_bins[gc]: gc_bins[gc][length] += 1 else: gc_bins[gc][length] = 1 return gc_list, gc_bins, lengths, dinuc
def bg_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Files representing the binning are stored in the "bg_dir" directory. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(bg_file) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) gc = int(round(gc)) # python3 fix gc_bins[gc].append(record) lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir) return gc_list, gc_bins, lengths, dinuc
def fg_GC_bins(fg, winlen, step): """ Get %GC info for foreground sequences. Compute G+C content for all sequences in the foreground and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin Return the corresponding lists. """ with open_for_parsing(fg) as stream: tmp_gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): tmp_gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen, step) gc_list.append(gc) # python 3 fix gc = int(round(gc)) tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc)) lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, avg_and_sd_gc_info(tmp_gc_bins), lengths, dinuc
def bg_len_GC_bins(bg, bg_dir): """ Get lengths info for background sequences. Compute G+C content for all sequences in the background and store the information in a list. To each G+C percentage bin, we associate the corresponding sequence names with information about GC composition within sliding windows. Return info in lists. """ with open_for_parsing(bg) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths, dinuc
def fg_len_GC_bins(fg, winlen, step): """ Get needed lengths info for foreground sequences. Compute G+C content for all sequences in the foreground and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return the corresponding info in lists. """ with open_for_parsing(fg) as stream: tmp_gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) l_dic = [] for _ in range(0, 101): tmp_gc_bins.append([]) l_dic.append({}) for record in SeqIO.parse(stream, "fasta"): gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen, step) gc_list.append(gc) tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc)) length = len(record) if length in l_dic[gc]: l_dic[gc][length] += 1 else: l_dic[gc][length] = 1 lengths.append(length) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, avg_and_sd_len_gc_info(l_dic, tmp_gc_bins), lengths, dinuc
def get_bins_from_bg_dir(bg_dir, percent): """ Return the sequences from the corresponding bin file. """ filename = "{0}/bg_bin_{1:d}.txt".format(bg_dir, percent) with open_for_parsing(filename) as stream: bin_seq = [] for record in SeqIO.parse(stream, "fasta"): bin_seq.append(record) return bin_seq
def get_bins_len_from_bg_dir(bg_dir, percent): """ Return the sequences from the corresponding bin file. """ filename = "{0}/bg_bin_{1:d}.txt".format(bg_dir, percent) with open_for_parsing(filename) as stream: bin_seq = {} for record in SeqIO.parse(stream, "fasta"): length = len(record) if length in bin_seq: bin_seq[length].append(record) else: bin_seq[length] = [record] return bin_seq
def fg_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. It computes the %GC content and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, lengths distrib, and dinuc compo. """ with open_for_parsing(fg_file) as stream: gc_bins = [0] * 101 gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) # python 3 fix gc = int(round(gc)) gc_bins[gc] += 1 lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, gc_bins, lengths, dinuc