def GC_info(seq, win_len, step): """ Calculate needed %GC information. Calculate G+C content, minimal %GC in sliding windows, maximal %GC in sliding windows, stdev of %GC in sliding windows, and CV of %GC in sliding windows For GC content, it returns the percentage (float between 0 and 100). Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) when counting the G and C content. The percentage is calculated against the length of the sequence using A,C,G,T,S,W with Ns, e.g.: >>> GC("ACTGN") 50.0 Note that this will return zero for an empty sequence. """ gc = GC(seq) tmp_gc = [] if win_len >= len(seq): return gc, gc, gc, 0, 0 for i in range(0, len(seq) - win_len): tmp_gc.append(GC(seq[i:i + win_len])) sd = numpy.std(tmp_gc) # Applying +1 to GC to make sure we do not divide by 0 return gc, min(tmp_gc), max(tmp_gc), sd, 100. * sd / (gc + 1.)
def fg_len_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. Computes %GC contant and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(fg_file) as stream: gc_bins = [] for _ in range(0, 101): gc_bins.append({}) gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) length = len(record) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] lengths.append(length) if length in gc_bins[gc]: gc_bins[gc][length] += 1 else: gc_bins[gc][length] = 1 return gc_list, gc_bins, lengths, dinuc
def bg_len_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the %GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(bg_file) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths, dinuc
def bg_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Files representing the binning are stored in the "bg_dir" directory. Return lists of GC contents, GC bins, and lengths distrib. """ with open_for_parsing(bg_file) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) gc = int(round(gc)) # python3 fix gc_bins[gc].append(record) lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir) return gc_list, gc_bins, lengths, dinuc
def bg_len_GC_bins(bg, bg_dir): """ Get lengths info for background sequences. Compute G+C content for all sequences in the background and store the information in a list. To each G+C percentage bin, we associate the corresponding sequence names with information about GC composition within sliding windows. Return info in lists. """ with open_for_parsing(bg) as stream: gc_bins = [] gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths, dinuc
def generate_sequences(seqs, kmer, winlen, step, nfold, random_seed): set_seed(random_seed) cpt = 1 bg_gc_list = [] bg_lengths = [] dinuc = [0] * len(IUPAC_DINUC) for record in seqs: seq = record.seq.__str__() descr = "Background sequence for {0:s}".format(record.name, cpt) for n in range(0, nfold): new_sequence = shuffle_window(seq, kmer, winlen, step) new_seq = SeqRecord(Seq(new_sequence, IUPACData.ambiguous_dna_letters), id="background_seq_{0:d}".format(cpt), description=descr) print(new_seq.format("fasta"), end='') bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(new_sequence))] cpt += 1 return bg_gc_list, bg_lengths, dinuc
def fg_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. It computes the %GC content and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, lengths distrib, and dinuc compo. """ with open_for_parsing(fg_file) as stream: gc_bins = [0] * 101 gc_list = [] lengths = [] dinuc = [0] * len(IUPAC) * len(IUPAC) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) # python 3 fix gc = int(round(gc)) gc_bins[gc] += 1 lengths.append(len(record.seq)) dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))] return gc_list, gc_bins, lengths, dinuc