Example #1
0
def GC_info(seq, win_len, step):
    """
    Calculate needed %GC information.

    Calculate G+C content, minimal %GC in sliding windows, maximal %GC in
    sliding windows, stdev of %GC in sliding windows, and CV of %GC in sliding
    windows
    For GC content, it returns the percentage (float between 0 and 100).
    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
    when counting the G and C content.    The percentage is calculated against
    the length of the sequence using A,C,G,T,S,W with Ns, e.g.:
    >>> GC("ACTGN")
    50.0
    Note that this will return zero for an empty sequence.

    """

    gc = GC(seq)
    tmp_gc = []
    if win_len >= len(seq):
        return gc, gc, gc, 0, 0
    for i in range(0, len(seq) - win_len):
        tmp_gc.append(GC(seq[i:i + win_len]))
    sd = numpy.std(tmp_gc)
    # Applying +1 to GC to make sure we do not divide by 0

    return gc, min(tmp_gc), max(tmp_gc), sd, 100. * sd / (gc + 1.)
Example #2
0
def fg_len_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    Computes %GC contant and store the information in a list. To each G+C
    percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(fg_file) as stream:
        gc_bins = []
        for _ in range(0, 101):
            gc_bins.append({})
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            length = len(record)
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
            lengths.append(length)
            if length in gc_bins[gc]:
                gc_bins[gc][length] += 1
            else:
                gc_bins[gc][length] = 1
    return gc_list, gc_bins, lengths, dinuc
Example #3
0
def bg_len_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the %GC information in a list. To each G+C percentage
    bin, we associate the corresponding sequence names.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(bg_file) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            if len(record) in gc_bins[gc]:
                gc_bins[gc][len(record)].append(record)
            else:
                gc_bins[gc][len(record)] = [record]
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths, dinuc
Example #4
0
def bg_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the GC information in a list. To each G+C percentage bin,
    we associate the corresponding sequence names.
    Files representing the binning are stored in the "bg_dir" directory.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(bg_file) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append([])
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            gc = int(round(gc))  # python3 fix
            gc_bins[gc].append(record)
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir)
    return gc_list, gc_bins, lengths, dinuc
Example #5
0
def bg_len_GC_bins(bg, bg_dir):
    """
    Get lengths info for background sequences.

    Compute G+C content for all sequences in the background and store the
    information in a list. To each G+C percentage bin, we associate the
    corresponding sequence names with information about GC composition within
    sliding windows.

    Return info in lists.

    """

    with open_for_parsing(bg) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            if len(record) in gc_bins[gc]:
                gc_bins[gc][len(record)].append(record)
            else:
                gc_bins[gc][len(record)] = [record]
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths, dinuc
def generate_sequences(seqs, kmer, winlen, step, nfold, random_seed):
    set_seed(random_seed)
    cpt = 1
    bg_gc_list = []
    bg_lengths = []
    dinuc = [0] * len(IUPAC_DINUC)
    for record in seqs:
        seq = record.seq.__str__()
        descr = "Background sequence for {0:s}".format(record.name, cpt)
        for n in range(0, nfold):
            new_sequence = shuffle_window(seq, kmer, winlen, step)
            new_seq = SeqRecord(Seq(new_sequence,
                                    IUPACData.ambiguous_dna_letters),
                                id="background_seq_{0:d}".format(cpt),
                                description=descr)
            print(new_seq.format("fasta"), end='')
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(new_sequence))]
            cpt += 1
    return bg_gc_list, bg_lengths, dinuc
Example #7
0
def fg_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    It computes the %GC content and store the information in a list. To each
    G+C percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, lengths distrib, and dinuc compo.

    """
    with open_for_parsing(fg_file) as stream:
        gc_bins = [0] * 101
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            # python 3 fix
            gc = int(round(gc))
            gc_bins[gc] += 1
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, gc_bins, lengths, dinuc