Example #1
0
def bg_len_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the %GC information in a list. To each G+C percentage
    bin, we associate the corresponding sequence names.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(bg_file) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            if len(record) in gc_bins[gc]:
                gc_bins[gc][len(record)].append(record)
            else:
                gc_bins[gc][len(record)] = [record]
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths, dinuc
Example #2
0
def fg_len_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    Computes %GC contant and store the information in a list. To each G+C
    percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(fg_file) as stream:
        gc_bins = []
        for _ in range(0, 101):
            gc_bins.append({})
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            length = len(record)
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
            lengths.append(length)
            if length in gc_bins[gc]:
                gc_bins[gc][length] += 1
            else:
                gc_bins[gc][length] = 1
    return gc_list, gc_bins, lengths, dinuc
Example #3
0
def bg_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the GC information in a list. To each G+C percentage bin,
    we associate the corresponding sequence names.
    Files representing the binning are stored in the "bg_dir" directory.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(bg_file) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append([])
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            gc = int(round(gc))  # python3 fix
            gc_bins[gc].append(record)
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir)
    return gc_list, gc_bins, lengths, dinuc
Example #4
0
def fg_GC_bins(fg, winlen, step):
    """
    Get %GC info for foreground sequences.

    Compute G+C content for all sequences in the foreground and store the
    information in a list. To each G+C percentage bin, we associate the number
    of sequences falling in the corresponding bin

    Return the corresponding lists.

    """

    with open_for_parsing(fg) as stream:
        tmp_gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            tmp_gc_bins.append([])
        for record in SeqIO.parse(stream, "fasta"):
            gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen,
                                                       step)
            gc_list.append(gc)
            # python 3 fix
            gc = int(round(gc))
            tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc))
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, avg_and_sd_gc_info(tmp_gc_bins), lengths, dinuc
Example #5
0
def bg_len_GC_bins(bg, bg_dir):
    """
    Get lengths info for background sequences.

    Compute G+C content for all sequences in the background and store the
    information in a list. To each G+C percentage bin, we associate the
    corresponding sequence names with information about GC composition within
    sliding windows.

    Return info in lists.

    """

    with open_for_parsing(bg) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            if len(record) in gc_bins[gc]:
                gc_bins[gc][len(record)].append(record)
            else:
                gc_bins[gc][len(record)] = [record]
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths, dinuc
Example #6
0
def fg_len_GC_bins(fg, winlen, step):
    """
    Get needed lengths info for foreground sequences.

    Compute G+C content for all sequences in the foreground and store the
    information in a list. To each G+C percentage bin, we associate the number
    of sequences falling in the corresponding bin.

    Return the corresponding info in lists.

    """

    with open_for_parsing(fg) as stream:
        tmp_gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        l_dic = []
        for _ in range(0, 101):
            tmp_gc_bins.append([])
            l_dic.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen,
                                                       step)
            gc_list.append(gc)
            tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc))
            length = len(record)
            if length in l_dic[gc]:
                l_dic[gc][length] += 1
            else:
                l_dic[gc][length] = 1
            lengths.append(length)
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, avg_and_sd_len_gc_info(l_dic, tmp_gc_bins), lengths, dinuc
Example #7
0
def get_bins_from_bg_dir(bg_dir, percent):
    """ Return the sequences from the corresponding bin file. """

    filename = "{0}/bg_bin_{1:d}.txt".format(bg_dir, percent)
    with open_for_parsing(filename) as stream:
        bin_seq = []
        for record in SeqIO.parse(stream, "fasta"):
            bin_seq.append(record)
        return bin_seq
Example #8
0
def get_bins_len_from_bg_dir(bg_dir, percent):
    """ Return the sequences from the corresponding bin file. """

    filename = "{0}/bg_bin_{1:d}.txt".format(bg_dir, percent)
    with open_for_parsing(filename) as stream:
        bin_seq = {}
        for record in SeqIO.parse(stream, "fasta"):
            length = len(record)
            if length in bin_seq:
                bin_seq[length].append(record)
            else:
                bin_seq[length] = [record]
        return bin_seq
Example #9
0
def fg_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    It computes the %GC content and store the information in a list. To each
    G+C percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, lengths distrib, and dinuc compo.

    """
    with open_for_parsing(fg_file) as stream:
        gc_bins = [0] * 101
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            # python 3 fix
            gc = int(round(gc))
            gc_bins[gc] += 1
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, gc_bins, lengths, dinuc