Ejemplo n.º 1
0
def generate_sequences(fg_bins, bg_bins, bg_dir, nfold, random_seed):
    """
    Choose randomly the background sequences in each bin of %GC.

    Follow the same distribution as the one of foreground sequences with a
    nfold ratio.
    Return the list of %GC and length distrib.

    """
    random.seed(random_seed)
    lengths = []
    gc_list = []
    dinuc = [0] * len(IUPAC) * len(IUPAC)
    for percent in range(0, 101):
        if fg_bins[percent]:
            try:
                nb = fg_bins[percent] * nfold
                if bg_bins:
                    bin_seq = bg_bins[percent]
                else:
                    bin_seq = get_bins_from_bg_dir(bg_dir, percent)
                sample = random.sample(bin_seq, nb)
                gc_list.extend([percent] * nb)
            except ValueError:
                sys.stderr.write("""*** WARNING ***
                    Sample larger than population for {0:d}% G+C content:
                    {1:d} needed and {2:d} obtained\n""".format(
                    percent, fg_bins[percent] * nfold, len(bin_seq)))
                sample = bin_seq
                gc_list.extend([percent] * len(bin_seq))
            for r in sample:
                print(r.format("fasta"), end='')
                lengths.append(len(r.seq))
                dinuc = [x + y for x, y in zip(dinuc, dinuc_count(r.seq))]
    return gc_list, lengths, dinuc
Ejemplo n.º 2
0
def fg_len_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    Computes %GC contant and store the information in a list. To each G+C
    percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(fg_file) as stream:
        gc_bins = []
        for _ in range(0, 101):
            gc_bins.append({})
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            length = len(record)
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
            lengths.append(length)
            if length in gc_bins[gc]:
                gc_bins[gc][length] += 1
            else:
                gc_bins[gc][length] = 1
    return gc_list, gc_bins, lengths, dinuc
Ejemplo n.º 3
0
def bg_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the GC information in a list. To each G+C percentage bin,
    we associate the corresponding sequence names.
    Files representing the binning are stored in the "bg_dir" directory.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(bg_file) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append([])
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            gc = int(round(gc))  # python3 fix
            gc_bins[gc].append(record)
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir)
    return gc_list, gc_bins, lengths, dinuc
Ejemplo n.º 4
0
def bg_len_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the %GC information in a list. To each G+C percentage
    bin, we associate the corresponding sequence names.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    with open_for_parsing(bg_file) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            if len(record) in gc_bins[gc]:
                gc_bins[gc][len(record)].append(record)
            else:
                gc_bins[gc][len(record)] = [record]
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths, dinuc
Ejemplo n.º 5
0
def fg_GC_bins(fg, winlen, step):
    """
    Get %GC info for foreground sequences.

    Compute G+C content for all sequences in the foreground and store the
    information in a list. To each G+C percentage bin, we associate the number
    of sequences falling in the corresponding bin

    Return the corresponding lists.

    """

    with open_for_parsing(fg) as stream:
        tmp_gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            tmp_gc_bins.append([])
        for record in SeqIO.parse(stream, "fasta"):
            gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen,
                                                       step)
            gc_list.append(gc)
            # python 3 fix
            gc = int(round(gc))
            tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc))
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, avg_and_sd_gc_info(tmp_gc_bins), lengths, dinuc
Ejemplo n.º 6
0
def bg_len_GC_bins(bg, bg_dir):
    """
    Get lengths info for background sequences.

    Compute G+C content for all sequences in the background and store the
    information in a list. To each G+C percentage bin, we associate the
    corresponding sequence names with information about GC composition within
    sliding windows.

    Return info in lists.

    """

    with open_for_parsing(bg) as stream:
        gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for _ in range(0, 101):
            gc_bins.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            if len(record) in gc_bins[gc]:
                gc_bins[gc][len(record)].append(record)
            else:
                gc_bins[gc][len(record)] = [record]
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths, dinuc
Ejemplo n.º 7
0
def fg_len_GC_bins(fg, winlen, step):
    """
    Get needed lengths info for foreground sequences.

    Compute G+C content for all sequences in the foreground and store the
    information in a list. To each G+C percentage bin, we associate the number
    of sequences falling in the corresponding bin.

    Return the corresponding info in lists.

    """

    with open_for_parsing(fg) as stream:
        tmp_gc_bins = []
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        l_dic = []
        for _ in range(0, 101):
            tmp_gc_bins.append([])
            l_dic.append({})
        for record in SeqIO.parse(stream, "fasta"):
            gc, min_gc, max_gc, sd_gc, cv_gc = GC_info(record.seq, winlen,
                                                       step)
            gc_list.append(gc)
            tmp_gc_bins[gc].append((min_gc, max_gc, sd_gc, cv_gc))
            length = len(record)
            if length in l_dic[gc]:
                l_dic[gc][length] += 1
            else:
                l_dic[gc][length] = 1
            lengths.append(length)
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, avg_and_sd_len_gc_info(l_dic, tmp_gc_bins), lengths, dinuc
Ejemplo n.º 8
0
def generate_len_sequences(fg_bins, bg_bins, bg_dir, deviation, winlen, step,
                           nfold):
    """
    Choose randomly the background sequences in each bin of GC%.

    with the same distribution as the one of foreground sequences with a nfold
    ratio.

    Return the sequences and their lengths.

    """

    sys.setrecursionlimit(10000)
    gc_list = []
    lengths = []
    dinuc = [0] * len(IUPAC) * len(IUPAC)
    for percent in range(0, 101):
        if fg_bins[percent][0]:
            nb = sum(fg_bins[percent][0][0].values()) * nfold
            if bg_bins:
                bin_seq = bg_bins[percent]
            else:
                bin_seq = get_bins_len_from_bg_dir(bg_dir, percent)
            sequences = []
            bg_keys = sorted(bin_seq.keys())
            for size in fg_bins[percent][0][0].keys():
                nb_to_retrieve = fg_bins[percent][0][0][size] * nfold
                seqs, _, bg_keys = extract_seq_rec(size, nb_to_retrieve,
                                                   bg_keys, bin_seq, [], 0,
                                                   fg_bins[percent], deviation,
                                                   winlen, step)
                sequences.extend(seqs)
            nb_match = len(sequences)
            if nb_match != nb:
                sys.stderr.write("""\n*** WARNING ***
                Sample larger than population for {0:d}% G+C content:
                {1:d} needed and {2:d} obtained\n""".format(
                    percent, nb, nb_match))
            gc_list.extend([percent] * (nb_match))
            for r in sequences:
                print("{0:s}".format(r.format("fasta")), end='')
                lengths.append(len(r))
                dinuc = [x + y for x, y in zip(dinuc, dinuc_count(r.seq))]
    return gc_list, lengths, dinuc
def generate_sequences(seqs, kmer, winlen, step, nfold, random_seed):
    set_seed(random_seed)
    cpt = 1
    bg_gc_list = []
    bg_lengths = []
    dinuc = [0] * len(IUPAC_DINUC)
    for record in seqs:
        seq = record.seq.__str__()
        descr = "Background sequence for {0:s}".format(record.name, cpt)
        for n in range(0, nfold):
            new_sequence = shuffle_window(seq, kmer, winlen, step)
            new_seq = SeqRecord(Seq(new_sequence,
                                    IUPACData.ambiguous_dna_letters),
                                id="background_seq_{0:d}".format(cpt),
                                description=descr)
            print(new_seq.format("fasta"), end='')
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(new_sequence))]
            cpt += 1
    return bg_gc_list, bg_lengths, dinuc
Ejemplo n.º 10
0
def generate_len_sequences(fg, bg, bg_dir, nfold):
    """
    Extract the sequences from the bg with similar sizes as in the fg.

    Return the %GC list and length distrib.

    """

    sys.setrecursionlimit(10000)
    lengths = []
    gc_list = []
    dinuc = [0] * len(IUPAC) * len(IUPAC)
    for percent in range(0, 101):
        if fg[percent]:
            nb = sum(fg[percent].values()) * nfold
            sequences = []
            for size in fg[percent].keys():
                nb_to_retrieve = fg[percent][size] * nfold
                if bg:
                    bg_bins = bg[percent]
                else:
                    bg_bins = get_bins_len_from_bg_dir(bg_dir, percent)
                bg_keys = sorted(bg_bins.keys())
                seqs, _ = extract_seq_rec(size, nb_to_retrieve, bg_keys,
                                          bg_bins, [], 0)
                sequences.extend(seqs)
            nb_match = len(sequences)
            gc_list.extend([percent] * nb_match)
            if nb_match != nb:
                sys.stderr.write("""*** WARNING ***
                    Sample larger than population for {0:d}% G+C content:
                    {1:d} needed and {2:d} obtained\n""".format(percent,
                                                                nb,
                                                                nb_match))
            for s in sequences:
                lengths.append(len(s))
                dinuc = [x + y for x, y in zip(dinuc, dinuc_count(s.seq))]
                print("{0:s}".format(s.format("fasta")), end='')
    return gc_list, lengths, dinuc
Ejemplo n.º 11
0
def generate_sequences(fg_bins, bg_bins, bg_dir, deviation, winlen, step,
                       nfold, random_seed):
    """
    Choose randomly the background sequences in each bin of GC%.

    The same distribution as the one of foreground sequences with a nfold ratio
    is asked.

    Return the sequences with their lengths.

    """

    random.seed(random_seed)
    gc_list = []
    lengths = []
    dinuc = [0] * len(IUPAC) * len(IUPAC)
    for percent in range(0, 101):
        if fg_bins[percent][0]:
            nb = fg_bins[percent][0][0] * nfold
            if bg_bins:
                bin_seq = bg_bins[percent]
            else:
                bin_seq = get_bins_from_bg_dir(bg_dir, percent)
            left, sample = extract_random_sample(bin_seq, fg_bins[percent], nb,
                                                 deviation, winlen, step)
            if left:
                sys.stderr.write("""\n*** WARNING ***
                Sample larger than population for {0:d}% G+C content:
                {1:d} needed and {2:d} obtained\n""".format(
                    percent, nb, nb - left))
                gc_list.extend([percent] * (nb - left))
            else:
                gc_list.extend([percent] * nb)
            for r in sample:
                print(r.format("fasta"), end='')
                lengths.append(len(r.seq))
                dinuc = [x + y for x, y in zip(dinuc, dinuc_count(r.seq))]
    return gc_list, lengths, dinuc
Ejemplo n.º 12
0
def fg_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    It computes the %GC content and store the information in a list. To each
    G+C percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, lengths distrib, and dinuc compo.

    """
    with open_for_parsing(fg_file) as stream:
        gc_bins = [0] * 101
        gc_list = []
        lengths = []
        dinuc = [0] * len(IUPAC) * len(IUPAC)
        for record in SeqIO.parse(stream, "fasta"):
            gc = GC(record.seq)
            gc_list.append(gc)
            # python 3 fix
            gc = int(round(gc))
            gc_bins[gc] += 1
            lengths.append(len(record.seq))
            dinuc = [x + y for x, y in zip(dinuc, dinuc_count(record.seq))]
    return gc_list, gc_bins, lengths, dinuc