Exemple #1
0
def GC_info(seq, win_len, step):
    """
    Calculate needed %GC information.

    Calculate G+C content, minimal %GC in sliding windows, maximal %GC in
    sliding windows, stdev of %GC in sliding windows, and CV of %GC in sliding
    windows
    For GC content, it returns the percentage (float between 0 and 100).
    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
    when counting the G and C content.    The percentage is calculated against
    the length of the sequence using A,C,G,T,S,W with Ns, e.g.:
    >>> GC("ACTGN")
    50.0
    Note that this will return zero for an empty sequence.

    """

    gc = GC(seq)
    tmp_gc = []
    if win_len >= len(seq):
        return gc, gc, gc, 0, 0
    for i in range(0, len(seq) - win_len):
        tmp_gc.append(GC(seq[i:i + win_len]))
    sd = numpy.std(tmp_gc)
    # Applying +1 to GC to make sure we do not divide by 0
    return gc, min(tmp_gc), max(tmp_gc), sd, 100. * sd / (gc + 1.)
Exemple #2
0
def bg_len_GC_bins(bg, bg_dir):
    """
    Get lengths info for background sequences.

    Compute G+C content for all sequences in the background and store the
    information in a list. To each G+C percentage bin, we associate the
    corresponding sequence names with information about GC composition within
    sliding windows.

    Return info in lists.

    """

    stream = open(bg)
    gc_bins = []
    gc_list = []
    lengths = []
    for _ in xrange(0, 101):
        gc_bins.append({})
    for record in SeqIO.parse(stream, "fasta"):
        gc = GC(record.seq)
        gc_list.append(gc)
        if len(record) in gc_bins[gc]:
            gc_bins[gc][len(record)].append(record)
        else:
            gc_bins[gc][len(record)] = [record]
        lengths.append(len(record.seq))
    stream.close()
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths
def fg_len_GC_bins(fg_file):
    """
    Compute G+C content for all sequences in the foreground.

    Computes %GC contant and store the information in a list. To each G+C
    percentage bin, we associate the number of sequences falling in the
    corresponding bin.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    stream = open(fg_file)
    gc_bins = []
    for _ in range(0, 101):
        gc_bins.append({})
    gc_list = []
    lengths = []
    for record in SeqIO.parse(stream, "fasta"):
        gc = GC(record.seq)
        gc_list.append(gc)
        length = len(record)
        lengths.append(length)
        if length in gc_bins[gc]:
            gc_bins[gc][length] += 1
        else:
            gc_bins[gc][length] = 1
    stream.close()
    return gc_list, gc_bins, lengths
def bg_len_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the %GC information in a list. To each G+C percentage
    bin, we associate the corresponding sequence names.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    stream = open(bg_file)
    gc_bins = []
    gc_list = []
    lengths = []
    for _ in range(0, 101):
        gc_bins.append({})
    for record in SeqIO.parse(stream, "fasta"):
        gc = GC(record.seq)
        gc_list.append(gc)
        if len(record) in gc_bins[gc]:
            gc_bins[gc][len(record)].append(record)
        else:
            gc_bins[gc][len(record)] = [record]
        lengths.append(len(record.seq))
    stream.close()
    print_in_bg_dir(gc_bins, bg_dir, True)
    return gc_list, gc_bins, lengths
def fg_GC_bins(fg_file):
    """Computes G+C content for all sequences in the foreground and store the
  information in a list. To each G+C percentage bin, we associate the number of
  sequences falling in the corresponding bin
  """
    stream = open(fg_file)
    gc_bins = [0] * 101
    gc_list = []
    lengths = []
    for record in SeqIO.parse(stream, "fasta"):
        gc = GC(record.seq)
        gc_list.append(gc)
        gc_bins[gc] += 1
        lengths.append(len(record.seq))
    stream.close()
    return gc_list, gc_bins, lengths
def generate_sequences(seqs, winlen, step, nfold):
    """
    Shuffle sequences within a sliding window, keeping mononuc compo.
    Return %GC and length distribution of output sequences.
    """
    bg_gc_list = []
    bg_lengths = []
    for record in seqs:
        sequence = record.seq.__str__()
        for _ in range(0, nfold):
            new_sequence = shuffle_window(sequence, winlen, step)
            new_seq = SeqRecord(Seq(new_sequence, generic_dna),
                                id="background_seq_{}".format(record.name),
                                description="")
            print(new_seq.format("fasta"), end="")
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
    return bg_gc_list, bg_lengths
def bg_GC_bins(bg_file):
    """ Computes G+C content for all sequences in the background and store the
  information in a list. To each G+C percentage bin, we associate the
  corresponding sequence names
  """
    stream = open(bg_file)
    gc_bins = []
    gc_list = []
    lengths = []
    for i in range(0, 101):
        gc_bins.append([])
    for record in SeqIO.parse(stream, "fasta"):
        gc = GC(record.seq)
        gc_list.append(gc)
        gc_bins[gc].append(record)
        lengths.append(len(record.seq))
    stream.close()
    return gc_list, gc_bins, lengths
Exemple #8
0
def generate_sequences(seqs, winlen, step, nfold):
    bg_gc_list = []
    bg_lengths = []
    for record in seqs:
        seq = record.seq.__str__()
        for n in range(0, nfold):
            new_sequence = ""
            for sequence in split_seq(seq):
                if re.match("N", sequence):
                    new_sequence += sequence
                elif sequence:
                    new_sequence += shuffle_window(sequence, winlen, step)
            new_seq = SeqRecord(Seq(new_sequence, generic_dna),
                                id="background_seq_for_{}".format(record.name),
                                description="")
            print(new_seq.format("fasta"), end="")
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
    return bg_gc_list, bg_lengths
def generate_sequences(seqs, nfold):
    """
    Generate sequences by shuffling input (mononucleotide).

    return tuple containing %GC compo and length distrib of output.

    """
    bg_gc_list = []
    bg_lengths = []
    for record in seqs:
        seq = record.seq.__str__()
        for _ in range(0, nfold):
            new_sequence = "".join(random.sample(seq, len(seq)))
            new_seq = SeqRecord(Seq(new_sequence, generic_dna),
                                id="background_seq_for_{}".format(record.name),
                                description="")
            print(new_seq.format("fasta"), end="")
            bg_gc_list.append(GC(new_sequence))
            bg_lengths.append(len(new_sequence))
    return bg_gc_list, bg_lengths
def bg_GC_bins(bg_file, bg_dir):
    """
    Compute G+C content for all sequences in the background.

    Compute and store the GC information in a list. To each G+C percentage bin,
    we associate the corresponding sequence names.
    Files representing the binning are stored in the "bg_dir" directory.
    Return lists of GC contents, GC bins, and lengths distrib.

    """
    stream = open(bg_file)
    gc_bins = []
    gc_list = []
    lengths = []
    for _ in xrange(0, 101):
        gc_bins.append([])
    for record in SeqIO.parse(stream, "fasta"):
        gc = GC(record.seq)
        gc_list.append(gc)
        gc_bins[gc].append(record)
        lengths.append(len(record.seq))
    stream.close()
    print_in_bg_dir(gc_bins, bg_dir)
    return gc_list, gc_bins, lengths
    wins = windows.slidingWindow(sequence,
                                 size=windowSize,
                                 step=overlap,
                                 fillvalue="-")
    # let's count the windows
    i = 0
    for window in wins:
        if i % 100 == 0:
            print("[STATUS] \t" + str(i) + " windows processed for " +
                  str(header) + ".",
                  end="\r")
        # get the sequence
        seq = ''.join(window)
        # calculate GC stats
        currentWindow = str(bin_int) + '-' + str(bin_int + windowSize)
        PerGC, GCSkew, nucleotideCounts = GC.GCStats(seq)
        # calculate kmer stats
        UniqueKmers = kmers.getUniqueKmers(seq, kmerLength)
        formattedHeader = header.replace(">.", "")
        GCperWindowCSV.write(formattedHeader + ',' + currentWindow + ',' +
                             str(PerGC) + ',' + str(GCSkew) + ',' +
                             str(UniqueKmers) + '\n')

        # keep running total of nucleotide counts
        totalNucleotideCounts = dictionaries.mergeDictionaries(
            totalNucleotideCounts, nucleotideCounts)

        if bin_int < seq_length - overlap:
            bin_int += overlap
        else:
            bin_int = 0