Example #1
0
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit.chroms(chromNameBit) < i + regionSize:
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + regionSize))
        except Exception as detail:
            if verbose:
                print("{}:{}-{}".format(chromNameBit, i, i + regionSize))
                print(detail)
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
def countReadsPerGC_worker(chromNameBam,
                           start, end, stepSize, regionSize,
                           chrNameBamToBit, verbose=False):
    """given a genome region defined by
    (start, end), the GC content is quantified for
    regions of size regionSize that are contiguous
    """

    chromNameBit = chrNameBamToBit[chromNameBam]
    tbit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])
    c = 1
    sub_reads_per_gc = []
    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    for index in xrange(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if region extends over the chromosome end
        if tbit[chromNameBit].size < i + regionSize:
            break

        try:
            gc = getGC_content(tbit[chromNameBit].get(i, i + regionSize))
        except Exception as detail:
            if verbose:
                print "{}:{}-{}".format(chromNameBit, i, i + regionSize)
                print detail
            continue
        numberReads = bam.count(chromNameBam, i, i + regionSize)
        sub_reads_per_gc.append((numberReads, gc))
        c += 1

    return sub_reads_per_gc
Example #3
0
def getReadGCcontent(tbit, read, fragmentLength, chrNameBit):
    """
    The fragments for forward and reverse reads are defined as follows::

           |- read.pos       |- read.aend
        ---+=================>-----------------------+---------    Forward strand

           |-fragStart                               |-fragEnd

        ---+-----------------------<=================+---------    Reverse strand
                                   |-read.pos        |-read.aend

           |-----------------------------------------|
                            read.tlen

    """
    fragStart = None
    fragEnd = None

    if read.is_paired and read.is_proper_pair and abs(
            read.tlen) < 2 * fragmentLength:
        if read.is_reverse and read.tlen < 0:
            fragEnd = read.aend
            fragStart = read.aend + read.tlen
        elif read.tlen >= read.qlen:
            fragStart = read.pos
            fragEnd = read.pos + read.tlen

    if not fragStart:
        if read.is_reverse:
            fragEnd = read.aend
            fragStart = read.aend - fragmentLength
        else:
            fragStart = read.pos
            fragEnd = fragStart + fragmentLength
    fragStart = max(0, fragStart)
    try:
        gc = getGC_content(tbit[chrNameBit][fragStart:fragEnd],
                           as_fraction=True)
    except Exception:
        return None
    if gc is None:
        return None

    # match the gc to the given fragmentLength
    gc = int(np.round(gc * fragmentLength))
    return gc
Example #4
0
def getReadGCcontent(tbit, read, fragmentLength, chrNameBit):
    """
    The fragments for forward and reverse reads are defined as follows::

           |- read.pos       |- read.aend
        ---+=================>-----------------------+---------    Forward strand

           |-fragStart                               |-fragEnd

        ---+-----------------------<=================+---------    Reverse strand
                                   |-read.pos        |-read.aend

           |-----------------------------------------|
                            read.tlen

    """
    fragStart = None
    fragEnd = None

    if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength:
        if read.is_reverse and read.tlen < 0:
            fragEnd = read.reference_end
            fragStart = read.reference_end + read.template_length
        elif read.template_length >= read.query_alignment_length:
            fragStart = read.pos
            fragEnd = read.pos + read.template_length

    if not fragStart:
        if read.is_reverse:
            fragEnd = read.reference_end
            fragStart = read.reference_end - fragmentLength
        else:
            fragStart = read.pos
            fragEnd = fragStart + fragmentLength
    fragStart = max(0, fragStart)
    try:
        gc = getGC_content(tbit, chrNameBit, fragStart, fragEnd)
    except Exception:
        return None
    if gc is None:
        return None

    # match the gc to the given fragmentLength
    gc = int(np.round(gc * fragmentLength))
    return gc
Example #5
0
def getReadGCcontent(tbit, read, fragmentLength, chrNameBit):
    """
    The fragments for forward and reverse reads are defined as follows::

           |- read.pos       |- read.aend
        ---+=================>-----------------------+---------    Forward strand

           |-fragStart                               |-fragEnd

        ---+-----------------------<=================+---------    Reverse strand
                                   |-read.pos        |-read.aend

           |-----------------------------------------|
                            read.tlen

    """
    fragStart = None
    fragEnd = None

    if read.is_paired and read.is_proper_pair and abs(read.tlen) < 2 * fragmentLength:
        if read.is_reverse and read.tlen < 0:
            fragEnd = read.aend
            fragStart = read.aend + read.tlen
        elif read.tlen >= read.qlen:
            fragStart = read.pos
            fragEnd = read.pos + read.tlen

    if not fragStart:
        if read.is_reverse:
            fragEnd = read.aend
            fragStart = read.aend - fragmentLength
        else:
            fragStart = read.pos
            fragEnd = fragStart + fragmentLength
    try:
        gc = getGC_content(tbit[chrNameBit].get(fragStart, fragEnd), as_fraction=True)
    except Exception:
        return None

    # match the gc to the given fragmentLength
    gc = int(np.round(gc * fragmentLength))
    return gc
Example #6
0
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
                             fragmentLength,
                             chrNameBamToBit, verbose=False):
    r""" given genome regions, the GC content of the genome is tabulated for
    fragments of length 'fragmentLength' each 'stepSize' positions.

    >>> test = Tester()
    >>> args = test.testTabulateGCcontentWorker()
    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)

    The forward read positions are:
    [1,  4,  10, 10, 16, 18]
    which correspond to a GC of
    [1,  1,  1,  1,  2,  1]

    The evaluated position are
    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
    the corresponding GC is
    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]

    >>> print(N_gc)
    [0 4 5 1]
    >>> print(F_gc)
    [0 4 1 0]
    >>> test.set_filter_out_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}

    Test for the filter out option
    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)
    >>> test.unset_filter_out_file()

    The evaluated positions are
    [ 0  2  8 10 12 14 16 18]
    >>> print(N_gc)
    [0 3 4 1]
    >>> print(F_gc)
    [0 3 1 0]

    Test for extra_sampling option
    >>> test.set_extra_sampling_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}
    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)

    The new positions evaluated are
    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
    and the GC is
    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
    >>> print(res[0])
    [1 5 5 1]
    >>> print(res[1])
    [0 5 1 0]

    """
    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    chromNameBit = chrNameBamToBit[chromNameBam]

    # array to keep track of the GC from regions of length 'fragmentLength'
    # from the genome. The index of the array is used to
    # indicate the gc content. The values inside the
    # array are counts. Thus, if N_gc[10] = 3, that means
    # that 3 regions have a gc_content of 10.
    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')

    tbit = py2bit.open(global_vars['2bit'])
    bam = bamHandler.openBam(global_vars['bam'])
    peak = 0
    startTime = time.time()

    if verbose:
        print("[{:.3f}] computing positions to "
              "sample".format(time.time() - startTime))

    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    read_counts = []
    # Optimize IO.
    # if the sample regions are far apart from each
    # other is faster to go to each location and fetch
    # the reads found there.
    # Otherwise, if the regions to sample are close to
    # each other, is faster to load all the reads in
    # a large region into memory and consider only
    # those falling into the positions to sample.
    # The following code gets the reads
    # that are at sampling positions that lie close together
    if np.mean(np.diff(positions_to_sample)) < 1000:
        start_pos = min(positions_to_sample)
        end_pos = max(positions_to_sample)
        if verbose:
            print("[{:.3f}] caching reads".format(time.time() - startTime))

        counts = np.bincount([r.pos - start_pos
                              for r in bam.fetch(chromNameBam, start_pos,
                                                 end_pos + 1)
                              if not r.is_reverse and r.pos >= start_pos],
                             minlength=end_pos - start_pos + 2)

        read_counts = counts[positions_to_sample - min(positions_to_sample)]
        if verbose:
            print("[{:.3f}] finish caching reads.".format(
                time.time() - startTime))

    countTime = time.time()

    c = 1
    for index in range(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if the end of the chromosome is reached
        if i + fragmentLength['median'] > tbit.chroms(chromNameBit):
            break

        try:
            gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False)
        except Exception as detail:
            if verbose:
                print(detail)
            continue

        subN_gc[gc] += 1

        # count all reads at position 'i'
        if len(read_counts) == 0:  # case when no cache was done
            num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
                             if x.is_reverse is False and x.pos == i])
        else:
            num_reads = read_counts[index]

        if num_reads >= global_vars['max_reads']:
            peak += 1
            continue

        subF_gc[gc] += num_reads
        if verbose:
            if index % 50000 == 0:
                endTime = time.time()
                print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
                      (multiprocessing.current_process().name,
                       index, index / (endTime - countTime),
                       chromNameBit, start, end, stepSize))
        c += 1

    if verbose:
        endTime = time.time()
        print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" %
              (multiprocessing.current_process().name,
               index, index / (endTime - countTime),
               chromNameBit, start, end, stepSize))
        print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
                                                    (endTime - startTime), chromNameBit, start, end, stepSize))

    return(subN_gc, subF_gc)
def tabulateGCcontent_worker(chromNameBam, start, end, stepSize,
                             fragmentLength,
                             chrNameBamToBit, verbose=False):
    r""" given genome regions, the GC content of the genome is tabulated for
    fragments of length 'fragmentLength' each 'stepSize' positions.

    >>> test = Tester()
    >>> args = test.testTabulateGCcontentWorker()
    >>> N_gc, F_gc = tabulateGCcontent_worker(*args)

    The forward read positions are:
    [1,  4,  10, 10, 16, 18]
    which correspond to a GC of
    [1,  1,  1,  1,  2,  1]

    The evaluated position are
    [0,  2,  4,  6,  8, 10, 12, 14, 16, 18]
    the corresponding GC is
    [2,  1,  1,  2,  2,  1,  2,  3,  2,  1]

    >>> print N_gc
    [0 4 5 1]
    >>> print F_gc
    [0 4 1 0]
    >>> test.set_filter_out_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}

    Test for the filter out option
    >>> N_gc, F_gc = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)
    >>> test.unset_filter_out_file()

    The evaluated positions are
    [ 0  2  8 10 12 14 16 18]
    >>> print N_gc
    [0 3 4 1]
    >>> print F_gc
    [0 3 1 0]

    Test for extra_sampling option
    >>> test.set_extra_sampling_file()
    >>> chrNameBam2bit =  {'2L': 'chr2L'}
    >>> res = tabulateGCcontent_worker('2L', 0, 20, 2,
    ... {'median': 3}, chrNameBam2bit)

    The new positions evaluated are
    [0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18]
    and the GC is
    [2, 1, 1, 0, 1, 2, 2, 1,  2,  3,  2,  1]
    >>> print res[0]
    [1 5 5 1]
    >>> print res[1]
    [0 5 1 0]

    """
    if start > end:
        raise NameError("start %d bigger that end %d" % (start, end))

    chromNameBit = chrNameBamToBit[chromNameBam]

    # array to keep track of the GC from regions of length 'fragmentLength'
    # from the genome. The index of the array is used to
    # indicate the gc content. The values inside the
    # array are counts. Thus, if N_gc[10] = 3, that means
    # that 3 regions have a gc_content of 10.
    subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')
    subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int')

    tbit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])
    peak = 0
    startTime = time.time()

    if verbose:
        print "[{:.3f}] computing positions to " \
            "sample".format(time.time() - startTime)

    positions_to_sample = getPositionsToSample(chromNameBit,
                                               start, end, stepSize)

    read_counts = []
    # Optimize IO.
    # if the sample regions are far apart from each
    # other is faster to go to each location and fetch
    # the reads found there.
    # Otherwise, if the regions to sample are close to
    # each other, is faster to load all the reads in
    # a large region into memory and consider only
    # those falling into the positions to sample.
    # The following code gets the reads
    # that are at sampling positions that lie close together
    if np.mean(np.diff(positions_to_sample)) < 1000:
        start_pos = min(positions_to_sample)
        end_pos = max(positions_to_sample)
        if verbose:
            print "[{:.3f}] caching reads".format(time.time() - startTime)

        counts = np.bincount([r.pos - start_pos
                              for r in bam.fetch(chromNameBam, start_pos,
                                                 end_pos + 1)
                              if not r.is_reverse and r.pos >= start_pos],
                             minlength=end_pos - start_pos + 2)

        read_counts = counts[positions_to_sample - min(positions_to_sample)]
        if verbose:
            print "[{:.3f}] finish caching reads.".format(
                time.time() - startTime)

    countTime = time.time()

    c = 1
    for index in xrange(len(positions_to_sample)):
        i = positions_to_sample[index]
        # stop if the end of the chromosome is reached
        if i + fragmentLength['median'] > tbit[chromNameBit].size:
            break

        try:
            gc = getGC_content(
                tbit[chromNameBit].get(i, i + fragmentLength['median']),
                as_fraction=False)
        except Exception as detail:
            if verbose:
                print detail
            continue

        subN_gc[gc] += 1

        # count all reads at position 'i'
        if len(read_counts) == 0:  # case when no cache was done
            num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1)
                             if x.is_reverse is False and x.pos == i])
        else:
            num_reads = read_counts[index]

        if num_reads >= global_vars['max_reads']:
            peak += 1
            continue

        subF_gc[gc] += num_reads
        if verbose:
            if index % 50000 == 0:
                endTime = time.time()
                print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \
                    (multiprocessing.current_process().name,
                     index, index / (endTime - countTime),
                     chromNameBit, start, end, stepSize)
        c += 1

    if verbose:
        endTime = time.time()
        print "%s processing %d (%.1f per sec) @ %s:%s-%s %s" % \
            (multiprocessing.current_process().name,
             index, index / (endTime - countTime),
             chromNameBit, start, end, stepSize)
        print "%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name,
                                                    (endTime - startTime), chromNameBit, start, end, stepSize)

    return(subN_gc, subF_gc)