Ejemplo n.º 1
0
def get_scale_factor(args):

    scale_factor = args.scaleFactor
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization)
    blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName)
    bam_mapped -= blacklisted

    if args.normalizeTo1x:
        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if args.verbose:
                    print("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median']))

            elif args.extendReads < 1:
                exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
            elif args.extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print "Estimated read length is {}".format(int(read_len_dict['median']))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print "Estimated current coverage {}".format(current_coverage)
            print "Scaling factor {}".format(args.scaleFactor)

    elif args.normalizeUsingRPKM:
        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print "scale factor using RPKM is {0}".format(args.scaleFactor)

    return scale_factor
Ejemplo n.º 2
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        debug = 1
    else:
        debug = 0

    func_args = {'scaleFactor': get_scale_factor(args)}
    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if frag_len_dict is None:
            exit("*Error*: For the --MNAse function a paired end library is required. ")

        wr = CenterFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            verbose=args.verbose,
                            )

    else:
        wr = writeBedGraph.WriteBedGraph([args.bam],
                                         binLength=args.binSize,
                                         stepSize=args.binSize,
                                         region=args.region,
                                         numberOfProcessors=args.numberOfProcessors,
                                         extendReads=args.extendReads,
                                         minMappingQuality=args.minMappingQuality,
                                         ignoreDuplicates=args.ignoreDuplicates,
                                         center_read=args.centerReads,
                                         zerosToNans=args.skipNonCoveredRegions,
                                         samFlag_include=args.samFlagInclude,
                                         samFlag_exclude=args.samFlagExclude,
                                         verbose=args.verbose,
                                         )

    wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
           format=args.outFileFormat, smoothLength=args.smoothLength)
def main(args=None):
    args = parse_arguments().parse_args(args)
    fragment_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=True,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose,
                                                                    binSize=args.binSize,
                                                                    distanceBetweenBins=args.distanceBetweenBins)

    if fragment_len_dict:
        if fragment_len_dict['mean'] == 0:
            print "No pairs were found. Is the data from a paired-end sequencing experiment?"

        print "Sample size: {}\n".format(fragment_len_dict['sample_size'])

        print "\nFragment lengths:"
        print "Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" \
              "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'],
                                                      fragment_len_dict['qtile25'],
                                                      fragment_len_dict['mean'],
                                                      fragment_len_dict['median'],
                                                      fragment_len_dict['qtile75'],
                                                      fragment_len_dict['max'],
                                                      fragment_len_dict['std'])
    else:
        print "No pairs were found. Is the data from a paired-end sequencing experiment?"

    print "\nRead lengths:"
    print "Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" \
          "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'],
                                                  read_len_dict['qtile25'],
                                                  read_len_dict['mean'],
                                                  read_len_dict['median'],
                                                  read_len_dict['qtile75'],
                                                  read_len_dict['max'],
                                                  read_len_dict['std'])

    if args.histogram:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        plt.hist(fragment_len_dict['lengths'], 50,
                 range=(fragment_len_dict['min'], fragment_len_dict['mean'] * 2),
                 normed=True)
        plt.xlabel('Fragment Length')
        plt.ylabel('Frequency')
        plt.title(args.plotTitle)
        plt.savefig(args.histogram, bbox_inches=0)
        plt.close()
Ejemplo n.º 4
0
def getFragSize(bam, args):
        fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True,
                                                                        blackListFileName=args.blackListFileName,
                                                                        numberOfProcessors=args.numberOfProcessors,
                                                                        verbose=args.verbose,
                                                                        binSize=args.binSize,
                                                                        distanceBetweenBins=args.distanceBetweenBins)
        print("\n\nBAM file : {}".format(bam))
        if fragment_len_dict:
            if fragment_len_dict['mean'] == 0:
                print("No pairs were found. Is the data from a paired-end sequencing experiment?")

            print("Sample size: {}\n".format(fragment_len_dict['sample_size']))

            print("Fragment lengths:")
            print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
                  "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'],
                                                          fragment_len_dict['qtile25'],
                                                          fragment_len_dict['mean'],
                                                          fragment_len_dict['median'],
                                                          fragment_len_dict['qtile75'],
                                                          fragment_len_dict['max'],
                                                          fragment_len_dict['std']))
        else:
            print("No pairs were found. Is the data from a paired-end sequencing experiment?")

        print("\nRead lengths:")
        print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
              "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'],
                                                      read_len_dict['qtile25'],
                                                      read_len_dict['mean'],
                                                      read_len_dict['median'],
                                                      read_len_dict['qtile75'],
                                                      read_len_dict['max'],
                                                      read_len_dict['std']))
        return fragment_len_dict
Ejemplo n.º 5
0
def get_scale_factor(args):

    scale_factor = args.scaleFactor
    bam_handle = bamHandler.openBam(args.bam)
    bam_mapped = parserCommon.bam_total_reads(bam_handle,
                                              args.ignoreForNormalization)
    blacklisted = parserCommon.bam_blacklisted_reads(
        bam_handle, args.ignoreForNormalization, args.blackListFileName)
    bam_mapped -= blacklisted

    if args.normalizeTo1x:
        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            return_lengths=False,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit(
                        "*ERROR*: library is not paired-end. Please provide an extension length."
                    )
                if args.verbose:
                    print(
                        "Fragment length based on paired en data "
                        "estimated to be {}".format(frag_len_dict['median']))

            elif args.extendReads < 1:
                exit(
                    "*ERROR*: read extension must be bigger than one. Value give: {} "
                    .format(args.extendReads))
            elif args.extendReads > 2000:
                exit(
                    "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                    .format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print "Estimated read length is {}".format(
                    int(read_len_dict['median']))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print "Estimated current coverage {}".format(current_coverage)
            print "Scaling factor {}".format(args.scaleFactor)

    elif args.normalizeUsingRPKM:
        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print "scale factor using RPKM is {0}".format(args.scaleFactor)

    return scale_factor
Ejemplo n.º 6
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1, args.bamIndex1)
    bam2 = bamHandler.openBam(args.bamfile2, args.bamIndex2)

    bam1_mapped = parserCommon.bam_total_reads(bam1,
                                               args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2,
                                               args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength,
                args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) /
                    np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(
                [bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
            bamindex = args.bamIndex1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2
            bamindex = args.bamIndex2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(
                    bamfile,
                    bamindex,
                    return_lengths=False,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit(
                                "*ERROR*: library is not paired-end. Please provide an extension length."
                            )
                        if args.verbose:
                            print(
                                "Fragment length based on paired en data "
                                "estimated to be {}".format(
                                    frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit(
                            "*ERROR*: read extension must be bigger than one. Value give: {} "
                            .format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit(
                            "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                            .format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(
                            int(read_len_dict['median']))

                current_coverage = float(
                    mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(
                        current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped *
                                               tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors
Ejemplo n.º 7
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        debug = 1
    else:
        debug = 0

    if args.normalizeTo1x or args.normalizeUsingRPKM:
        # if a normalization is required then compute the scale factors
        scale_factor = get_scale_factor(args)
    else:
        scale_factor = args.scaleFactor

    func_args = {'scaleFactor': scale_factor}

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if frag_len_dict is None:
            sys.exit("*Error*: For the --MNAse function a paired end library is required. ")

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            blackListFileName=args.blackListFileName,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            verbose=args.verbose,
                            )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
        else:
            if args.Offset[0] == 0:
                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
        wr = OffsetFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            verbose=args.verbose)
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset

    elif args.filterRNAstrand:
        wr = filterRnaStrand([args.bam],
                             binLength=args.binSize,
                             stepSize=args.binSize,
                             region=args.region,
                             numberOfProcessors=args.numberOfProcessors,
                             extendReads=args.extendReads,
                             minMappingQuality=args.minMappingQuality,
                             ignoreDuplicates=args.ignoreDuplicates,
                             center_read=args.centerReads,
                             zerosToNans=args.skipNonCoveredRegions,
                             samFlag_include=args.samFlagInclude,
                             samFlag_exclude=args.samFlagExclude,
                             minFragmentLength=args.minFragmentLength,
                             maxFragmentLength=args.maxFragmentLength,
                             verbose=args.verbose,
                             )

        wr.filter_strand = args.filterRNAstrand
    else:
        wr = writeBedGraph.WriteBedGraph([args.bam],
                                         binLength=args.binSize,
                                         stepSize=args.binSize,
                                         region=args.region,
                                         blackListFileName=args.blackListFileName,
                                         numberOfProcessors=args.numberOfProcessors,
                                         extendReads=args.extendReads,
                                         minMappingQuality=args.minMappingQuality,
                                         ignoreDuplicates=args.ignoreDuplicates,
                                         center_read=args.centerReads,
                                         zerosToNans=args.skipNonCoveredRegions,
                                         samFlag_include=args.samFlagInclude,
                                         samFlag_exclude=args.samFlagExclude,
                                         minFragmentLength=args.minFragmentLength,
                                         maxFragmentLength=args.maxFragmentLength,
                                         verbose=args.verbose,
                                         )

    wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat, smoothLength=args.smoothLength)
Ejemplo n.º 8
0
def get_scale_factor(args):
    scale_factor = args.scaleFactor
    bam_mapped, bam_mapped_total = get_num_kept_reads(args)
    if args.normalizeTo1x:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: 1x\n")

        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if args.verbose:
                    print(("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median'])))

            elif args.extendReads < 1:
                exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
            elif args.extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print("Estimated read length is {}".format(int(read_len_dict['median'])))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print("Estimated current coverage {}".format(current_coverage))
            print("Scaling factor {}".format(args.scaleFactor))

    elif args.normalizeUsingRPKM:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: RPKM\n")

        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print("scale factor using RPKM is {0}".format(args.scaleFactor))
    else:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: depth\n")

        scale_factor *= bam_mapped / float(bam_mapped_total)

    if args.verbose:
        print("Final scaling factor: {}".format(scale_factor))

    return scale_factor
Ejemplo n.º 9
0
    def __init__(self,
                 bamFilesList,
                 binLength=50,
                 numberOfSamples=None,
                 numberOfProcessors=1,
                 verbose=False,
                 region=None,
                 bedFile=None,
                 extendReads=False,
                 blackListFileName=None,
                 minMappingQuality=None,
                 ignoreDuplicates=False,
                 chrsToSkip=[],
                 stepSize=None,
                 center_read=False,
                 samFlag_include=None,
                 samFlag_exclude=None,
                 zerosToNans=False,
                 skipZeroOverZero=False,
                 smoothLength=0,
                 minFragmentLength=0,
                 maxFragmentLength=0,
                 out_file_for_raw_data=None,
                 bed_and_bin=False,
                 statsList=[],
                 mappedList=[]):

        self.bamFilesList = bamFilesList
        self.binLength = binLength
        self.numberOfSamples = numberOfSamples
        self.blackListFileName = blackListFileName
        self.statsList = statsList
        self.mappedList = mappedList
        self.skipZeroOverZero = skipZeroOverZero
        self.bed_and_bin = bed_and_bin

        if extendReads and len(bamFilesList):
            from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
            frag_len_dict, read_len_dict = get_read_and_fragment_length(
                bamFilesList[0],
                return_lengths=False,
                blackListFileName=blackListFileName,
                numberOfProcessors=numberOfProcessors,
                verbose=verbose)
            if extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    self.defaultFragmentLength = int(frag_len_dict['median'])
                else:
                    exit(
                        "*ERROR*: library is not paired-end. Please provide an extension length."
                    )
                if verbose:
                    print(
                        ("Fragment length based on paired en data "
                         "estimated to be {}".format(frag_len_dict['median'])))

            elif extendReads < read_len_dict['median']:
                sys.stderr.write(
                    "*WARNING*: read extension is smaller than read length (read length = {}). "
                    "Reads will not be extended.\n".format(
                        int(read_len_dict['median'])))
                self.defaultFragmentLength = 'read length'

            elif extendReads > 2000:
                exit(
                    "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                    .format(extendReads))
            else:
                self.defaultFragmentLength = int(extendReads)

        else:
            self.defaultFragmentLength = 'read length'

        self.numberOfProcessors = numberOfProcessors
        self.verbose = verbose
        self.region = region
        self.bedFile = bedFile
        self.minMappingQuality = minMappingQuality
        self.ignoreDuplicates = ignoreDuplicates
        self.chrsToSkip = chrsToSkip
        self.stepSize = stepSize
        self.center_read = center_read
        self.samFlag_include = samFlag_include
        self.samFlag_exclude = samFlag_exclude
        self.minFragmentLength = minFragmentLength
        self.maxFragmentLength = maxFragmentLength
        self.zerosToNans = zerosToNans
        self.smoothLength = smoothLength

        if out_file_for_raw_data:
            self.save_data = True
            self.out_file_for_raw_data = out_file_for_raw_data
        else:
            self.save_data = False
            self.out_file_for_raw_data = None

        # check that wither numberOfSamples or stepSize are set
        if numberOfSamples is None and stepSize is None and bedFile is None:
            raise ValueError(
                "either stepSize, numberOfSamples or bedFile have to be set")

        if self.defaultFragmentLength != 'read length':
            self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
        else:
            self.maxPairedFragmentLength = 1000
        if self.maxFragmentLength > 0:
            self.maxPairedFragmentLength = self.maxFragmentLength

        if len(self.mappedList) == 0:
            try:
                for fname in self.bamFilesList:
                    bam, mapped, unmapped, stats = bamHandler.openBam(
                        fname,
                        returnStats=True,
                        nThreads=self.numberOfProcessors)
                    self.mappedList.append(mapped)
                    self.statsList.append(stats)
                    bam.close()
            except:
                self.mappedList = []
                self.statsList = []
Ejemplo n.º 10
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    if args.extraSampling:
        extra_sampling_file = args.extraSampling.name
        args.extraSampling.close()
    else:
        extra_sampling_file = None

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile
    global_vars['filter_out'] = args.blackListFileName
    global_vars['extra_sampling_file'] = extra_sampling_file

    bit = twobit.TwoBitFile(open(global_vars['2bit']))
    bam = bamHandler.openBam(global_vars['bam'])

    if args.fragmentLength:
        fragment_len_dict = \
            {'median': args.fragmentLength}

    else:
        fragment_len_dict, __ = \
            get_read_and_fragment_length(args.bamfile, None,
                                         numberOfProcessors=args.numberOfProcessors,
                                         verbose=args.verbose)
        if not fragment_len_dict:
            print "\nPlease provide the fragment length used for the " \
                "sample preparation.\n"
            exit(1)

        fragment_len_dict = {'median': int(fragment_len_dict['median'])}

    chrNameBitToBam = tbitToBamChrName(bit.index.keys(), bam.references)

    global_vars['genome_size'] = sum([bit[x].size for x in bit.index])
    global_vars['total_reads'] = bam.mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    confidence_p_value = float(1) / args.sampleSize

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]

    # use poisson distribution to identify peaks that should be discarted.
    # I multiply by 4, because the real distribution of reads
    # vary depending on the gc content
    # and the global number of reads per bp may a be too low.
    # empirically, a value of at least 4 times as big as the
    # reads_per_bp was found.
    # Similarly for the min value, I divide by 4.
    global_vars['max_reads'] = \
        poisson(4 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).isf(confidence_p_value)
    # this may be of not use, unless the depth of sequencing is really high
    # as this value is close to 0
    global_vars['min_reads'] = \
        poisson(0.25 * global_vars['reads_per_bp'] *
                fragment_len_dict['median']).ppf(confidence_p_value)

    for key in global_vars:
        print "{}: {}".format(key, global_vars[key])

    print "computing frequencies"
    # the GC of the genome is sampled each stepSize bp.
    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
    print "stepSize: {}".format(stepSize)
    data = tabulateGCcontent(fragment_len_dict,
                             chrNameBitToBam, stepSize,
                             chromSizes,
                             numberOfProcessors=args.numberOfProcessors,
                             verbose=args.verbose,
                             region=args.region)

    np.savetxt(args.GCbiasFrequenciesFile.name, data)

    if args.biasPlot:
        reads_per_gc = countReadsPerGC(args.regionSize,
                                       chrNameBitToBam, stepSize * 10,
                                       chromSizes,
                                       numberOfProcessors=args.numberOfProcessors,
                                       verbose=args.verbose,
                                       region=args.region)
        plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
Ejemplo n.º 11
0
def get_scale_factors(args):
    if args.ratio == 'subtract':
        # We need raw counts in this case
        normalizeTo1x = args.normalizeTo1x
        normalizeUsingRPKM = args.normalizeUsingRPKM
        args.normalizeTo1x = False
        args.normalizeUsingRPKM = False

    # This is only used if we subtract
    mapped_reads = [None, None]

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength, args.numberOfSamples,
            1,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            bam1 = bamHandler.openBam(args.bamfile1)
            bam2 = bamHandler.openBam(args.bamfile2)

            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'],
                   args.sampleLength))

            print("ignoring filtering/blacklists, size factors if the number of mapped "
                  "reads would have been used:")
            print(tuple(
                float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped])))
            bam1.close()
            bam2.close()

    elif args.scaleFactorsMethod == 'readCount':
        args.bam = args.bamfile1
        args.scaleFactor = 1.0
        bam1_mapped, _ = get_num_kept_reads(args)
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args)
        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped])
        mapped_reads = [bam1_mapped, bam2_mapped]
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.

        if args.scaleFactors is None:
            # check which of the two samples is not scaled down
            if scale_factors[0] == 1:
                args.bam = args.bamfile1
                mapped_reads = mapped_reads[0]
            else:
                args.bam = args.bamfile2
                mapped_reads = mapped_reads[1]
            if mapped_reads is None:
                mapped_reads, _ = get_num_kept_reads(args)

        # Replace the arguments
        args.normalizeTo1x = normalizeTo1x
        args.normalizeUsingRPKM = normalizeUsingRPKM

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                            return_lengths=False,
                                                                            blackListFileName=args.blackListFileName,
                                                                            numberOfProcessors=args.numberOfProcessors,
                                                                            verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                        if args.verbose:
                            print(("Fragment length based on paired en data "
                                  "estimated to be {}".format(frag_len_dict['median'])))

                    elif args.extendReads < 1:
                        exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print("Estimated read length is {}".format(int(read_len_dict['median'])))

                current_coverage = float(mapped_reads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Estimated current coverage {}".format(current_coverage))
                    print("Scale factor to convert "
                          "current coverage to 1: {}".format(coverage_scale_factor))
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mapped_reads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Scale factor for RPKM is {0}".format(coverage_scale_factor))

    return scale_factors
Ejemplo n.º 12
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        debug = 1
    else:
        debug = 0

    if args.normalizeTo1x or args.normalizeUsingRPKM:
        # if a normalization is required then compute the scale factors
        scale_factor = get_scale_factor(args)
    else:
        scale_factor = args.scaleFactor

    func_args = {'scaleFactor': scale_factor}

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            return_lengths=False,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if frag_len_dict is None:
            sys.exit(
                "*Error*: For the --MNAse function a paired end library is required. "
            )

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            verbose=args.verbose,
        )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exir(
                    "'Error*: The right side bound is less than the left-side bound. This is inappropriate."
                )
        else:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
        wr = OffsetFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            verbose=args.verbose)
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset

    elif args.filterRNAstrand:
        wr = filterRnaStrand(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            verbose=args.verbose,
        )

        wr.filter_strand = args.filterRNAstrand
    else:
        wr = writeBedGraph.WriteBedGraph(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            verbose=args.verbose,
        )

    wr.run(writeBedGraph.scaleCoverage,
           func_args,
           args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat,
           smoothLength=args.smoothLength)
Ejemplo n.º 13
0
def get_scale_factors(args):
    if args.ratio == 'subtract':
        # We need raw counts in this case
        normalizeTo1x = args.normalizeTo1x
        normalizeUsingRPKM = args.normalizeUsingRPKM
        args.normalizeTo1x = False
        args.normalizeUsingRPKM = False

    # This is only used if we subtract
    mapped_reads = [None, None]

    if args.scaleFactors:
        scale_factors = list(map(float, args.scaleFactors.split(":")))
    elif args.scaleFactorsMethod == 'SES':
        scalefactors_dict = estimateScaleFactor(
            [args.bamfile1, args.bamfile2],
            args.sampleLength,
            args.numberOfSamples,
            1,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose,
            chrsToSkip=args.ignoreForNormalization)

        scale_factors = scalefactors_dict['size_factors']

        if args.verbose:
            bam1 = bamHandler.openBam(args.bamfile1)
            bam2 = bamHandler.openBam(args.bamfile2)

            print("Size factors using SES: {}".format(scale_factors))
            print("%s regions of size %s where used " %
                  (scalefactors_dict['sites_sampled'], args.sampleLength))

            print(
                "ignoring filtering/blacklists, size factors if the number of mapped "
                "reads would have been used:")
            print(
                tuple(
                    float(min(bam1.mapped, bam2.mapped)) /
                    np.array([bam1.mapped, bam2.mapped])))
            bam1.close()
            bam2.close()

    elif args.scaleFactorsMethod == 'readCount':
        args.bam = args.bamfile1
        args.scaleFactor = 1.0
        bam1_mapped, _ = get_num_kept_reads(args)
        args.bam = args.bamfile2
        bam2_mapped, _ = get_num_kept_reads(args)
        scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(
            [bam1_mapped, bam2_mapped])
        mapped_reads = [bam1_mapped, bam2_mapped]
        if args.verbose:
            print("Size factors using total number "
                  "of mapped reads: {}".format(scale_factors))

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor due to RPKM or normalize1x
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.

        if args.scaleFactors is None:
            # check which of the two samples is not scaled down
            if scale_factors[0] == 1:
                args.bam = args.bamfile1
                mapped_reads = mapped_reads[0]
            else:
                args.bam = args.bamfile2
                mapped_reads = mapped_reads[1]
            if mapped_reads is None:
                mapped_reads, _ = get_num_kept_reads(args)

        # Replace the arguments
        args.normalizeTo1x = normalizeTo1x
        args.normalizeUsingRPKM = normalizeUsingRPKM

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(
                    args.bam,
                    return_lengths=False,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit(
                                "*ERROR*: library is not paired-end. Please provide an extension length."
                            )
                        if args.verbose:
                            print(("Fragment length based on paired en data "
                                   "estimated to be {}".format(
                                       frag_len_dict['median'])))

                    elif args.extendReads < 1:
                        exit(
                            "*ERROR*: read extension must be bigger than one. Value give: {} "
                            .format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit(
                            "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                            .format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print("Estimated read length is {}".format(
                            int(read_len_dict['median'])))

                current_coverage = float(
                    mapped_reads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Estimated current coverage {}".format(
                        current_coverage))
                    print("Scale factor to convert "
                          "current coverage to 1: {}".format(
                              coverage_scale_factor))
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mapped_reads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped *
                                               tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print("Scale factor for RPKM is {0}".format(
                        coverage_scale_factor))

    return scale_factors
Ejemplo n.º 14
0
def get_scale_factor(args, stats):
    scale_factor = args.scaleFactor
    bam_mapped, bam_mapped_total = get_num_kept_reads(args, stats)
    if args.normalizeUsing == 'RPGC':
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.effectiveGenomeSize))

        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if args.verbose:
                    print(("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median'])))

            elif args.extendReads < 1:
                exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
            elif args.extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print("Estimated read length is {}".format(int(read_len_dict['median'])))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.effectiveGenomeSize
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print("Estimated current coverage {}".format(current_coverage))
            print("Scaling factor {}".format(args.scaleFactor))

    elif args.normalizeUsing == 'RPKM':
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: RPKM\n")

        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print("scale factor using RPKM is {0}".format(args.scaleFactor))

    elif args.normalizeUsing == 'CPM':
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: CPM\n")

        # the CPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
        million_reads_mapped = float(bam_mapped) / 1e6
        scale_factor *= 1.0 / (million_reads_mapped)

        if debug:
            print("scale factor using CPM is {0}".format(args.scaleFactor))

    elif args.normalizeUsing == 'BPM':
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: BPM\n")
        # the BPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped")
        # sampled_bins_sum = getSampledSum(args.bam)
        tile_len_in_kb = float(args.binSize) / 1000
        tpm_scaleFactor = (bam_mapped / tile_len_in_kb) / 1e6

        scale_factor *= 1 / (tpm_scaleFactor * tile_len_in_kb)
        if debug:
            print("scale factor using BPM is {0}".format(args.scaleFactor))

    else:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: none (signal scaled by the fraction of alignments kept after filtering)\n")

        scale_factor *= bam_mapped / float(bam_mapped_total)

    if args.verbose:
        print("Final scaling factor: {}".format(scale_factor))

    return scale_factor
Ejemplo n.º 15
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        debug = 1
    else:
        debug = 0

    func_args = {'scaleFactor': get_scale_factor(args)}

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            return_lengths=False,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if frag_len_dict is None:
            exit(
                "*Error*: For the --MNAse function a paired end library is required. "
            )

        wr = CenterFragment(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            verbose=args.verbose,
        )

    elif args.filterRNAstrand:
        wr = filterRnaStrand(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            verbose=args.verbose,
        )

        wr.filter_strand = args.filterRNAstrand
    else:
        wr = writeBedGraph.WriteBedGraph(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            verbose=args.verbose,
        )

    wr.run(writeBedGraph.scaleCoverage,
           func_args,
           args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat,
           smoothLength=args.smoothLength)
Ejemplo n.º 16
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        sys.stderr.write("Specified --scaleFactor: {}\n".format(args.scaleFactor))
        debug = 1
    else:
        debug = 0

    if args.normalizeUsing == 'None':
        args.normalizeUsing = None  # For the sake of sanity
    elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
        sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")

    if args.normalizeUsing:
        # if a normalization is required then compute the scale factors
        bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors)
        bam.close()
        scale_factor = get_scale_factor(args, stats)
    else:
        scale_factor = args.scaleFactor

    func_args = {'scaleFactor': scale_factor}

    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
    if args.filterRNAstrand and not args.Offset:
        args.Offset = [1, -1]

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                    return_lengths=False,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose)
        if frag_len_dict is None:
            sys.exit("*Error*: For the --MNAse function a paired end library is required. ")

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            blackListFileName=args.blackListFileName,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            chrsToSkip=args.ignoreForNormalization,
                            verbose=args.verbose,
                            )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.")
        else:
            if args.Offset[0] == 0:
                sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.")
        wr = OffsetFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            chrsToSkip=args.ignoreForNormalization,
                            verbose=args.verbose)
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset
    else:
        wr = writeBedGraph.WriteBedGraph([args.bam],
                                         binLength=args.binSize,
                                         stepSize=args.binSize,
                                         region=args.region,
                                         blackListFileName=args.blackListFileName,
                                         numberOfProcessors=args.numberOfProcessors,
                                         extendReads=args.extendReads,
                                         minMappingQuality=args.minMappingQuality,
                                         ignoreDuplicates=args.ignoreDuplicates,
                                         center_read=args.centerReads,
                                         zerosToNans=args.skipNonCoveredRegions,
                                         samFlag_include=args.samFlagInclude,
                                         samFlag_exclude=args.samFlagExclude,
                                         minFragmentLength=args.minFragmentLength,
                                         maxFragmentLength=args.maxFragmentLength,
                                         chrsToSkip=args.ignoreForNormalization,
                                         verbose=args.verbose,
                                         )

    wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat, smoothLength=args.smoothLength)
Ejemplo n.º 17
0
def getFragSize(bam, args, idx, outRawFrags):
    fragment_len_dict, read_len_dict = get_read_and_fragment_length(
        bam,
        return_lengths=True,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose,
        binSize=args.binSize,
        distanceBetweenBins=args.distanceBetweenBins)

    if outRawFrags:
        label = bam
        if args.samplesLabel and idx < len(args.samplesLabel):
            label = args.samplesLabel[idx]
        if fragment_len_dict:
            fragment_len_dict['lengths'] = [
                int(x) for x in fragment_len_dict['lengths']
            ]
            cnts = np.bincount(fragment_len_dict['lengths'],
                               minlength=int(fragment_len_dict['max']) + 1)
        else:
            read_len_dict['lengths'] = [
                int(x) for x in read_len_dict['lengths']
            ]
            cnts = np.bincount(read_len_dict['lengths'],
                               minlength=int(read_len_dict['max']) + 1)
        for idx, v in enumerate(cnts):
            if v > 0:
                outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label))

    if args.samplesLabel and idx < len(args.samplesLabel):
        print("\n\nSample label: {}".format(args.samplesLabel[idx]))
    else:
        print("\n\nBAM file : {}".format(bam))

    if fragment_len_dict:
        if fragment_len_dict['mean'] == 0:
            print(
                "No pairs were found. Is the data from a paired-end sequencing experiment?"
            )

        print("Sample size: {}\n".format(fragment_len_dict['sample_size']))

        print("Fragment lengths:")
        print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
              "3rd Qu.: {}\nMax.: {}\nStd: {}".format(
                  fragment_len_dict['min'], fragment_len_dict['qtile25'],
                  fragment_len_dict['mean'], fragment_len_dict['median'],
                  fragment_len_dict['qtile75'], fragment_len_dict['max'],
                  fragment_len_dict['std']))
        print(
            "MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n"
            .format(fragment_len_dict['mad'], fragment_len_dict['qtile10'],
                    fragment_len_dict['qtile20'], fragment_len_dict['qtile30'],
                    fragment_len_dict['qtile40'], fragment_len_dict['qtile60'],
                    fragment_len_dict['qtile70'], fragment_len_dict['qtile80'],
                    fragment_len_dict['qtile90'],
                    fragment_len_dict['qtile99']))
    else:
        print(
            "No pairs were found. Is the data from a paired-end sequencing experiment?"
        )

    print("\nRead lengths:")
    print("Sample size: {}\n".format(read_len_dict['sample_size']))
    print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
          "3rd Qu.: {}\nMax.: {}\nStd: {}".format(
              read_len_dict['min'], read_len_dict['qtile25'],
              read_len_dict['mean'], read_len_dict['median'],
              read_len_dict['qtile75'], read_len_dict['max'],
              read_len_dict['std']))
    print(
        "MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n"
        .format(read_len_dict['mad'], read_len_dict['qtile10'],
                read_len_dict['qtile20'], read_len_dict['qtile30'],
                read_len_dict['qtile40'], read_len_dict['qtile60'],
                read_len_dict['qtile70'], read_len_dict['qtile80'],
                read_len_dict['qtile90'], read_len_dict['qtile99']))

    # The read and fragment lists will just eat up memory if not removed!
    if args.histogram:
        if fragment_len_dict:
            maxVal = fragment_len_dict['mean'] * 2
            minVal = fragment_len_dict['min']
        else:
            maxVal = read_len_dict['mean'] * 2
            minVal = read_len_dict['min']
        if args.maxFragmentLength > 0:
            maxVal = args.maxFragmentLength

        if fragment_len_dict:
            fragment_len_dict['lengths'] = getDensity(
                fragment_len_dict['lengths'], minVal, maxVal)
        if read_len_dict:
            read_len_dict['lengths'] = getDensity(read_len_dict['lengths'],
                                                  minVal, maxVal)
    else:
        if fragment_len_dict:
            del fragment_len_dict['lengths']
        if read_len_dict:
            del read_len_dict['lengths']

    return (fragment_len_dict, read_len_dict)
Ejemplo n.º 18
0
def main(args=None):
    args = parse_arguments().parse_args(args)

    if args.extraSampling:
        extra_sampling_file = args.extraSampling.name
        args.extraSampling.close()
    else:
        extra_sampling_file = None

    global global_vars
    global_vars = {}
    global_vars['2bit'] = args.genome
    global_vars['bam'] = args.bamfile
    global_vars['filter_out'] = args.blackListFileName
    global_vars['extra_sampling_file'] = extra_sampling_file

    tbit = py2bit.open(global_vars['2bit'])
    bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors)

    if args.fragmentLength:
        fragment_len_dict = \
            {'median': args.fragmentLength}

    else:
        fragment_len_dict, __ = \
            get_read_and_fragment_length(args.bamfile, None,
                                         numberOfProcessors=args.numberOfProcessors,
                                         verbose=args.verbose)
        if not fragment_len_dict:
            print("\nPlease provide the fragment length used for the "
                  "sample preparation.\n")
            exit(1)

        fragment_len_dict = {'median': int(fragment_len_dict['median'])}

    chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references)

    global_vars['genome_size'] = sum(tbit.chroms().values())
    global_vars['total_reads'] = mapped
    global_vars['reads_per_bp'] = \
        float(global_vars['total_reads']) / args.effectiveGenomeSize

    confidence_p_value = float(1) / args.sampleSize

    # chromSizes: list of tuples
    chromSizes = [(bam.references[i], bam.lengths[i])
                  for i in range(len(bam.references))]
    chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()]

    # use poisson distribution to identify peaks that should be discarted.
    # I multiply by 4, because the real distribution of reads
    # vary depending on the gc content
    # and the global number of reads per bp may a be too low.
    # empirically, a value of at least 4 times as big as the
    # reads_per_bp was found.
    # Similarly for the min value, I divide by 4.
    global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value)
    # this may be of not use, unless the depth of sequencing is really high
    # as this value is close to 0
    global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value)

    for key in global_vars:
        print("{}: {}".format(key, global_vars[key]))

    print("computing frequencies")
    # the GC of the genome is sampled each stepSize bp.
    stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1)
    print("stepSize: {}".format(stepSize))
    data = tabulateGCcontent(fragment_len_dict,
                             chrNameBitToBam, stepSize,
                             chromSizes,
                             numberOfProcessors=args.numberOfProcessors,
                             verbose=args.verbose,
                             region=args.region)

    np.savetxt(args.GCbiasFrequenciesFile.name, data)

    if args.biasPlot:
        reads_per_gc = countReadsPerGC(args.regionSize,
                                       chrNameBitToBam, stepSize * 10,
                                       chromSizes,
                                       numberOfProcessors=args.numberOfProcessors,
                                       verbose=args.verbose,
                                       region=args.region)
        if args.plotFileFormat == "plotly":
            plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize)
        else:
            plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat)
Ejemplo n.º 19
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit(
            "Error: You need to specify at least one of --plotFile or --outRawCounts!\n"
        )

    if args.labels is None:
        args.labels = args.bamfiles
    if args.smartLabels:
        args.labels = smartLabels(args.bamfiles)
    if len(args.labels) != len(args.bamfiles):
        sys.exit(
            "Error: The number of labels ({0}) does not match the number of BAM files ({1})!"
            .format(len(args.labels), len(args.bamfiles)))

    global gtf
    if not args.regionLabels and args.smartLabels:
        args.regionLabels = smartLabels(args.BED)
    gtf = Enrichment(args.BED,
                     keepExons=args.keepExons,
                     labels=args.regionLabels)

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(
        args.bamfiles[0],
        return_lengths=False,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit(
                    "*ERROR*: library is not paired-end. Please provide an extension length."
                )
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write(
                "*WARNING*: read extension is smaller than read length (read length = {}). "
                "Reads will not be extended.\n".format(
                    int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit(
                "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                .format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write(
            "file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(
                    x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i]))
        of.close()
Ejemplo n.º 20
0
def get_scale_factor(args):
    scale_factor = args.scaleFactor
    bam_mapped, bam_mapped_total = get_num_kept_reads(args)
    if args.normalizeTo1x:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write(
            "normalization: 1x (effective genome size {})\n".format(
                args.normalizeTo1x))

        # try to guess fragment length if the bam file contains paired end reads
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            return_lengths=False,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if args.extendReads:
            if args.extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    fragment_length = frag_len_dict['median']
                else:
                    exit(
                        "*ERROR*: library is not paired-end. Please provide an extension length."
                    )
                if args.verbose:
                    print(
                        ("Fragment length based on paired en data "
                         "estimated to be {}".format(frag_len_dict['median'])))

            elif args.extendReads < 1:
                exit(
                    "*ERROR*: read extension must be bigger than one. Value give: {} "
                    .format(args.extendReads))
            elif args.extendReads > 2000:
                exit(
                    "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                    .format(args.extendReads))
            else:
                fragment_length = args.extendReads

        else:
            # set as fragment length the read length
            fragment_length = int(read_len_dict['median'])
            if args.verbose:
                print("Estimated read length is {}".format(
                    int(read_len_dict['median'])))

        current_coverage = \
            float(bam_mapped * fragment_length) / args.normalizeTo1x
        # the scaling sets the coverage to match 1x
        scale_factor *= 1.0 / current_coverage
        if debug:
            print("Estimated current coverage {}".format(current_coverage))
            print("Scaling factor {}".format(args.scaleFactor))

    elif args.normalizeUsingRPKM:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: RPKM\n")

        # the RPKM is the # reads per tile / \
        #    ( total reads (in millions) * tile length in Kb)
        million_reads_mapped = float(bam_mapped) / 1e6
        tile_len_in_kb = float(args.binSize) / 1000

        scale_factor *= 1.0 / (million_reads_mapped * tile_len_in_kb)

        if debug:
            print("scale factor using RPKM is {0}".format(args.scaleFactor))
    else:
        # Print output, since normalzation stuff isn't printed to stderr otherwise
        sys.stderr.write("normalization: depth\n")

        scale_factor *= bam_mapped / float(bam_mapped_total)

    if args.verbose:
        print("Final scaling factor: {}".format(scale_factor))

    return scale_factor
Ejemplo n.º 21
0
def main(args=None):
    args = process_args(args)

    global debug
    if args.verbose:
        sys.stderr.write("Specified --scaleFactor: {}\n".format(
            args.scaleFactor))
        debug = 1
    else:
        debug = 0

    if args.normalizeUsing == 'None':
        args.normalizeUsing = None  # For the sake of sanity
    elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize:
        sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n")

    if args.normalizeUsing:
        # if a normalization is required then compute the scale factors
        bam, mapped, unmapped, stats = openBam(
            args.bam, returnStats=True, nThreads=args.numberOfProcessors)
        bam.close()
        scale_factor = get_scale_factor(args, stats)
    else:
        scale_factor = args.scaleFactor

    func_args = {'scaleFactor': scale_factor}

    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
    if args.filterRNAstrand and not args.Offset:
        args.Offset = [1, -1]

    if args.MNase:
        # check that library is paired end
        # using getFragmentAndReadSize
        from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
        frag_len_dict, read_len_dict = get_read_and_fragment_length(
            args.bam,
            return_lengths=False,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            verbose=args.verbose)
        if frag_len_dict is None:
            sys.exit(
                "*Error*: For the --MNAse function a paired end library is required. "
            )

        # Set some default fragment length bounds
        if args.minFragmentLength == 0:
            args.minFragmentLength = 130
        if args.maxFragmentLength == 0:
            args.maxFragmentLength = 200

        wr = CenterFragment(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            verbose=args.verbose,
        )

    elif args.Offset:
        if len(args.Offset) > 1:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
            if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]:
                sys.exir(
                    "'Error*: The right side bound is less than the left-side bound. This is inappropriate."
                )
        else:
            if args.Offset[0] == 0:
                sys.exit(
                    "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment."
                )
        wr = OffsetFragment([args.bam],
                            binLength=args.binSize,
                            stepSize=args.binSize,
                            region=args.region,
                            numberOfProcessors=args.numberOfProcessors,
                            extendReads=args.extendReads,
                            minMappingQuality=args.minMappingQuality,
                            ignoreDuplicates=args.ignoreDuplicates,
                            center_read=args.centerReads,
                            zerosToNans=args.skipNonCoveredRegions,
                            samFlag_include=args.samFlagInclude,
                            samFlag_exclude=args.samFlagExclude,
                            minFragmentLength=args.minFragmentLength,
                            maxFragmentLength=args.maxFragmentLength,
                            chrsToSkip=args.ignoreForNormalization,
                            verbose=args.verbose)
        wr.filter_strand = args.filterRNAstrand
        wr.Offset = args.Offset
    else:
        wr = writeBedGraph.WriteBedGraph(
            [args.bam],
            binLength=args.binSize,
            stepSize=args.binSize,
            region=args.region,
            blackListFileName=args.blackListFileName,
            numberOfProcessors=args.numberOfProcessors,
            extendReads=args.extendReads,
            minMappingQuality=args.minMappingQuality,
            ignoreDuplicates=args.ignoreDuplicates,
            center_read=args.centerReads,
            zerosToNans=args.skipNonCoveredRegions,
            samFlag_include=args.samFlagInclude,
            samFlag_exclude=args.samFlagExclude,
            minFragmentLength=args.minFragmentLength,
            maxFragmentLength=args.maxFragmentLength,
            chrsToSkip=args.ignoreForNormalization,
            verbose=args.verbose,
        )

    wr.run(writeBedGraph.scaleCoverage,
           func_args,
           args.outFileName,
           blackListFileName=args.blackListFileName,
           format=args.outFileFormat,
           smoothLength=args.smoothLength)
Ejemplo n.º 22
0
    def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1,
                 verbose=False, region=None,
                 bedFile=None, extendReads=False,
                 blackListFileName=None,
                 minMappingQuality=None,
                 ignoreDuplicates=False,
                 chrsToSkip=[],
                 stepSize=None,
                 center_read=False,
                 samFlag_include=None,
                 samFlag_exclude=None,
                 zerosToNans=False,
                 smoothLength=0,
                 minFragmentLength=0,
                 maxFragmentLength=0,
                 out_file_for_raw_data=None):

        self.bamFilesList = bamFilesList
        self.binLength = binLength
        self.numberOfSamples = numberOfSamples
        self.blackListFileName = blackListFileName

        if extendReads and len(bamFilesList):
            from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
            frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0],
                                                                        return_lengths=False,
                                                                        blackListFileName=blackListFileName,
                                                                        numberOfProcessors=numberOfProcessors,
                                                                        verbose=verbose)
            if extendReads is True:
                # try to guess fragment length if the bam file contains paired end reads
                if frag_len_dict:
                    self.defaultFragmentLength = int(frag_len_dict['median'])
                else:
                    exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                if verbose:
                    print(("Fragment length based on paired en data "
                          "estimated to be {}".format(frag_len_dict['median'])))

            elif extendReads < read_len_dict['median']:
                sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
                                 "Reads will not be extended.\n".format(int(read_len_dict['median'])))
                self.defaultFragmentLength = 'read length'

            elif extendReads > 2000:
                exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads))
            else:
                self.defaultFragmentLength = int(extendReads)

        else:
            self.defaultFragmentLength = 'read length'

        self.numberOfProcessors = numberOfProcessors
        self.verbose = verbose
        self.region = region
        self.bedFile = bedFile
        self.minMappingQuality = minMappingQuality
        self.ignoreDuplicates = ignoreDuplicates
        self.chrsToSkip = chrsToSkip
        self.stepSize = stepSize
        self.center_read = center_read
        self.samFlag_include = samFlag_include
        self.samFlag_exclude = samFlag_exclude
        self.minFragmentLength = minFragmentLength
        self.maxFragmentLength = maxFragmentLength
        self.zerosToNans = zerosToNans
        self.smoothLength = smoothLength

        if out_file_for_raw_data:
            self.save_data = True
            self.out_file_for_raw_data = out_file_for_raw_data
        else:
            self.save_data = False
            self.out_file_for_raw_data = None

        # check that wither numberOfSamples or stepSize are set
        if numberOfSamples is None and stepSize is None and bedFile is None:
            raise ValueError("either stepSize, numberOfSamples or bedFile have to be set")

        if self.defaultFragmentLength != 'read length':
            self.maxPairedFragmentLength = 4 * self.defaultFragmentLength
        else:
            self.maxPairedFragmentLength = 1000
        if self.maxFragmentLength > 0:
            self.maxPairedFragmentLength = self.maxFragmentLength
Ejemplo n.º 23
0
def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n")

    if args.labels is None:
        args.labels = args.bamfiles
    if args.smartLabels:
        args.labels = smartLabels(args.bamfiles)
    if len(args.labels) != len(args.bamfiles):
        sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles)))

    # Ensure that if we're given an attributeKey that it's not empty
    if args.attributeKey and args.attributeKey == "":
        args.attributeKey = None

    global gtf
    if not args.regionLabels and args.smartLabels:
        args.regionLabels = smartLabels(args.BED)
    gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey)

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0],
                                                                return_lengths=False,
                                                                blackListFileName=args.blackListFileName,
                                                                numberOfProcessors=args.numberOfProcessors,
                                                                verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.")
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). "
                             "Reads will not be extended.\n".format(int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i]))
        of.close()
Ejemplo n.º 24
0
def get_scale_factors(args):

    bam1 = bamHandler.openBam(args.bamfile1)
    bam2 = bamHandler.openBam(args.bamfile2)

    bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization)
    bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization)

    if args.scaleFactors:
        scale_factors = map(float, args.scaleFactors.split(":"))
    else:
        if args.scaleFactorsMethod == 'SES':
            scalefactors_dict = estimateScaleFactor(
                [bam1.filename, bam2.filename],
                args.sampleLength, args.numberOfSamples,
                1,
                numberOfProcessors=args.numberOfProcessors,
                verbose=args.verbose,
                chrsToSkip=args.ignoreForNormalization)

            scale_factors = scalefactors_dict['size_factors']

            if args.verbose:
                print "Size factors using SES: {}".format(scale_factors)
                print "%s regions of size %s where used " % \
                    (scalefactors_dict['sites_sampled'],
                     args.sampleLength)

                print "size factor if the number of mapped " \
                    "reads would have been used:"
                print tuple(
                    float(min(bam1.mapped, bam2.mapped)) / np.array([bam1.mapped, bam2.mapped]))

        elif args.scaleFactorsMethod == 'readCount':
            scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array([bam1_mapped, bam2_mapped])
            if args.verbose:
                print "Size factors using total number " \
                    "of mapped reads: {}".format(scale_factors)

    # in case the subtract method is used, the final difference
    # would be normalized according to the given method
    if args.ratio == 'subtract':
        # The next lines identify which of the samples is not scaled down.
        # The normalization using RPKM or normalize to 1x would use
        # as reference such sample. Since the other sample would be
        # scaled to match the un-scaled one, the normalization factor
        # for both samples should be based on the unscaled one.
        # For example, if sample A is unscaled and sample B is scaled by 0.5,
        # then normalizing factor for A to report RPKM read counts
        # is also applied to B.
        if scale_factors[0] == 1:
            mappedReads = bam1_mapped
            bamfile = args.bamfile1
        else:
            mappedReads = bam2_mapped
            bamfile = args.bamfile2

        if args.scaleFactors is None:
            if args.normalizeTo1x:
                # try to guess fragment length if the bam file contains paired end reads
                from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile,
                                                                            return_lengths=False,
                                                                            numberOfProcessors=args.numberOfProcessors,
                                                                            verbose=args.verbose)
                if args.extendReads:
                    if args.extendReads is True:
                        # try to guess fragment length if the bam file contains paired end reads
                        if frag_len_dict:
                            fragment_length = frag_len_dict['median']
                        else:
                            exit("*ERROR*: library is not paired-end. Please provide an extension length.")
                        if args.verbose:
                            print("Fragment length based on paired en data "
                                  "estimated to be {}".format(frag_len_dict['median']))

                    elif args.extendReads < 1:
                        exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads))
                    elif args.extendReads > 2000:
                        exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads))
                    else:
                        fragment_length = args.extendReads

                else:
                    # set as fragment length the read length
                    fragment_length = int(read_len_dict['median'])
                    if args.verbose:
                        print "Estimated read length is {}".format(int(read_len_dict['median']))

                current_coverage = float(mappedReads * fragment_length) / args.normalizeTo1x
                # the coverage scale factor is 1 / coverage,
                coverage_scale_factor = 1.0 / current_coverage
                scale_factors = np.array(scale_factors) * coverage_scale_factor
                if args.verbose:
                    print "Estimated current coverage {}".format(current_coverage)
                    print "Scale factor to convert " \
                          "current coverage to 1: {}".format(coverage_scale_factor)
            else:
                # by default normalize using RPKM
                # the RPKM is:
                # Num reads per tile/(total reads (in millions)*tile length in Kb)
                millionReadsMapped = float(mappedReads) / 1e6
                tileLengthInKb = float(args.binSize) / 1000
                coverage_scale_factor = 1.0 / (millionReadsMapped * tileLengthInKb)
                scale_factors = np.array(scale_factors) * coverage_scale_factor

                if args.verbose:
                    print "scale factor for   "
                    "RPKM is {0}".format(coverage_scale_factor)

    return scale_factors
Ejemplo n.º 25
0
def getFragSize(bam, args, idx, outRawFrags):
    fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True,
                                                                    blackListFileName=args.blackListFileName,
                                                                    numberOfProcessors=args.numberOfProcessors,
                                                                    verbose=args.verbose,
                                                                    binSize=args.binSize,
                                                                    distanceBetweenBins=args.distanceBetweenBins)

    if outRawFrags:
        label = bam
        if args.samplesLabel and idx < len(args.samplesLabel):
            label = args.samplesLabel[idx]
        if fragment_len_dict:
            fragment_len_dict['lengths'] = [int(x) for x in fragment_len_dict['lengths']]
            cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1)
        else:
            read_len_dict['lengths'] = [int(x) for x in read_len_dict['lengths']]
            cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1)
        for idx, v in enumerate(cnts):
            if v > 0:
                outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label))

    if args.samplesLabel and idx < len(args.samplesLabel):
        print("\n\nSample label: {}".format(args.samplesLabel[idx]))
    else:
        print("\n\nBAM file : {}".format(bam))

    if fragment_len_dict:
        if fragment_len_dict['mean'] == 0:
            print("No pairs were found. Is the data from a paired-end sequencing experiment?")

        print("Sample size: {}\n".format(fragment_len_dict['sample_size']))

        print("Fragment lengths:")
        print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
              "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'],
                                                      fragment_len_dict['qtile25'],
                                                      fragment_len_dict['mean'],
                                                      fragment_len_dict['median'],
                                                      fragment_len_dict['qtile75'],
                                                      fragment_len_dict['max'],
                                                      fragment_len_dict['std']))
        print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(fragment_len_dict['mad'],
                                                                                                                                                               fragment_len_dict['qtile10'],
                                                                                                                                                               fragment_len_dict['qtile20'],
                                                                                                                                                               fragment_len_dict['qtile30'],
                                                                                                                                                               fragment_len_dict['qtile40'],
                                                                                                                                                               fragment_len_dict['qtile60'],
                                                                                                                                                               fragment_len_dict['qtile70'],
                                                                                                                                                               fragment_len_dict['qtile80'],
                                                                                                                                                               fragment_len_dict['qtile90'],
                                                                                                                                                               fragment_len_dict['qtile99']))
    else:
        print("No pairs were found. Is the data from a paired-end sequencing experiment?")

    print("\nRead lengths:")
    print("Sample size: {}\n".format(read_len_dict['sample_size']))
    print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n"
          "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'],
                                                  read_len_dict['qtile25'],
                                                  read_len_dict['mean'],
                                                  read_len_dict['median'],
                                                  read_len_dict['qtile75'],
                                                  read_len_dict['max'],
                                                  read_len_dict['std']))
    print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(read_len_dict['mad'],
                                                                                                                                                           read_len_dict['qtile10'],
                                                                                                                                                           read_len_dict['qtile20'],
                                                                                                                                                           read_len_dict['qtile30'],
                                                                                                                                                           read_len_dict['qtile40'],
                                                                                                                                                           read_len_dict['qtile60'],
                                                                                                                                                           read_len_dict['qtile70'],
                                                                                                                                                           read_len_dict['qtile80'],
                                                                                                                                                           read_len_dict['qtile90'],
                                                                                                                                                           read_len_dict['qtile99']))

    # The read and fragment lists will just eat up memory if not removed!
    if args.histogram:
        if fragment_len_dict:
            maxVal = fragment_len_dict['mean'] * 2
            minVal = fragment_len_dict['min']
        else:
            maxVal = read_len_dict['mean'] * 2
            minVal = read_len_dict['min']
        if args.maxFragmentLength > 0:
            maxVal = args.maxFragmentLength

        if fragment_len_dict:
            fragment_len_dict['lengths'] = getDensity(fragment_len_dict['lengths'], minVal, maxVal)
        if read_len_dict:
            read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal)
    else:
        if fragment_len_dict:
            del fragment_len_dict['lengths']
        if read_len_dict:
            del read_len_dict['lengths']

    return (fragment_len_dict, read_len_dict)