def main(args=None):

    args = parse_arguments().parse_args(args)

    if not args.outRawCounts and not args.plotFile:
        sys.exit(
            "Error: You need to specify at least one of --plotFile or --outRawCounts!\n"
        )

    if args.labels is None:
        args.labels = args.bamfiles
    if len(args.labels) != len(args.bamfiles):
        sys.exit(
            "Error: The number of labels ({0}) does not match the number of BAM files ({1})!"
            .format(len(args.labels), len(args.bamfiles)))

    # Get fragment size and chromosome dict
    fhs = [openBam(x) for x in args.bamfiles]
    chromSize, non_common_chr = getCommonChrNames(fhs, verbose=args.verbose)
    for fh in fhs:
        fh.close()

    frag_len_dict, read_len_dict = get_read_and_fragment_length(
        args.bamfiles[0],
        return_lengths=False,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        verbose=args.verbose)
    if args.extendReads:
        if args.extendReads is True:
            # try to guess fragment length if the bam file contains paired end reads
            if frag_len_dict:
                defaultFragmentLength = frag_len_dict['median']
            else:
                sys.exit(
                    "*ERROR*: library is not paired-end. Please provide an extension length."
                )
            if args.verbose:
                print("Fragment length based on paired en data "
                      "estimated to be {0}".format(frag_len_dict['median']))
        elif args.extendReads < read_len_dict['median']:
            sys.stderr.write(
                "*WARNING*: read extension is smaller than read length (read length = {}). "
                "Reads will not be extended.\n".format(
                    int(read_len_dict['median'])))
            defaultFragmentLength = 'read length'
        elif args.extendReads > 2000:
            sys.exit(
                "*ERROR*: read extension must be smaller that 2000. Value give: {} "
                .format(args.extendReads))
        else:
            defaultFragmentLength = args.extendReads
    else:
        defaultFragmentLength = 'read length'

    # Get the chunkLength
    chunkLength = getChunkLength(args, chromSize)

    # Map reduce to get the counts/file/feature
    res = mapReduce([args, defaultFragmentLength],
                    getEnrichment_worker,
                    chromSize,
                    genomeChunkLength=chunkLength,
                    region=args.region,
                    blackListFileName=args.blackListFileName,
                    numberOfProcessors=args.numberOfProcessors,
                    verbose=args.verbose)

    features = res[0][1]
    featureCounts = []
    for i in list(range(len(args.bamfiles))):
        d = dict()
        for x in features:
            d[x] = 0
        featureCounts.append(d)

    # res is a list, with each element a list (length len(args.bamfiles)) of dicts
    totalCounts = [0] * len(args.bamfiles)
    for x in res:
        for i, y in enumerate(x[2]):
            totalCounts[i] += y
        for i, y in enumerate(x[0]):
            for k, v in y.items():
                featureCounts[i][k] += v

    # Make a plot
    if args.plotFile:
        plotEnrichment(args, featureCounts, totalCounts, features)

    # Raw counts
    if args.outRawCounts:
        of = open(args.outRawCounts, "w")
        of.write("file\tfeatureType\tpercent\n")
        for i, x in enumerate(args.labels):
            for k, v in featureCounts[i].items():
                of.write("{0}\t{1}\t{2:5.2f}\n".format(x, k, (100.0 * v) /
                                                       totalCounts[i]))
        of.close()
    def run(self,
            func_to_call,
            func_args,
            out_file_name,
            blackListFileName=None,
            format="bedgraph",
            smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        bam_handlers = [bamHandler.openBam(x) for x in self.bamFilesList]
        genome_chunk_length = getGenomeChunkLength(bam_handlers,
                                                   self.binLength)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handlers,
                                                             verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # concatenate intermediary bedgraph files
        out_file = open(out_file_name + ".bg", 'wb')
        for tempfilename in res:
            if tempfilename:
                # concatenate all intermediate tempfiles into one
                # bedgraph file
                _foo = open(tempfilename, 'rb')
                shutil.copyfileobj(_foo, out_file)
                _foo.close()
                os.remove(tempfilename)

        bedgraph_file = out_file.name
        out_file.close()
        if format == 'bedgraph':
            os.rename(bedgraph_file, out_file_name)
            if self.verbose:
                print("output file: {}".format(out_file_name))
        else:
            bedGraphToBigWig(chrom_names_and_size, bedgraph_file,
                             out_file_name, True)
            if self.verbose:
                print("output file: {}".format(out_file_name))
            os.remove(bedgraph_file)
Exemple #3
0
def writeBedGraph(bamFilesList,
                  outputFileName,
                  fragmentLength,
                  func,
                  funcArgs,
                  tileSize=25,
                  region=None,
                  numberOfProcessors=None,
                  format="bedgraph",
                  extendPairedEnds=True,
                  zerosToNans=True,
                  smoothLength=0,
                  minMappingQuality=None,
                  ignoreDuplicates=False,
                  fragmentFromRead_func=None,
                  centerRead=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    >>> test = Tester()
    >>> import tempfile
    >>> outFile = tempfile.NamedTemporaryFile()
    >>> funcArgs = {'scaleFactor': 1.0}
    >>> writeBedGraph( [test.bamFile1], outFile.name,
    ... 0, scaleCoverage, funcArgs, region='3R:0:200')
    >>> open(outFile.name, 'r').readlines()
    ['3R\t100\t200\t1.0\n']
    >>> outFile.close()

    """
    bamHandlers = [openBam(x) for x in bamFilesList]
    genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
    # check if both bam files correspond to the same species
    # by comparing the chromosome names:
    chromNamesAndSize = getCommonChrNames(bamHandlers, verbose=False)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (tileSize, fragmentLength, bamFilesList, func, funcArgs,
         extendPairedEnds, smoothLength, zerosToNans, minMappingQuality,
         ignoreDuplicates, fragmentFromRead_func, centerRead),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            shutil.copyfileobj(open(tempFileName, 'rb'), outFile)
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print "output file: %s" % (outputFileName)
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName,
                         False)
        if debug:
            print "output file: %s" % (outputFileName)
        os.remove(bedGraphFile)
def writeBedGraph(bamOrBwFileList,
                  outputFileName,
                  fragmentLength,
                  func,
                  funcArgs,
                  tileSize=25,
                  region=None,
                  blackListFileName=None,
                  numberOfProcessors=1,
                  format="bedgraph",
                  extendPairedEnds=True,
                  missingDataAsZero=False,
                  smoothLength=0,
                  fixed_step=False,
                  verbose=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """
    bamHandles = []
    mappedList = []
    for indexedFile, fileFormat in bamOrBwFileList:
        if fileFormat == 'bam':
            bam, mapped, unmapped, stats = bamHandler.openBam(
                indexedFile, returnStats=True, nThreads=numberOfProcessors)
            bamHandles.append(bam)
            mappedList.append(mapped)

    if len(bamHandles):
        genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize,
                                                 mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose)
    else:
        genomeChunkLength = int(10e6)
        cCommon = []
        chromNamesAndSize = {}
        for fileName, fileFormat in bamOrBwFileList:
            if fileFormat == 'bigwig':
                fh = pyBigWig.open(fileName)
            else:
                continue

            for chromName, size in list(fh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "input files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bamOrBwFileList[0][0], size, fileName))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            fh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs,
         extendPairedEnds, smoothLength, missingDataAsZero, fixed_step),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        blackListFileName=blackListFileName,
        numberOfProcessors=numberOfProcessors,
        verbose=verbose)

    # Determine the sorted order of the temp files
    chrom_order = dict()
    for i, _ in enumerate(chromNamesAndSize):
        chrom_order[_[0]] = i
    res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
    res.sort()

    if format == 'bedgraph':
        of = open(outputFileName, 'wb')
        for r in res:
            if r is not None:
                _ = open(r[3], 'rb')
                shutil.copyfileobj(_, of)
                _.close()
                os.remove(r[3])
        of.close()
    else:
        bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res],
                         outputFileName)
def writeBedGraph(bamOrBwFileList,
                  outputFileName,
                  fragmentLength,
                  func,
                  funcArgs,
                  tileSize=25,
                  region=None,
                  blackListFileName=None,
                  numberOfProcessors=None,
                  format="bedgraph",
                  extendPairedEnds=True,
                  missingDataAsZero=False,
                  smoothLength=0,
                  fixed_step=False):
    r"""
    Given a list of bamfiles, a function and a function arguments,
    this method writes a bedgraph file (or bigwig) file
    for a partition of the genome into tiles of given size
    and a value for each tile that corresponds to the given function
    and that is related to the coverage underlying the tile.

    """

    bamHandlers = [
        bamHandler.openBam(indexedFile)
        for indexedFile, fileFormat in bamOrBwFileList if fileFormat == 'bam'
    ]
    if len(bamHandlers):
        genomeChunkLength = getGenomeChunkLength(bamHandlers, tileSize)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chromNamesAndSize, __ = getCommonChrNames(bamHandlers, verbose=False)
    else:
        genomeChunkLength = int(10e6)
        bigwigs = [
            fileName for fileName, fileFormat in bamOrBwFileList
            if fileFormat == 'bigwig'
        ]
        cCommon = []
        chromNamesAndSize = {}
        for bw in bigwigs:
            bwh = pyBigWig.open(bw)
            for chromName, size in list(bwh.chroms().items()):
                if chromName in chromNamesAndSize:
                    cCommon.append(chromName)
                    if chromNamesAndSize[chromName] != size:
                        print("\nWARNING\n"
                              "Chromosome {} length reported in the "
                              "bigwig files differ.\n{} for {}\n"
                              "{} for {}.\n\nThe smallest "
                              "length will be used".format(
                                  chromName, chromNamesAndSize[chromName],
                                  bigwigs[0], size, bw))
                        chromNamesAndSize[chromName] = min(
                            chromNamesAndSize[chromName], size)
                else:
                    chromNamesAndSize[chromName] = size
            bwh.close()

        # get the list of common chromosome names and sizes
        chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()
                             if k in cCommon]

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(tileSize)

    res = mapReduce.mapReduce(
        (tileSize, fragmentLength, bamOrBwFileList, func, funcArgs,
         extendPairedEnds, smoothLength, missingDataAsZero, fixed_step),
        writeBedGraph_wrapper,
        chromNamesAndSize,
        genomeChunkLength=genomeChunkLength,
        region=region,
        blackListFileName=blackListFileName,
        numberOfProcessors=numberOfProcessors)

    # concatenate intermediary bedgraph files
    outFile = open(outputFileName + ".bg", 'wb')
    for tempFileName in res:
        if tempFileName:
            # concatenate all intermediate tempfiles into one
            # bedgraph file
            _foo = open(tempFileName, 'rb')
            shutil.copyfileobj(_foo, outFile)
            _foo.close()
            os.remove(tempFileName)

    bedGraphFile = outFile.name
    outFile.close()
    if format == 'bedgraph':
        os.rename(bedGraphFile, outputFileName)
        if debug:
            print("output file: %s" % (outputFileName))
    else:
        bedGraphToBigWig(chromNamesAndSize, bedGraphFile, outputFileName, True)
        if debug:
            print("output file: %s" % (outputFileName))
        os.remove(bedGraphFile)
    def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0):
        r"""
        Given a list of bamfiles, a function and a function arguments,
        this method writes a bedgraph file (or bigwig) file
        for a partition of the genome into tiles of given size
        and a value for each tile that corresponds to the given function
        and that is related to the coverage underlying the tile.

        Parameters
        ----------
        func_to_call : str
            function name to be called to convert the list of coverages computed
            for each bam file at each position into a single value. An example
            is a function that takes the ratio between the coverage of two
            bam files.
        func_args : dict
            dict of arguments to pass to `func`. E.g. {'scaleFactor':1.0}

        out_file_name : str
            name of the file to save the resulting data.

        smoothLength : int
            Distance in bp for smoothing the coverage per tile.


        """
        self.__dict__["smoothLength"] = smoothLength
        getStats = len(self.mappedList) < len(self.bamFilesList)
        bam_handles = []
        for x in self.bamFilesList:
            if getStats:
                bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=self.numberOfProcessors)
                self.mappedList.append(mapped)
                self.statsList.append(stats)
            else:
                bam = bamHandler.openBam(x)
            bam_handles.append(bam)

        genome_chunk_length = getGenomeChunkLength(bam_handles, self.binLength, self.mappedList)
        # check if both bam files correspond to the same species
        # by comparing the chromosome names:
        chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False)

        if self.region:
            # in case a region is used, append the tilesize
            self.region += ":{}".format(self.binLength)

        for x in list(self.__dict__.keys()):
            if x in ["mappedList", "statsList"]:
                continue
            sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x)))

        res = mapReduce.mapReduce([func_to_call, func_args],
                                  writeBedGraph_wrapper,
                                  chrom_names_and_size,
                                  self_=self,
                                  genomeChunkLength=genome_chunk_length,
                                  region=self.region,
                                  blackListFileName=blackListFileName,
                                  numberOfProcessors=self.numberOfProcessors)

        # Determine the sorted order of the temp files
        chrom_order = dict()
        for i, _ in enumerate(chrom_names_and_size):
            chrom_order[_[0]] = i
        res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res]
        res.sort()

        if format == 'bedgraph':
            out_file = open(out_file_name, 'wb')
            for r in res:
                if r[3]:
                    _foo = open(r[3], 'rb')
                    shutil.copyfileobj(_foo, out_file)
                    _foo.close()
                    os.remove(r[3])
            out_file.close()
        else:
            bedGraphToBigWig(chrom_names_and_size, [x[3] for x in res], out_file_name)