Exemple #1
0
def run(options):
    options = opt_validate(options)
    # weights = options.weights

    info("Read and build bedGraph for each replicate...")
    reps = []
    i = 1
    for ifile in options.ifile:
        info("Read file #%d" % i)
        reps.append(cBedGraphIO.bedGraphIO(ifile).build_bdgtrack())
        i += 1

    # first two reps

    info("combining #1 and #2 with method '%s'" % options.method)
    cmbtrack = reps[0].overlie(reps[1], func=options.method)
    ofile = os.path.join(options.outdir, options.ofile)
    info("Write bedGraph of combined scores...")
    ofhd = open(ofile, "wb")
    cmbtrack.write_bedGraph(
        ofhd,
        name="%s_combined_scores" % (options.method.upper()),
        description="Scores calculated by %s" % (options.method.upper()),
    )
    info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
Exemple #2
0
def run( options ):
    options = opt_validate( options )
    info("Read and build bedGraph...")
    bio = BedGraphIO.bedGraphIO(options.ifile)
    btrack = bio.build_bdgtrack(baseline_value=0)

    info("Modify bedGraph...")
    if options.method.lower() == "p2q":
        btrack.p2q()
    elif options.method.lower() == "analen":
        btrack.analen()
    else:
        extraparam = float(options.extraparam[0])
        if options.method.lower() == "multiply":
            btrack.apply_func( lambda x: x * extraparam)
        elif options.method.lower() == "add":
            btrack.apply_func( lambda x: x + extraparam)
        elif options.method.lower() == "max":
            btrack.apply_func( lambda x: x if x> extraparam else extraparam )
        elif options.method.lower() == "min":
            btrack.apply_func( lambda x: x if x< extraparam else extraparam )
        
    ofile = os.path.join( options.outdir, options.ofile )
    info("Write bedGraph of modified scores...")
    ofhd = open(ofile,"wb")
    btrack.write_bedGraph(ofhd,name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
    info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
Exemple #3
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error


    outputfile = open(options.oprefix+"_refinepeak.bed", "w")

    peakio = file(options.bedfile)
    peaks = PeakIO()
    for l in peakio:
        fs = l.rstrip().split()
        peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] )

    peaks.sort()
    
    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options (options)
    
    retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff )
    outputfile.write( "\n".join( map(lambda x: "%s\t%d\t%d\t%s\t%.2f" % x , retval) ) )
    info("Done!")
    info("Check output file: %s" % options.oprefix+"_refinepeak.bed")
Exemple #4
0
def run(options):
    options = opt_validate(options)
    #weights = options.weights

    info("Read and build bedGraph for each replicate...")
    reps = []
    i = 1
    for ifile in options.ifile:
        info("Read file #%d" % i)
        reps.append(BedGraphIO.bedGraphIO(ifile).build_bdgtrack())
        i += 1

    # first two reps

    info("combining tracks 1-%i with method '%s'" % (i - 1, options.method))
    cmbtrack = reps[0].overlie([reps[j] for j in range(1, i - 1)],
                               func=options.method)
    ofile = os.path.join(options.outdir, options.ofile)
    info("Write bedGraph of combined scores...")
    ofhd = open(ofile, "w")
    cmbtrack.write_bedGraph(
        ofhd,
        name="%s_combined_scores" % (options.method.upper()),
        description="Scores calculated by %s" % (options.method.upper()))
    info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
Exemple #5
0
def run( options ):
    options = opt_validate( options )
    info("Read and build bedGraph...")
    bio = BedGraphIO.bedGraphIO(options.ifile)
    btrack = bio.build_bdgtrack(baseline_value=0)

    info("Modify bedGraph...")
    if options.method.lower() == "p2q":
        btrack.p2q()

    else:
        extraparam = float(options.extraparam[0])
        if options.method.lower() == "multiply":
            btrack.apply_func( lambda x: x * extraparam)
        elif options.method.lower() == "add":
            btrack.apply_func( lambda x: x + extraparam)
        elif options.method.lower() == "max":
            btrack.apply_func( lambda x: x if x> extraparam else extraparam )
        elif options.method.lower() == "min":
            btrack.apply_func( lambda x: x if x< extraparam else extraparam )
        
    ofile = os.path.join( options.outdir, options.ofile )
    info("Write bedGraph of modified scores...")
    ofhd = open(ofile,"w")
    btrack.write_bedGraph(ofhd,name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper()))
    info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
Exemple #6
0
def run(o_options):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate(o_options)
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    if options.outputfile != "stdout":
        outfhd = open(os.path.join(options.outdir, options.outputfile), "w")
    else:
        outfhd = sys.stdout

    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options(options)

    info("tag size = %d" % options.tsize)
    fwtrack.fw = options.tsize

    t0 = fwtrack.total
    info(" total tags in alignment file: %d" % (t0))
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info(
                "calculate max duplicate tags in single position based on binomal distribution..."
            )
            max_dup_tags = cal_max_dup_tags(options.gsize, t0)
            info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
            info(
                "filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)"
                % (max_dup_tags))
        else:
            info("user defined the maximum tags...")
            max_dup_tags = int(options.keepduplicates)
            info(
                "filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)"
                % (max_dup_tags))

        if not options.dryrun:
            fwtrack = fwtrack.filter_dup(max_dup_tags)
            t1 = fwtrack.total
        else:
            t1 = fwtrack.filter_dup_dryrun(max_dup_tags)

        info(" tags after filtering in alignment file: %d" % (t1))
        info(" Redundant rate of alignment file: %.2f" % (float(t0 - t1) / t0))

    if not options.dryrun:
        info("Write to BED file")
        fwtrack.print_to_bed(fhd=outfhd)
        info("finished! Check %s." % options.outputfile)
    else:
        info("Dry-run is finished!")
Exemple #7
0
def run( options ):
    options = opt_validate( options )
    scaling_factor = options.sfactor
    pseudo_depth = 1.0/scaling_factor   # not an actual depth, but its reciprocal, a trick to override SPMR while necessary.

    info("Read and build treatment bedGraph...")
    tbio = BedGraphIO.bedGraphIO(options.tfile)
    tbtrack = tbio.build_bdgtrack()

    info("Read and build control bedGraph...")
    cbio = BedGraphIO.bedGraphIO(options.cfile)
    cbtrack = cbio.build_bdgtrack()

    info("Build scoreTrackII...")
    sbtrack = tbtrack.make_scoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth )
    if abs(scaling_factor-1) > 1e-6:
        # Only for the case while your input is SPMR from MACS2 callpeak; Let's override SPMR.
        info("Values in your input bedGraph files will be multiplied by %f ..." % scaling_factor)
        sbtrack.change_normalization_method( ord('M') ) # a hack to override SPMR
    sbtrack.set_pseudocount( options.pseudocount )

    already_processed_method_list = []
    for (i, method) in enumerate(options.method):
        if method in already_processed_method_list:
            continue
        else:
            already_processed_method_list.append( method )

        info("Calculate scores comparing treatment and control by '%s'..." % method)
        if options.ofile:
            ofile = os.path.join( options.outdir, options.ofile[ i ] )
        else:
            ofile = os.path.join( options.outdir, options.oprefix + "_" + method + ".bdg" )
        # build score track
        if method == 'ppois':
            sbtrack.change_score_method( ord('p') )
        elif method == 'qpois':
            sbtrack.change_score_method( ord('q') )
        elif method == 'subtract':
            sbtrack.change_score_method( ord('d') )
        elif method == 'logFE':
            sbtrack.change_score_method( ord('f') )
        elif method == 'FE':
            sbtrack.change_score_method( ord('F') )
        elif method == 'logLR':             # log likelihood
            sbtrack.change_score_method( ord('l') )
        elif method == 'slogLR':             # log likelihood
            sbtrack.change_score_method( ord('s') )
        elif method == 'max':             
            sbtrack.change_score_method( ord('M') )            
        else:
            raise Exception("Can't reach here!")
        
        info("Write bedGraph of scores...")
        ofhd = open(ofile,"wb")
        sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3)
        info("Finished '%s'! Please check '%s'!" % (method, ofile))
Exemple #8
0
def run( options ):
    options = opt_validate( options )
    scaling_factor = options.sfactor
    pseudo_depth = 1.0/scaling_factor   # not an actual depth, but its reciprocal, a trick to override SPMR while necessary.

    info("Read and build treatment bedGraph...")
    tbio = BedGraphIO.bedGraphIO(options.tfile)
    tbtrack = tbio.build_bdgtrack()

    info("Read and build control bedGraph...")
    cbio = BedGraphIO.bedGraphIO(options.cfile)
    cbtrack = cbio.build_bdgtrack()

    info("Build scoreTrackII...")
    sbtrack = tbtrack.make_scoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth )
    if abs(scaling_factor-1) > 1e-6:
        # Only for the case while your input is SPMR from MACS2 callpeak; Let's override SPMR.
        info("Values in your input bedGraph files will be multiplied by %f ..." % scaling_factor)
        sbtrack.change_normalization_method( ord('M') ) # a hack to override SPMR
    sbtrack.set_pseudocount( options.pseudocount )

    already_processed_method_list = []
    for (i, method) in enumerate(options.method):
        if method in already_processed_method_list:
            continue
        else:
            already_processed_method_list.append( method )

        info("Calculate scores comparing treatment and control by '%s'..." % method)
        if options.ofile:
            ofile = os.path.join( options.outdir, options.ofile[ i ] )
        else:
            ofile = os.path.join( options.outdir, options.oprefix + "_" + method + ".bdg" )
        # build score track
        if method == 'ppois':
            sbtrack.change_score_method( ord('p') )
        elif method == 'qpois':
            sbtrack.change_score_method( ord('q') )
        elif method == 'subtract':
            sbtrack.change_score_method( ord('d') )
        elif method == 'logFE':
            sbtrack.change_score_method( ord('f') )
        elif method == 'FE':
            sbtrack.change_score_method( ord('F') )
        elif method == 'logLR':             # log likelihood
            sbtrack.change_score_method( ord('l') )
        elif method == 'slogLR':             # log likelihood
            sbtrack.change_score_method( ord('s') )
        elif method == 'max':             
            sbtrack.change_score_method( ord('M') )            
        else:
            raise Exception("Can't reach here!")
        
        info("Write bedGraph of scores...")
        ofhd = open(ofile,"w")
        sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3)
        info("Finished '%s'! Please check '%s'!" % (method, ofile))
Exemple #9
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    options.PE_MODE = options.format in ('BAMPE','BEDPE')
    
    if options.outputfile != "stdout":
        outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" )
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    if options.PE_MODE:
        info("# read input file in Paired-end mode.")
        inputtrack = load_frag_files_options ( options ) # return PETrackI object
        t0 = inputtrack.total # total fragments
        info("# total fragments/pairs in alignment file: %d" % (t0) )
    else:
        info("read tag files...")
        inputtrack = load_tag_files_options (options)
    
        info("tag size = %d" % options.tsize)
        inputtrack.fw = options.tsize

        t0 = inputtrack.total
        info(" total tags in alignment file: %d" % (t0))

    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("calculate max duplicate tags in single position based on binomal distribution...")
            max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
            info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))
        else:
            info("user defined the maximum tags...")
            max_dup_tags = int(options.keepduplicates)
            info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))

        t1 = inputtrack.filter_dup(max_dup_tags)

        info(" tags after filtering in alignment file: %d" % (t1))
        info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0))
            
    if not options.dryrun:
        info( "Write to BED file" )
        inputtrack.print_to_bed( fhd=outfhd )
        info( "finished! Check %s." % options.outputfile )
    else:
        info( "Dry-run is finished!" )
Exemple #10
0
def run(options0):
    options = opt_validate(options0)
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    options.PE_MODE = options.format in ('BAMPE', 'BEDPE')

    #0 check output file
    if options.outputfile:
        outfhd = open(os.path.join(options.outdir, options.outputfile), "w")
    else:
        outfhd = sys.stdout

    #1 Read tag files
    if options.PE_MODE:
        info("# read input file in Paired-end mode.")
        treat = load_frag_files_options(options)  # return PETrackI object
        t0 = treat.total  # total fragments
        info("# total fragments/pairs in alignment file: %d" % (t0))
    else:
        info("read tag files...")
        treat = load_tag_files_options(options)

        info("tag size = %d" % options.tsize)
        treat.fw = options.tsize

        t0 = treat.total
        info(" total tags in alignment file: %d" % (t0))

    if options.number:
        if options.number > t0:
            error(
                " Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!"
            )
            error(" %.2e > %.2e" % (options.number, t0))
            sys.exit(1)
        info(" Number of tags you want to keep: %.2e" % (options.number))
        options.percentage = float(options.number) / t0 * 100
    info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage))

    if options.seed >= 0:
        info(" Random seed has been set as: %d" % options.seed)

    treat.sample_percent(options.percentage / 100.0, options.seed)

    info(" tags after random sampling in alignment file: %d" % (treat.total))

    info("Write to BED file")
    treat.print_to_bed(fhd=outfhd)
    info("finished! Check %s." % options.outputfile)
Exemple #11
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    options.PE_MODE = options.format in ('BAMPE','BEDPE')
    
    #1 Read tag files
    if options.PE_MODE:
        info("# read input file in Paired-end mode.")
        treat = load_frag_files_options ( options ) # return PETrackI object
        t0 = treat.total
        info("# total fragments/pairs in alignment file: %d" % (t0) )
    else:
        info("# read alignment files...")
        treat = load_tag_files_options  (options)
        t0 = treat.total    
        info("# tag size = %d" % options.tsize)
        treat.fw = options.tsize
        info("# total tags in alignment file: %d", t0)

    #2 Build Model
    info("# Build Peak Model...")
    if options.PE_MODE:
        d = treat.average_template_length
        info("# Average insertion length of all pairs is %d bps" % d)
        return
    
    try:
        peakmodel = PeakModel(treatment = treat,
                              max_pairnum = MAX_PAIRNUM,
                              opt = options
                              )
        info("# finished!")
        debug("#  Summary Model:")
        debug("#   min_tags: %d" % (peakmodel.min_tags))
        debug("#   d: %d" % (peakmodel.d))
        info("# predicted fragment length is %d bps" % peakmodel.d)
        info("# alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
        info("# Generate R script for model : %s" % (options.modelR))
        model2r_script(peakmodel,options.modelR, options.rfile )
        options.d = peakmodel.d

    except NotEnoughPairsException:
        warn("# Can't find enough pairs of symmetric peaks to build model!")
Exemple #12
0
def run( options0 ):
    options = opt_validate( options0 )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    options.PE_MODE = options.format in ('BAMPE','BEDPE')

    #0 check output file
    if options.outputfile:
        outfhd = open( os.path.join( options.outdir, options.outputfile ), "w" )
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    if options.PE_MODE:
        info("# read input file in Paired-end mode.")
        treat = load_frag_files_options ( options ) # return PETrackI object
        t0 = treat.total # total fragments
        info("# total fragments/pairs in alignment file: %d" % (t0) )
    else:
        info("read tag files...")
        treat = load_tag_files_options (options)
    
        info("tag size = %d" % options.tsize)
        treat.fw = options.tsize

        t0 = treat.total
        info(" total tags in alignment file: %d" % (t0))
        
    if options.number:
        if options.number > t0:
            error(" Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!")
            error(" %.2e > %.2e" % (options.number, t0))
            sys.exit(1)
        info(" Number of tags you want to keep: %.2e" % (options.number))
        options.percentage = float(options.number)/t0*100
    info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage))

    if options.seed >= 0:
        info(" Random seed has been set as: %d" % options.seed )

    treat.sample_percent(options.percentage/100.0, options.seed )

    info(" tags after random sampling in alignment file: %d" % (treat.total))

    info("Write to BED file")
    treat.print_to_bed(fhd=outfhd)
    info("finished! Check %s." % options.outputfile)
def run(o_options):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate(o_options)
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    assert options.format != 'BAMPE', "Pair-end data with BAMPE option currently doesn't work with pileup command. You can pretend your data to be single-end with -f BAM. Please try again!"

    #0 prepare output file
    outfile = os.path.join(options.outdir, options.outputfile)
    if os.path.isfile(outfile):
        info("# Existing file %s will be replaced!" % outfile)
        os.unlink(outfile)

    #1 Read tag files
    info("# read alignment files...")
    (tsize, treat) = load_tag_files_options(options)

    info("# tag size = %d", tsize)

    t0 = treat.total
    info("# total tags in alignment file: %d", t0)

    if options.bothdirection:
        info(
            "# Pileup alignment file, extend each read towards up/downstream direction with %d bps"
            % options.extsize)
        pileup_and_write(treat,
                         outfile,
                         options.extsize * 2,
                         1,
                         directional=False,
                         halfextension=False)
    else:
        info(
            "# Pileup alignment file, extend each read towards downstream direction with %d bps"
            % options.extsize)
        pileup_and_write(treat,
                         outfile,
                         options.extsize,
                         1,
                         directional=True,
                         halfextension=False)

    info("# Done! Check %s" % options.outputfile)
Exemple #14
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    options.PE_MODE = options.format in ('BAMPE','BEDPE')
    #assert options.format != 'BAMPE', "Pair-end data with BAMPE option currently doesn't work with pileup command. You can pretend your data to be single-end with -f BAM. Please try again!"


    #0 prepare output file
    outfile = os.path.join( options.outdir, options.outputfile )
    if os.path.isfile( outfile ):
        info("# Existing file %s will be replaced!" % outfile )
        os.unlink( outfile )
        
    #1 Read tag files
    info("# read alignment files...")
    if options.PE_MODE:
        info("# read input file in Paired-end mode.")
        treat = load_frag_files_options ( options ) # return PETrackI object
        t0 = treat.total # total fragments
        info("# total fragments/pairs in alignment file: %d" % (t0) )
        info("# Pileup paired-end alignment file.")
        pileup_and_write_pe(treat, outfile )
        
    else:
        (tsize, treat) = load_tag_files_options  (options)
    
        info("# tag size = %d", tsize)
    
        t0 = treat.total
        info("# total tags in alignment file: %d", t0)

        if options.bothdirection:
            info("# Pileup alignment file, extend each read towards up/downstream direction with %d bps" % options.extsize)
            pileup_and_write(treat, outfile, options.extsize * 2, 1, directional=False, halfextension=False)
        else:
            info("# Pileup alignment file, extend each read towards downstream direction with %d bps" % options.extsize)
            pileup_and_write(treat, outfile, options.extsize, 1, directional=True, halfextension=False)

    info("# Done! Check %s" % options.outputfile)
Exemple #15
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error


    outputfile = open(options.oprefix+"_refinepeak.bed", "w")

    
    #if options.outputfile != "stdout":
    #    assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile
    #    outfhd = open(options.outputfile,"w")
    #else:
    #    outfhd = sys.stdout

    peakio = file(options.bedfile)
    peaks = PeakIO()
    for l in peakio:
        fs = l.rstrip().split()
        peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] )

    peaks.sort()
    
    #for l in peakio:
    #l = peakio.readline()
    #fs = l.rstrip().split()
    #print fs
    
    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options (options)
    
    #info("tag size = %d" % options.tsize)
    #fwtrack.fw = options.tsize

    retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff )
    outputfile.write( "\n".join( map(lambda x: "%s\t%d\t%d\t%s\t%.2f" % x , retval) ) )
    info("Done!")
    info("Check output file: %s" % options.oprefix+"_refinepeak.bed")
Exemple #16
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    if options.outputfile != "stdout":
        assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile
        outfhd = open(options.outputfile,"w")
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options (options)
    
    info("tag size = %d" % options.tsize)
    fwtrack.fw = options.tsize

    t0 = fwtrack.total
    info(" total tags in alignment file: %d" % (t0))
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("calculate max duplicate tags in single position based on binomal distribution...")
            max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
            info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))
        else:
            info("user defined the maximum tags...")
            max_dup_tags = int(options.keepduplicates)
            info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))

        fwtrack = fwtrack.filter_dup(max_dup_tags)
        t1 = fwtrack.total
        info(" tags after filtering in alignment file: %d" % (t1))
        info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0))
    info("Write to BED file")
    fwtrack.print_to_bed(fhd=outfhd)
    info("finished! Check %s." % options.outputfile)
Exemple #17
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    assert options.format != 'BAMPE', "Pair-end data with BAMPE option doesn't work with predictd command. You can pretend your data to be single-end with -f BAM. Please try again!"
    
    #1 Read tag files
    info("# read alignment files...")
    treat = load_tag_files_options  (options)
    
    info("# tag size = %d", options.tsize)
    
    t0 = treat.total
    info("# total tags in alignment file: %d", t0)

    #2 Build Model
    info("# Build Peak Model...")

    try:
        peakmodel = PeakModel(treatment = treat,
                              max_pairnum = MAX_PAIRNUM,
                              opt = options
                              )
        info("# finished!")
        debug("#  Summary Model:")
        debug("#   min_tags: %d" % (peakmodel.min_tags))
        debug("#   d: %d" % (peakmodel.d))
        info("# predicted fragment length is %d bps" % peakmodel.d)
        info("# alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
        info("# Generate R script for model : %s" % (options.modelR))
        model2r_script(peakmodel,options.modelR,options.rfile)
        options.d = peakmodel.d
        

    except NotEnoughPairsException:
        warn("# Can't find enough pairs of symmetric peaks to build model!")
def run(options0):
    options = opt_validate(options0)
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 check output file
    if options.outputfile:
        outfhd = open(os.path.join(options.outdir, options.outputfile), "w")
    else:
        outfhd = sys.stdout

    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options(options)

    info("tag size = %d" % options.tsize)
    fwtrack.fw = options.tsize

    t0 = fwtrack.total
    info(" total tags in alignment file: %d" % (t0))
    if options.number:
        if options.number > t0:
            error(
                " Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!"
            )
            error(" %.2e > %.2e" % (options.number, t0))
            sys.exit(1)
        info(" Number of tags you want to keep: %.2e" % (options.number))
        options.percentage = float(options.number) / t0 * 100
    info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage))

    if options.seed >= 0:
        info(" Random seed has been set as: %d" % options.seed)

    fwtrack.sample_percent(options.percentage / 100.0, options.seed)

    info(" tags after random sampling in alignment file: %d" % (fwtrack.total))

    info("Write to BED file")
    fwtrack.print_to_bed(fhd=outfhd)
    info("finished! Check %s." % options.outputfile)
Exemple #19
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    assert options.format != 'BAMPE', "Pair-end data with BAMPE option currently doesn't work with pileup command. You can pretend your data to be single-end with -f BAM. Please try again!"

    #0 prepare output file
    if options.outputfile != "stdout":
        assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile
        outfhd = open(options.outputfile,"w")
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    info("# read alignment files...")
    (tsize, treat) = load_tag_files_options  (options)
    
    info("# tag size = %d", tsize)
    
    t0 = treat.total
    info("# total tags in alignment file: %d", t0)

    if options.bothdirection:
        info("# Pileup alignment file, extend each read towards up/downstream direction with %d bps" % options.extsize)        
        treat_btrack = unified_pileup_bdg(treat, options.extsize * 2, 1, directional=False, halfextension=False)
        info("# save bedGraph to %s" % options.outputfile)
        treat_btrack.write_bedGraph( outfhd, "Pileup", "Pileup track with extsize %d on both directions" % options.extsize, trackline=False )
    else:
        info("# Pileup alignment file, extend each read towards downstream direction with %d bps" % options.extsize)
        treat_btrack = unified_pileup_bdg(treat, options.extsize, 1, directional=True, halfextension=False)
        info("# save bedGraph to %s" % options.outputfile)
        treat_btrack.write_bedGraph( outfhd, "Pileup", "Pileup track with extsize %d on 5' directions" % options.extsize, trackline=False )

    info("# Done! Check %s" % options.outputfile)
Exemple #20
0
def run( options0 ):
    options = opt_validate( options0 )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 check output file
    if options.outputfile:
        assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile
        outfhd = open(options.outputfile,"w")
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options (options)
    
    info("tag size = %d" % options.tsize)
    fwtrack.fw = options.tsize

    t0 = fwtrack.total
    info(" total tags in alignment file: %d" % (t0))
    if options.number:
        if options.number > t0:
            error(" Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!")
            error(" %.2e > %.2e" % (options.number, t0))
            sys.exit(1)
        info(" Number of tags you want to keep: %.2e" % (options.number))
        options.percentage = float(options.number)/t0*100
    info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage))

    if options.seed >= 0:
        info(" Random seed has been set as: %d" % options.seed )

    fwtrack.sample_percent(options.percentage/100.0, options.seed )

    info(" tags after random sampling in alignment file: %d" % (fwtrack.total))

    info("Write to BED file")
    fwtrack.print_to_bed(fhd=outfhd)
    info("finished! Check %s." % options.outputfile)
Exemple #21
0
def run( o_options ):
    """The calculation based on the binomial distribution for how many tags to allow at one site.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    if options.outputfile != "stdout":
        outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" )
    else:
        outfhd = sys.stdout
    
    #1 Read tag files
    if options.ifile:
	if not options.quiet:
            info("counting tags from input files...")
        fwtrack = load_tag_files_options (options)
        t0 = fwtrack.total
	if not options.quiet:
            info(" total tags in alignment file: %d" % (t0))
            info("tag size = %d" % options.tsize)
        fwtrack.fw = options.tsize
    elif options.numTags:
	t0 = options.numTags
    
    if not options.quiet:
    	info("calculate max duplicate tags in single position based on binomal distribution...")
    max_dup_tags = cal_max_dup_tags(options.gsize,t0)

    if not options.quiet:    
	info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
    else:
	print max_dup_tags
Exemple #22
0
def run( o_options ):
    """The Main function/pipeline for duplication filter.
    
    """
    # Parse options...
    options = opt_validate( o_options )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    if options.ofile:
        outputfile = open( os.path.join( options.outdir, options.ofile ), 'w' )
        options.oprefix = options.ofile
    else:
        outputfile = open( os.path.join( options.outdir, "%s_refinepeak.bed" % options.oprefix), "w" )


    peakio = open(options.bedfile,"rb")
    peaks = PeakIO()
    for l in peakio:
        fs = l.rstrip().split()
        peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] )

    peaks.sort()
    peakio.close()
    
    #1 Read tag files
    info("read tag files...")
    fwtrack = load_tag_files_options (options)
    
    retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff )
    outputfile.write( (b"\n".join( [b"%s\t%d\t%d\t%s\t%.2f" % x for x in retval] )).decode() )
    outputfile.close()
    info("Done!")
Exemple #23
0
def run( args ):
    """The Main function/pipeline for MACS.
    
    """
    # Parse options...
    options = opt_validate( args )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    info("\n"+options.argtxt)
    options.PE_MODE = options.format in ('BAMPE',)
    if options.PE_MODE: tag = 'fragment' # call things fragments not tags
    else: tag = 'tag'

    tempfile.tempdir = options.tempdir

    #1 Read tag files
    info("#1 read %s files...", tag)
    if options.PE_MODE: (treat, control) = load_frag_files_options (options)
    else:       (treat, control) = load_tag_files_options  (options)
    if control is not None: check_names(treat, control, error)
    
    info("#1 %s size = %d", tag, options.tsize)
    tagsinfo  = "# %s size is determined as %d bps\n" % (tag, options.tsize)
    
    t0 = treat.total
    tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
    info("#1  total %ss in treatment: %d", tag, t0)
    # not ready yet
#    options.filteringmodel = True
#    if options.filteringmodel:
#        treat.separate_dups()
#        t0 = treat.total + treat.dups.total
#        t1 = treat.total
#        info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
#        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
#    elif options.keepduplicates != "all":
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
            treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info("#1  max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
        else:
            info("#1 user defined the maximum %ss...", tag)
            treatment_max_dup_tags = int(options.keepduplicates)
        if options.PE_MODE:
            info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
        else:
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
        treat.separate_dups(treatment_max_dup_tags) # changed 5-29
#        treat.filter_dup(treatment_max_dup_tags)
        t1 = treat.total
        info("#1  %ss after filtering in treatment: %d", tag, t1)
        tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
        if options.PE_MODE:
            tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
        else:
            tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
        info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
    else:
        t1 = t0

    if control is not None:
        c0 = control.total
        tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
        info("#1  total %ss in control: %d", tag, c0)
        # not ready yet
        #if options.filteringmodel:
        #    control.separate_dups()
        #    c0 = treat.total + treat.dups.total
        #    c1 = treat.total
        #    info("#1  Redundant rate of treatment: %.2f", float(c0 - c1) / c0)
        #    tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0)
        #elif options.keepduplicates != "all":
        if options.keepduplicates != "all":
            if options.keepduplicates == "auto":
                info("#1  for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
                control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
                info("#1  max_dup_tags based on binomial = %d" % (control_max_dup_tags))
            else:
                info("#1 user defined the maximum %ss...", tag)
                control_max_dup_tags = int(options.keepduplicates)
            if options.PE_MODE:
                info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
            else:
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
#            control.filter_dup(treatment_max_dup_tags)
            control.separate_dups(treatment_max_dup_tags) # changed 5-29
            c1 = control.total
            
            info("#1  %ss after filtering in control: %d", tag, c1)
            tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
            if options.PE_MODE:
                tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
            else:
                tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)
            
            info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
            tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
        else:
            c1 = c0
    info("#1 finished!")

    #2 Build Model
    info("#2 Build Peak Model...")

    if options.nomodel:
        info("#2 Skipped...")
        if options.PE_MODE:
            #options.shiftsize = 0
            options.d = options.tsize
        else:
            options.d=options.extsize
        if options.shift > 0:
            info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift))
        elif options.shift < 0:
            info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1))
        info("#2 Use %d as fragment length" % (options.d))
        options.scanwindow=2*options.d  # remove the effect of --bw
    else:
        try:
            peakmodel = PeakModel(treatment = treat,
                                  max_pairnum = MAX_PAIRNUM,
                                  opt = options
                                  )
            info("#2 finished!")
            debug("#2  Summary Model:")
            debug("#2   min_tags: %d" % (peakmodel.min_tags))
            debug("#2   d: %d" % (peakmodel.d))
            debug("#2   scan_window: %d" % (peakmodel.scan_window))
            info("#2 predicted fragment length is %d bps" % peakmodel.d)
            info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
            info("#2.2 Generate R script for model : %s" % (options.modelR))
            model2r_script(peakmodel,options.modelR,options.name)
            options.d = peakmodel.d
            options.scanwindow= 2*options.d
            if options.d <= 2*options.tsize:
                warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
                if options.onauto:
                    options.d=options.extsize
                    options.scanwindow=2*options.d 
                    warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d))
                else:
                    warn("#2 You may need to consider one of the other alternative d(s): %s" %  ','.join(map(str,peakmodel.alternative_d)))
                    warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
                
        except NotEnoughPairsException:
            if not options.onauto:
                sys.exit(1)
            warn("#2 Skipped...")
            options.d=options.extsize
            options.scanwindow=2*options.d 
            warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d))

    #3 Call Peaks
    info("#3 Call peaks...")
    if options.nolambda:
        info("# local lambda is disabled!")

    # decide options.tocontrol according to options.tolarge
    if control and options.PE_MODE:
        c1 = c1 * 2
    
    if control:
        if options.downsample:
            # use random sampling to balance treatment and control
            info("#3 User prefers to use random sampling instead of linear scaling.")
            if t1 > c1:
                info("#3 MACS is random sampling treatment %ss...", tag)
                if options.seed < 0:
                    warn("#3 Your results may not be reproducible due to the random sampling!")
                else:
                    info("#3 Random seed (%d) is used." % options.seed)
                treat.sample_num(c1, options.seed)
                info("#3 %d Tags from treatment are kept", treat.total)                
            elif c1 > t1: 
                info("#3 MACS is random sampling control %ss...", tag)
                if options.seed < 0:
                    warn("#3 Your results may not be reproducible due to the random sampling!")
                else:
                    info("#3 Random seed (%d) is used." % options.seed)
                control.sample_num(t1, options.seed)
                info("#3 %d %ss from control are kept", control.total, tag)
            # set options.tocontrol although it would;t matter now
            options.tocontrol = False
        else:
            if options.tolarge:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # true, we will scale control to treatment.
                    options.tocontrol = False
                else:
                    # treatment has less tags than control, since tolarge is
                    # true, we will scale treatment to control.
                    options.tocontrol = True
            else:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # false, we will scale treatment to control.
                    options.tocontrol = True
                else:
                    # treatment has less tags than control, since tolarge is
                    # false, we will scale control to treatment.
                    options.tocontrol = False

    peakdetect = PeakDetect(treat = treat,
                            control = control,
                            opt = options
                            )
    peakdetect.call_peaks()

    # filter out low FE peaks
    peakdetect.peaks.filter_fc( fc_low = options.fecutoff )

    #call refinepeak if needed.
    # if options.refine_peaks:
    #     info("#3 now put back duplicate reads...")
    #     treat.addback_dups()
    #     info("#3 calculate reads balance to refine peak summits...")
    #     refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 )
    #     info("#3 reassign scores for newly refined peak summits...")
    #     peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace
    #     #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" )
    #     #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w")
    #     #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline )

    #diag_result = peakdetect.diag_result()
    #4 output
    #4.1 peaks in XLS
    info("#4 Write output xls file... %s" % (options.peakxls))
    ofhd_xls = open( options.peakxls, "w" )
    ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
    ofhd_xls.write(options.argtxt+"\n")

    ofhd_xls.write(tagsinfo)

    if options.shift > 0:
        ofhd_xls.write("#2 Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift))
    elif options.shift < 0:
        ofhd_xls.write("#2 Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1))

    ofhd_xls.write("# d = %d\n" % (options.d))
    try:
        ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
    except:
        # when --nomodel is used, there is no peakmodel object. Simply skip this line.
        pass
    if options.nolambda:
        ofhd_xls.write("# local lambda is disabled!\n")
    # pass write method so we can print too, and include name
    peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name)
    ofhd_xls.close()
    #4.2 peaks in BED
    if options.log_pvalue:
        score_column = "pscore"
    elif options.log_qvalue:
        score_column = "qscore"
    #4.2 peaks in narrowPeak
    if not options.broad:
        #info("#4 Write peak bed file... %s" % (options.peakbed))
        #ofhd_bed = open(options.peakbed,"w")
        #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline)
        #ofhd_bed.close()
        info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
        ofhd_bed = open( options.peakNarrowPeak, "w" )
        peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline )
        ofhd_bed.close()
        #4.2-2 summits in BED
        info("#4 Write summits bed file... %s" % (options.summitbed))
        ofhd_summits = open( options.summitbed, "w" )
        peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name,
                                              description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")",
                                              score_column=score_column, trackline=options.trackline )
        ofhd_summits.close()
    #4.2 broad peaks in bed12 or gappedPeak
    else:
        info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
        ofhd_bed = open( options.peakBroadPeak, "w" )
        peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline)
        ofhd_bed.close()
        info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
        ofhd_bed = open( options.peakGappedPeak, "w" )
        peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline)
        ofhd_bed.close()

    info("Done!")
Exemple #24
0
def run( args ):
    """The Main function/pipeline for MACS.
    
    """
    # Parse options...
    options = opt_validate( args )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    info("\n"+options.argtxt)
    options.PE_MODE = options.format in ('BAMPE',)
    if options.PE_MODE: tag = 'fragment' # call things fragments not tags
    else: tag = 'tag'
    
    #1 Read tag files
    info("#1 read %s files...", tag)
    if options.PE_MODE: (treat, control) = load_frag_files_options (options)
    else:       (treat, control) = load_tag_files_options  (options)
    if control is not None: check_names(treat, control, error)
    
    info("#1 %s size = %d", tag, options.tsize)
    tagsinfo  = "# %s size is determined as %d bps\n" % (tag, options.tsize)
    
    t0 = treat.total
    tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
    info("#1  total %ss in treatment: %d", tag, t0)
    # not ready yet
#    options.filteringmodel = True
#    if options.filteringmodel:
#        treat.separate_dups()
#        t0 = treat.total + treat.dups.total
#        t1 = treat.total
#        info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
#        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
#    elif options.keepduplicates != "all":
    if options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
            treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info("#1  max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
        else:
            info("#1 user defined the maximum %ss...", tag)
            treatment_max_dup_tags = int(options.keepduplicates)
        if options.PE_MODE:
            info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
        else:
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
        treat.separate_dups(treatment_max_dup_tags) # changed 5-29
#        treat.filter_dup(treatment_max_dup_tags)
        t1 = treat.total
        info("#1  %ss after filtering in treatment: %d", tag, t1)
        tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
        if options.PE_MODE:
            tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
        else:
            tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
        info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
    else:
        t1 = t0

    if control is not None:
        c0 = control.total
        tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
        info("#1  total %ss in control: %d", tag, c0)
        # not ready yet
        #if options.filteringmodel:
        #    control.separate_dups()
        #    c0 = treat.total + treat.dups.total
        #    c1 = treat.total
        #    info("#1  Redundant rate of treatment: %.2f", float(c0 - c1) / c0)
        #    tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0)
        #elif options.keepduplicates != "all":
        if options.keepduplicates != "all":
            if options.keepduplicates == "auto":
                info("#1  for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
                control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
                info("#1  max_dup_tags based on binomial = %d" % (control_max_dup_tags))
            else:
                info("#1 user defined the maximum %ss...", tag)
                control_max_dup_tags = int(options.keepduplicates)
            if options.PE_MODE:
                info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
            else:
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
#            control.filter_dup(treatment_max_dup_tags)
            control.separate_dups(treatment_max_dup_tags) # changed 5-29
            c1 = control.total
            
            info("#1  %ss after filtering in control: %d", tag, c1)
            tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
            if options.PE_MODE:
                tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
            else:
                tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)
            
            info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
            tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
        else:
            c1 = c0
    info("#1 finished!")

    #2 Build Model
    info("#2 Build Peak Model...")

    if options.nomodel:
        info("#2 Skipped...")
        if options.PE_MODE:
            options.shiftsize = 0
            options.d = options.tsize
        else:
            options.d=options.shiftsize*2
        info("#2 Use %d as fragment length" % (options.d))
        options.scanwindow=2*options.d  # remove the effect of --bw
    else:
        try:
            peakmodel = PeakModel(treatment = treat,
                                  max_pairnum = MAX_PAIRNUM,
                                  opt = options
                                  )
            info("#2 finished!")
            debug("#2  Summary Model:")
            debug("#2   min_tags: %d" % (peakmodel.min_tags))
            debug("#2   d: %d" % (peakmodel.d))
            debug("#2   scan_window: %d" % (peakmodel.scan_window))
            info("#2 predicted fragment length is %d bps" % peakmodel.d)
            info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
            info("#2.2 Generate R script for model : %s" % (options.modelR))
            model2r_script(peakmodel,options.modelR,options.name)
            options.d = peakmodel.d
            options.scanwindow= 2*options.d
            if options.d <= 2*options.tsize:
                warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
                if options.onauto:
                    options.d=options.shiftsize*2
                    options.scanwindow=2*options.d 
                    warn("#2 MACS will use %d as shiftsize, %d as fragment length. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.shiftsize,options.d))
                else:
                    warn("#2 You may need to consider one of the other alternative d(s): %s" %  ','.join(map(str,peakmodel.alternative_d)))
                    warn("#2 You can restart the process with --nomodel --shiftsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
                
        except NotEnoughPairsException:
            if not options.onauto:
                sys.exit(1)
            warn("#2 Skipped...")
            options.d=options.shiftsize*2
            options.scanwindow=2*options.d 
            warn("#2 Since --fix-bimodal is set, MACS will use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d))


    #3 Call Peaks
    info("#3 Call peaks...")
    if options.nolambda:
        info("# local lambda is disabled!")

    # decide options.tocontrol according to options.tolarge
    if control:
        if options.downsample:
            # use random sampling to balance treatment and control
            info("#3 User prefers to use random sampling instead of linear scaling.")
            if t1 > c1:
                info("#3 MACS is random sampling treatment %ss...", tag)
                if options.seed < 0:
                    warn("#3 Your results may not be reproducible due to the random sampling!")
                else:
                    info("#3 Random seed (%d) is used." % options.seed)
                treat.sample_num(c1, options.seed)
                info("#3 %d Tags from treatment are kept", treat.total)                
            elif c1 > t1: 
                info("#3 MACS is random sampling control %ss...", tag)
                if options.seed < 0:
                    warn("#3 Your results may not be reproducible due to the random sampling!")
                else:
                    info("#3 Random seed (%d) is used." % options.seed)
                control.sample_num(t1, options.seed)
                info("#3 %d %ss from control are kept", control.total, tag)
            # set options.tocontrol although it would;t matter now
            options.tocontrol = False
        else:
            if options.tolarge:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # true, we will scale control to treatment.
                    options.tocontrol = False
                else:
                    # treatment has less tags than control, since tolarge is
                    # true, we will scale treatment to control.
                    options.tocontrol = True
            else:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # false, we will scale treatment to control.
                    options.tocontrol = True
                else:
                    # treatment has less tags than control, since tolarge is
                    # false, we will scale control to treatment.
                    options.tocontrol = False

    peakdetect = PeakDetect(treat = treat,
                            control = control,
                            opt = options
                            )
    peakdetect.call_peaks()

    #call refinepeak if needed.
    # if options.refine_peaks:
    #     info("#3 now put back duplicate reads...")
    #     treat.addback_dups()
    #     info("#3 calculate reads balance to refine peak summits...")
    #     refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 )
    #     info("#3 reassign scores for newly refined peak summits...")
    #     peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace
    #     #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" )
    #     #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w")
    #     #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline )

    #diag_result = peakdetect.diag_result()
    #4 output
    #4.1 peaks in XLS
    info("#4 Write output xls file... %s" % (options.peakxls))
    ofhd_xls = open(options.peakxls,"w")
    ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
    ofhd_xls.write(options.argtxt+"\n")

    ofhd_xls.write(tagsinfo)

    ofhd_xls.write("# d = %d\n" % (options.d))
    try:
        ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
    except:
        # when --nomodel is used, there is no peakmodel object. Simply skip this line.
        pass
    if options.nolambda:
        ofhd_xls.write("# local lambda is disabled!\n")
    # pass write method so we can print too, and include name
    peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name)
    ofhd_xls.close()
    #4.2 peaks in BED
    if options.log_pvalue:
        score_column = "pscore"
    elif options.log_qvalue:
        score_column = "qscore"
    #4.2 peaks in narrowPeak
    if not options.broad:
        #info("#4 Write peak bed file... %s" % (options.peakbed))
        #ofhd_bed = open(options.peakbed,"w")
        #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline)
        #ofhd_bed.close()
        info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
        ofhd_bed = open(options.peakNarrowPeak,"w")
        peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline )
        ofhd_bed.close()
        #4.2-2 summits in BED
        info("#4 Write summits bed file... %s" % (options.summitbed))
        ofhd_summits = open(options.summitbed,"w")
        peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name,
                                              description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")",
                                              score_column=score_column, trackline=options.trackline )
        ofhd_summits.close()
    #4.2 broad peaks in bed12 or gappedPeak
    else:
        info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
        ofhd_bed = open(options.peakBroadPeak,"w")
        peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline)
        ofhd_bed.close()
        info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
        ofhd_bed = open(options.peakGappedPeak,"w")
        peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline)
        ofhd_bed.close()

    info("Done!")
Exemple #25
0
def run( args ):
    """The Main function/pipeline for MACS.
    
    """
    # Parse options...
    options = opt_validate( args )
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error
    #0 output arguments
    info("\n"+options.argtxt)
    
    #1 Read tag files
    info("#1 read tag files...")
    (treat, control) = load_tag_files_options (options)
    # check common chromosome names
    if control:
        tchrnames = set(treat.get_chr_names())
        cchrnames = set(control.get_chr_names())
        commonnames = tchrnames.intersection(cchrnames)
        if len(commonnames)==0:
            error("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
            error("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
            error("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
            sys.exit()
    
    info("#1 tag size = %d" % options.tsize)
    tagsinfo  = "# tag size is determined as %d bps\n" % (options.tsize)

    t0 = treat.total
    tagsinfo += "# total tags in treatment: %d\n" % (t0)
    info("#1  total tags in treatment: %d" % (t0))
    if options.format == 'BAMPE':
        info("#1 BAMPE mode does not filter duplicates. Use samtools rmdup if needed ...")
        t1 = treat.total
    elif options.keepduplicates != "all":
        if options.keepduplicates == "auto":
            info("#1 calculate max duplicate tags in single position based on binomal distribution...")
            treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
            info("#1  max_dup_tags based on binomal = %d" % (treatment_max_dup_tags))
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags))
        else:
            info("#1 user defined the maximum tags...")
            treatment_max_dup_tags = int(options.keepduplicates)
            info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags))

        treat.filter_dup(treatment_max_dup_tags)
        t1 = treat.total
        info("#1  tags after filtering in treatment: %d" % (t1))
        tagsinfo += "# tags after filtering in treatment: %d\n" % (t1)
        tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
        info("#1  Redundant rate of treatment: %.2f" % (float(t0-t1)/t0))
        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
    else:
        t1 = treat.total

    if control:
        c0 = control.total
        tagsinfo += "# total tags in control: %d\n" % (c0)
        info("#1  total tags in control: %d" % (c0))
        if options.format == 'BAMPE':
            info("#1 BAMPE mode does not filter duplicates. Use samtools rmdup if needed ...")
            c1 = control.total
        elif options.keepduplicates != "all":
            if options.keepduplicates == "auto":
                info("#1  for control, calculate max duplicate tags in single position based on binomal distribution...")
                control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
                info("#1  max_dup_tags based on binomal = %d" % (control_max_dup_tags))
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (control_max_dup_tags))
            else:
                info("#1 user defined the maximum tags...")
                control_max_dup_tags = int(options.keepduplicates)
                info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags))
            
            control.filter_dup(control_max_dup_tags)
            c1 = control.total
            info("#1  tags after filtering in control: %d" % (c1))
            tagsinfo += "# tags after filtering in control: %d\n" % (c1)
            tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (control_max_dup_tags)
            
            info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
            tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
        else:
            c1 = control.total
        
            
    info("#1 finished!")

    #2 Build Model
    info("#2 Build Peak Model...")

    if options.nomodel:
        info("#2 Skipped...")
        if options.format == 'BAMPE':
            options.shiftsize = 0
            options.d = options.tsize
        else:
            options.d=options.shiftsize*2
        info("#2 Use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d))
        options.scanwindow=2*options.d  # remove the effect of --bw
    else:
        try:
            peakmodel = PeakModel(treatment = treat,
                                  max_pairnum = MAX_PAIRNUM,
                                  opt = options
                                  )
            info("#2 finished!")
            debug("#2  Summary Model:")
            debug("#2   min_tags: %d" % (peakmodel.min_tags))
            debug("#2   d: %d" % (peakmodel.d))
            debug("#2   scan_window: %d" % (peakmodel.scan_window))
            info("#2 predicted fragment length is %d bps" % peakmodel.d)
            info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
            info("#2.2 Generate R script for model : %s" % (options.modelR))
            model2r_script(peakmodel,options.modelR,options.name)
            options.d = peakmodel.d
            options.scanwindow= 2*options.d
            if options.d <= 2*options.tsize:
                warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
                if options.onauto:
                    options.d=options.shiftsize*2
                    options.scanwindow=2*options.d 
                    warn("#2 MACS will use %d as shiftsize, %d as fragment length. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.shiftsize,options.d))
                else:
                    warn("#2 You may need to consider one of the other alternative d(s): %s" %  ','.join(map(str,peakmodel.alternative_d)))
                    warn("#2 You can restart the process with --nomodel --shiftsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
                
        except NotEnoughPairsException:
            if not options.onauto:
                sys.exit(1)
            warn("#2 Skipped...")
            options.d=options.shiftsize*2
            options.scanwindow=2*options.d 
            warn("#2 Since --fix-bimodal is set, MACS will use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d))


    #3 Call Peaks
    info("#3 Call peaks...")
    if options.nolambda:
        info("# local lambda is disabled!")

    # decide options.tocontrol according to options.tolarge
    if control:
        if options.downsample:
            # use random sampling to balance treatment and control
            info("#3 User prefers to use random sampling instead of linear scaling.")
            if t1 > c1:
                info("#3 MACS is random sampling treatment tags...")
                warn("#3 Your results may not be reproducible due to the random sampling!")
                treat.sample_num(c1)
                info("#3 %d tags from treatment are kept" % treat.total)                
            elif c1 > t1: 
                info("#3 MACS is random sampling control tags...")
                warn("#3 Your results may not be reproducible due to the random sampling!")                
                control.sample_num(t1)
                info("#3 %d tags from control are kept" % control.total)
            # set options.tocontrol although it would;t matter now
            options.tocontrol = False
        else:
            if options.tolarge:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # true, we will scale control to treatment.
                    options.tocontrol = False
                else:
                    # treatment has less tags than control, since tolarge is
                    # true, we will scale treatment to control.
                    options.tocontrol = True
            else:
                if t1 > c1:
                    # treatment has more tags than control, since tolarge is
                    # false, we will scale treatment to control.
                    options.tocontrol = True
                else:
                    # treatment has less tags than control, since tolarge is
                    # false, we will scale control to treatment.
                    options.tocontrol = False

    peakdetect = PeakDetect(treat = treat,
                            control = control,
                            opt = options
                            )
    peakdetect.call_peaks()
    #diag_result = peakdetect.diag_result()
    #4 output
    #4.1 peaks in XLS
    info("#4 Write output xls file... %s" % (options.peakxls))
    ofhd_xls = open(options.peakxls,"w")
    ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
    ofhd_xls.write(options.argtxt+"\n")

    ofhd_xls.write(tagsinfo)

    ofhd_xls.write("# d = %d\n" % (options.d))
    try:
        ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
    except:
        # when --nomodel is used, there is no peakmodel object. Simply skip this line.
        pass
    if options.nolambda:
        ofhd_xls.write("# local lambda is disabled!\n")
    peakdetect.toxls(ofhd_xls, name = options.name)
    ofhd_xls.close()
    #4.2 peaks in BED
    if options.log_pvalue:
        score_column = "pscore"
    elif options.log_qvalue:
        score_column = "qscore"
    info("#4 Write peak bed file... %s" % (options.peakbed))
    ofhd_bed = open(options.peakbed,"w")
    peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline)
    ofhd_bed.close()
    #4.2 peaks in narrowPeak
    info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
    ofhd_bed = open(options.peakNarrowPeak,"w")
    peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline )
    ofhd_bed.close()
    #4.2 broad peaks in bed12
    if options.broad:
        info("#4 Write broad peak in bed12 format file... %s" % (options.peakBroadPeak))
        ofhd_bed = open(options.peakBroadPeak,"w")
        peakdetect.broadpeaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline)
        ofhd_bed.close()
    #4.2-2 summits in BED
    info("#4 Write summits bed file... %s" % (options.summitbed))
    ofhd_summits = open(options.summitbed,"w")
    peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name,
                                          description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")",
                                          score_column=score_column, trackline=options.trackline )
    ofhd_summits.close()