Beispiel #1
0
def mapper(options, line):
    bedtool = pybedtools.BedTool(line, from_string=True)

    # loads in the last bedline, because bedtools doesn't have a .next()
    for bedline in bedtool:
        pass

    if options.premRNA:
        length = bedline.stop - bedline.start
    else:
        length = sum(
            [int(x) for x in bedline[10][:-1].strip().split(",")]
        )  # Just gets the lengths of the exons (although no mention of cds or not... not important)

    print call_peaks(
        [bedline.chrom, bedline.name, bedline.start, bedline.stop, bedline.strand],
        length,
        None,
        options.bam,
        int(options.margin),
        options.FDR_alpha,
        options.threshold,
        int(options.minreads),
        options.poisson_cutoff,
        options.plotit,
        10,
        1000,
        options.SloP,
        False,
    )
Beispiel #2
0
def mapper(options, line):
    bedtool = pybedtools.BedTool(line, from_string=True)

    #loads in the last bedline, because bedtools doesn't have a .next()
    for bedline in bedtool:
        pass
    
    if options.premRNA:
        length = bedline.stop - bedline.start
    else:
        length = sum([int(x) for x in bedline[10][:-1].strip().split(",")]) #Just gets the lengths of the exons (although no mention of cds or not... not important)
    
    print call_peaks([bedline.chrom, bedline.name, bedline.start, bedline.stop,
                      bedline.strand], length, options.bam, int(options.max_gap), 
                      options.FDR_alpha, options.threshold, 
                      int(options.minreads), options.poisson_cutoff, 
                      options.plotit, 10, 1000, options.SloP, False)
Beispiel #3
0
def main(options):
    ##############################################
    # logging.info("options : {}".format(options))
    ##############################################
    check_for_index(options.bam)

    if options.np == 'autodetect':
        options.np = multiprocessing.cpu_count()

    pool = multiprocessing.Pool(int(options.np))

    bamfile = options.bam

    if os.path.exists(bamfile):
        #re-set to include the full path to bamfile
        bamfile = os.path.abspath(bamfile)
        logging.info("bam file is set to %s\n" % (bamfile))
    else:
        logging.error("Bam file: %s is not defined" % (bamfile))
        raise IOError

    if options.gtfFile:
        # TODO always False - no longer an option
        bedtool = build_transcript_data_gtf(
            pybedtools.BedTool(options.gtfFile), options.premRNA)
    else:
        bedtool = build_transcript_data_gtf_as_structure(
            options.species, options.premRNA)
    bedtool.saveas()

    #gets a bedtool of all genes to call peaks on
    if options.gene:
        bedtool = bedtool.filter(lambda x: x.attrs['gene_id'] in options.gene)

    # options.maxgenes   # truncates for max bedtool
    if options.maxgenes:
        ########################################################################
        logging.info(" number of genes before maxing : {}".format(
            len(bedtool)))
        logging.info(" max genes from user input: {}".format(options.maxgenes))
        ########################################################################
        if options.maxgenes < len(bedtool):
            bedtool = bedtool.random_subset(int(options.maxgenes))
        else:
            logging.info(
                " number of genes <= max genes from user , not truncating genes"
            )
            pass

    exons = get_exon_bed(options.species)

    bedtool = bedtool.saveas()

    tasks = [
        (
            bedtool_interval,
            bedtool_interval.attrs['effective_length'],
            bamfile,
            options.max_gap,
            options.FDR_alpha,
            options.threshold,
            options.binom,
            options.method,
            options.minreads,
            options.poisson_cutoff,
            options.plotit,
            10,
            1000,
            options.SloP,
            options.max_width,
            options.min_width,
            options.algorithm,
            # TODO options.algorithm now always "spline" !
            options.reverse_strand,
            exons) for gene_no, bedtool_interval in enumerate(bedtool)
    ]
    ##################################
    # print("len(tasks):", len(tasks))
    ##################################

    #jobs = []
    peaks_dicts = []
    # generate list of all peaks_dict's, (one peaks_dict per gene)
    ##############################################################
    if options.debug:
        peaks_dicts = [call_peaks(*task) for task in tasks]

    else:
        jobs = [pool.apply_async(call_peaks, task) for task in tasks]

        for job, task in zip(jobs, tasks):
            try:
                peaks_dicts.append(job.get(timeout=options.timeout))
            except multiprocessing.TimeoutError as error:
                print()
                ####################################################################################################################################
                logging.error(
                    "gene %s timed out after %s minutes on bedinterval: %s" %
                    (task[0].attrs['gene_id'], options.timeout / 60, task[0]))
                ####################################################################################################################################

    pool.close()
    #################################################################
    logging.info("finished call_peaks on all genes")
    #################################################################

    ############################################################
    logging.info(" starting adding up transcriptome-wise reads")
    ############################################################
    transcriptome_reads = count_transcriptome_reads(peaks_dicts)
    transcriptome_size = count_transcriptome_length(peaks_dicts)
    ####################################################################################
    logging.info(" transcriptome size in bases: {}".format(transcriptome_size))
    logging.info(
        " transcriptome total number of reads: {}".format(transcriptome_reads))
    ####################################################################################
    filtered_peak_bedtool_tsv = filter_peaks_dicts(peaks_dicts,
                                                   options.poisson_cutoff,
                                                   transcriptome_size,
                                                   transcriptome_reads,
                                                   options.use_global_cutoff,
                                                   options.bonferroni_correct,
                                                   options.algorithm,
                                                   options.SloP,
                                                   options.min_width,
                                                   bypassfiltering=False)

    ##########################################################
    # logging.info(" 1: {}".format(filtered_peak_bedtool_tsv))
    ##########################################################

    ###############
    # writing files
    ###############

    # options.outfileF, options.save_pickle
    #======================================
    outbedF = options.outfileF
    wether_to_save_pickle = options.save_pickle
    #
    # writing tsv files
    #==================
    with open(outbedF + ".tsv", 'w') as tsvfile:
        tsvfile.write(filtered_peak_bedtool_tsv)
    #
    # writing bed files
    #==================
    pybedtools.BedTool(filtered_peak_bedtool_tsv,
                       from_string=True).sort(stream=True).saveas(outbedF)
    ########################################################
    #logging.info(" wrote filtered peaks to %s" % (outbedF))
    ########################################################
    #
    # writing pickle files
    #=====================
    if wether_to_save_pickle is True:
        with open(
                outbedF + ".pickle", 'w'
        ) as picklefile:  # TODO Can't pickle save after filtering ? as we have a tsv now, not a peaks_dicts list !?
            pickle.dump(peaks_dicts, file=picklefile)
Beispiel #4
0
def func_star(varables):
    """ covert f([1,2]) to f(1,2) """
    return call_peaks(*varables)
Beispiel #5
0
def main(options):
    """
    Run the whole pipeline
    :rtype: None
    """
    ##############################################
    # logging.info("options : {}".format(options))
    ##############################################

    ############ CHECKING FILE STATUS ############
    check_for_index(options.bam)
    bamfile = options.bam

    if os.path.exists(bamfile):
        # re-set to include the full path to bamfile
        bamfile = os.path.abspath(bamfile)
        logging.info("bam file is set to %s\n" % (bamfile))
    else:
        logging.error("Bam file: %s is not defined" % (bamfile))
        raise IOError

    ########### PREPARE GENE LENGTH ################
    # if options.gtfFile:
    #    # TODO always False - no longer an option
    #   bedtool = build_transcript_data_gtf(pybedtools.BedTool(options.gtfFile), options.premRNA)
    # else:
    bedtool = build_transcript_data_gtf_as_structure(options.species, options.premRNA).saveas()

    # gets a bedtool of all genes to call peaks on
    if options.gene:
        bedtool = bedtool.filter(lambda x: x.attrs['gene_id'].split('.')[0] in options.gene).saveas()  ### bug

    # options.maxgenes   # truncates for max bedtool
    if options.maxgenes:
        logging.info(" number of genes before maxing : {}".format(len(bedtool)))
        logging.info(" max genes from user input: {}".format(options.maxgenes))
        ########################################################################
        if options.maxgenes < len(bedtool):
            bedtool = bedtool.random_subset(int(options.maxgenes)).saveas()
        else:
            logging.info(" number of genes <= max genes from user , not truncating genes")
            pass

    if len(bedtool) == 0:
        raise Warning('Bedtool length is 0; check gene id')

    exons = get_exon_bed(options.species)

    ############### PREPARE MULTIPROCESSING ##############################

    if options.np == 'autodetect':
        options.np = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(int(options.np))

    tasks = [(bedtool_interval, bedtool_interval.attrs['effective_length'], bamfile, options.max_gap, options.FDR_alpha,
              options.threshold, options.binom, options.method, options.minreads, options.poisson_cutoff,
              options.plotit, 10, 1000, options.SloP, options.max_width,
              options.min_width, options.algorithm,
              # TODO options.algorithm now always "spline" !
              options.reverse_strand, exons
              )
             for gene_no, bedtool_interval in enumerate(bedtool)]

    logging.info('Total tasks: {}'.format(len(tasks)))

    ############## CALL PEAKS BY HEIGHT AND CURVE##########################

    # generate list of all peaks_dict's, (one peaks_dict per gene)
    peaks_dicts = []

    if options.debug:
        peaks_dicts = [call_peaks(*task) for task in tasks]

    else:
        jobs = [pool.apply_async(call_peaks, task) for task in tasks]

        for job, task in zip(jobs, tasks):
            try:
                peaks_dicts.append(job.get(timeout=options.timeout))
            except multiprocessing.TimeoutError as error:
                print()

                logging.error("gene %s timed out after %s minutes on bedinterval: %s" % (
                task[0].attrs['gene_id'], options.timeout / 60, task[0]))


    pool.close()
    logging.info("finished call_peaks on all genes")


    ################### FILTER PEAK BY READ #################################
    logging.info(" starting adding up transcriptome-wise reads")

    transcriptome_reads = count_transcriptome_reads(peaks_dicts)
    transcriptome_size = count_transcriptome_length(peaks_dicts)

    logging.info(" transcriptome size in bases: {}".format(transcriptome_size))
    logging.info(" transcriptome total number of reads: {}".format(transcriptome_reads))
    ####################################################################################
    filtered_peak_bedtool_tsv = filter_peaks_dicts(peaks_dicts,
                                                   options.poisson_cutoff,
                                                   transcriptome_size,
                                                   transcriptome_reads,
                                                   options.use_global_cutoff,
                                                   options.bonferroni_correct,
                                                   options.algorithm,
                                                   options.SloP,
                                                   options.min_width,
                                                   bypassfiltering=False)

    ############### WRITE TO FILE #####################################

    if type(filtered_peak_bedtool_tsv) == str:
        # with open(outbedF + ".tsv", 'w') as tsvfile:
        #    tsvfile.write(filtered_peak_bedtool_tsv)
        # filtered_peak_bedtool_dataframe.to_csv(tsvfile, sep = '\t')
        pybedtools.BedTool(filtered_peak_bedtool_tsv, from_string=True).sort(stream=True).saveas(options.outfileF)

    if options.save_pickle is True:
        with open(options.outfileF + ".pickle", 'w') as f:
            # TODO Can't pickle save after filtering ? as we have a tsv now, not a peaks_dicts list !?
            pickle.dump(peaks_dicts, file=f)
Beispiel #6
0
def func_star(varables):
    """ covert f([1,2]) to f(1,2) """
    return call_peaks(*varables)