Ejemplo n.º 1
0
def augustus_pb(args, toil_options):
    """
    Main entry function for AugustusPB toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    :return:
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(
                t, args.genome_fasta)
            input_file_ids.chrom_sizes = FileID.forPath(
                t.importFile('file://' + args.chrom_sizes), args.chrom_sizes)
            input_file_ids.pb_cfg = FileID.forPath(
                t.importFile('file://' + args.pb_cfg), args.pb_cfg)
            input_file_ids.hints_gff = FileID.forPath(
                t.importFile('file://' + args.hints_gff), args.hints_gff)
            job = Job.wrapJobFn(setup,
                                args,
                                input_file_ids,
                                memory='16G',
                                disk='32G')
            raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job)
        else:
            raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf)
        t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf)
        t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf)
        t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)
def align_transcripts(args, toil_options):
    """
    Main entry function for transcript alignment toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta)
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp),
                                                          args.annotation_gp)
            input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path)
            input_file_ids.modes = {}
            file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp,
                        input_file_ids.ref_db]
            for mode in args.transcript_modes:
                input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp'])
                file_ids.append(input_file_ids.modes[mode])
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage)
            results_file_ids = t.start(job)
        else:
            results_file_ids = t.restart()
        for file_path, file_id in results_file_ids.iteritems():
            tools.fileOps.ensure_file_dir(file_path)
            t.exportFile(file_id, 'file://' + file_path)
Ejemplo n.º 3
0
def hints_db(hints_args, toil_options):
    """
    Entry point for hints database Toil pipeline.
    """
    def validate_import_bam(t, bam_path, fasta_sequences, genome):
        validate_bam_fasta_pairs(bam_path, fasta_sequences, genome)
        return [FileID.forPath(t.importFile('file://' + bam_path), bam_path),
                FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')]

    fasta = pyfasta.Fasta(hints_args.fasta)
    fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()}
    with Toil(toil_options) as t:
        if not t.options.restart:
            # load the RNA-seq data, if we have any
            bam_file_ids = {'BAM': {}, 'INTRONBAM': {}}
            for dtype in ['BAM', 'INTRONBAM']:
                if hints_args.genome not in hints_args.cfg[dtype]:
                    continue
                for bam_path in hints_args.cfg[dtype][hints_args.genome]:
                    bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path,
                                                                                          fasta_sequences,
                                                                                          hints_args.genome)

            # load the IsoSeq data, if we have any
            iso_seq_file_ids = []
            if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']:
                for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]:
                    validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome)
                    iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome))

            if hints_args.annotation is None:
                annotation_file_id = None
            else:
                annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation),
                                                    hints_args.annotation)
            if hints_args.protein_fasta is None:
                protein_fasta_file_id = genome_fasta_file_id = None
            else:
                protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta),
                                                       hints_args.protein_fasta)
                genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta)

            input_file_ids = {'bams': bam_file_ids,
                              'iso_seq_bams': iso_seq_file_ids,
                              'annotation': annotation_file_id,
                              'protein_fasta': protein_fasta_file_id,
                              'genome_fasta': genome_fasta_file_id}
            if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0:
                logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome))

            disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids)
            job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage)
            combined_hints = t.start(job)
        else:
            logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome))
            combined_hints = t.restart()
        tools.fileOps.ensure_file_dir(hints_args.hints_path)
        t.exportFile(combined_hints, 'file://' + hints_args.hints_path)
def write_fasta_to_filestore(toil, fasta_local_path):
    """
    Convenience function that loads a fasta and its associated gdx/flat file into the fileStore.
    Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat)
    :param toil: Toil context manager
    :param fasta_local_path: Path to local fasta to load.
    :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat
    """
    fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path)
    gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx')
    flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat')
    return fasta_file_id, gdx_file_id, flat_file_id
def write_fasta_to_filestore(toil, fasta_local_path):
    """
    Convenience function that loads a fasta and its associated gdx/flat file into the fileStore.
    Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat)
    :param toil: Toil context manager
    :param fasta_local_path: Path to local fasta to load.
    :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat
    """
    fasta_file_id = FileID.forPath(
        toil.importFile('file:///' + fasta_local_path), fasta_local_path)
    gdx_file_id = FileID.forPath(
        toil.importFile('file:///' + fasta_local_path + '.gdx'),
        fasta_local_path + '.gdx')
    flat_file_id = FileID.forPath(
        toil.importFile('file:///' + fasta_local_path + '.flat'),
        fasta_local_path + '.flat')
    return fasta_file_id, gdx_file_id, flat_file_id
def augustus(args, coding_gp, toil_options):
    """
    Main entry function for Augustus toil pipeline
    :param args: dictionary of arguments from CAT
    :param coding_gp: genePred with only coding transcripts
    :param toil_options: toil options Namespace object
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.tm_cfg = FileID.forPath(t.importFile('file://' + args.tm_cfg), args.tm_cfg)
            input_file_ids.coding_gp = FileID.forPath(t.importFile('file://' + coding_gp), coding_gp)
            input_file_ids.ref_psl = FileID.forPath(t.importFile('file://' + args.ref_psl), args.ref_psl)
            input_file_ids.tm_psl = FileID.forPath(t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl)
            input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp),
                                                          args.annotation_gp)
            file_ids = [input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl,
                        input_file_ids.tm_psl, input_file_ids.annotation_gp]
            if args.augustus_tmr:
                input_file_ids.augustus_hints_db = FileID.forPath(t.importFile('file://' + args.augustus_hints_db),
                                                                  args.augustus_hints_db)
                input_file_ids.tmr_cfg = FileID.forPath(t.importFile('file://' + args.tmr_cfg), args.tmr_cfg)
                file_ids.append(args.augustus_hints_db)
            disk_usage = tools.toilInterface.find_total_disk_usage(file_ids)
            job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage)
            tm_file_id, tmr_file_id = t.start(job)
        else:
            tm_file_id, tmr_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.augustus_tm_gtf)
        t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf)
        if tmr_file_id is not None:
            tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf)
            t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf)
def chaining(args, toil_options):
    """entry point to this program"""
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
            input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
            input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit),
                                                          args.query_two_bit)
            target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f)
                                       for genome, f in args.target_two_bits.iteritems()}
            input_file_ids.target_two_bits = target_two_bit_file_ids
            job = Job.wrapJobFn(setup, args, input_file_ids)
            chain_file_ids = t.start(job)
        else:
            chain_file_ids = t.restart()
        for chain_file, chain_file_id in chain_file_ids.iteritems():
            tools.fileOps.ensure_file_dir(chain_file)
            t.exportFile(chain_file_id, 'file://' + chain_file)
def augustus_pb(args, toil_options):
    """
    Main entry function for AugustusPB toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    :return:
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta)
            input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.chrom_sizes), args.chrom_sizes)
            input_file_ids.pb_cfg = FileID.forPath(t.importFile('file://' + args.pb_cfg), args.pb_cfg)
            input_file_ids.hints_gff = FileID.forPath(t.importFile('file://' + args.hints_gff), args.hints_gff)
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk='32G')
            raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job)
        else:
            raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf)
        t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf)
        t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf)
        t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp)
def augustus_cgp(args, toil_options):
    """
    Main entry function for AugustusCGP toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    :return:
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
            input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
            input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db)
            if args.cgp_param is not None:
                input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param)
            else:
                input_file_ids.cgp_param = None
                input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf)
            input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg)
            input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta)
                                    for genome, fasta in args.fasta_files.iteritems()}
            du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G')
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du)
            results, stdout_file_ids, param_file_id = t.start(job)
        else:
            results, stdout_file_ids, param_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.stdout_file)
        with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp:
            for (chrom, start, chunksize), stdout_file in stdout_file_ids.iteritems():
                outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize))
                t.exportFile(stdout_file, 'file://' + tmp)
                for l in open(tmp):
                    outf.write(l)
        for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.iteritems():
            tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome])
            t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_cgp_raw_gtf[genome])
            t.exportFile(joined_gtf_file_id, 'file://' + args.augustus_cgp_gtf[genome])
            t.exportFile(joined_gp_file_id, 'file://' + args.augustus_cgp_gp[genome])
        if args.cgp_param is None:
            t.exportFile(param_file_id, 'file://' + args.param_out_path)
def augustus_cgp(args, toil_options):
    """
    Main entry function for AugustusCGP toil pipeline
    :param args: dictionary of arguments from CAT
    :param toil_options: toil options Namespace object
    :return:
    """
    with Toil(toil_options) as t:
        if not t.options.restart:
            input_file_ids = argparse.Namespace()
            input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal)
            input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes)
            input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db)
            if args.cgp_param is not None:
                input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param)
            else:
                input_file_ids.cgp_param = None
                input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf)
            input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg)
            input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta)
                                    for genome, fasta in args.fasta_files.iteritems()}
            du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G')
            job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du)
            results, stdout_file_ids, param_file_id = t.start(job)
        else:
            results, stdout_file_ids, param_file_id = t.restart()
        tools.fileOps.ensure_file_dir(args.stdout_file)
        with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp:
            for (chrom, start, chunksize), stdout_file in stdout_file_ids.iteritems():
                outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize))
                t.exportFile(stdout_file, 'file://' + tmp)
                for l in open(tmp):
                    outf.write(l)
        for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.iteritems():
            tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome])
            t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_cgp_raw_gtf[genome])
            t.exportFile(joined_gtf_file_id, 'file://' + args.augustus_cgp_gtf[genome])
            t.exportFile(joined_gp_file_id, 'file://' + args.augustus_cgp_gp[genome])
        if args.cgp_param is None:
            t.exportFile(param_file_id, 'file://' + args.param_out_path)
Ejemplo n.º 11
0
 def validate_import_bam(t, bam_path, fasta_sequences, genome):
     validate_bam_fasta_pairs(bam_path, fasta_sequences, genome)
     return [FileID.forPath(t.importFile('file://' + bam_path), bam_path),
             FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')]