Example #1
0
def sort_ubam(ubams):
    jobs = []
    for ubam in ubams:
        ubam = util.File(ubam)
        obam = util.File(
            os.path.join(tmpdir,
                         os.path.basename(ubam.path.rstrip('u.bam') + '.bam')))
        job = sjm.Job('picard_sortUbam-%s' % ubam.prefix)
        job.memory = "20G"
        job.input = ubam
        job.output = obam
        job.append('picard_sortUbam.sh %s %s' % (job.input, job.output))
        jobs.append(job)
    return jobs
Example #2
0
def align_se(reads1, reads2):
    jobs = []
    for i in range(0, len(reads1)):
        read1 = reads1[i]
        read2 = reads2[i]
        readfile1 = util.File(read1)
        readfile2 = util.File(read2)
        bamname = re.sub(r'[._][Rr]1', '', readfile1.prefix) + '.sorted.bam'
        bam = util.File(os.path.join(tmpdir, bamname))
        job = sjm.Job('bwa_aln_se-%s' % readfile1.prefix)
        job.output = bam
        job.append('bwa_aln_se.sh %s %s %s %s' %
                   (job.output, read1, read2, readgroup))
        jobs.append(job)
    return jobs
Example #3
0
def handle_file_thread(path):
    file = util.File(path)
    map_file_to_queue[path] = queue.Queue(0)
    while True:
        file.append(map_file_to_queue[path].get())
        file.save()
    pass
def gatk_mvcf(pjobs, vcfout):
    vcfs = [pjob.output for pjob in pjobs]
    job = sjm.Job('gatk_CatVCF-%s' % (bamfile.prefix))
    job.memory = "10G"
    job.output = util.File(os.path.join(outdir, vcfout))
    job.append('gatk_catvcf.sh %s %s' % (job.output, ' '.join(vcfs)))
    job.depend(*pjobs)
    return job
Example #5
0
def align_pe(reads1, reads2):
    jobs=[]
    for i in range(0, len(reads1)):
        read1 = reads1[i]
        read2 = reads2[i]
        readfile1 = util.File(read1)
        readfile2 = util.File(read2)
        if readfile1.path.endswith('.gz'):
            bamname = re.sub(r'[._][Rr]1', '', readfile1.prefix.rstrip('.fastq') ) + '.sorted.bam'
        else:
            bamname   = re.sub(r'[._][Rr]1', '', readfile1.prefix ) + '.sorted.bam'
        bam = util.File( os.path.join(tmpdir, bamname) )
        job = sjm.Job('bwa_aln_pe-%s' % readfile1.prefix)
        job.output = bam
        job.memory = "15G"
        job.append('bwa_aln_pe.sh %s %s %s %s'%(job.output, read1, read2, readgroup))
        jobs.append(job)
    return jobs
Example #6
0
def sam_flagstat(pjobs):
    jobs = []
    for pjob in pjobs:
        bam=util.File(pjob.output)
        job=sjm.Job('samtools-flagstat-%s' % bam.prefix)
        job.memory = "10G"
        job.output = bam.chext("flagstat.txt")
        job.append('samtools flagstat %s > %s'%(bam, job.output))
        job.depend(pjob)
        jobs.append(job)
    return jobs
def gatk_joint(pjobs):
    jobs = []
    gvcfs = [pjob.output.path for pjob in pjobs]
    outvcf = util.File(os.path.join(args.outdir, args.output))
    job = sjm.Job('GATK-joint-gt-%s' % outvcf.name)
    job.memory = "20G"
    job.output = outvcf
    job.append('gatk_gt_joint.sh %s %s' % (job.output, ' '.join(gvcfs)))
    job.depend(*pjobs)
    jobs.append(job)
    return jobs
Example #8
0
def dedup_merge(pjobs, outbam):
    jobs = []
    bams = []
    for pjob in pjobs:
        bams.append(pjob.output.path)
    job = sjm.Job('picard_mdup-%s' % outbam )
    job.memory = "20G"
    job.output = util.File( os.path.join(outdir, outbam) )
    job.append('picard_mdup.sh %s %s'%(job.output, ' '.join(bams) ) )
    job.depend(*pjobs)
    jobs.append(job)
    return jobs
Example #9
0
def gatk_hc(pjobs):
    jobs = []
    for pjob in pjobs:
        bamfile = util.File(pjob.output)
        job = sjm.Job('gatk_haplotypecaller-%s'%(bamfile.prefix))
        job.memory = "40G"
        job.output = os.path.join(tmpdir, '%s.%s' % (bamfile.prefix, 'g.vcf.gz'))
        job.regions = pjob.regions
        job.append('gatk_hc.sh %s %s %s'%(job.output, bamfile.path, pjob.regions))
        job.depend(pjob)
        jobs.append(job)
    return jobs
Example #10
0
def gatk_gt(pjobs):
    jobs = []
    for pjob in pjobs:
        gvcffile = util.File(pjob.output, iszipfile=True)
        job = sjm.Job('gatk_genotypeGVCFs-%s'%(gvcffile.prefix))
        job.memory = "15G"
        job.output = os.path.join( tmpdir, '%s.%s' % (gvcffile.prefix, 'gt.vcf.gz') )
        job.regions = pjob.regions
        job.append('gatk_gt.sh %s %s %s'%(job.output, gvcffile.path, pjob.regions))
        job.depend(pjob)
        jobs.append(job)
    return jobs
Example #11
0
def merge_aln(pjobs):
    jobs = []
    for pjob in pjobs:
        alnbam = pjob.output
        ubam = pjob.input
        job = sjm.Job('picard_mergeBam-%s' % alnbam.name)
        job.memory = "10G"
        job.output = util.File(alnbam.path.rstrip('.aln.bam') + '.sort.bam')
        job.append('picard_mergeBam.sh %s %s %s' % (job.output, alnbam, ubam))
        job.depend(pjob)
        jobs.append(job)
    return jobs
Example #12
0
def gatk_recal(pjobs):
    jobs = []
    for pjob in pjobs:
        bamfile = util.File(pjob.output)
        job = sjm.Job('gatk_recalibrate-%s'%(bamfile.prefix))
        job.memory = "20G"
        job.output = os.path.join(tmpdir, '%s.%s' % (bamfile.prefix, 'recal.bam'))
        job.regions = pjob.regions
        job.append('gatk_recal.sh %s %s'%(job.output, bamfile.path))
        job.depend(pjob)
        jobs.append(job)
    return jobs
Example #13
0
def merge_bam(pjobs, out_prefix, suffix=None):
    '''
    Caveat: If output bam exists, needs to apply "-f" to overwrite or task will abort.
    '''
    bams = []
    for pjob in pjobs:
        bams.append(pjob.output.path)
    job = sjm.Job('samtools_merge-%s' % suffix)
    job.memory = "5G"
    outname = os.path.join(tmpdir, '%s.%s.bam' % (out_prefix, suffix))
    job.output = util.File(outname)
    job.append('samtools merge %s %s && samtools index %s' %
               (job.output, ' '.join(bams), job.output))
    job.depend(*pjobs)
    return job
def merge_gvcf(gvcfs):
    jobs = []
    gvcf_batches = [
        gvcfs[x:x + args.merge_count]
        for x in range(0, len(gvcfs), args.merge_count)
    ]
    for i, gvcf_batch in enumerate(gvcf_batches):
        ogvcf = util.File(
            os.path.join(args.tempdir,
                         '%s.batch%d.g.vcf.gz' % (args.temp_prefix, i)))
        job = sjm.Job('gatk_combine_gvcf-%s' % ogvcf.name)
        job.memory = "40G"
        job.output = ogvcf
        job.append('gatk_combine_gvcf.sh %s %s' %
                   (job.output, " ".join(gvcf_batch)))
        jobs.append(job)
    return jobs
Example #15
0
p = argparse.ArgumentParser(description='run_gatk.py -b tiny_b38.bam -o `pwd` --tmp /rgs01/scratch_space/ -r $ref_genome.gatk -j cap_tiny.sjm')
p.add_argument('-b','--bam', metavar='STR', required=True, help='Support for aligned and dedupped BAMs as input')
p.add_argument('-j', '--jobfile', metavar='FILE', help='The jobfile name (default: stdout)')
p.add_argument('-o', '--output', metavar='DIR', required=True, help='The output directory, will be created if not present')
p.add_argument('-r','--regions_file', metavar='FILE', required=True, help='A fiel that defines the regions of GATK parallele run')
p.add_argument('-A','--account',metavar='STR', help='Account that were used to run the pipeline')
p.add_argument('-T', '--tmp', metavar='DIR', required=True, help='The TMP directory for storing intermediate files, will be created if not exist (default=output directory')
p.add_argument('--skip_realn_recal',action='store_true',  help='Skip GATK relingment and recalibration')
p.add_argument('--skip_recal', action='store_true', help='Skip GATK recalibration only')
p.add_argument('--submit', action='store_true', help='Submit the jobs')
args = p.parse_args()

if args.jobfile is None:
    jobfile=None
else:
    jobfile=util.File(args.jobfile)

# set up directory
outdir=util.Dir(args.output)
logdir=util.Dir(outdir, 'log')
tmpdir=outdir
if args.tmp: tmpdir=util.Dir(args.tmp)
tmpdir.mkdirs()
outdir.mkdirs()

sjm.Job.name_prefix="GATK"+"."
sjm.Job.memory="20G" # default if not provided
sjm.Job.queue="pcgp"
sjm.Job.project="CompBio"
if args.account: sjm.Job.sge_options="-A %s" % args.account
tmpdir = getattr(__builtins__, 'str')(tmpdir)
Example #16
0
p.add_argument('-q', '--queue', metavar='NAME', default="normal", help='Queue for jobs (default: normal)')
p.add_argument('-t', '--threads', metavar='COUNT', type=int, default=4, help='Number of threads for BWA alignment, only works for SGE (default: 4)')
p.add_argument('--account', metavar='STR', default="swang", help='Accounting string for the purpose of cluster accounting.')
p.add_argument('--submit', action='store_true', help='Submit the jobs')
args = p.parse_args()

outdir=util.Dir(args.outdir)
outdir.mkdirs()

tmpdir=util.Dir(os.path.join(args.tmp, args.sm))
tmpdir.mkdirs()

if args.jobfile is None:
    jobfile=None
else:
    jobfile=util.File(args.jobfile)

readgroup = "'@RG\\\\tID:%s\\\\tLB:%s\\\\tSM:%s\\\\tPL:%s'" % (args.id, args.lb, args.sm, args.pl)

sjm.Job.name_prefix="BWA-mapping"+"."
sjm.Job.memory="%sG"%args.memory
sjm.Job.queue="pcgp"
sjm.Job.project="CompBio"

tmpdir = getattr(__builtins__, 'str')(tmpdir)
outdir = getattr(__builtins__, 'str')(outdir)

def align_pe(reads1, reads2):
    jobs=[]
    for i in range(0, len(reads1)):
        read1 = reads1[i]
Example #17
0
    def suggestor(filename, body):
        (old_module, old_symbol) = old_fullname.rsplit('.', 1)
        (new_module, new_symbol) = new_fullname.rsplit('.', 1)

        # We only need to operate on the old file (although we'll generate a
        # patch for the new one as well).  Caller should ensure this but we
        # check to be safe.
        if filename != util.filename_for_module_name(old_module):
            return

        file_info = util.File(filename, body)

        # Find where old_fullname is defined in old_module.
        # TODO(csilvers): traverse try/except, for, etc, and complain
        # if we see the symbol defined inside there.
        # TODO(csilvers): look for ast.AugAssign and complain if our
        # symbol is in there.
        old_module_toplevel = util.toplevel_names(file_info)
        if old_symbol not in old_module_toplevel:
            raise khodemod.FatalError(
                filename, 0, "Could not find symbol '%s' in '%s': "
                "maybe it's in a try/finally or if?" %
                (old_symbol, old_module))

        # Now get the startpos and endpos of this symbol's definition.
        node_to_move = old_module_toplevel[old_symbol]
        start, end = util.get_area_for_ast_node(node_to_move,
                                                file_info,
                                                include_previous_comments=True)
        definition_region = body[start:end]

        # Decide what text to add, which may require a rename.
        if old_symbol == new_symbol:
            new_definition_region = definition_region
        else:
            # Find the token with the name of the symbol, and update it.
            if isinstance(node_to_move, (ast.FunctionDef, ast.ClassDef)):
                for token in file_info.tokens.get_tokens(node_to_move):
                    if token.string in ('def', 'class'):
                        break
                else:
                    raise khodemod.FatalError(
                        filename, 0, "Could not find symbol '%s' in "
                        "'%s': maybe it's defined weirdly?" %
                        (old_symbol, old_module))
                # We want the token after the def.
                name_token = file_info.tokens.next_token(token)
            else:  # isinstance(node_to_move, ast.Assign)
                # The name should be a single token, if we get here.
                name_token, = list(
                    file_info.tokens.get_tokens(node_to_move.targets[0]))

            if name_token.string != old_symbol:
                raise khodemod.FatalError(
                    filename, 0, "Could not find symbol '%s' in "
                    "'%s': maybe it's defined weirdly?" %
                    (old_symbol, old_module))
            new_definition_region = (body[start:name_token.startpos] +
                                     new_symbol + body[name_token.endpos:end])

        if old_module == new_module:
            # Just patch the module in place.
            yield khodemod.Patch(filename, definition_region,
                                 new_definition_region, start, end)
        else:
            # Remove the region from the old file.
            # (If we've removed the remainder of the file,
            # _remove_empty_files_suggestor will clean up.)
            yield khodemod.Patch(filename, definition_region, '', start, end)

            # Add the region to the new file.
            new_filename = util.filename_for_module_name(new_module)
            new_file_body = khodemod.read_file(project_root,
                                               new_filename) or ''

            # Mess about with leading newlines.  First, we strip any existing
            # ones.  Then, if we are adding to an existing file, we add enough
            # to satisfy pep8.
            new_definition_region = new_definition_region.lstrip('\r\n')
            if new_file_body:
                current_newlines = (len(new_file_body) -
                                    len(new_file_body.rstrip('\r\n')) +
                                    len(new_definition_region) -
                                    len(new_definition_region.lstrip('\r\n')))
                if current_newlines < 3:
                    new_definition_region = ('\n' * (3 - current_newlines) +
                                             new_definition_region)

            # Now we need to add the new symbol to new_module.
            # TODO(benkraft): Allow, as an option, adding it after a specific
            # other symbol in new_module.
            yield khodemod.Patch(new_filename, '', new_definition_region,
                                 len(new_file_body), len(new_file_body))

            # TODO(benkraft): Fix up imports in the new and old modules.

        new_filename = util.filename_for_module_name(new_module)
        for patch in _add_init_py(new_filename):
            yield patch
               metavar='FILE',
               required=True,
               help='Final single vcf output file if choose joint call')
g2.add_argument('--skip_realn_recal',
                action='store_true',
                help='Skip GATK relingment and recalibration')
g2.add_argument('--skip_recal',
                action='store_true',
                help='Skip GATK recalibration only')
p.add_argument('--submit', action='store_true', help='Submit the jobs')
args = p.parse_args()

if args.jobfile is None:
    jobfile = None
else:
    jobfile = util.File(args.jobfile)

outdir = util.Dir(args.output)
logdir = util.Dir(outdir, 'log')
tmpdir = outdir
if args.tmp: tmpdir = util.Dir(args.tmp)
tmpdir.mkdirs()
outdir.mkdirs()

sjm.Job.name_prefix = "GATK" + "."
sjm.Job.memory = "20G"
sjm.Job.queue = "pcgp"
sjm.Job.project = "CompBio"
if args.account: sjm.Job.sge_options = "-A %s" % args.account
tmpdir = getattr(__builtins__, 'str')(tmpdir)
outdir = getattr(__builtins__, 'str')(outdir)