예제 #1
0
def calculate_depths(label, infiles, args):
    bamfile = infiles[label]['bam']
    vcffile = infiles[label]['vcf']
    outlabel = get_outlabel(bamfile, 'bam', args.outdir)
    loglabel = get_outlabel(bamfile, 'bam', args.logdir) if args.logdir \
               else outlabel
    logfile = loglabel + ".calculate_depths.log"
    outfile = outlabel + '.depths{}.txt'.format(args.tag)
    sys.stderr.write("  Calculating depths for {}\n".format(outlabel))
    sys.stderr.write("    Log file: {}\n".format(logfile))
    logfh = open(logfile, 'w')
    logfh.write("Bam file: {}\n".format(bamfile))
    logfh.write("VCF file: {}\n\n".format(vcffile))
    try:
        if have_file(outfile, args.force, stderr=logfh):
            logfh.write("  Already have {}\n".format(outfile))
        else:
            logfh.write("Start time: {}\n".format(timestamp()))
            (vcffields, variants) = parse_vcf(vcffile, logfh)
            depths = find_variant_depths(variants, bamfile, logfh, args)
            add_fwd_rev_depths(depths, variants, bamfile, args)
            print_depths(outfile, variants, depths, vcffields, logfh)
            logfh.write("End time: {}\n".format(timestamp()))
    except Exception, e:
        e.args += (vcffile, )
        raise
예제 #2
0
파일: run_gatk.py 프로젝트: eulaf/CFseq
def call_variants_gatk(bamfile, ref, args):
    outlabel = get_outlabel(bamfile, args.outdir)
    loglabel = get_outlabel(bamfile, args.logdir if args.logdir else args.outdir)
    logfile = loglabel + ".gatk.log"
    logfh = open(logfile, "w")
    sys.stderr.write("  Running GATK on {}\n".format(bamfile))
    sys.stderr.write("    Log file {}\n".format(logfile))
    try:
        logfh.write("Start time: {}\n".format(timestamp()))
        index_bam(bamfile, args, logfh)
        vcffile = run_gatk(bamfile, outlabel, ref, args, logfh)
        if os.path.isfile(vcffile):
            sys.stderr.write("    VCF file: {}\n".format(vcffile))
        logfh.write("Finish time: {}\n".format(timestamp()))
    except Exception, e:
        e.args += (bamfile,)
        sys.stderr.write("Error running gatk." + "  Check log file {}\n".format(logfile))
        raise
예제 #3
0
def call_variants_freebayes(bamfile, reffile, bedfile, args):
    sys.stderr.write("Bam file: {}\n".format(bamfile))
    if args.logdir:
        label = get_outlabel(bamfile, args.logdir)
        outlog = label +'.freebayes.log'
        logfh = open(outlog, 'w')
    else:
        logfh = sys.stderr
    tag = "{}:\t".format(get_outlabel(bamfile))
    logfh.write("{}start time {}\n".format(tag, timestamp()))
    vcffile = run_freebayes(bamfile, reffile, bedfile, logfh, args)
    outvcf = vcffile.replace('.vcf', '')+".filtered.vcf"
    filteredvcf = filter_freebayes_vcf(vcffile, outvcf, logfh, args)
    vcfs.append(filteredvcf)
    logfh.write("{}finish time {}\n".format(tag, timestamp()))
    if args.logdir:
        logfh.close()
    return vcfs
예제 #4
0
파일: run_bwa_mem.py 프로젝트: eulaf/CFseq
def align_create_bam(sample, fqfiles, ref, reflabel, args):
    outlabel = "{}-{}".format(sample, reflabel)
    loglabel = os.path.join(args.logdir, outlabel) if args.logdir \
               else outlabel
    logfile = loglabel + ".bwa.log"
    sys.stderr.write("  Mapping {}\n".format(sample))
    sys.stderr.write("    Log file {}\n".format(logfile))
    if args.outdir: 
        outlabel = os.path.abspath(os.path.join(args.outdir, outlabel))
    logfh = open(logfile, 'w')
    try:
        logfh.write("Start time {}\n".format(timestamp()))
        samfile = run_bwa(sample, outlabel, ref, fqfiles, logfh, args.force)
        bamfile = create_sorted_bam(samfile, outlabel, logfh, args.force, 
                                    args.sam)
        logfh.write("Finish time {}\n".format(timestamp()))
    except Exception, e:
        e.args += (sample, )
        sys.stderr.write("Error running bwa." +\
                         "  Check log file {}\n".format(logfile))
        raise
예제 #5
0
def gatk_pipeline(label, bamfiles, gatkdir, logdir, args):
    cmd = [SCRIPTS['gatk_pipeline'], '-l', label, '-o', gatkdir,]
    cmd += ['-p', str(args.processes)]
    if logdir: cmd += ['--logdir', logdir]
    if args.maxreads: cmd += ['-m', str(args.maxreads)]
    if args.force: cmd += ['-f',]
    cmd.extend(bamfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running gatk pipeline: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except Exception, e:
        raise
예제 #6
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
def align_and_trim(fqfile, primerfa, primerinfo, max_trim_len, args):
    outlabel = fastq_file_label(fqfile, args.outdir)
    loglabel = fastq_file_label(fqfile, args.logdir) if args.logdir \
               else outlabel
    logfile = loglabel + ".primer_trim.log"
    sys.stderr.write("  Trimming {}\n".format(os.path.basename(fqfile)))
    sys.stderr.write("    Log file {}\n".format(logfile))
    logfh = open(logfile, 'w')
    try:
        logfh.write("Start time: {}\n".format(timestamp()))
        fafiles = create_fasta_of_primer_region(fqfile, max_trim_len, 
                                                outlabel, logfh, args.force)
        alignout = run_aligner(fafiles, primerfa, outlabel, logfh, args.force)
        (trimfq, seqnamefile) = trim_primers(fqfile, alignout, max_trim_len, 
                                            primerinfo, outlabel, logfh, args)
        if os.path.isfile(trimfq):
            sys.stderr.write("    Created {}\n".format(trimfq))
        logfh.write("Finish time: {}\n".format(timestamp()))
    except Exception, e:
        e.args += (fqfile, )
        sys.stderr.write("Error trimming primers." +\
                         "  Check log file {}\n".format(logfile))
        raise
예제 #7
0
def run_annovar(vcffiles, outdir, args):
    cmd = [SCRIPTS['run_annovar'], '-o', outdir]
    if args.label: cmd += ['-l', args.label + '.annovar']
    if args.force: cmd += ['-f',]
    cmd.extend(vcffiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running annovar: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    except Exception, e:
        raise
예제 #8
0
def annotate_spreadsheets(spreadsheets, outdir, args):
    cmd = [SCRIPTS['add_mol2k'], '-o', outdir]
    if args.force: cmd += ['-f',]
    cmd.extend(spreadsheets)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Annotating spreadsheets: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    outfiles = find_outfiles(outdir, spreadsheets, 'mol2k.txt', delim='.results')
    if not outfiles:
        sys.exit(1)
    return outfiles
예제 #9
0
def trim_all_fastq(fqfiles, aligndir, logdir, args):
    cmd = [SCRIPTS['primer_trim'], '-o', aligndir,]
    cmd += ['-p', str(args.processes)]
    if logdir: cmd += ['--logdir', logdir]
    if args.force: cmd += ['-f',]
    if args.label: cmd += ['-s', args.label + '.summary.txt' ]
    cmd.extend(fqfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Trimming fastq files: "+" ".join(cmd)+"\n")
    outfiles = find_outfiles(aligndir, fqfiles, 'trimmed.fastq', quiet=True)
    if len(outfiles)==len(fqfiles):
        sys.stderr.write("Already have trimmed fq files.\n")
        return outfiles
    try:
        subprocess.check_call(cmd)
    except Exception, e:
        raise
예제 #10
0
def create_spreadsheet(label, samplevcfs, annovar_out, outdir):
    cmd = [SCRIPTS['create_spreadsheet'], '-l', label,'-o', outdir]
    for afile in annovar_out:
        cmd += ['-a', afile]
    if args.force: cmd += ['-f',]
    cmd.extend(samplevcfs)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Creating spreadsheet: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    outfile = find_outfiles(outdir, [label,], 'results.txt', debug=args.debug)
    if not outfile:
        sys.exit(1)
    return outfile[0]
예제 #11
0
def separate_vcf(vcf, gatkdir, bamfiles, args):
    cmd = [SCRIPTS['separate_vcf'], '-o', gatkdir, ]
    cmd += ['--logdir', args.logdir]
    if args.force: cmd += ['-f',]
    cmd.append(vcf)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Separating gVCF: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    samplevcfs = find_outfiles(gatkdir, bamfiles, 'separated.vcf', delim='-',
                           debug=args.debug)
    if not samplevcfs:
        sys.exit(1)
    return samplevcfs
예제 #12
0
def run_freebayes(bamfiles, fbdir, args):
    cmd = [SCRIPTS['run_freebayes'], '-o', fbdir,]
    cmd += ['-p', str(args.processes)]
    if args.logdir: cmd += ['--logdir', args.logdir]
    if args.force: cmd += ['-f',]
    cmd.extend(bamfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running freebayes: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        sys.stderr.write("Error running freebayes pipeline.")
        raise
    outfiles = find_outfiles(fbdir, bamfiles, 'freebayes.filtered.vcf',
                             debug=args.debug)
    if not outfiles:
        sys.stderr.write("No freebayes .filtered.vcf files found.\n")
        sys.exit(1)
    return (outfiles)
예제 #13
0
def run_annovar(vcffiles, outdir, args):
    cmd = [SCRIPTS['run_annovar'], '-o', outdir]
    if args.label: cmd += ['-l', args.label + '.annovar']
    if args.force: cmd += ['-f',]
    cmd.extend(vcffiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running annovar: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    vffile = find_outfiles(outdir, [args.label,], 'annovar-hgvs.variant_function',
                           debug=args.debug)
    evffile = find_outfiles(outdir, [args.label,], 
                           'annovar-hgvs.exonic_variant_function',
                           debug=args.debug)
    if not vffile or not evffile:
        sys.exit(1)
    return (vffile[0], evffile[0])
예제 #14
0
def run_freebayes(bamfiles, fbdir, logdir, args):
    cmd = [SCRIPTS['run_freebayes'], '-o', fbdir,]
    cmd += ['-p', str(args.processes)]
    if args.force: cmd += ['-f',]
    cmd.extend(bamfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running freebayes: "+" ".join(cmd)+"\n")
    logfile = os.path.join(logdir, args.label + '.freebayes.log')
    try:
        with open(logfile, 'w') as logfh:
            subprocess.check_call(cmd, stderr=logfh)
    except subprocess.CalledProcessError as e:
        sys.stderr.write("Error running freebayes."+\
                         " Please check logfile {}\n".format(logfile))
        raise
    vcffiles = find_outfiles(fbdir, bamfiles, 'freebayes.filtered.vcf')
    if not vcffiles:
        sys.stderr.write("No freebayes .filtered.vcf files found.\n")
        sys.exit(1)
    return (vcffiles)
예제 #15
0
def calculate_depths(vcffiles, bamfiles, gatkdir, args):
    cmd = [SCRIPTS['calculate_depths'], '-o', gatkdir, ]
    if args.logdir: cmd += ['--logdir', args.logdir]
    cmd += ['-p', str(args.processes)]
    if args.force: cmd += ['-f',]
    cmd.extend(vcffiles)
    cmd.extend(bamfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Calculating depths: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    outfiles = find_outfiles(gatkdir, bamfiles, 'depths.txt', 
                             debug=args.debug)
    if not outfiles:
        sys.stderr.write("No depths.txt files found.\n")
        sys.exit(1)
    return outfiles
예제 #16
0
def run_bwa_all_fastq(fqfiles, aligndir, logdir, args):
    cmd = [SCRIPTS['run_bwa'], '-o', aligndir, '-p', str(args.processes)]
    if logdir: cmd += ['--logdir', logdir]
    if args.force: cmd += ['-f',]
    if args.sam: cmd += ['-s',]
    cmd.extend(fqfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running bwa: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        sys.stderr.write("Error running bwa.\n")
        raise
    delim = '_L001' if '_L001' in fqfiles[0] else '_R'
    ext = 'genomic_refseq.bam'
    outfiles = find_outfiles(aligndir, [ f for f in fqfiles if 'R1' in f ], 
                             ext, delim=delim)
    if not outfiles:
        sys.stderr.write("No bam files found.\n")
        sys.exit(1)
    return outfiles
예제 #17
0
def gatk_pipeline(label, bamfiles, gatkdir, logdir, args):
    cmd = [
        SCRIPTS['gatk_pipeline'],
        '-l',
        label,
        '-o',
        gatkdir,
    ]
    cmd += ['-p', str(args.processes)]
    if logdir: cmd += ['--logdir', logdir]
    if args.maxreads: cmd += ['-m', str(args.maxreads)]
    if args.force: cmd += [
            '-f',
    ]
    cmd.extend(bamfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running gatk pipeline: " + " ".join(cmd) + "\n")
    try:
        subprocess.check_call(cmd)
    except Exception, e:
        raise
예제 #18
0
def create_spreadsheet(label, samplevcfs, annovar_out, outdir):
    cmd = [SCRIPTS['create_spreadsheet'], '-l', label, '-o', outdir]
    for afile in annovar_out:
        cmd += ['-a', afile]
    if args.force: cmd += [
            '-f',
    ]
    cmd.extend(samplevcfs)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Creating spreadsheet: " + " ".join(cmd) + "\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    outfile = find_outfiles(outdir, [
        label,
    ], 'results.txt', debug=args.debug)
    if not outfile:
        sys.exit(1)
    return outfile[0]
예제 #19
0
def run_gatk_all_bam(label, bamfiles, gatkdir, args):
    cmd = [SCRIPTS['run_gatk'], '-c', label, '-o', gatkdir,]
    cmd += ['-p', str(args.processes)]
    cmd += ['--logdir', args.logdir]
    if args.force: cmd += ['-f',]
    if args.debug: cmd += ['--debug',]
    if args.maxreads: cmd += ['-m', str(args.maxreads)]
    cmd.extend(bamfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Running gatk: "+" ".join(cmd)+"\n")
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError as e:
        print e.output
        raise
    outfiles = find_outfiles(gatkdir, bamfiles, 'gatk-cohort.vcf',
                             debug=args.debug)
    gvcf = find_outfiles(gatkdir, [label,], 'gatk-merged.vcf',
                         debug=args.debug)
    if not outfiles:
        sys.stderr.write("No gatk-cohort.vcf files found.\n")
        sys.exit(1)
    return (outfiles, gvcf[0])
예제 #20
0
def trim_all_fastq(fqfiles, aligndir, logdir, args):
    cmd = [
        SCRIPTS['primer_trim'],
        '-o',
        aligndir,
    ]
    cmd += ['-p', str(args.processes)]
    if logdir: cmd += ['--logdir', logdir]
    if args.force: cmd += [
            '-f',
    ]
    if args.label: cmd += ['-s', args.label + '.summary.txt']
    cmd.extend(fqfiles)
    sys.stderr.write("\n==================================================\n")
    sys.stderr.write("Current time: {}\n".format(timestamp()))
    sys.stderr.write("Trimming fastq files: " + " ".join(cmd) + "\n")
    outfiles = find_outfiles(aligndir, fqfiles, 'trimmed.fastq', quiet=True)
    if len(outfiles) == len(fqfiles):
        sys.stderr.write("Already have trimmed fq files.\n")
        return outfiles
    try:
        subprocess.check_call(cmd)
    except Exception, e:
        raise