예제 #1
0
def create_sorted_bam(samfile, outlabel, logfh, force, keepsam):
    bamfile = outlabel + '.bam'
    logfh.write("\nSorting sam: {}\n".format(samfile))
    logfh.write("Creating bam: {}\n".format(bamfile))
    cmd = picardExe[:] + [ 'SortSam', 'I='+samfile, 'O='+bamfile,
           'SORT_ORDER=coordinate']
    logfh.write(" ".join(cmd)+"\n")
    if have_file(bamfile, force, stderr=logfh):
        logfh.write("Already have bam file: {}\n".format(bamfile))
    else:
        logfh.flush()
        subprocess.call(cmd, stderr=logfh)
    if not keepsam and have_file(bamfile, stderr=logfh):
        remove_file(samfile, stderr=logfh)
    return bamfile
예제 #2
0
def run_gatk(bamfile, label, ref, args, logfh=sys.stderr):
    caller = 'HaplotypeCaller'
    outvcf = label + '.gatk.vcf'
    if args.cohort:
        outvcf = outvcf.replace('.vcf', '-cohort.vcf')
    cmd = gatkExe[:] + ['-T', caller, '--genotyping_mode', 'DISCOVERY']
    cmd.extend(['-R', ref, '-I', bamfile, '-o', outvcf])
    if args.intervals: cmd.extend(['-L', args.intervals])
    if args.dbsnp: cmd.extend(["--dbsnp", args.dbsnp])
    if args.maxreads:
        cmd.extend(['--maxReadsInRegionPerSample', str(args.maxreads)])
    if args.debug:
        bamout = label + '.gatk-debug.bam'
        cmd.extend(['-bamout', bamout])
    if args.cohort:
        cmd.extend(['-ERC', 'GVCF', '--variant_index_type', 'LINEAR'])
        cmd.extend(['--variant_index_parameter', '128000'])


#    if args.basequal:
#        cmd.extend(['-mbq', str(args.basequal)])
    logfh.write("\nGATK: " + " ".join(cmd) + "\n")
    if have_file(outvcf, args.force, stderr=logfh):
        logfh.write("  Already have {}.\n".format(outvcf))
    else:
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if not os.path.isfile(outvcf):
        logfh.write("  Failed to create {}\n".format(outvcf))
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #3
0
def calculate_depths(label, infiles, args):
    bamfile = infiles[label]['bam']
    vcffile = infiles[label]['vcf']
    outlabel = get_outlabel(bamfile, 'bam', args.outdir)
    loglabel = get_outlabel(bamfile, 'bam', args.logdir) if args.logdir \
               else outlabel
    logfile = loglabel + ".calculate_depths.log"
    outfile = outlabel + '.depths{}.txt'.format(args.tag)
    sys.stderr.write("  Calculating depths for {}\n".format(outlabel))
    sys.stderr.write("    Log file: {}\n".format(logfile))
    logfh = open(logfile, 'w')
    logfh.write("Bam file: {}\n".format(bamfile))
    logfh.write("VCF file: {}\n\n".format(vcffile))
    try:
        if have_file(outfile, args.force, stderr=logfh):
            logfh.write("  Already have {}\n".format(outfile))
        else:
            logfh.write("Start time: {}\n".format(timestamp()))
            (vcffields, variants) = parse_vcf(vcffile, logfh)
            depths = find_variant_depths(variants, bamfile, logfh, args)
            add_fwd_rev_depths(depths, variants, bamfile, args)
            print_depths(outfile, variants, depths, vcffields, logfh)
            logfh.write("End time: {}\n".format(timestamp()))
    except Exception, e:
        e.args += (vcffile, )
        raise
예제 #4
0
def create_annovar_input_file(vcffiles, outlabel, args):
    outfile = outlabel + '.avinput';
    cmd = [annovarInputExe, '-format', 'vcf4', '-allsample', '-withfreq', ]
    cmd.extend([ '-includeinfo', ])  
    sys.stderr.write("\nCreating annovar input file: "+" ".join(cmd)+"\n")
    if have_file(outfile, args.force):
        sys.stderr.write("  Already have {}.\n".format(outfile))
    else: 
        lines = []
        for vcffile in vcffiles:
            sys.stderr.write("    Running: {} {}\n".format(" ".join(cmd),
                             vcffile))
            output = subprocess.check_output(cmd + [vcffile,])
            if not output:
                sys.stderr.write("  No output.\n")
            else:
                for line in output.split("\n"):
                    v = line.split("\t")[0:5]
                    if len(v)==5: 
                        (chrom, v[1]) = cftr.CFTR_to_hg19(v[0], v[1])
                        (v[0], v[2]) = cftr.CFTR_to_hg19(v[0], v[2])
                        lines.append("\t".join(map(str, v)))
        uniqlines = sorted(set(lines))
        with open(outfile, 'w') as ofh:
            ofh.write("\n".join(uniqlines))
        if not os.path.isfile(outfile):
            sys.stderr.write("  Failed to create {}\n".format(outfile))
            sys.exit(1)
    return outfile
예제 #5
0
파일: run_gatk.py 프로젝트: eulaf/CFseq
def cohort_merge_gvcfs(vcfs, ref, args):
    sys.stderr.write("Genotyping gVCFs: {} vcfs\n".format(len(vcfs)))
    label = args.cohort
    outlabel = os.path.join(args.outdir, label) if args.outdir else label
    outvcf = outlabel + ".gatk-merged.vcf"
    cmd = gatkExe[:] + ["-T", "GenotypeGVCFs"]
    cmd.extend(["-R", ref, "-o", outvcf])
    if args.intervals:
        cmd.extend(["-L", args.intervals])
    if args.dbsnp:
        cmd.extend(["--dbsnp", args.dbsnp])
    variants = ("--variant " + " --variant ".join(vcfs)).split(" ")
    cmd += variants
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else:
        loglabel = os.path.join(args.logdir, label) if args.logdir else label
        logfile = loglabel + ".gatk-merged.log"
        sys.stderr.write("    Log file {}\n".format(logfile))
        with open(logfile, "w") as logfh:
            logfh.write("CMD: {}\n".format(cmd))
            logfh.flush()
            check_call(cmd, stderr=logfh)
    if os.path.isfile(outvcf):
        sys.stderr.write("  Merged gVCF: {}\n".format(outvcf))
    else:
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #6
0
파일: run_gatk.py 프로젝트: eulaf/CFseq
def run_gatk(bamfile, label, ref, args, logfh=sys.stderr):
    caller = "HaplotypeCaller"
    outvcf = label + ".gatk.vcf"
    if args.cohort:
        outvcf = outvcf.replace(".vcf", "-cohort.vcf")
    cmd = gatkExe[:] + ["-T", caller, "--genotyping_mode", "DISCOVERY"]
    cmd.extend(["-R", ref, "-I", bamfile, "-o", outvcf])
    if args.intervals:
        cmd.extend(["-L", args.intervals])
    if args.dbsnp:
        cmd.extend(["--dbsnp", args.dbsnp])
    if args.maxreads:
        cmd.extend(["--maxReadsInRegionPerSample", str(args.maxreads)])
    if args.debug:
        bamout = label + ".gatk-debug.bam"
        cmd.extend(["-bamout", bamout])
    if args.cohort:
        cmd.extend(["-ERC", "GVCF", "--variant_index_type", "LINEAR"])
        cmd.extend(["--variant_index_parameter", "128000"])
    #    if args.basequal:
    #        cmd.extend(['-mbq', str(args.basequal)])
    logfh.write("\nGATK: " + " ".join(cmd) + "\n")
    if have_file(outvcf, args.force, stderr=logfh):
        logfh.write("  Already have {}.\n".format(outvcf))
    else:
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if not os.path.isfile(outvcf):
        logfh.write("  Failed to create {}\n".format(outvcf))
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #7
0
def cohort_merge_gvcfs(vcfs, ref, args):
    sys.stderr.write("Genotyping gVCFs: {} vcfs\n".format(len(vcfs)))
    label = args.cohort
    outlabel = os.path.join(args.outdir, label) if args.outdir else label
    outvcf = outlabel + '.gatk-merged.vcf'
    cmd = gatkExe[:] + ['-T', 'GenotypeGVCFs']
    cmd.extend(['-R', ref, '-o', outvcf])
    if args.intervals: cmd.extend(['-L', args.intervals])
    if args.dbsnp: cmd.extend(["--dbsnp", args.dbsnp])
    variants = ("--variant " + " --variant ".join(vcfs)).split(' ')
    cmd += variants
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else:
        loglabel = os.path.join(args.logdir, label) if args.logdir else label
        logfile = loglabel + '.gatk-merged.log'
        sys.stderr.write("    Log file {}\n".format(logfile))
        with open(logfile, 'w') as logfh:
            logfh.write("CMD: {}\n".format(cmd))
            logfh.flush()
            check_call(cmd, stderr=logfh)
    if os.path.isfile(outvcf):
        sys.stderr.write("  Merged gVCF: {}\n".format(outvcf))
    else:
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #8
0
def run_annovar(annov_input, outlabel, refdir, args):
    outfile1 = outlabel + '-hgvs.variant_function'
    outfile2 = outlabel + '-hgvs.exonic_variant_function'
    cmd = [annovarExe, '-build', 'hg19', '-hgvs', '-out', outlabel+'-hgvs', ]
    cmd.extend([ annov_input, refdir ])  
    sys.stderr.write("\nRunning annovar: "+" ".join(cmd)+"\n")
    if have_file(outfile1, args.force) and have_file(outfile2, args.force):
        sys.stderr.write("  Already have {} and {}.\n".format(outfile1,
                         outfile2))
    else: 
        subprocess.check_call(cmd)
        if not os.path.isfile(outfile1):
            sys.stderr.write("  Failed to create {}\n".format(outfile1))
            sys.exit(1)
        if not os.path.isfile(outfile2):
            sys.stderr.write("  Failed to create {}\n".format(outfile2))
            sys.exit(1)
    return (outfile1, outfile2)
예제 #9
0
def run_bwa(sample, outlabel, ref, fqfiles, logfh, force):
    samfile = outlabel + ".sam"
    bamfile = outlabel + ".bam"
    logfh.write("Output sam: {}\n".format(samfile))
    readgroup = "\\t".join(['@RG', "ID:"+sample, "SM:"+sample, "PL:Illumina",
                           "LB:"+sample, "PU:unit1"]);
    cmd = [bwaExe, 'mem', '-M', '-R', readgroup, ref,] + fqfiles
    logfh.write(" ".join(cmd)+"\n")
    if have_file(samfile, force, stderr=logfh):
        logfh.write("Already have sam file: {}\n".format(samfile))
    elif have_file(bamfile, force, stderr=logfh):
        logfh.write("Already have bam file: {}\n".format(bamfile))
    else:
        logfh.flush()
        output = subprocess.check_output(cmd, stderr=logfh)
        with open(samfile, 'w') as ofh:
            ofh.write(output)
    return samfile
예제 #10
0
def index_bam(bamfile, args, logfh=sys.stderr):
    label = bamfile.rstrip('bam').rstrip('.')
    outidx = label + '.bai'
    cmd = picardExe[:] + ['BuildBamIndex', 'I=' + bamfile]
    if not have_file(outidx, args.force, quiet=True, stderr=logfh):
        logfh.write("\nIndex bam: " + " ".join(cmd) + "\n")
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if not os.path.isfile(outidx):
        logfh.write("  Failed to create {}\n".format(outidx))
        sys.stderr.write("  Failed to create {}\n".format(outidx))
        sys.exit(1)
예제 #11
0
파일: run_gatk.py 프로젝트: eulaf/CFseq
def index_bam(bamfile, args, logfh=sys.stderr):
    label = bamfile.rstrip("bam").rstrip(".")
    outidx = label + ".bai"
    cmd = picardExe[:] + ["BuildBamIndex", "I=" + bamfile]
    if not have_file(outidx, args.force, quiet=True, stderr=logfh):
        logfh.write("\nIndex bam: " + " ".join(cmd) + "\n")
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if not os.path.isfile(outidx):
        logfh.write("  Failed to create {}\n".format(outidx))
        sys.stderr.write("  Failed to create {}\n".format(outidx))
        sys.exit(1)
예제 #12
0
def create_fragment_report(fqpair, frag2primers, read_primer_file, force,
                           logfh):
    logfh.write("\nCreating fragment report\n".format(read_primer_file))
    if have_file(read_primer_file, force, stderr=logfh):
        logfh.write("  Already have {}\n".format(read_primer_file))
        return
    readnum_patt = re.compile('.*_(R[12]).*')
    readnums = [readnum_patt.sub('\\1_primer', fqfile) for fqfile in fqpair]
    logfh.write("  Writing {}\n".format(read_primer_file))
    fragcounts = {
        'tot_fragments': 0,
        'unidentified': 0,
        'singleton': 0,
        'paired-good': 0,
        'paired-other': 0,
        'misprime': 0,
    }
    with open(read_primer_file, 'w') as ofh:
        ofh.write("Fragment\t" + "\t".join(readnums) +\
                  "\tStatus\tEstimated_fragment_size\n")
        for fragname in sorted(frag2primers.keys()):
            row = [
                fragname,
            ]
            for fqfile in fqpair:
                if fqfile in frag2primers[fragname]['primers']:
                    row.append(frag2primers[fragname]['primers'][fqfile])
                else:
                    row.append('')
            status = frag2primers[fragname]['status']
            fragcounts[status] += 1
            row.extend([status, frag2primers[fragname]['ampsize']])
            ofh.write("\t".join([str(r) for r in row]) + "\n")
    fragcounts['tot_fragments'] = len(frag2primers.keys())
    logfh.write("{:<12}\t{:<6} fragments\t{:<5}%\n".format(
        "Fragment status", "Number", "Percent"))
    for k in fragcounts:
        perc = fragcounts[k] * 100.0 / fragcounts['tot_fragments']
        logfh.write("{:<12}\t{:>6} fragments\t{:5.1f}%\n".format(
            k, fragcounts[k], perc))
    logfh.write("{:<12}\t{:>6} fragments\n".format(
        "Total", fragcounts['tot_fragments']))
    fragcounts['paired'] = fragcounts['paired-good'] + \
                           fragcounts['paired-other']
    if os.path.isfile(read_primer_file):
        sys.stderr.write("    Fragment report {}\n".format(read_primer_file))
    return fragcounts
예제 #13
0
def create_primer_report(primerreads,
                         primerlist,
                         primer_read_file,
                         logfh,
                         force=False,
                         debug=False):
    """Create file listing each primer and the reads that match it.
    Include counts of unidentified and mismatched reads.  Tally percent
    of total reads amplified by each primer pair."""
    logfh.write("\nCreating primer report\n".format(primer_read_file))
    if have_file(primer_read_file, force, stderr=logfh):
        logfh.write("  Already have {}\n".format(primer_read_file))
        return
    logfh.write("  Writing {}\n".format(primer_read_file))
    tot_reads = 0
    keylist = primerlist + ['unidentified', 'misprime']
    primercounts = {}
    primerkeys = []
    with open(primer_read_file, 'w') as ofh:
        ofh.write("Primer\tNum_reads\tReads\n")
        for primer in keylist:
            numreads = len(primerreads[primer])
            tot_reads += numreads
            readlist = ", ".join(primerreads[primer])
            primerpair = primer.rstrip('_F').rstrip('_R')
            if primerpair in primercounts:
                primercounts[primerpair] += numreads
            else:
                primerkeys.append(primerpair)
                primercounts[primerpair] = numreads
            ofh.write("{}\t{}\t{}\n".format(primer, numreads, readlist))
    if debug:
        logfh.write("{:<12}\t{:<6} reads\t{:<5}%\n".format(
            "Primer", "Number", "Percent"))
        for k in keylist:
            numreads = len(primerreads[k])
            perc = numreads * 100.0 / tot_reads
            logfh.write("{:<12}\t{:>6} reads\t{:5.2f}%\n".format(
                k, numreads, perc))
        logfh.write("{:<12}\t{:<6} reads\n".format("Total", tot_reads))
    primercounts['tot_reads'] = tot_reads
    primerkeys.insert(0, 'tot_reads')
    if os.path.isfile(primer_read_file):
        sys.stderr.write(
            "    Primer read report {}\n".format(primer_read_file))
    return (primercounts, primerkeys)
예제 #14
0
def table_annovar(annov_input, outlabel, refdir, args):
    outfile = outlabel + '.hg19_multianno.txt'
    outfile1 = outlabel + '.variant_function'
    outfile2 = outlabel + '.exonic_variant_function'
    (protocol_list, operation_list) = annovar_protocol(annovarDBs, refdir)
    cmd = [ annovarTableExe, annov_input, refdir, '-out', outlabel, ]
    cmd.extend(['-buildver', 'hg19', '-out', outlabel, '-nastring', '.'])
    cmd.extend(['-protocol', protocol_list, '-operation', operation_list])
    sys.stderr.write("\nRunning table_annovar: "+" ".join(cmd)+"\n")
    if have_file(outfile, args.force):
        sys.stderr.write("  Already have {}.\n".format(outfile))
    else: 
        subprocess.check_call(cmd)
        if not os.path.isfile(outfile):
            sys.stderr.write("  Failed to create {}\n".format(outfile))
            sys.exit(1)
    return (outfile)
예제 #15
0
파일: separate_vcf.py 프로젝트: eulaf/CFseq
def run_select_variants(vcffile, outvcf, sample, ref, logfh, args):
    logfh.write("\n-- SelectVariants --\n")
    cmd = gatkExe[:]
    cmd.extend(['-T', 'SelectVariants', '--excludeNonVariants']) 
    cmd.extend(['-R', ref])  
    cmd.extend(['--variant', vcffile])  
    cmd.extend(['-o', outvcf])  
    cmd.extend(['-sn', sample])  
    logfh.write("  Sample {}:\t{}\n".format(sample, outvcf))
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else: 
        logfh.write(" ".join(cmd)+"\n")
        logfh.flush()
        check_call(cmd, stderr=logfh)
        if os.path.isfile(outvcf):
            sys.stderr.write("  Created {}\n".format(outvcf))
        else:
            sys.stderr.write("  Failed to create {}\n".format(outvcf))
    return outvcf
예제 #16
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
def create_primer_report(primerreads, primerlist, primer_read_file, logfh,
                         force=False, debug=False):
    """Create file listing each primer and the reads that match it.
    Include counts of unidentified and mismatched reads.  Tally percent
    of total reads amplified by each primer pair."""
    logfh.write("\nCreating primer report\n".format(primer_read_file))
    if have_file(primer_read_file, force, stderr=logfh):
        logfh.write("  Already have {}\n".format(primer_read_file))
        return
    logfh.write("  Writing {}\n".format(primer_read_file))
    tot_reads = 0
    keylist = primerlist + ['unidentified', 'misprime']
    primercounts = {}
    primerkeys = []
    with open(primer_read_file, 'w') as ofh:
        ofh.write("Primer\tNum_reads\tReads\n")
        for primer in keylist:
            numreads = len(primerreads[primer])
            tot_reads += numreads
            readlist = ", ".join(primerreads[primer])
            primerpair = primer.rstrip('_F').rstrip('_R')
            if primerpair in primercounts:
                primercounts[primerpair] += numreads
            else:
                primerkeys.append(primerpair)
                primercounts[primerpair] = numreads
            ofh.write("{}\t{}\t{}\n".format(primer, numreads, readlist))
    if debug:
        logfh.write("{:<12}\t{:<6} reads\t{:<5}%\n".format(
            "Primer", "Number", "Percent"))
        for k in keylist:
            numreads = len(primerreads[k])
            perc = numreads*100.0/tot_reads
            logfh.write("{:<12}\t{:>6} reads\t{:5.2f}%\n".format(k, 
                numreads, perc))
        logfh.write("{:<12}\t{:<6} reads\n".format("Total", tot_reads))
    primercounts['tot_reads'] = tot_reads
    primerkeys.insert(0, 'tot_reads')
    if os.path.isfile(primer_read_file):
        sys.stderr.write("    Primer read report {}\n".format(primer_read_file))
    return (primercounts, primerkeys)
예제 #17
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
def run_aligner(queryfiles, primerfa, outlabel, logfh, force):
    outfile = outlabel + "-primers.cm.out"
    logfh.write("\nAlignment output in {}\n".format(outfile))
    if (have_file(outfile, force, stderr=logfh)):
        logfh.write("      Already have {}\n".format(outfile))
        return (outfile, None)
    try:
        with open(outfile, 'w') as ofh:
            for queryfile in queryfiles:
                logfh.write("    Aligning {}\n".format(queryfile))
                cmd = [CM_EXE, queryfile, primerfa, "-minscore", "12", 
                       "-minmatch", "8", "-tags", "-alignments"]
                logfh.write("      Running {}\n".format(" ".join(cmd)))
                logfh.flush()
                subprocess.check_call(cmd, stdout=ofh, stderr=logfh)
    except subprocess.CalledProcessError as e:
        sys.stderr.write("Error running cross_match for {}\n".format(outfile))
        raise
    if not os.path.isfile(outfile):
        sys.stderr.write("Output file {} not found.\n".format(outfile))
    return outfile
예제 #18
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, 
                 args):
    """Returns trimmed fastq file and file with list of sequence names"""
    trimmedfq = outlabel + ".trimmed.fastq"
    seqfile = outlabel + ".seqlist.txt"
    logfh.write("    Trimming fq: {}\n".format(trimmedfq))
    if have_files([trimmedfq, seqfile], args.force, stderr=logfh):
        logfh.write("      Already have {}\n".format(trimmedfq))
        return (trimmedfq, seqfile)
    aligns = parse_alignout(alignout)
    seqlist = []
    with open(trimmedfq, 'w') as outfq:
        inseq = FastQParser(fqfile)
        for seqrec in inseq:
            seqlist.append(seqrec.id)
            if seqrec.id in aligns:
                primer = aligns[seqrec.id]['primer']
                if primerinfo[primer]['overlap']:
                    primerend = aligns[seqrec.id]['end'] +\
                                aligns[seqrec.id]['left']
                    subrec = seqrec[primerend:]
                    if args.debug:
                        logfh.write("{}\tTrimming\t{}\n".format(
                                         primer, seqrec.id))
                else:
                    if args.debug:
                        logfh.write("{}\tNot trimming\t{}\n".format(
                                         primer, seqrec.id))
                    subrec = seqrec
            else: #trim default max_primer_len+2
                subrec = seqrec[max_trim_len:]
            outfq.write("{}\n".format(subrec.fastq()))
    logfh.write("    Seq list: {}\n".format(seqfile))
    if have_file(seqfile, True, stderr=logfh):
        logfh.write("      Still have {}\n".format(seqfile))
        sys.exit()
    with open_file(seqfile, 'w') as ifh:
        ifh.write("\n".join(seqlist)+"\n")
    return (trimmedfq, seqfile)
예제 #19
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
def create_fragment_report(fqpair, frag2primers, read_primer_file, force, 
                           logfh):
    logfh.write("\nCreating fragment report\n".format(read_primer_file))
    if have_file(read_primer_file, force, stderr=logfh):
        logfh.write("  Already have {}\n".format(read_primer_file))
        return
    readnum_patt = re.compile('.*_(R[12]).*')
    readnums = [ readnum_patt.sub('\\1_primer', fqfile) for fqfile in fqpair ]
    logfh.write("  Writing {}\n".format(read_primer_file))
    fragcounts = { 'tot_fragments':0, 'unidentified': 0, 'singleton': 0,
                   'paired-good': 0, 'paired-other': 0, 'misprime': 0, }
    with open(read_primer_file, 'w') as ofh:
        ofh.write("Fragment\t" + "\t".join(readnums) +\
                  "\tStatus\tEstimated_fragment_size\n")
        for fragname in sorted(frag2primers.keys()):
            row = [fragname,]
            for fqfile in fqpair:
                if fqfile in frag2primers[fragname]['primers']:
                    row.append(frag2primers[fragname]['primers'][fqfile])
                else:
                    row.append('')
            status = frag2primers[fragname]['status']
            fragcounts[status] += 1
            row.extend([status, frag2primers[fragname]['ampsize']])
            ofh.write("\t".join([str(r) for r in row])+"\n")
    fragcounts['tot_fragments'] = len(frag2primers.keys())
    logfh.write("{:<12}\t{:<6} fragments\t{:<5}%\n".format("Fragment status", 
                "Number", "Percent"))
    for k in fragcounts:
        perc = fragcounts[k]*100.0/fragcounts['tot_fragments']
        logfh.write("{:<12}\t{:>6} fragments\t{:5.1f}%\n".format(k, 
                    fragcounts[k], perc))
    logfh.write("{:<12}\t{:>6} fragments\n".format("Total", 
                fragcounts['tot_fragments']))
    fragcounts['paired'] = fragcounts['paired-good'] + \
                           fragcounts['paired-other']
    if os.path.isfile(read_primer_file):
        sys.stderr.write("    Fragment report {}\n".format(read_primer_file))
    return fragcounts
예제 #20
0
def run_freebayes(bamfile, reffile, bedfile, logfh, args):
    label = get_outlabel(bamfile, args.outdir)
    outvcf = label + '.freebayes.vcf'
    cmd = [ freebayesExe, '-f', reffile, '-t', bedfile ]
    cmd.extend(['-b', bamfile, '-v', outvcf])  
#    cmd.extend(['--max-complex-gap', '5',])  
    if args.basequal:
        cmd.extend(['-q', str(args.basequal)])  
    if args.minaltcount: 
        cmd.extend(['-C', str(args.minaltcount)])
    logfh.write("\nCMD: "+" ".join(cmd)+"\n")
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else: 
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if os.path.isfile(outvcf):
        sys.stderr.write("  Freebayes result in {}\n".format(outvcf))
    else:
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #21
0
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh,
                 args):
    """Returns trimmed fastq file and file with list of sequence names"""
    trimmedfq = outlabel + ".trimmed.fastq"
    seqfile = outlabel + ".seqlist.txt"
    logfh.write("    Trimming fq: {}\n".format(trimmedfq))
    if have_files([trimmedfq, seqfile], args.force, stderr=logfh):
        logfh.write("      Already have {}\n".format(trimmedfq))
        return (trimmedfq, seqfile)
    aligns = parse_alignout(alignout)
    seqlist = []
    with open(trimmedfq, 'w') as outfq:
        inseq = FastQParser(fqfile)
        for seqrec in inseq:
            seqlist.append(seqrec.id)
            if seqrec.id in aligns:
                primer = aligns[seqrec.id]['primer']
                if primerinfo[primer]['overlap']:
                    primerend = aligns[seqrec.id]['end'] +\
                                aligns[seqrec.id]['left']
                    subrec = seqrec[primerend:]
                    if args.debug:
                        logfh.write("{}\tTrimming\t{}\n".format(
                            primer, seqrec.id))
                else:
                    if args.debug:
                        logfh.write("{}\tNot trimming\t{}\n".format(
                            primer, seqrec.id))
                    subrec = seqrec
            else:  #trim default max_primer_len+2
                subrec = seqrec[max_trim_len:]
            outfq.write("{}\n".format(subrec.fastq()))
    logfh.write("    Seq list: {}\n".format(seqfile))
    if have_file(seqfile, True, stderr=logfh):
        logfh.write("      Still have {}\n".format(seqfile))
        sys.exit()
    with open_file(seqfile, 'w') as ifh:
        ifh.write("\n".join(seqlist) + "\n")
    return (trimmedfq, seqfile)
예제 #22
0
def run_freebayes(bamfile, reffile, bedfile, logfh, args):
    label = get_outlabel(bamfile, args.outdir)
    outvcf = label + '.freebayes.vcf'
    cmd = [freebayesExe, '-f', reffile, '-t', bedfile]
    cmd.extend(['-b', bamfile, '-v', outvcf])
    #    cmd.extend(['--max-complex-gap', '5',])
    if args.basequal:
        cmd.extend(['-q', str(args.basequal)])
    if args.minaltcount:
        cmd.extend(['-C', str(args.minaltcount)])
    logfh.write("\nCMD: " + " ".join(cmd) + "\n")
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else:
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if os.path.isfile(outvcf):
        sys.stderr.write("  Freebayes result in {}\n".format(outvcf))
    else:
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #23
0
def create_fasta_of_primer_region(fqfile, max_trim_len, outlabel, logfh,
                                  force):
    """Creates fasta files of the first max_trim_len bases of
    each sequence in given fqfile.  Also, returns a list with names of
    all sequences in fqfile."""
    outfile = outlabel + ".primer_region{}.fa".format(max_trim_len)
    logfh.write("    Creating 5' fasta: {}\n".format(outfile))
    outfiles = [
        outfile,
    ]
    if have_file(outfile, force, stderr=logfh):
        logfh.write("      Already have {}\n".format(outfile))
    else:
        logfh.write("      Writing {}\n".format(outfile))
        numseqs = 0
        totseqs = 0
        outfa = open(outfile, 'w')
        inseq = FastQParser(fqfile)
        for seqrec in inseq:
            numseqs += 1
            subseq = seqrec.seq[0:max_trim_len]
            outfa.write(">{}\n{}\n".format(seqrec.id, subseq))
            if numseqs == MAX_READS:
                outfa.close()
                logfh.write("      Wrote {} seqs\n".format(numseqs))
                numfiles = len(outfiles) + 1
                outfile = outlabel + ".primer_region{}-{}.fa".format(
                    max_trim_len, numfiles)
                logfh.write("      Writing {}\n".format(outfile))
                outfiles.append(outfile)
                outfa = open(outfile, 'w')
                totseqs += numseqs
                numseqs = 0
        logfh.write("      Wrote {} seqs\n".format(numseqs))
        if len(outfiles) > 1:
            totseqs += numseqs
            logfh.write("  Wrote {} total seqs\n".format(totseqs))
    return outfiles
예제 #24
0
def run_aligner(queryfiles, primerfa, outlabel, logfh, force):
    outfile = outlabel + "-primers.cm.out"
    logfh.write("\nAlignment output in {}\n".format(outfile))
    if (have_file(outfile, force, stderr=logfh)):
        logfh.write("      Already have {}\n".format(outfile))
        return (outfile, None)
    try:
        with open(outfile, 'w') as ofh:
            for queryfile in queryfiles:
                logfh.write("    Aligning {}\n".format(queryfile))
                cmd = [
                    CM_EXE, queryfile, primerfa, "-minscore", "12",
                    "-minmatch", "8", "-tags", "-alignments"
                ]
                logfh.write("      Running {}\n".format(" ".join(cmd)))
                logfh.flush()
                subprocess.check_call(cmd, stdout=ofh, stderr=logfh)
    except subprocess.CalledProcessError as e:
        sys.stderr.write("Error running cross_match for {}\n".format(outfile))
        raise
    if not os.path.isfile(outfile):
        sys.stderr.write("Output file {} not found.\n".format(outfile))
    return outfile
예제 #25
0
def filter_freebayes_vcf(vcffile, outvcf, logfh, args):
    #    cmd = [ vcffilterExe, '-f', "QUAL > 20", "-f", "DP > 10" ]
    cmd = [vcffilterExe, '-s']
    if args.outdir:
        cmd.extend(['-o', args.outdir])
    if args.altbasequal:
        cmd.extend(['-a', str(args.altbasequal)])
    if args.dp:
        cmd.extend(['-d', str(args.dp)])
    if args.qual:
        cmd.extend(['-q', str(args.qual)])
    cmd.append(vcffile)
    logfh.write("\nCMD: " + " ".join(cmd) + "\n")
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else:
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if os.path.isfile(outvcf):
        sys.stderr.write("  Filtered FreeBayes result {}\n".format(outvcf))
    else:
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #26
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
def create_fasta_of_primer_region(fqfile, max_trim_len, outlabel, logfh, 
                                  force):
    """Creates fasta files of the first max_trim_len bases of
    each sequence in given fqfile.  Also, returns a list with names of
    all sequences in fqfile."""
    outfile = outlabel + ".primer_region{}.fa".format(max_trim_len)
    logfh.write("    Creating 5' fasta: {}\n".format(outfile))
    outfiles = [outfile,]
    if have_file(outfile, force, stderr=logfh):
        logfh.write("      Already have {}\n".format(outfile))
    else:
        logfh.write("      Writing {}\n".format(outfile))
        numseqs = 0
        totseqs = 0
        outfa = open(outfile, 'w')
        inseq = FastQParser(fqfile)
        for seqrec in inseq:
            numseqs += 1
            subseq = seqrec.seq[0:max_trim_len]
            outfa.write(">{}\n{}\n".format(seqrec.id, subseq))
            if numseqs==MAX_READS:
                outfa.close()
                logfh.write("      Wrote {} seqs\n".format(numseqs))
                numfiles = len(outfiles) + 1
                outfile = outlabel + ".primer_region{}-{}.fa".format(
                          max_trim_len, numfiles)
                logfh.write("      Writing {}\n".format(outfile))
                outfiles.append(outfile)
                outfa = open(outfile, 'w')
                totseqs += numseqs
                numseqs = 0
        logfh.write("      Wrote {} seqs\n".format(numseqs))
        if len(outfiles)>1:
            totseqs += numseqs
            logfh.write("  Wrote {} total seqs\n".format(totseqs))
    return outfiles
예제 #27
0
def filter_freebayes_vcf(vcffile, outvcf, logfh, args):
#    cmd = [ vcffilterExe, '-f', "QUAL > 20", "-f", "DP > 10" ]
    cmd = [  vcffilterExe, '-s']
    if args.outdir:
        cmd.extend(['-o', args.outdir])
    if args.altbasequal:
        cmd.extend(['-a', str(args.altbasequal)])
    if args.dp:
        cmd.extend(['-d', str(args.dp)])
    if args.qual:
        cmd.extend(['-q', str(args.qual)])
    cmd.append(vcffile)
    logfh.write("\nCMD: "+" ".join(cmd)+"\n")
    if have_file(outvcf, args.force):
        sys.stderr.write("  Already have {}.\n".format(outvcf))
    else: 
        logfh.flush()
        check_call(cmd, stderr=logfh)
    if os.path.isfile(outvcf):
        sys.stderr.write("  Filtered FreeBayes result {}\n".format(outvcf))
    else:
        sys.stderr.write("  Failed to create {}\n".format(outvcf))
        sys.exit(1)
    return outvcf
예제 #28
0
파일: primer_trim.py 프로젝트: eulaf/CFseq
                        help="Name for summary file.")
    parser.add_argument("-o", "--outdir", help="Directory for output files.")
    parser.add_argument("-f", "--force", default=False, action='store_true',
                        help="Overwrite existing files.")
    parser.add_argument("-p", "--processes", default=1, type=int,
                        help="Number of processes to run.")
    parser.add_argument("--debug", default=False, action='store_true',
                        help="Debug mode.")
    parser.add_argument("--logdir", help="Directory for log files.")

    if len(sys.argv)<2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    primerfa = RESOURCE['primer_fa']
    if not os.path.isfile(primerfa):
        sys.exit("Could not find resource {}\n".format(primerfa))
    (primerinfo, max_primer_len) = primer_info(primerfa)
    find_overlaps(primerinfo, args.debug)
    outfiles = align_trim_all_fqfiles(args.fqfiles, primerfa, 
                                      primerinfo, max_primer_len, args)
    if args.outdir: 
        args.summary = os.path.join(args.outdir, args.summary)
    if have_file(args.summary, args.force):
        sys.stderr.write("Already have {}.\n".format(args.summary))
    else:
        samplecounts = assess_all_primers(args.fqfiles, outfiles, primerinfo, 
                                          args)
        print_summary(samplecounts, args.summary)
예제 #29
0
                        " .variant_function and" +\
                        " .exonic_variant_function files.", )
    parser.add_argument("-o", "--outdir", help="Directory for output file.",)
    parser.add_argument("-l", "--label", help="Label for output file.",)
    parser.add_argument("-f", "--force", default=False, action='store_true',
                        help="Overwrite existing files.")

    if len(sys.argv)<2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    outlabel = get_outlabel(args.vcffiles, args)
    outfile = outlabel + ".results.txt"
    rejectfile = outlabel + ".rejects.txt"
    if have_file(outfile, args.force):
        sys.stderr.write("  Already have {}.\n".format(outfile))
    else:
        flatdata = defaultdict(dict)
        for vcffile in args.vcffiles:
            (fields, vcfdata) = parse_vcf(vcffile)
            (flatdata, newfields) = flatten_vcf_data(fields, vcfdata, flatdata)
        annovar_data = get_annovar_data(args.annovar)
        bedfile = cftr.RESOURCE['analysis_roi_bed']
        if not os.path.isfile(bedfile): 
            sys.exit("BED file {} not found\n".format(bedfile))
        roi = cftr.parse_bedfile(bedfile)
        create_spreadsheet(newfields, flatdata, annovar_data, roi, outfile,
                           rejectfile, args)

예제 #30
0
    parser.add_argument("-s", "--dbsnp", default=False, action='store_true',
                        help="Add dbsnp to ID field.",)
    parser.add_argument("-l", "--label", default='filtered',
                        help="Label for output files.",)
    parser.add_argument("-f", "--force", default=False, action='store_true',
                        help="Overwrite existing files.")

    if len(sys.argv)<2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    dbsnp = parse_dbsnp() if args.dbsnp else {}
    pos_removed = []
    for vcffile in args.vcffiles:
        sys.stderr.write("Processing {}.\n".format(vcffile))
        outlabel = os.path.basename(vcffile).rstrip('vcf').rstrip('.')
        outvcf = outlabel + '.{}.vcf'.format(args.label)
        if args.outdir:
            outvcf = os.path.join(args.outdir, outvcf)
        if have_file(outvcf, args.force):
            sys.stderr.write("  Already have {}.\n".format(outvcf))
        else:
            sys.stderr.write("Writing {}\n".format(outvcf))
            (header, fields, vcfinfo) = parse_vcf(vcffile)
            if args.dbsnp: add_dbsnp(vcfinfo, dbsnp)
            pos_rm = filter_vcf(header, fields, vcfinfo, outvcf, dbsnp, args)
            pos_removed.extend(pos_rm)
    sys.stderr.write("\nPositions removed: {}\n".format(", ".join(
        [str(j) for j in sorted([int(i) for i in set(pos_removed)])])))
예제 #31
0
파일: add_mol2k.py 프로젝트: eulaf/CFseq
    #    parser.add_argument("mol2k", help="Mol2k file of variants.")
    parser.add_argument("spreadsheets", nargs="+", help="Spreadsheet to check")
    parser.add_argument("-o", "--outdir", help="Directory for output.")
    parser.add_argument("-s", "--seen", default=False, action="store_true", help="Reset Seen counter.")
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite existing files.")

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    mol2kfile = cftr.RESOURCE["cftr_db"]
    if not os.path.isfile(mol2kfile):
        sys.exit("Could not find mol2k resource file: {}\n".format(mol2kfile))
    (mol2k, mol2k_fields) = parse_mol2k(mol2kfile, args.seen)
    seenfile = os.path.basename(mol2kfile).rstrip("txt").rstrip("csv").rstrip(".") + ".seen.txt"
    if args.outdir:
        seenfile = os.path.join(args.outdir, seenfile)
    for spreadsheet in args.spreadsheets:
        outfile = spreadsheet.rstrip("txt").rstrip(".") + ".mol2k.txt"
        if args.outdir:
            outfile = os.path.join(args.outdir, os.path.basename(outfile))
        if have_file(outfile, args.force):
            sys.stderr.write("  Already have {}.\n".format(outfile))
        else:
            find_mol2k_variants(spreadsheet, mol2k, outfile)
    if have_file(seenfile, args.force):
        sys.stderr.write("  Already have {}.\n".format(seenfile))
    else:
        print_mol2k_seen(mol2k, mol2k_fields, seenfile)
예제 #32
0
                        help="Label for output files.")
    parser.add_argument("--debug", action="store_true", default=False,
                        help="Debug mode.")
    parser.add_argument("-f", "--force", default=False, action="store_true",
                        help="Overwrite existing files.")

    if len(sys.argv)<2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    outfile = args.outlabel + ".tgpolyt_counts.txt"
    summaryfile = args.outlabel + ".tgpolyt.txt"
    sys.stderr.write("Writing {}\n".format(outfile))
    sys.stderr.write("Writing {}\n".format(summaryfile))
    if have_file(outfile, args.force) and have_file(summaryfile, args.force):
        sys.stderr.write("Already have {} and {}\n".format(
                         outfile, summaryfile))
        sys.exit()
    outfields = ['sample', 'TG-polyT', 'frequency', 'num_reads', 
                 'tot_reads']
    summfields = ['sample', 'TG-polyT', 'hom_het', 'frequency', 
                  'num_reads']
    with open(outfile, 'w') as ofh, open(summaryfile, 'w') as sfh:
        ofh.write("\t".join(outfields)+"\n")
        sfh.write("\t".join(summfields)+"\n")
        for bamfile in args.bamfiles:
            sample = get_samplename(bamfile)
            sys.stderr.write("\nReading bam file: {}\n".format(bamfile))
            sys.stderr.write("  Sample: {}\n".format(sample))
            reads = get_reads_covering_region(bamfile, REGION)
예제 #33
0
    parser = ArgumentParser(description=descr)
    parser.add_argument("primer2readsdir", 
                        help="Directory with primer2reads.txt files.")
    parser.add_argument("bamfiles", nargs="+", help="Bam files.")
    parser.add_argument("-o", "--outfile", default="uniformity.txt",
                        help="Name for output file.")
    parser.add_argument("-f", "--force", default=False, action="store_true",
                        help="Overwrite existing files.")
    parser.add_argument("-d", "--debug", default=False, action="store_true",
                        help="Keep intermediate files.")

    if len(sys.argv)<2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    if have_file(args.outfile, args.force):
        sys.stderr.write("  Already have {}\n".format(args.outfile))
        sys.exit()
    roi = parse_roi()
    p2rdict = get_primer2reads_files(args.primer2readsdir)
    ampcov = {}
    for bamfile in sorted(args.bamfiles):
        sample = os.path.basename(bamfile).split('-')[0].split('.')[0]
        if not sample in p2rdict:
            sys.stderr.write("No primer2read file for {}\n".format(bamfile))
            continue
        p2r = parse_primer2reads_file(p2rdict[sample])
        ampcov[sample] = get_amplicon_coverage(sample, bamfile, p2r, roi)
    compile_data(ampcov, roi, args.outfile)
예제 #34
0
    )
    parser.add_argument("-f",
                        "--force",
                        default=False,
                        action='store_true',
                        help="Overwrite existing files.")

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    dbsnp = parse_dbsnp() if args.dbsnp else {}
    pos_removed = []
    for vcffile in args.vcffiles:
        sys.stderr.write("Processing {}.\n".format(vcffile))
        outlabel = os.path.basename(vcffile).rstrip('vcf').rstrip('.')
        outvcf = outlabel + '.{}.vcf'.format(args.label)
        if args.outdir:
            outvcf = os.path.join(args.outdir, outvcf)
        if have_file(outvcf, args.force):
            sys.stderr.write("  Already have {}.\n".format(outvcf))
        else:
            sys.stderr.write("Writing {}\n".format(outvcf))
            (header, fields, vcfinfo) = parse_vcf(vcffile)
            if args.dbsnp: add_dbsnp(vcfinfo, dbsnp)
            pos_rm = filter_vcf(header, fields, vcfinfo, outvcf, dbsnp, args)
            pos_removed.extend(pos_rm)
    sys.stderr.write("\nPositions removed: {}\n".format(", ".join(
        [str(j) for j in sorted([int(i) for i in set(pos_removed)])])))
예제 #35
0
    parser.add_argument("-f",
                        "--force",
                        default=False,
                        action="store_true",
                        help="Overwrite existing files.")

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    outfile = args.outlabel + ".tgpolyt_counts.txt"
    summaryfile = args.outlabel + ".tgpolyt.txt"
    sys.stderr.write("Writing {}\n".format(outfile))
    sys.stderr.write("Writing {}\n".format(summaryfile))
    if have_file(outfile, args.force) and have_file(summaryfile, args.force):
        sys.stderr.write("Already have {} and {}\n".format(
            outfile, summaryfile))
        sys.exit()
    outfields = ['sample', 'TG-polyT', 'frequency', 'num_reads', 'tot_reads']
    summfields = ['sample', 'TG-polyT', 'hom_het', 'frequency', 'num_reads']
    with open(outfile, 'w') as ofh, open(summaryfile, 'w') as sfh:
        ofh.write("\t".join(outfields) + "\n")
        sfh.write("\t".join(summfields) + "\n")
        for bamfile in args.bamfiles:
            sample = get_samplename(bamfile)
            sys.stderr.write("\nReading bam file: {}\n".format(bamfile))
            sys.stderr.write("  Sample: {}\n".format(sample))
            reads = get_reads_covering_region(bamfile, REGION)
            (tgpolyt, totreads) = count_tgpolyt(reads, REGION, args.debug)
            report_results(ofh, sfh, sample, tgpolyt, totreads, args)
예제 #36
0
                        "--processes",
                        default=1,
                        type=int,
                        help="Number of processes to run.")
    parser.add_argument("--debug",
                        default=False,
                        action='store_true',
                        help="Debug mode.")
    parser.add_argument("--logdir", help="Directory for log files.")

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    primerfa = RESOURCE['primer_fa']
    if not os.path.isfile(primerfa):
        sys.exit("Could not find resource {}\n".format(primerfa))
    (primerinfo, max_primer_len) = primer_info(primerfa)
    find_overlaps(primerinfo, args.debug)
    outfiles = align_trim_all_fqfiles(args.fqfiles, primerfa, primerinfo,
                                      max_primer_len, args)
    if args.outdir:
        args.summary = os.path.join(args.outdir, args.summary)
    if have_file(args.summary, args.force):
        sys.stderr.write("Already have {}.\n".format(args.summary))
    else:
        samplecounts = assess_all_primers(args.fqfiles, outfiles, primerinfo,
                                          args)
        print_summary(samplecounts, args.summary)
예제 #37
0
파일: add_mol2k.py 프로젝트: rruizcor/CFseq
                        "--force",
                        default=False,
                        action='store_true',
                        help="Overwrite existing files.")

    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit()
    args = parser.parse_args()

    mol2kfile = cftr.RESOURCE['cftr_db']
    if not os.path.isfile(mol2kfile):
        sys.exit("Could not find mol2k resource file: {}\n".format(mol2kfile))
    (mol2k, mol2k_fields) = parse_mol2k(mol2kfile, args.seen)
    seenfile = os.path.basename(mol2kfile).rstrip('txt').\
                       rstrip('csv').rstrip('.') + ".seen.txt"
    if args.outdir:
        seenfile = os.path.join(args.outdir, seenfile)
    for spreadsheet in args.spreadsheets:
        outfile = spreadsheet.rstrip('txt').rstrip('.') + ".mol2k.txt"
        if args.outdir:
            outfile = os.path.join(args.outdir, os.path.basename(outfile))
        if have_file(outfile, args.force):
            sys.stderr.write("  Already have {}.\n".format(outfile))
        else:
            find_mol2k_variants(spreadsheet, mol2k, outfile)
    if have_file(seenfile, args.force):
        sys.stderr.write("  Already have {}.\n".format(seenfile))
    else:
        print_mol2k_seen(mol2k, mol2k_fields, seenfile)