Exemple #1
0
def run(parser, args):
    if args.method == "spades":
        run_spades(parser, args)
    elif args.method == "dipspades":
        run_dipspades(parser, args)
    elif args.method == "megahit":
        run_megahit(parser, args)
    else:
        status("Unknow assembler method {}".format(args.method))
Exemple #2
0
def orient_to_start(fasta_in, fasta_out, folder='.', start=False):
    # if not starting, then use cytochrome oxidase (cob)
    startFile = os.path.join(folder, '{}.fasta'.format(uuid.uuid4()))
    if not start:
        # generated as spoa consensus from select fungal cob genes
        cob1 = 'atgagaattttaaaaagtcatcctttattaaaattagttaatagttatattattgattcaccacaaccttctaatattagttatttatgaaattttggatctttattagctttatgtttagttatacaaattgtaactggtgttacattagctatgcactatacacctaatgttgatttagcttttaattctgtagaacatattatgagagatgtaaataatggttgattaataagatatttacatgctaatactgcttcagcattctttttcttagttatatttacatataggtagaggattatattatggttcatataaatcacctagaacattaacatgagctattgg'
        with open(startFile, 'w') as outfile:
            outfile.write('>COB\n{}\n'.format(softwrap(cob1)))
    else:
        shutil.copyfile(start, startFile)

    # load sequence into dictionary
    initial_seq = ''
    header = ''
    with open(fasta_in, 'r') as infile:
        for title, seq in SimpleFastaParser(infile):
            initial_seq = seq
            header = title

    alignments = []
    minimap2_cmd = ['minimap2', '-x', 'map-ont', '-c', fasta_in, startFile]
    for line in execute(minimap2_cmd, '.'):
        cols = line.rstrip().split('\t')
        alignments.append(cols)
    if len(alignments) == 1:
        ref_strand = cols[4]
        ref_offset = int(cols[2])
        if ref_strand == '-':
            ref_start = int(cols[8]) + ref_offset
        else:
            ref_start = int(cols[7]) - ref_offset
        rotated = initial_seq[ref_start:] + initial_seq[:ref_start]
        if ref_strand == '-':
            rotated = RevComp(rotated)
        with open(fasta_out, 'w') as outfile:
            outfile.write('>{}\n{}\n'.format('mt', softwrap(rotated)))
    elif len(alignments) == 0:
        status(
            'ERROR: unable to rotate because did not find --starting sequence\n'
        )
        with open(fasta_out, 'w') as outfile:
            outfile.write('>{}\n{}\n'.format('mt', softwrap(initial_seq)))
    elif len(alignments) > 1:
        status('ERROR: unable to rotate because found multiple alignments\n')
        for x in alignments:
            sys.stderr.write('{}\n'.format(x))
        with open(fasta_out, 'w') as outfile:
            outfile.write('>{}\n{}\n'.format('mt', softwrap(initial_seq)))
    if os.path.isfile(startFile):
        os.remove(startFile)
Exemple #3
0
def run(parser, args):
    status('Sorting sequences by length longest --> shortest')
    AllSeqs = {}
    with open(args.input, 'rU') as fasta_in:
        for Header, Seq in SimpleFastaParser(fasta_in):
            if not Header in AllSeqs:
                if len(Seq) >= args.minlen:
                    AllSeqs[Header] = len(Seq)
    sortSeqs = sorted(AllSeqs.items(),
                      key=operator.itemgetter(1),
                      reverse=True)
    orderedSeqs = [i[0] for i in sortSeqs]
    SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta'))
    with open(args.out, 'w') as fasta_out:
        for i, x in enumerate(orderedSeqs):
            fasta_out.write('>{:}_{:}\n{:}\n'.format(
                args.name, i + 1, softwrap(str(SeqRecords[x].seq))))

    status('Output written to: {:}'.format(args.out))
Exemple #4
0
def run_dipspades(parser, args):

    if not args.workdir:
        args.workdir = 'dipspades_' + str(os.getpid())

    runcmd = [
        'dipspades.py', '--threads',
        str(args.cpus), '--cov-cutoff', 'auto', '--mem', args.memory, '-o',
        args.workdir
    ]

    if args.assembler_args:
        runcmd.extend(args.assembler_args)

    if args.haplocontigs:
        runcmd.extend(['--hap', args.haplocontigs])

    if args.tmpdir:
        runcmd.extend(['--tmp-dir', args.tmpdir])

    #find reads -- use --left/right or look for cleaned in tmpdir
    forReads, revReads = (None, ) * 2
    if args.left:
        forReads = os.path.abspath(args.left)
    if args.right:
        revReads = os.path.abspath(args.right)
    if not forReads:
        status('Unable to located FASTQ raw reads, provide --left')
        sys.exit(1)

    if not revReads:
        runcmd = runcmd + ['-s', forReads]
    else:
        runcmd = runcmd + ['--pe1-1', forReads, '--pe1-2', revReads]

        # this basically overrides everything above and only runs --restart-from option
    if os.path.isdir(args.workdir):
        runcmd = ['dipspades.py', '-o', args.workdir, '--continue']

    # now run the spades job
    status('Assembling FASTQ data using Spades')

    printCMD(runcmd)
    DEVNULL = open(os.devnull, 'w')
    if args.debug:
        subprocess.run(runcmd)
    else:
        subprocess.run(runcmd, stdout=DEVNULL, stderr=DEVNULL)
    #pull out assembly

    if args.out:
        finalOut = args.out
    else:
        finalOut = prefix + '.dipspades.fasta'
    dipspadesoutdir = os.path.join(args.workdir, 'dipspades')
    if os.path.isfile(os.path.join(args.workdir, 'consensus_contigs.fasta')):
        shutil.copyfile(os.path.join(args.workdir, 'consensus_contigs.fasta'),
                        finalOut)
        shutil.copyfile(
            os.path.join(args.workdir, 'dipspades',
                         'paired_consensus_contigs.fasta'),
            prefix + ".dipspades_consensus_paired.fasta")
        shutil.copyfile(
            os.path.join(args.workdir, 'dipspades',
                         'paired_consensus_contigs.fasta'),
            prefix + ".dipspades_consensus_unpaired.fasta")
        status('Dipspades assembly finished: {:}'.format(finalOut))
        status(
            'Dipspades assembly copied over: {:}'.format(
                prefix + ".dipspades_consensus_unpaired.fasta"),
            prefix + ".dipspades_consensus_paired.fasta")
        numSeqs, assemblySize = fastastats(finalOut)
        status('Assembly is {:,} scaffolds and {:,} bp'.format(
            numSeqs, assemblySize))
    else:
        status(
            'Spades assembly output missing -- check Dipspades logfile in {:}.'
            .format(os.path.join(args.workdir, 'dipspades', 'dipspades.log')))

    if not args.pipe:
        status(
            'Your next command might be:\n\tAAFTF vecscreen -i {:} -c {:}\n'.
            format(finalOut, args.cpus))
Exemple #5
0
def run_megahit(parser, args):

    if not args.workdir:
        args.workdir = 'megahit_' + str(os.getpid())

    runcmd = ['megahit', '-t', str(args.cpus), '-o', args.workdir]

    if args.assembler_args:
        runcmd.extend(args.assembler_args)

    if args.memory:
        runcmd.extend(['--memory', args.memory])

    if args.tmpdir:
        runcmd.extend(['--tmp-dir', args.tmpdir])

    #find reads -- use --left/right or look for cleaned in tmpdir
    forReads, revReads = (None, ) * 2
    if args.left:
        forReads = os.path.abspath(args.left)
    if args.right:
        revReads = os.path.abspath(args.right)
    if not forReads:
        status('Unable to located FASTQ raw reads, provide --left')
        sys.exit(1)

    if not revReads:
        runcmd = runcmd + ['-r', forReads]
    else:
        runcmd = runcmd + ['-1', forReads, '-2', revReads]

    if os.path.isdir(args.workdir):
        status("Cannot re-run with existing folder {}".format(args.workdir))

    # now run the spades job
    status('Assembling FASTQ data using megahit')
    printCMD(runcmd)
    DEVNULL = open(os.devnull, 'w')
    if args.debug:
        subprocess.run(runcmd)
    else:
        subprocess.run(runcmd, stdout=DEVNULL, stderr=DEVNULL)
    #pull out assembly
    if args.out:
        finalOut = args.out
    else:
        finalOut = prefix + '.megahit.fasta'

    if os.path.isfile(os.path.join(args.workdir, 'final.contigs.fa')):
        shutil.copyfile(os.path.join(args.workdir, 'final.contigs.fa'),
                        finalOut)
        status('Megahit assembly finished: {:}'.format(finalOut))
        numSeqs, assemblySize = fastastats(finalOut)
        status('Assembly is {:,} scaffolds and {:,} bp'.format(
            numSeqs, assemblySize))
    else:
        status('Megahit assembly output missing -- check megahit logfile.')

    if not args.pipe:
        status(
            'Your next command might be:\n\tAAFTF vecscreen -i {:} -c {:}\n'.
            format(finalOut, args.cpus))
Exemple #6
0
def run(parser, args):

    if not args.workdir:
        args.workdir = 'aaftf-sourpurge_' + str(uuid.uuid4())[:8]
    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)

    bamthreads = 4
    if args.cpus < 4:
        bamthreads = 1

    #find reads
    forReads, revReads = (None, ) * 2
    if args.left:
        forReads = os.path.abspath(args.left)
    if args.right:
        revReads = os.path.abspath(args.right)
    if not forReads:
        status(
            'Unable to located FASTQ raw reads, low coverage will be skipped. Provide -l,--left or -r,--right to enable low coverage filtering.'
        )
#        sys.exit(1)

#parse database locations
    if not args.sourdb:
        try:
            DB = os.environ["AAFTF_DB"]
        except KeyError:
            if args.AAFTF_DB:
                SOUR = os.path.join(args.AAFTF_DB, 'genbank-k31.lca.json.gz')
            else:
                status(
                    "$AAFTF_DB/genbank-k31.lca.json.gz not found, pass --sourdb"
                )
                sys.exit(1)
        SOUR = os.path.join(DB, 'genbank-k31.lca.json.gz')
        if not os.path.isfile(SOUR):
            status(
                "{:} sourmash database not found, download and rename to genbank-k31.lca.json.gz"
                .format(SOUR))
            sys.exit(1)
    else:
        SOUR = os.path.abspath(args.sourdb)

    # hard coded tmpfile
    assembly_working = 'assembly.fasta'
    megablast_working = 'megablast.out'
    blobBAM = 'remapped.bam'
    shutil.copyfile(args.input, os.path.join(args.workdir, assembly_working))
    numSeqs, assemblySize = fastastats(
        os.path.join(args.workdir, assembly_working))
    status('Assembly is {:,} contigs and {:,} bp'.format(
        numSeqs, assemblySize))
    DEVNULL = open(os.devnull, 'w')

    #now filter for taxonomy with sourmash lca classify
    status('Running SourMash to get taxonomy classification for each contig')
    sour_sketch = os.path.basename(assembly_working) + '.sig'
    sour_compute = [
        'sourmash', 'compute', '-k', '31', '--scaled=1000', '--singleton',
        assembly_working
    ]
    printCMD(sour_compute)
    subprocess.run(sour_compute, cwd=args.workdir, stderr=DEVNULL)
    sour_classify = [
        'sourmash', 'lca', 'classify', '--db', SOUR, '--query', sour_sketch
    ]
    printCMD(sour_classify)
    # output csv: ID,status,superkingdom,phylum,class,order,family,genus,species,strain
    Taxonomy = {}
    UniqueTax = []
    sourmashTSV = os.path.join(args.workdir, 'sourmash.csv')
    with open(sourmashTSV, 'w') as sour_out:
        for line in execute(sour_classify, args.workdir):
            sour_out.write(line)
            if not line or line.startswith('\n') or line.startswith(
                    'ID') or line.count(',') < 9:
                continue
            line = line.strip()
            cols = line.split(',')
            if 'found' in cols:
                idx = cols.index('found')
                Taxonomy[cols[0]] = cols[idx + 1:]
                taxClean = [x for x in cols[idx + 1:] if x]
                UniqueTax.append('{:}'.format(';'.join(taxClean)))
            elif 'nomatch' in cols:
                idx = cols.index('nomatch')
                Taxonomy[cols[0]] = cols[idx + 1:]
    UniqueTax = set(UniqueTax)
    status('Found {:} taxonomic classifications for contigs:\n{:}'.format(
        len(UniqueTax), '\n'.join(UniqueTax)))
    if args.taxonomy:
        sys.exit(1)
    Tax2Drop = []
    for k, v in Taxonomy.items():
        v = [x for x in v if x]  #remove empty items from list
        if args.debug:
            print('{:}\t{:}'.format(k, v))
        if len(v) > 0:
            if not any(i in v for i in args.phylum):
                Tax2Drop.append(k)

    #drop contigs from taxonomy before calculating coverage
    status('Dropping {:} contigs from taxonomy screen'.format(len(Tax2Drop)))
    sourTax = os.path.join(args.workdir, 'sourmashed-tax-screen.fasta')
    with open(sourTax, 'w') as outfile:
        with open(os.path.join(args.workdir, assembly_working),
                  'rU') as infile:
            for record in SeqIO.parse(infile, 'fasta'):
                if not record.id in Tax2Drop:
                    SeqIO.write(record, outfile, 'fasta')

    # only do coverage trimming if reads provided
    Contigs2Drop = [
    ]  # this will be empty if no reads given to gather by coverage
    if forReads:
        #check if BAM present, if so skip running
        if not os.path.isfile(os.path.join(args.workdir, blobBAM)):
            # index
            bwa_index = ['bwa', 'index', os.path.basename(sourTax)]
            status('Building BWA index')
            printCMD(bwa_index)
            subprocess.run(bwa_index, cwd=args.workdir, stderr=DEVNULL)
            #mapped reads to assembly using BWA
            bwa_cmd = [
                'bwa',
                'mem',
                '-t',
                str(args.cpus),
                os.path.basename(sourTax),  # assembly index base
                forReads
            ]
            if revReads:
                bwa_cmd.append(revReads)

                #run BWA and pipe to samtools sort
                status('Aligning reads to assembly with BWA')
                printCMD(bwa_cmd)
                p1 = subprocess.Popen(bwa_cmd,
                                      cwd=args.workdir,
                                      stdout=subprocess.PIPE,
                                      stderr=DEVNULL)
                p2 = subprocess.Popen([
                    'samtools', 'sort', '--threads',
                    str(bamthreads), '-o', blobBAM, '-'
                ],
                                      cwd=args.workdir,
                                      stdout=subprocess.PIPE,
                                      stderr=DEVNULL,
                                      stdin=p1.stdout)
                p1.stdout.close()
                p2.communicate()
                subprocess.run(['samtools', 'index', blobBAM],
                               cwd=args.workdir)

        #now calculate coverage from BAM file
        status('Calculating read coverage per contig')
        FastaBed = os.path.join(args.workdir, 'assembly.bed')
        lengths = []
        with open(FastaBed, 'w') as bedout:
            with open(sourTax, 'rU') as SeqIn:
                for record in SeqIO.parse(SeqIn, 'fasta'):
                    bedout.write('{:}\t{:}\t{:}\n'.format(
                        record.id, 0, len(record.seq)))
                    lengths.append(len(record.seq))

        N50 = calcN50(lengths)
        Coverage = {}
        coverageBed = os.path.join(args.workdir, 'coverage.bed')
        cov_cmd = ['samtools', 'bedcov', os.path.basename(FastaBed), blobBAM]
        printCMD(cov_cmd)
        with open(coverageBed, 'w') as bed_out:
            for line in execute(cov_cmd, args.workdir):
                bed_out.write(line)

                if not line or line.startswith('\n') or line.count('\t') < 3:
                    continue

                line = line.strip()
                cols = line.split('\t')
                cov = int(cols[3]) / float(cols[2])
                Coverage[cols[0]] = (int(cols[2]), cov)

        #get average coverage of N50 contigs
        n50Cov = []
        for k, v in Coverage.items():
            if args.debug:
                print('{:}; Len: {:}; Cov: {:.2f}'.format(k, v[0], v[1]))
            if v[0] >= N50:
                n50Cov.append(v[1])
        n50AvgCov = sum(n50Cov) / len(n50Cov)
        minpct = args.mincovpct / 100
        # should we make this a variable? 5% was something arbitrary
        min_coverage = float(n50AvgCov * minpct)
        status('Average coverage for N50 contigs is {:}X'.format(
            int(n50AvgCov)))

        #Start list of contigs to drop
        for k, v in Coverage.items():
            if v[1] <= min_coverage:
                Contigs2Drop.append(k)
        status(
            'Found {:,} contigs with coverage less than {:.2f}X ({:}%)'.format(
                len(Contigs2Drop), min_coverage, args.mincovpct))

    if args.debug:
        print('Contigs dropped due to coverage: {:}'.format(
            ','.join(Contigs2Drop)))
        print('Contigs dropped due to taxonomy: {:}'.format(
            ','.join(Tax2Drop)))

    DropFinal = Contigs2Drop + Tax2Drop
    DropFinal = set(DropFinal)
    status('Dropping {:,} total contigs based on taxonomy and coverage'.format(
        len(DropFinal)))
    with open(args.outfile, 'w') as outfile, open(sourTax, 'rU') as seqin:
        for record in SeqIO.parse(seqin, 'fasta'):
            if not record.id in DropFinal:
                SeqIO.write(record, outfile, 'fasta')

    numSeqs, assemblySize = fastastats(args.outfile)
    status('Sourpurged assembly is {:,} contigs and {:,} bp'.format(
        numSeqs, assemblySize))
    if '_' in args.outfile:
        nextOut = args.outfile.split('_')[0] + '.rmdup.fasta'
    elif '.' in args.outfile:
        nextOut = args.outfile.split('.')[0] + '.rmdup.fasta'
    else:
        nextOut = args.outfile + '.rmdup.fasta'

    if checkfile(sourmashTSV):
        baseinput = os.path.basename(args.input)
        if '.' in baseinput:
            baseinput = baseinput.rsplit('.', 1)[0]

        shutil.copy(sourmashTSV, baseinput + '.sourmash-taxonomy.csv')

    if not args.debug:
        SafeRemove(args.workdir)

    if not args.pipe:
        status('Your next command might be:\n\tAAFTF rmdup -i {:} -o {:}\n'.
               format(args.outfile, nextOut))
Exemple #7
0
def run(parser, args):
    # first check if NOVOplasty and minimap2 are installed, else exit
    programs = ['NOVOplasty.pl', 'minimap2']
    for x in programs:
        if not which_path(x):
            status('ERROR: {} is not installed, exiting'.format(x))
            sys.exit(1)
    # first we need to generate working directory
    unique_id = str(uuid.uuid4())[:8]
    if not args.workdir:
        args.workdir = 'mito_' + unique_id
    if not os.path.isdir(args.workdir):
        os.makedirs(args.workdir)

    # now estimate read lengths of FASTQ
    read_len = GuessRL(args.left)

    # check for seed sequence, otherwise write one
    if not args.seed:
        if not args.reference:
            seedFasta = os.path.abspath(
                os.path.join(os.path.dirname(__file__), 'mito-seed.fasta'))
        else:
            seedFasta = os.path.abspath(args.reference)
    else:
        seedFasta = os.path.abspath(args.seed)

    # now write the novoplasty config file
    defaultConfig = os.path.join(os.path.dirname(__file__),
                                 'novoplasty-config.txt')
    novoConfig = os.path.join(args.workdir, 'novo-config.txt')
    if args.reference:
        refgenome = os.path.abspath(args.reference)
    else:
        refgenome = ''
    checkWords = ("<PROJECT>", "<MINLEN>", "<MAXLEN>", "<MAXMEM>", "<SEED>",
                  "<READLEN>", "<FORWARD>", "<REVERSE>", "<REFERENCE>")
    repWords = (unique_id, str(args.minlen), str(args.maxlen),
                str(int(getRAM() * .75)), seedFasta, str(read_len),
                os.path.abspath(args.left), os.path.abspath(args.right),
                refgenome)
    with open(novoConfig, 'w') as outfile:
        with open(defaultConfig, 'r') as infile:
            for line in infile:
                for check, rep in zip(checkWords, repWords):
                    line = line.replace(check, rep)
                outfile.write(line)

    # now we can finally run NOVOplasty.pl
    status('De novo assembling mitochondrial genome using NOVOplasty')
    cmd = ['NOVOPlasty.pl', '-c', 'novo-config.txt']
    printCMD(cmd)
    novolog = os.path.join(args.workdir, 'novoplasty.log')
    with open(novolog, 'w') as logfile:
        p1 = subprocess.Popen(cmd,
                              cwd=args.workdir,
                              stdout=logfile,
                              stderr=logfile)
        p1.communicate()

    # now parse the results
    draftMito = None
    circular = False
    for f in os.listdir(args.workdir):
        if f.startswith('Circularized_assembly_'):
            draftMito = os.path.join(args.workdir, f)
            circular = True
            break
        if f.startswith('Contigs_1_'):
            draftMito = os.path.join(args.workdir, f)
            break
        if f.startswith('Uncircularized_assemblies_'):
            draftMito = os.path.join(args.workdir, f)
            break
    if circular:
        status('NOVOplasty assembled complete circular genome')
        if args.starting:
            status('Rotating assembly to start with {}'.format(args.starting))
        else:
            status('Rotating assembly to start with Cytochrome b (cob) gene')
        orient_to_start(draftMito,
                        args.out,
                        folder=args.workdir,
                        start=args.starting)
    else:
        numContigs = 0
        contigLength = 0
        with open(args.out, 'w') as outfile:
            with open(draftMito, 'r') as infile:
                for title, seq in SimpleFastaParser(infile):
                    numContigs += 1
                    contigLength += len(seq)
                    outfile.write('>contig_{}\n{}\n'.format(
                        numContigs, softwrap(seq)))
        status(
            'NOVOplasty assembled {} contigs consiting of {:,} bp, but was unable to circularize genome'
            .format(numContigs, contigLength))

    status('AAFTF mito complete: {}'.format(args.out))
    if not args.pipe:
        shutil.rmtree(args.workdir)
Exemple #8
0
def run(parser,args):

    #find reads for pilon
    forReads, revReads = (None,)*2
    if args.left:
        forReads = os.path.abspath(args.left)
    if args.right:
        revReads = os.path.abspath(args.right)

    if not forReads:
        status('Unable to located FASTQ raw reads, pass via -l,--left and/or -r,--right')
        sys.exit(1)

    custom_workdir = 1
    if not args.workdir:
        custom_workdir = 0
        args.workdir = 'aaftf-pilon_'+str(uuid.uuid4())[:8]
    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)

    bamthreads = 4
    if args.cpus < 4:
        bamthreads = args.cpus

    DEVNULL = open(os.devnull, 'w')
    for i in range(1, args.iterations+1):
        status('Starting Pilon polishing iteration {:}'.format(i))
        correctedFasta = 'pilon'+str(i)+'.fasta'
        if i == 1: #first loop
            initialFasta = args.infile
            shutil.copyfile(args.infile,
                            os.path.join(args.workdir,
                                         os.path.basename(args.infile)))
        else:
            initialFasta = os.path.join(args.workdir, 'pilon'+str(i-1)+'.fasta')

        pilonBAM = os.path.basename(initialFasta)+'.bwa.bam'
        if not os.path.isfile(os.path.join(args.workdir, pilonBAM)):
            bwa_index = ['bwa', 'index', os.path.basename(initialFasta)]
            printCMD(bwa_index)
            subprocess.run(bwa_index, cwd=args.workdir, stderr=DEVNULL)
            bwa_cmd = ['bwa', 'mem', '-t', str(args.cpus), os.path.basename(initialFasta), forReads]
            if revReads:
                bwa_cmd.append(revReads)

            #run BWA and pipe to samtools sort
            printCMD(bwa_cmd)
            p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir,
                                  stdout=subprocess.PIPE, stderr=DEVNULL)
            p2 = subprocess.Popen(['samtools', 'sort',
                                   '-@', str(bamthreads),'-o', pilonBAM, '-'],
                                  cwd=args.workdir, stdout=subprocess.PIPE,
                                  stderr=DEVNULL, stdin=p1.stdout)
            p1.stdout.close()
            p2.communicate()

            #BAM file needs to be indexed for Pilon
            subprocess.run(['samtools', 'index', pilonBAM], cwd=args.workdir)

        #run Pilon
        pilon_cmd = ['pilon', '--genome', os.path.basename(initialFasta),
                     '--frags', pilonBAM,
                     '-Xmx{}g'.format(args.memory),
                     '--output', correctedFasta.split('.fasta')[0],
                     '--threads', str(args.cpus),
                     '--changes']
        pilon_log = 'pilon'+str(i)+'.log'
        printCMD(pilon_cmd)
        with open(os.path.join(args.workdir, pilon_log), 'w') as logfile:
            subprocess.run(pilon_cmd, cwd=args.workdir, stderr=logfile,
                           stdout=logfile)
        num_changes = line_count(os.path.join(args.workdir, 'pilon'+str(i)+'.changes'))

        status('Found {:,} changes in Pilon iteration {:}'.format(num_changes, i))

        #clean-up as we iterate to prevent tmp directory from blowing up
        dirty = [initialFasta+'.sa', initialFasta+'.amb', initialFasta+'.ann',
                 initialFasta+'.pac', initialFasta+'.bwt', os.path.join(args.workdir, pilonBAM),
                 os.path.join(args.workdir, pilonBAM+'.bai')]
        for f in dirty:
            if i == 1:
                if os.path.isfile(os.path.join(args.workdir, f)):
                    os.remove(os.path.join(args.workdir, f))
            else:
                if os.path.isfile(f):
                    os.remove(f)

    #copy last iteration to output
    if args.outfile:
        polishedFasta = args.outfile
    else:
        polishedFasta = os.path.basename(args.infile).split('.f')[0]+'.pilon.fasta'
    shutil.copyfile(os.path.join(args.workdir, 'pilon'+str(args.iterations)+'.fasta'), polishedFasta)

    status('AAFTF pilon completed {:} iterations.'.format(args.iterations))
    status('Pilon polished assembly: {:}'.format(polishedFasta))
    if '_' in polishedFasta:
        nextOut = polishedFasta.split('_')[0]+'.final.fasta'
    elif '.' in polishedFasta:
        nextOut = polishedFasta.split('.')[0]+'.final.fasta'
    else:
        nextOut = polishedFasta+'.final.fasta'

    if not args.debug and not custom_workdir:
        SafeRemove(args.workdir)

    if not args.pipe:
        status('Your next command might be:\n\tAAFTF sort -i {:} -o {:}\n'.format(polishedFasta, nextOut))
Exemple #9
0
def parse_clean_blastn(fastafile, prefix, blastn, stringent):
    '''
    Blast header rows:
    qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue score qlen
    '''

    cleaned = prefix + ".clean.fsa"
    logging = prefix + ".parse.log"

    excludes = {}
    VecHits = {}
    found_vector_seq = 0
    with open(blastn, "r") as vectab:
        rdr = csv.reader(vectab, delimiter="\t")
        for row in rdr:
            qaccver, saccver, pid, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore, score, qlen = row
            if qaccver in contigs_to_remove:
                continue
            #vecscreen https://www.ncbi.nlm.nih.gov/tools/vecscreen/about/#Moderate
            #says to use score here (I'm interpret as score not bitscore)
            #need to determine if match is terminal or if internal
            loc = [int(qstart), int(qend)]
            if loc[0] > loc[1]:
                loc = [loc[1], loc[0]]
            #check for location
            terminal = False
            position = None
            if loc[0] <= 25:
                terminal = True
                position = '5'
            if (int(qlen) - loc[1]) <= 25:
                terminal = True
                position = '3'
            Match = 0  # weak=0, moderate=1, strong=2
            score = int(score)
            if terminal:
                if score >= 19:
                    Match = 1
                if score >= 24:
                    Match = 2
            else:
                if score >= 25:
                    Match = 1
                if score >= 30:
                    Match = 2
            if Match == 0:
                continue
            if stringent == 'high':
                if Match > 0:
                    found_vector_seq += 1
                    if not qaccver in VecHits:
                        VecHits[qaccver] = [(saccver, int(qlen), loc,
                                             int(score), terminal, position)]
                    else:
                        VecHits[qaccver].append(
                            (saccver, int(qlen), loc, int(score), terminal,
                             position))
            else:
                if Match > 1:
                    found_vector_seq += 1
                    if not qaccver in VecHits:
                        VecHits[qaccver] = [(saccver, int(qlen), loc,
                                             int(score), terminal, position)]
                    else:
                        VecHits[qaccver].append(
                            (saccver, int(qlen), loc, int(score), terminal,
                             position))

    trimTerminal = 0
    splitContig = 0
    with open(cleaned, "w") as output_handle, open(logging, "w") as log:
        for record in SeqIO.parse(fastafile, "fasta"):
            FiveEnd = 0
            ThreeEnd = len(record.seq)
            internals = []
            slicer = []
            sInt = []
            Seq = str(record.seq)
            if not record.id in VecHits:
                if len(record.seq) >= 200:
                    output_handle.write('>{:}\n{:}\n'.format(
                        record.id, softwrap(Seq)))
            else:
                #VecHits contains list of tuples of information, if terminal, then just truncate
                #off the closest side. Also, need to check if multiple intervals are within 50
                #bp of each other, that whole interval is removed.
                #should be able to accomplish above with the several rounds that it runs with,
                #so split on internal and trim terminal. done.
                for hit in VecHits[record.id]:
                    ID, length, loc, score, terminal, pos = hit
                    if terminal and pos == '5':
                        if loc[1] > FiveEnd:
                            FiveEnd = loc[1]
                    elif terminal and pos == '3':
                        if loc[0] < ThreeEnd:
                            ThreeEnd = loc[0]
                    else:  #internal hits to add to list
                        if not loc in internals:
                            internals.append(loc)
                #now sort intervals
                sInt = sorted(internals, key=lambda x: int(x[0]))
                #now construct slicing list
                if len(sInt) < 1:
                    slicer = [FiveEnd, ThreeEnd]
                else:
                    slicer = [FiveEnd]
                    for x in sInt:
                        slicer = slicer + x
                    slicer.append(ThreeEnd)
                paired_slicer = list(group(slicer, 2))
                if len(paired_slicer) < 2:
                    status('Terminal trimming {:} to {:}'.format(
                        record.id, paired_slicer))
                    newSeq = Seq[paired_slicer[0][0]:paired_slicer[0][1]]
                    if len(newSeq) >= 200:
                        output_handle.write('>{:}\n{:}\n'.format(
                            record.id, softwrap(newSeq)))
                else:
                    status('Spliting contig {:} into {:}'.format(
                        record.id, paired_slicer))
                    for num, y in enumerate(paired_slicer):
                        newSeq = Seq[y[0]:y[1]]
                        if len(newSeq) >= 200:
                            output_handle.write('>split{:}_{:}\n{:}\n'.format(
                                num + 1, record.id, softwrap(newSeq)))

    return (found_vector_seq, cleaned)
Exemple #10
0
def run(parser, args):
    if not args.workdir:
        args.workdir = 'aaftf-vecscreen_' + str(uuid.uuid4())[:8]
    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)

    #parse database locations
    DB = None
    if not args.AAFTF_DB:
        try:
            DB = os.environ["AAFTF_DB"]
        except KeyError:
            if args.AAFTF_DB:
                DB = args.AAFTF_DB
            else:
                pass
    else:
        DB = args.AAFTF_DB

    if args.percent_id:
        percentid_cutoff = args.percent_id

    infile = args.infile
    outfile = os.path.basename(args.outfile)
    outdir = os.path.dirname(args.outfile)
    if '.f' in outfile:
        prefix = outfile.rsplit('.f', 1)[0]
        print("prefix is ", prefix)
    else:
        prefix = str(os.getpid())
    if not outfile:
        outfile = "%s.vecscreen.fasta" % prefix

    outfile_vec = os.path.join(args.workdir,
                               "%s.tmp_vecscreen.fasta" % (prefix))

    # Common Euk/Prot contaminats for blastable DB later on
    status('Building BLAST databases for contamination screen.')
    makeblastdblist = []
    for d in DB_Links:
        if d == 'sourmash':
            continue
        url = DB_Links[d]
        dbname = os.path.basename(str(url))
        #logger.debug("testing for url=%s dbname=%s"%(url,dbname))
        if DB:
            file = os.path.join(DB, dbname)
        else:
            file = os.path.join(args.workdir, dbname)
        if file.endswith(".gz"):
            nogz = os.path.splitext(file)[0]
            if not os.path.exists(nogz):
                if not os.path.exists(file):
                    urllib.request.urlretrieve(url, file)

                with gzip.open(file, 'rb') as ingz, open(nogz, 'wb') as outfa:
                    shutil.copyfileobj(ingz, outfa)
#                call(['gunzip', '-k', file])
                make_blastdb('nucl', nogz, os.path.join(args.workdir, d))
            else:
                make_blastdb('nucl', nogz, os.path.join(args.workdir, d))
        else:
            if not os.path.exists(file):
                urllib.request.urlretrieve(url, file)
            make_blastdb('nucl', file, os.path.join(args.workdir, d))

    global contigs_to_remove
    contigs_to_remove = {}
    regions_to_trim = {}

    #qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore
    for contam in ["CONTAM_EUKS", "CONTAM_PROKS"]:
        status("%s Contamination Screen" % (contam))
        blastreport = os.path.join(args.workdir,
                                   "%s.%s.blastn" % (contam, prefix))
        blastnargs = [
            'blastn', '-query', infile, '-db',
            os.path.join(args.workdir, contam), '-num_threads',
            str(args.cpus), '-dust', 'yes', '-soft_masking', 'true',
            '-perc_identity', BlastPercent_ID_ContamMatch, '-lcase_masking',
            '-outfmt', '6', '-out', blastreport
        ]
        printCMD(blastnargs)
        call(blastnargs)
        hits = 0
        with open(blastreport) as report:
            colparser = csv.reader(report, delimiter="\t")
            for row in colparser:
                if ((float(row[2]) >= 98.0 and int(row[3]) >= 50)
                        or (float(row[2]) >= 94.0 and int(row[3]) >= 100)
                        or (float(row[2]) >= 90.0 and int(row[3]) >= 200)):
                    if not row[0] in regions_to_trim:
                        if int(row[6]) < int(row[7]):
                            start = int(row[6])
                            end = int(row[7])
                        else:
                            start = int(row[7])
                            end = int(row[6])
                        regions_to_trim[row[0]] = [(start, end, contam, row[1],
                                                    float(row[2]))]
                    else:
                        regions_to_trim[row[0]].append(
                            (start, end, contam, row[1], float(row[2])))
        status('{:} screening finished'.format(contam))

    eukCleaned = os.path.join(args.workdir,
                              "%s.euk-prot_cleaned.fasta" % (prefix))
    if len(regions_to_trim) > 0:
        with open(eukCleaned, 'w') as cleanout:
            with open(infile, 'rU') as fastain:
                for record in SeqIO.parse(fastain, 'fasta'):
                    if not record.id in regions_to_trim:
                        cleanout.write('>{:}\n{:}\n'.format(
                            record.id, softwrap(str(record.seq))))
                    else:
                        Seq = str(record.seq)
                        regions = regions_to_trim[record.id]
                        status(
                            'Splitting {:} due to contamination: {:}'.format(
                                record.id, regions))
                        lastpos = 0
                        newSeq = ''
                        for i, x in enumerate(regions):
                            newSeq = Seq[lastpos:x[0]]
                            lastpos = x[1]
                            cleanout.write('>split{:}_{:}\n{:}\n'.format(
                                i, record.id, softwrap(newSeq)))
                            if i == len(regions) - 1:
                                newSeq = Seq[x[1]:]
                                cleanout.write('>split{:}_{:}\n{:}\n'.format(
                                    i + 1, record.id, softwrap(newSeq)))
    else:
        eukCleaned = infile

    # MITO screen
    status('Mitochondria Contamination Screen')
    mitoHits = []
    blastreport = os.path.join(args.workdir, "%s.%s.blastn" % ('MITO', prefix))
    blastnargs = [
        'blastn', '-query', eukCleaned, '-db',
        os.path.join(args.workdir, 'MITO'), '-num_threads',
        str(args.cpus), '-dust', 'yes', '-soft_masking', 'true',
        '-perc_identity', BlastPercent_ID_MitoMatch, '-lcase_masking',
        '-outfmt', '6', '-out', blastreport
    ]
    printCMD(blastnargs)
    call(blastnargs)
    with open(blastreport) as report:
        colparser = csv.reader(report, delimiter="\t")
        for row in colparser:
            if int(row[3]) >= 120:
                contigs_to_remove[row[0]] = ('MitoScreen', row[1],
                                             float(row[2]))
                mitoHits.append(row[0])
    status('Mito screening finished.')

    #vecscreen starts here
    status(
        'Starting VecScreen, will remove terminal matches and split internal matches'
    )
    rnd = 0
    count = 1
    while (count > 0):
        filepref = "%s.r%d" % (prefix, rnd)
        report = os.path.join(args.workdir, "%s.vecscreen.tab" % (filepref))
        if not os.path.exists(report):
            cmd = [
                'blastn', '-task', 'blastn', '-reward', '1', '-penalty', '-5',
                '-gapopen', '3', '-gapextend', '3', '-dust', 'yes',
                '-soft_masking', 'true', '-evalue', '700', '-searchsp',
                '1750000000000', '-db',
                os.path.join(args.workdir, 'UniVec'), '-outfmt',
                '6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore score qlen',
                '-num_threads',
                str(args.cpus), '-query', eukCleaned, '-out', report
            ]
            #logger.info('CMD: {:}'.format(printCMD(cmd,7)))
            call(cmd)
        # this needs to know/return the new fasta file?
        status("Parsing VecScreen round {:}: {:} for {:}".format(
            rnd + 1, filepref, report))
        (count,
         cleanfile) = parse_clean_blastn(eukCleaned,
                                         os.path.join(args.workdir, filepref),
                                         report, args.stringency)
        status("count is %d cleanfile is %s" % (count, cleanfile))
        if count == 0:  # if there are no vector matches < than the pid cutoff
            status("copying %s to %s" % (eukCleaned, outfile_vec))
            shutil.copy(eukCleaned, outfile_vec)
        else:
            rnd += 1
            eukCleaned = cleanfile

    status("{:,} contigs will be removed:".format(len(contigs_to_remove)))
    for k, v in sorted(contigs_to_remove.items()):
        print('\t{:} --> dbhit={:}; hit={:}; pident={:}'.format(
            k, v[0], v[1], v[2]))

    # this could instead use the outfile and strip .fasta/fsa/fna and add mito on it I suppose, but assumes
    # a bit about the naming structure

    mitochondria = os.path.join(outdir, prefix + '.mitochondria.fasta')
    with open(args.outfile, "w") as output_handle, open(mitochondria,
                                                        'w') as mito_handle:
        for record in SeqIO.parse(outfile_vec, "fasta"):
            if not record.id in contigs_to_remove:
                SeqIO.write(record, output_handle, "fasta")
            elif record.id in mitoHits:
                SeqIO.write(record, mito_handle, "fasta")
    status('Writing {:,} cleaned contigs to: {:}'.format(
        countfasta(args.outfile), args.outfile))
    status('Writing {:,} mitochondrial contigs to: {:}'.format(
        countfasta(mitochondria), mitochondria))
    if '_' in args.outfile:
        nextOut = args.outfile.split('_')[0] + '.sourpurge.fasta'
    elif '.' in args.outfile:
        nextOut = args.outfile.split('.')[0] + '.sourpurge.fasta'
    else:
        nextOut = args.outfile + '.sourpurge.fasta'

    if not args.pipe:
        status(
            'Your next command might be:\n\tAAFTF sourpurge -i {:} -o {:} -c {:} --phylum Ascomycota\n'
            .format(args.outfile, nextOut, args.cpus))

    if not args.debug:
        SafeRemove(args.workdir)
Exemple #11
0
def run(parser, args):
    custom_workdir = 1
    if not args.workdir:
        custom_workdir = 0
        args.workdir = 'aaftf-filter_' + str(uuid.uuid4())[:8]
    if not os.path.exists(args.workdir):
        os.mkdir(args.workdir)

    #parse database locations
    DB = None
    if not args.AAFTF_DB:
        try:
            DB = os.environ["AAFTF_DB"]
        except KeyError:
            if args.AAFTF_DB:
                DB = args.AAFTF_DB
            else:
                pass
    else:
        DB = args.AAFTF_DB

    bamthreads = 4
    if args.cpus < 4:
        bamthreads = args.cpus

    earliest_file_age = -1
    contam_filenames = []
    # db of contaminant (PhiX)
    for url in Contaminant_Accessions.values():
        acc = os.path.basename(url)
        if DB:
            acc_file = os.path.join(DB, acc)
        else:
            acc_file = os.path.join(args.workdir, acc)
        contam_filenames.append(acc_file)
        if not os.path.exists(acc_file):
            urllib.request.urlretrieve(url, acc_file)
        if (earliest_file_age < 0
                or earliest_file_age < os.path.getctime(acc_file)):
            earliest_file_age = os.path.getctime(acc_file)

    # download univec too
    url = DB_Links['UniVec']
    acc = os.path.basename(DB_Links['UniVec'])
    if DB:
        acc_file = os.path.join(DB, acc)
    else:
        acc_file = os.path.join(args.workdir, acc)
    contam_filenames.append(acc_file)
    if not os.path.exists(acc_file):
        urllib.request.urlretrieve(url, acc_file)
        if (earliest_file_age < 0
                or earliest_file_age < os.path.getctime(acc_file)):
            earliest_file_age = os.path.getctime(acc_file)

    if args.screen_accessions:
        for acc in args.screen_accessions:
            if DB:
                acc_file = os.path.join(DB, acc + ".fna")
                if not os.path.exists(acc_file):
                    acc_file = os.path.join(args.workdir, acc + ".fna")
            else:
                acc_file = os.path.join(args.workdir, acc + ".fna")
            contam_filenames.append(acc_file)
            if not os.path.exists(acc_file):
                url = SeqDBs['nucleotide'] % (acc)
                urllib.request.urlretrieve(url, acc_file)
            if (earliest_file_age < 0
                    or earliest_file_age < os.path.getctime(acc_file)):
                earliest_file_age = os.path.getctime(acc_file)

    if args.screen_urls:
        for url in args.screen_urls:
            url_file = os.path.join(args.workdir, os.path.basename(url))
            contam_filenames.append(url_file)
            if not os.path.exists(url_file):
                urllib.request.urlretrieve(url, url_file)
            if (earliest_file_age < 0
                    or earliest_file_age < os.path.getctime(url_file)):
                earliest_file_age = os.path.getctime(url_file)

    if args.screen_local:
        for f in args.screen_local:
            contam_filenames.append(os.path.abspath(f))

    # concat vector db
    status('Generating combined contamination database:\n{:}'.format(
        '\n'.join(contam_filenames)))
    contamdb = os.path.join(args.workdir, 'contamdb.fa')
    if (not os.path.exists(contamdb)
            or (os.path.getctime(contamdb) < earliest_file_age)):
        with open(contamdb, 'wb') as wfd:
            for fname in contam_filenames:
                with open(fname,
                          'rb') as fd:  # reasonably fast copy for append
                    shutil.copyfileobj(fd, wfd)

    #find reads
    forReads, revReads = (None, ) * 2
    if args.left:
        forReads = os.path.abspath(args.left)
    if args.right:
        revReads = os.path.abspath(args.right)
    if not forReads:
        status("Must provide --left, unable to locate FASTQ reads")
        sys.exit(1)
    total = countfastq(forReads)
    if revReads:
        total = total * 2
    status('Loading {:,} total reads'.format(total))

    # seems like this needs to be stripping trailing extension?
    if not args.basename:
        if '_' in os.path.basename(forReads):
            args.basename = os.path.basename(forReads).split('_')[0]
        elif '.' in os.path.basename(forReads):
            args.basename = os.path.basename(forReads).split('.')[0]
        else:
            args.basename = os.path.basename(forReads)

    #logger.info('Loading {:,} FASTQ reads'.format(countfastq(forReads)))
    DEVNULL = open(os.devnull, 'w')
    alignBAM = os.path.join(args.workdir, args.basename + '_contam_db.bam')
    clean_reads = args.basename + "_filtered"
    refmatch_bbduk = [contamdb, 'phix', 'artifacts', 'lambda']
    if args.aligner == "bbduk":
        status('Kmer filtering reads using BBDuk')
        if args.memory:
            MEM = '-Xmx{:}g'.format(args.memory)
        else:
            MEM = '-Xmx{:}g'.format(round(0.6 * getRAM()))
        cmd = [
            'bbduk.sh', MEM, 't={:}'.format(args.cpus), 'hdist=1', 'k=27',
            'overwrite=true',
            'in=%s' % (forReads),
            'out=%s_1.fastq.gz' % (clean_reads)
        ]
        if revReads:
            cmd.extend(
                ['in2=%s' % (revReads),
                 'out2=%s_2.fastq.gz' % (clean_reads)])

        cmd.extend(['ref=%s' % (",".join(refmatch_bbduk))])
        #cmd.extend(['prealloc','qhdist=1'])
        printCMD(cmd)
        if args.debug:
            subprocess.run(cmd)
        else:
            subprocess.run(cmd, stderr=DEVNULL)

        if not args.debug and not custom_workdir:
            SafeRemove(args.workdir)

        clean = countfastq('{:}_1.fastq.gz'.format(clean_reads))
        if revReads:
            clean = clean * 2
        status('{:,} reads mapped to contamination database'.format(
            (total - clean)))
        status('{:,} reads unmapped and writing to file'.format(clean))

        status('Filtering complete:\n\tFor: {:}\n\tRev: {:}'.format(
            clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz'))
        if not args.pipe:
            status(
                'Your next command might be:\n\tAAFTF assemble -l {:} -r {:} -c {:} -o {:}\n'
                .format(clean_reads + '_1.fastq.gz',
                        clean_reads + '_2.fastq.gz', args.cpus,
                        args.basename + '.spades.fasta'))
        return

    elif args.aligner == 'bowtie2':
        # likely not used and less accurate than bbmap?
        if not os.path.isfile(alignBAM):
            status('Aligning reads to contamination database using bowtie2')
            if (not os.path.exists(contamdb + ".1.bt2")
                    or os.path.getctime(contamdb + ".1.bt2") <
                    os.path.getctime(contamdb)):
                # (re)build index if no index or index is older than
                # the db
                bowtie_index = ['bowtie2-build', contamdb, contamdb]
                printCMD(bowtie_index)
                subprocess.run(bowtie_index, stderr=DEVNULL, stdout=DEVNULL)

            bowtie_cmd = [
                'bowtie2', '-x',
                os.path.basename(contamdb), '-p',
                str(args.cpus), '--very-sensitive'
            ]
            if forReads and revReads:
                bowtie_cmd = bowtie_cmd + ['-1', forReads, '-2', revReads]
            elif forReads:
                bowtie_cmd = bowtie_cmd + ['-U', forReads]

            #now run and write to BAM sorted
            printCMD(bowtie_cmd)
            p1 = subprocess.Popen(bowtie_cmd,
                                  cwd=args.workdir,
                                  stdout=subprocess.PIPE,
                                  stderr=DEVNULL)
            p2 = subprocess.Popen([
                'samtools', 'sort', '-@',
                str(bamthreads), '-o',
                os.path.basename(alignBAM), '-'
            ],
                                  cwd=args.workdir,
                                  stdout=subprocess.PIPE,
                                  stderr=DEVNULL,
                                  stdin=p1.stdout)
            p1.stdout.close()
            p2.communicate()

    elif args.aligner == 'bwa':
        # likely less accurate than bbduk so may not be used
        if not os.path.isfile(alignBAM):
            status('Aligning reads to contamination database using BWA')
            if (not os.path.exists(contamdb + ".amb")
                    or os.path.getctime(contamdb + ".amb") <
                    os.path.getctime(contamdb)):
                bwa_index = ['bwa', 'index', contamdb]
                printCMD(bwa_index)
                subprocess.run(bwa_index, stderr=DEVNULL, stdout=DEVNULL)

            bwa_cmd = [
                'bwa', 'mem', '-t',
                str(args.cpus),
                os.path.basename(contamdb), forReads
            ]
            if revReads:
                bwa_cmd.append(revReads)

            #now run and write to BAM sorted
            printCMD(bwa_cmd)
            p1 = subprocess.Popen(bwa_cmd,
                                  cwd=args.workdir,
                                  stdout=subprocess.PIPE,
                                  stderr=DEVNULL)
            p2 = subprocess.Popen([
                'samtools', 'sort', '-@',
                str(bamthreads), '-o',
                os.path.basename(alignBAM), '-'
            ],
                                  cwd=args.workdir,
                                  stdout=subprocess.PIPE,
                                  stderr=DEVNULL,
                                  stdin=p1.stdout)
            p1.stdout.close()
            p2.communicate()

    elif args.aligner == 'minimap2':
        # likely not used but may be useful for pacbio/nanopore?
        if not os.path.isfile(alignBAM):
            status('Aligning reads to contamination database using minimap2')

            minimap2_cmd = [
                'minimap2', '-ax', 'sr', '-t',
                str(args.cpus),
                os.path.basename(contamdb), forReads
            ]
            if revReads:
                minimap2_cmd.append(revReads)

            #now run and write to BAM sorted
            printCMD(minimap2_cmd)
            p1 = subprocess.Popen(minimap2_cmd,
                                  cwd=args.workdir,
                                  stdout=subprocess.PIPE,
                                  stderr=DEVNULL)
            p2 = subprocess.Popen([
                'samtools', 'sort', '-@',
                str(bamthreads), '-o',
                os.path.basename(alignBAM), '-'
            ],
                                  cwd=args.workdir,
                                  stdout=subprocess.PIPE,
                                  stderr=DEVNULL,
                                  stdin=p1.stdout)
            p1.stdout.close()
            p2.communicate()
    else:
        status("Must specify bowtie2, bwa, or minimap2 for filtering")

    if os.path.isfile(alignBAM):
        #display mapping stats in terminal
        subprocess.run(['samtools', 'index', alignBAM])
        mapped, unmapped = bam_read_count(alignBAM)
        status('{:,} reads mapped to contamination database'.format(mapped))
        status('{:,} reads unmapped and writing to file'.format(unmapped))
        #now output unmapped reads from bamfile
        #this needs to be -f 5 so unmapped-pairs
        if forReads and revReads:
            samtools_cmd = [
                'samtools', 'fastq', '-f', '12', '-1',
                clean_reads + '_1.fastq.gz', '-2', clean_reads + '_2.fastq.gz',
                alignBAM
            ]
        elif forReads:
            samtools_cmd = [
                'samtools', 'fastq', '-f', '4', '-1',
                clean_reads + '.fastq.gz', alignBAM
            ]
        subprocess.run(samtools_cmd, stderr=DEVNULL)
        if not args.debug:
            SafeRemove(args.workdir)
        if revReads:
            status('Filtering complete:\n\tFor: {:}\n\tRev: {:}'.format(
                clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz'))
            if not args.pipe:
                status(
                    'Your next command might be:\n\tAAFTF assemble -l {:} -r {:} -c {:} -o {:}\n'
                    .format(clean_reads + '_1.fastq.gz',
                            clean_reads + '_2.fastq.gz', args.cpus,
                            args.basename + '.spades.fasta'))
        else:
            status('Filtering complete:\n\tSingle: {:}'.format(clean_reads +
                                                               '.fastq.gz'))
            if not args.pipe:
                status(
                    'Your next command might be:\n\tAAFTF assemble -l {:} -c {:} -o {:}\n'
                    .format(clean_reads + '.fastq.gz', args.cpus,
                            args.basename + '.spades.fasta'))
Exemple #12
0
def run(parser, args):

    if not args.basename:
        if '_' in os.path.basename(args.left):
            args.basename = os.path.basename(args.left).split('_')[0]
        elif '.' in os.path.basename(args.left):
            args.basename = os.path.basename(args.left).split('.')[0]
        else:
            args.basename = os.path.basename(args.left)

    total = countfastq(args.left)
    if args.right:
        total = total * 2
    status('Loading {:,} total reads'.format(total))

    DEVNULL = open(os.devnull, 'w')
    if args.method == 'bbduk':
        if args.memory:
            MEM = '-Xmx{:}g'.format(args.memory)
        else:
            MEM = '-Xmx{:}g'.format(round(0.6 * getRAM()))

        status('Adapter trimming using BBDuk')
        cmd = [
            'bbduk.sh', MEM, 'ref=adapters', 't={:}'.format(args.cpus),
            'ktrim=r', 'k=23', 'mink=11', 'minlen={:}'.format(args.minlen),
            'hdist=1', 'ftm=5', 'tpe', 'tbo', 'overwrite=true'
        ]
        if args.left and args.right:
            cmd += [
                'in1={:}'.format(args.left), 'in2={:}'.format(args.right),
                'out1={:}_1P.fastq.gz'.format(args.basename),
                'out2={:}_2P.fastq.gz'.format(args.basename)
            ]
        elif args.left:
            cmd += [
                'in={:}'.format(args.left),
                'out={:}_1U.fastq.gz'.format(args.basename)
            ]

        printCMD(cmd)
        if args.debug:
            subprocess.run(cmd)
        else:
            subprocess.run(cmd, stderr=DEVNULL)

        if args.right:
            clean = countfastq('{:}_1P.fastq.gz'.format(args.basename))
            clean = clean * 2
            status('{:,} reads remaining and writing to file'.format(clean))
            status('Trimming finished:\n\tFor: {:}\n\tRev {:}'.format(
                args.basename + '_1P.fastq.gz',
                args.basename + '_2P.fastq.gz'))
            if not args.pipe:
                status(
                    'Your next command might be:\n\tAAFTF filter -l {:} -r {:} -o {:} -c {:}\n'
                    .format(args.basename + '_1P.fastq.gz',
                            args.basename + '_2P.fastq.gz', args.basename,
                            args.cpus))
        else:
            clean = countfastq('{:}_1U.fastq.gz'.format(args.basename))
            status('{:,} reads remaining and writing to file'.format(clean))
            status('Trimming finished:\n\tSingle: {:}'.format(args.basename +
                                                              '_1U.fastq.gz'))
            if not args.pipe:
                status(
                    'Your next command might be:\n\tAAFTF filter -l {:} -o {:} -c {:}\n'
                    .format(args.basename + '_1U.fastq.gz', args.basename,
                            args.cpus))

    elif args.method == 'trimmomatic':
        #find path
        trimmomatic_path = find_trimmomatic()
        if trimmomatic_path:
            jarfile = trimmomatic_path
        elif args.trimmomatic:
            jarfile = args.trimmomatic
        else:
            status(
                'Trimmomatic cannot be found - please provide location of trimmomatic.jar file.'
            )
            sys.exit(1)

        if jarfile:
            path_to_adaptors = args.trimmomatic_adaptors
            leadingwindow = "LEADING:%d" % (args.trimmomatic_leadingwindow)
            trailingwindow = "TRAILING:%d" % (args.trimmomatic_trailingwindow)
            slidingwindow = "SLIDINGWINDOW:%s" % (
                args.trimmomatic_slidingwindow)

            quality = args.trimmomatic_quality
            quality = "-%s" % (quality)  # add leading dash

            if not os.path.exists(path_to_adaptors):
                if args.right:
                    path_to_adaptors = dirname(
                        jarfile) + "/adapters/TruSeq3-PE.fa"
                else:
                    path_to_adaptors = dirname(
                        jarfile) + "/adapters/TruSeq3-SE.fa"

                if not os.path.exists(path_to_adaptors):
                    findpath = dirname(jarfile)
                    path_to_adaptors = ""
                    while findpath:
                        if os.path.exists(findpath + "/share"):
                            if args.right:
                                path_to_adaptors = findpath + "/share/trimmomatic/adapters/TruSeq3-PE.fa"
                            else:
                                path_to_adaptors = findpath + "/share/trimmomatic/adapters/TruSeq3-SE.fa"
                            break
                        findpath = dirname(findpath)

                if not os.path.exists(path_to_adaptors):
                    status(
                        "Cannot find adaptors file, please specify manually")
                    status(
                        "Cannot find adaptors file, please specify manually")
                    return

            clipstr = args.trimmomatic_clip % (path_to_adaptors)

            cmd = []

            if args.left and args.right:
                cmd = [
                    'java', '-jar', jarfile, 'PE', '-threads',
                    str(args.cpus), quality, args.left, args.right,
                    args.basename + '_1P.fastq', args.basename + '_1U.fastq',
                    args.basename + '_2P.fastq', args.basename + '_2U.fastq',
                    clipstr, leadingwindow, trailingwindow, slidingwindow,
                    "MINLEN:%d" % (args.minlen)
                ]
            elif args.left and not args.right:
                cmd = [
                    'java', '-jar', jarfile, 'SE', '-threads',
                    str(args.cpus), quality, args.left,
                    args.basename + '_1U.fastq', clipstr, leadingwindow,
                    trailingwindow, slidingwindow,
                    "MINLEN:%d" % (args.minlen)
                ]
            else:
                status("Must provide left and right pairs or single read set")
                return

            status('Running trimmomatic adapter and quality trimming')
            printCMD(cmd)
            if args.debug:
                subprocess.run(cmd)
            else:
                subprocess.run(cmd, stderr=DEVNULL)
            if args.right:
                status('Compressing trimmed PE FASTQ files')
                Fzip_inplace(args.basename + '_1P.fastq', args.cpus)
                Fzip_inplace(args.basename + '_2P.fastq', args.cpus)
                SafeRemove(args.basename + '_1U.fastq')
                SafeRemove(args.basename + '_2U.fastq')
                status('Trimming finished:\n\tFor: {:}\n\tRev {:}'.format(
                    args.basename + '_1P.fastq.gz',
                    args.basename + '_2P.fastq.gz'))
                if not args.pipe:
                    status(
                        'Your next command might be:\n\tAAFTF filter -l {:} -r {:} -o {:} -c {:}\n'
                        .format(args.basename + '_1P.fastq.gz',
                                args.basename + '_2P.fastq.gz', args.basename,
                                args.cpus))
            else:
                status('Compressing trimmed SE FASTQ file')
                Fzip_inplace(args.basename + '_1U.fastq', args.cpus)
                status(
                    'Trimming finished:\n\tSingle: {:}'.format(args.basename +
                                                               '_1U.fastq.gz'))
                if not args.pipe:
                    status(
                        'Your next command might be:\n\tAAFTF filter -l {:} -o {:} -c {:}\n'
                        .format(args.basename + '_1U.fastq.gz', args.basename,
                                args.cpus))
Exemple #13
0
def run(parser, args):
    #script to run entire AAFTF pipeline
    args_dict = vars(args)
    basename = args_dict['basename']
    RAM = round(0.75 * getRAM())
    if not args.memory:
        args_dict['memory'] = str(RAM)

    #run trimming with bbduk
    if not checkfile(basename + '_1P.fastq.gz'):
        trimOpts = [
            'memory', 'left', 'right', 'basename', 'cpus', 'debug', 'minlen'
        ]
        trimDict = {k: v for (k, v) in args_dict.items() if k in trimOpts}
        trimDict['method'] = 'bbduk'
        trimDict['pipe'] = True
        trimargs = Namespace(**trimDict)
        trim.run(parser, trimargs)
    else:
        if args.right:
            status('AAFTF trim output found: {:} {:}'.format(
                basename + '_1P.fastq.gz', basename + '_2P.fastq.gz'))
        else:
            status('AAFTF trim output found: {:}'.format(basename +
                                                         '_1P.fastq.gz'))
    if not checkfile(basename + '_1P.fastq.gz'):
        status('AATFT trim failed')
        sys.exit(1)

    #run mitochondrial assembly on bbduk trimmed reads
    if args.right:
        if not checkfile(basename + '.mito.fasta'):
            mitoOpts = [
                'left', 'right', 'out', 'minlen', 'maxlen', 'seed', 'starting',
                'workdir', 'pipe', 'reference'
            ]
            mitoDict = {k: v for (k, v) in args_dict.items() if k in mitoOpts}
            mitoDict['left'] = basename + '_1P.fastq.gz'
            mitoDict['right'] = basename + '_2P.fastq.gz'
            mitoDict['out'] = basename + '.mito.fasta'
            mitoDict['minlen'] = 10000
            mitoDict['maxlen'] = 100000
            for x in mitoOpts:
                if not x in mitoDict:
                    mitoDict[x] = False
            mitoargs = Namespace(**mitoDict)
            mito.run(parser, mitoargs)
        else:
            status('AAFTF mito output found: {}'.format(basename +
                                                        '.mito.fasta'))
    else:
        status(
            'AAFTF mito requires PE reads, skipping mitochondrial de novo assembly'
        )

    #run filtering with bbduk
    if not checkfile(basename + '_filtered_1.fastq.gz'):
        filterOpts = [
            'screen_accessions', 'screen_urls', 'basename', 'cpus', 'debug',
            'memory', 'AAFTF_DB', 'workdir'
        ]
        filterDict = {k: v for (k, v) in args_dict.items() if k in filterOpts}
        filterDict['aligner'] = 'bbduk'
        filterDict['left'] = basename + '_1P.fastq.gz'
        if args.right:
            filterDict['right'] = basename + '_2P.fastq.gz'
        filterDict['pipe'] = True
        if checkfile(basename + '.mito.fasta'):
            filterDict['screen_local'] = [basename + '.mito.fasta']
        filterargs = Namespace(**filterDict)
        aaftf_filter.run(parser, filterargs)
    else:
        if args.right:
            status('AAFTF filter output found: {:} {:}'.format(
                basename + '_filtered_1.fastq.gz',
                basename + '_filtered_2.fastq.gz'))
        else:
            status('AAFTF filter output found: {:}'.format(
                basename + '_filtered_1.fastq.gz'))
    if not checkfile(basename + '_filtered_1.fastq.gz'):
        status('AATFT filter failed')
        sys.exit(1)

    #run assembly with spades
    if not checkfile(basename + '.spades.fasta'):
        assembleOpts = [
            'memory', 'cpus', 'debug', 'workdir', 'method', 'assembler_args',
            'tmpdir'
        ]
        assembleDict = {
            k: v
            for (k, v) in args_dict.items() if k in assembleOpts
        }
        assembleDict['left'] = basename + '_filtered_1.fastq.gz'
        if args.right:
            assembleDict['right'] = basename + '_filtered_2.fastq.gz'
        assembleDict['out'] = basename + '.spades.fasta'
        assembleDict['spades_tmpdir'] = None
        assembleDict['pipe'] = True
        assembleargs = Namespace(**assembleDict)
        assemble.run(parser, assembleargs)
    else:
        status('AAFTF assemble output found: {:}'.format(basename +
                                                         '.spades.fasta'))
    if not checkfile(basename + '.spades.fasta'):
        status('AATFT assemble failed')
        sys.exit(1)

    #run vecscreen
    if not checkfile(basename + '.vecscreen.fasta'):
        vecOpts = ['cpus', 'debug', 'workdir', 'AAFTF_DB']
        vecDict = {k: v for (k, v) in args_dict.items() if k in vecOpts}
        vecDict['percent_id'] = False
        vecDict['stringency'] = 'high'
        vecDict['infile'] = basename + '.spades.fasta'
        vecDict['outfile'] = basename + '.vecscreen.fasta'
        vecDict['pipe'] = True
        vecargs = Namespace(**vecDict)
        vecscreen.run(parser, vecargs)
    else:
        status('AAFTF vecscreen output found: {:}'.format(basename +
                                                          '.vecscreen.fasta'))
    if not checkfile(basename + '.vecscreen.fasta'):
        status('AATFT vecscreen failed')
        sys.exit(1)

    #run sourmash purge
    if not checkfile(basename + '.sourpurge.fasta'):
        sourOpts = [
            'cpus', 'debug', 'workdir', 'AAFTF_DB', 'phylum', 'sourdb',
            'mincovpct'
        ]
        sourDict = {k: v for (k, v) in args_dict.items() if k in sourOpts}
        sourDict['left'] = basename + '_filtered_1.fastq.gz'
        if args.right:
            sourDict['right'] = basename + '_filtered_2.fastq.gz'
        sourDict['input'] = basename + '.vecscreen.fasta'
        sourDict['outfile'] = basename + '.sourpurge.fasta'
        sourDict['taxonomy'] = False
        sourDict['pipe'] = True
        sourargs = Namespace(**sourDict)
        sourpurge.run(parser, sourargs)
    else:
        status('AAFTF sourpurge output found: {:}'.format(basename +
                                                          '.sourpurge.fasta'))
    if not checkfile(basename + '.sourpurge.fasta'):
        status('AATFT sourpurge failed')
        sys.exit(1)

    #run remove duplicates
    if not checkfile(basename + '.rmdup.fasta'):
        rmdupOpts = ['cpus', 'debug', 'workdir']
        rmdupDict = {k: v for (k, v) in args_dict.items() if k in rmdupOpts}
        rmdupDict['input'] = basename + '.sourpurge.fasta'
        rmdupDict['out'] = basename + '.rmdup.fasta'
        rmdupDict['minlen'] = args_dict['mincontiglen']
        rmdupDict['percent_id'] = 95
        rmdupDict['percent_cov'] = 95
        rmdupDict['exhaustive'] = False
        rmdupDict['pipe'] = True
        rmdupargs = Namespace(**rmdupDict)
        rmdup.run(parser, rmdupargs)
    else:
        status('AAFTF rmdup output found: {:}'.format(basename +
                                                      '.rmdup.fasta'))
    if not checkfile(basename + '.rmdup.fasta'):
        status('AATFT rmdup failed')
        sys.exit(1)

    #run pilon to error-correct
    if not checkfile(basename + '.pilon.fasta'):
        pilonOpts = ['cpus', 'debug', 'workdir', 'iterations', 'memory']
        pilonDict = {k: v for (k, v) in args_dict.items() if k in pilonOpts}
        pilonDict['infile'] = basename + '.rmdup.fasta'
        pilonDict['outfile'] = basename + '.pilon.fasta'
        pilonDict['left'] = basename + '_filtered_1.fastq.gz'
        if args.right:
            pilonDict['right'] = basename + '_filtered_2.fastq.gz'
        pilonDict['pipe'] = True
        pilonargs = Namespace(**pilonDict)
        pilon.run(parser, pilonargs)
    else:
        status('AAFTF pilon output found: {:}'.format(basename +
                                                      '.pilon.fasta'))
    if not checkfile(basename + '.pilon.fasta'):
        status('AATFT pilon failed')
        sys.exit(1)

    #sort and rename
    if not checkfile(basename + '.final.fasta'):
        sortDict = {
            'input': basename + '.pilon.fasta',
            'out': basename + '.final.fasta',
            'name': 'scaffold',
            'minlen': args_dict['mincontiglen']
        }
        sortargs = Namespace(**sortDict)
        aaftf_sort.run(parser, sortargs)
    else:
        status('AAFTF sort output found: {:}'.format(basename +
                                                     '.final.fasta'))
    if not checkfile(basename + '.final.fasta'):
        status('AATFT sort failed')
        sys.exit(1)

    #assess the assembly
    assessDict = {'input': basename + '.final.fasta', 'report': False}
    assessargs = Namespace(**assessDict)
    assess.run(parser, assessargs)
Exemple #14
0
def main():

    #########################################
    # create the top-level parser
    #########################################
    parser = argparse.ArgumentParser(
        prog='AAFTF', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-q",
                        "--quiet",
                        help="Do not output warnings to stderr",
                        action="store_true",
                        dest="quiet")
    parser.add_argument("-v",
                        "--version",
                        help="Installed AAFTF version",
                        action="version",
                        version="%(prog)s " + str(myversion))

    subparsers = parser.add_subparsers(title='[sub-commands]',
                                       dest='command',
                                       parser_class=ArgumentParserWithDefaults)

    #########################################
    # create the individual tool parsers
    #########################################

    ##########
    # trim
    ##########
    # arguments
    # --trimmomatic: arguments are path to JAR or application respectively
    # assume java is PATH already for trimmomatic
    # -o / --outdir: write outdir
    # -p / --prefix: outfile prefix
    # -ml / --minlen: min read length

    # read info, either paired data are required or singleton
    # --left: left or forward reads
    # --right: right or reverse reads
    # currently singleton / unpaired reads not supported?

    parser_trim = subparsers.add_parser(
        'trim',
        description=
        "This comamnd trims reads in FASTQ format to remove low quality reads and trim adaptor sequences",
        help='Trim FASTQ input reads')

    parser_trim.add_argument(
        '-o',
        '--out',
        type=str,
        required=False,
        dest='basename',
        help="Output basename, default to base name of --left reads")

    parser_trim.add_argument('-c',
                             '--cpus',
                             type=int,
                             metavar="cpus",
                             required=False,
                             default=1,
                             help="Number of CPUs/threads to use.")

    parser_trim.add_argument(
        '-ml',
        '--minlen',
        type=int,
        default=75,
        required=False,
        help="Minimum read length after trimming, default: 75")

    parser_trim.add_argument(
        '-l',
        '--left',
        type=str,
        required=True,
        help='left/forward reads of paired-end FASTQ or single-end FASTQ.')

    parser_trim.add_argument('-r',
                             '--right',
                             type=str,
                             required=False,
                             help='right/reverse reads of paired-end FASTQ.')

    parser_trim.add_argument('-v',
                             '--debug',
                             action='store_true',
                             help="Provide debugging messages")

    parser_trim.add_argument('--pipe',
                             action='store_true',
                             help="AAFTF is running in pipeline mode")

    parser_trim.add_argument('--method',
                             default='bbduk',
                             choices=['bbduk', 'trimmomatic'],
                             help='Program to use for adapter trimming')

    parser_trim.add_argument('-m',
                             '--memory',
                             type=int,
                             dest='memory',
                             required=False,
                             help="Max Memory (in GB)")

    tool_group = parser_trim.add_mutually_exclusive_group(required=False)

    tool_group.add_argument('--trimmomatic',
                            '--jar',
                            metavar='trimmomatic_jar',
                            type=str,
                            required=False,
                            help='Trimmomatic JAR path')
    trimmomatic_group = parser_trim.add_argument_group(
        title='Trimmomatic options',
        description="Trimmomatic trimming options")

    trimmomatic_group.add_argument(
        '--trimmomatic_adaptors',
        default="TruSeq3-PE.fa",
        help="Trimmomatic adaptor file, default: TruSeq3-PE.fa")

    trimmomatic_group.add_argument(
        '--trimmomatic_clip',
        default="ILLUMINACLIP:%s:2:30:10",
        help="Trimmomatic clipping, default: ILLUMINACLIP:TruSeq3-PE.fa:2:30:10"
    )

    trimmomatic_group.add_argument(
        '--trimmomatic_leadingwindow',
        default="3",
        type=int,
        help="Trimmomatic window processing arguments, default: LEADING:3")

    trimmomatic_group.add_argument(
        '--trimmomatic_trailingwindow',
        default="3",
        type=int,
        help="Trimmomatic window processing arguments, default: TRAILING:3")

    trimmomatic_group.add_argument(
        '--trimmomatic_slidingwindow',
        default="4:15",
        type=str,
        help=
        "Trimmomatic window processing arguments, default: SLIDINGWINDOW:4:15")
    trimmomatic_group.add_argument(
        '--trimmomatic_quality',
        default="phred33",
        help="Trimmomatic quality encoding -phred33 or phred64")

    ##########
    # mito-asm assembly mitochondrial genome
    ##########

    parser_mito = subparsers.add_parser(
        'mito',
        description=
        "De novo assembly of mitochondrial genome using NOVOplasty, takes PE Illumina adapter trimmed data.",
        help='De novo assembly of mitochondrial genome')
    parser_mito.add_argument('-l',
                             '--left',
                             required=True,
                             help="Left (Forward) reads")

    parser_mito.add_argument('-r',
                             '--right',
                             required=True,
                             help="Right (Reverse) reads")

    parser_mito.add_argument('-o',
                             '--out',
                             type=str,
                             required=True,
                             help="Output FASTA file for mitochondrial genome")

    parser_mito.add_argument('--minlen',
                             default=10000,
                             type=int,
                             help="Minimum expected genome size")

    parser_mito.add_argument('--maxlen',
                             default=100000,
                             type=int,
                             help="Maximum expected genome size")

    parser_mito.add_argument(
        '-s',
        '--seed',
        required=False,
        help=
        "Seed sequence, ie related mitochondrial genome, Default: A. nidulans")

    parser_mito.add_argument(
        '--starting',
        required=False,
        help="FASTA file of start sequence, rotate genome to, default COB")

    parser_mito.add_argument('--reference',
                             required=False,
                             help="Run NOVOplasty in reference mode")

    parser_mito.add_argument(
        '-w',
        '--workdir',
        '--tmpdir',
        type=str,
        dest='workdir',
        required=False,
        help="Temporary directory to store datafiles and processes in")

    parser_mito.add_argument('--pipe',
                             action='store_true',
                             help="AAFTF is running in pipeline mode")

    ##########
    # filter
    ##########
    # arguments
    # -i / --indir:  input dir
    # -p / --prefix: in fastq reads and output prefix
    # -a / --screen_accessions: screening sequence GenBank accessions
    # -u / --screen_urls: screening sequence URLs (fasta format)
    # --debug: print debug messages and do no remove contamdb BAM file
    # read info, either paired data are required or singleton
    # --left: left or forward reads
    # --right: right or reverse reads
    # or value from --prefix
    # --aligner: bbduk bwa, bowtie2, minimap for read alignment to contamdb

    parser_filter = subparsers.add_parser(
        'filter',
        description=
        "Filter reads which match contaminant databases such as phiX",
        help='Filter contaminanting reads')

    parser_filter.add_argument('-w',
                               '--workdir',
                               type=str,
                               help="temp directory")

    parser_filter.add_argument('-c',
                               '--cpus',
                               type=int,
                               metavar="cpus",
                               required=False,
                               default=1,
                               help="Number of CPUs/threads to use.")

    parser_filter.add_argument('-o',
                               '--out',
                               dest='basename',
                               type=str,
                               required=False,
                               help="Output basename")

    parser_filter.add_argument(
        '-v',
        '--debug',
        action='store_true',
        help=
        "Provide debugging messages and do not remove contamdb matching BAM")

    parser_filter.add_argument(
        '-a',
        '--screen_accessions',
        type=str,
        nargs="*",
        help="Genbank accession number(s) to screen out from initial reads.")

    parser_filter.add_argument(
        '-u',
        '--screen_urls',
        type=str,
        nargs="*",
        help="URLs to download and screen out initial reads.")

    parser_filter.add_argument(
        '-s',
        '--screen_local',
        type=str,
        nargs="+",
        help="Local FASTA file(s) to use contamination screen")

    parser_filter.add_argument('-l',
                               '--left',
                               required=True,
                               help="Left (Forward) reads")

    parser_filter.add_argument('-r',
                               '--right',
                               required=False,
                               help="Right (Reverse) reads")

    parser_filter.add_argument(
        '--AAFTF_DB',
        type=str,
        required=False,
        help="Path to AAFTF resources, defaults to $AAFTF_DB")

    parser_filter.add_argument(
        '--aligner',
        default='bbduk',
        choices=['bbduk', 'bowtie2', 'bwa', 'minimap2'],
        help='Aligner to use to map reads to contamination database')

    parser_filter.add_argument('-m',
                               '--memory',
                               type=int,
                               dest='memory',
                               required=False,
                               help="Max Memory (in GB)")

    parser_filter.add_argument('--pipe',
                               action='store_true',
                               help="AAFTF is running in pipeline mode")

    ##########
    # assemble
    ##########
    # arguments
    # -i / --indir:  input folder
    # -o / --outdir: output folder
    # -p / --prefix: input/outfile prefix
    # --paired or --unpaired
    # --spades
    # --tmpdir: tempdir for spades

    parser_asm = subparsers.add_parser(
        'assemble',
        description="Run assembler on cleaned reads",
        help='Assemble reads')

    parser_asm.add_argument('--method',
                            type=str,
                            required=False,
                            default="spades",
                            help="Assembly method: spades, dipspades, megahit")

    parser_asm.add_argument(
        '-o',
        '--out',
        type=str,
        required=True,  # think about sensible replacement in future
        help="Output assembly FASTA")

    parser_asm.add_argument('-w',
                            '--workdir',
                            type=str,
                            dest='workdir',
                            help="assembly output directory")

    parser_asm.add_argument('-c',
                            '--cpus',
                            type=int,
                            metavar="cpus",
                            required=False,
                            default=1,
                            help="Number of CPUs/threads to use.")

    parser_asm.add_argument(
        '-m',
        '--memory',
        type=str,
        dest='memory',
        required=False,
        default='32',
        help="Memory (in GB) setting for SPAdes. Default is 32")

    parser_asm.add_argument('-l',
                            '--left',
                            required=False,
                            help="Left (Forward) reads")

    parser_asm.add_argument('-r',
                            '--right',
                            required=False,
                            help="Right (Reverse) reads")

    parser_asm.add_argument('-v',
                            '--debug',
                            action='store_true',
                            help="Print Spades stdout to terminal")

    parser_asm.add_argument('--tmpdir',
                            type=str,
                            required=False,
                            help="Assembler temporary dir")
    parser_asm.add_argument('--assembler_args',
                            action='append',
                            required=False,
                            help="Additional SPAdes/Megahit arguments")
    parser_asm.add_argument('--haplocontigs',
                            dest='haplocontigs',
                            default=False,
                            action='store_true',
                            help="For dipSPAdes take the haplocontigs file")

    parser_asm.add_argument('--pipe',
                            action='store_true',
                            help="AAFTF is running in pipeline mode")

    ##########
    # vecscreen
    ##########
    # arguments
    # -i / --input:  input assembly file
    # -o / --outfile: output cleaned assembly
    # --prefix: Prefix for output / temp files
    # --tmpdir
    # --pid / percent_id

    parser_vecscreen = subparsers.add_parser(
        'vecscreen',
        description="Screen contigs for vector and common contaminantion",
        help='Vector and Contaminant Screening of assembled contigs')

    parser_vecscreen.add_argument('-c',
                                  '--cpus',
                                  type=int,
                                  metavar="cpus",
                                  default=1,
                                  help="Number of CPUs/threads to use.")

    parser_vecscreen.add_argument('-i',
                                  '--input',
                                  '--infile',
                                  type=str,
                                  required=True,
                                  dest='infile',
                                  help="Input contigs or scaffold assembly")

    parser_vecscreen.add_argument(
        '-o',
        '--outfile',
        type=str,
        required=True,
        help="Output vector screened and cleaned assembly")

    parser_vecscreen.add_argument(
        '-pid',
        '--percent_id',
        type=int,
        required=False,
        help="Percent Identity cutoff for vecscreen adaptor matches")

    parser_vecscreen.add_argument(
        '-w',
        '--workdir',
        '--tmpdir',
        type=str,
        help="Working directory to store datafiles and processes in")

    parser_vecscreen.add_argument(
        '--AAFTF_DB',
        type=str,
        required=False,
        help="Path to AAFTF resources, defaults to $AAFTF_DB")

    parser_vecscreen.add_argument('-s',
                                  '--stringency',
                                  default='high',
                                  choices=['high', 'low'],
                                  help="Stringency to filter VecScreen hits")

    parser_vecscreen.add_argument('-v',
                                  '--debug',
                                  action='store_true',
                                  dest='debug',
                                  help="Provide debugging messages")

    parser_vecscreen.add_argument('--pipe',
                                  action='store_true',
                                  help="AAFTF is running in pipeline mode")

    ##########
    # sourpurge
    ##########
    # arguments
    # -a / --assembly: input assembly file
    # -o / --out: output cleaned assembly file
    # -p / --prefix: datafile prefix and temp/output file prefix
    # -i / --indir: directory where sequence reads are located
    # -c / --cpus: number of cpus
    # --tmpdir
    # --phylum: phylum to keep
    parser_sour = subparsers.add_parser(
        'sourpurge',
        description="Purge contigs based on sourmash results",
        help='Purge contigs based on sourmash results')

    parser_sour.add_argument('-i',
                             '--input',
                             type=str,
                             required=True,
                             help="Input contigs or scaffold assembly")

    parser_sour.add_argument(
        '-o',
        '--outfile',
        type=str,
        required=True,  # think about sensible replacement in future
        help="Output sourmash cleaned assembly")

    parser_sour.add_argument('-l',
                             '--left',
                             required=False,
                             help="Left (Forward) reads")

    parser_sour.add_argument('-r',
                             '--right',
                             required=False,
                             help="Right (Reverse) reads")

    parser_sour.add_argument(
        '-p',
        '--phylum',
        required=True,
        nargs="+",
        help="Phylum or Phyla to keep matches, i.e. Ascomycota")

    parser_sour.add_argument('--sourdb',
                             required=False,
                             help="SourMash LCA k-31 taxonomy database")

    parser_sour.add_argument('-mc',
                             '--mincovpct',
                             default=5,
                             type=int,
                             help="Minimum percent of N50 coverage to remove")

    parser_sour.add_argument('-c',
                             '--cpus',
                             type=int,
                             metavar="cpus",
                             default=1,
                             help="Number of CPUs/threads to use.")

    parser_sour.add_argument(
        '-w',
        '--workdir',
        '--tmpdir',
        type=str,
        dest='workdir',
        required=False,
        help="Temporary directory to store datafiles and processes in")

    parser_sour.add_argument('-v',
                             '--debug',
                             action='store_true',
                             dest='debug',
                             help="Provide debugging messages")

    parser_sour.add_argument(
        '--AAFTF_DB',
        type=str,
        required=False,
        help="Path to AAFTF resources, defaults to $AAFTF_DB")

    parser_sour.add_argument('--just-show-taxonomy',
                             dest='taxonomy',
                             action='store_true',
                             help="Show taxonomy information and exit")

    parser_sour.add_argument('--pipe',
                             action='store_true',
                             help="AAFTF is running in pipeline mode")

    ##########
    # rmdup
    ##########

    # -i / --input
    # -o / --out
    # --tmpdir
    # --percent_id
    # ---mincovpct
    # -ml / --minlen
    # --exhaustive
    # --debug

    parser_rmdup = subparsers.add_parser(
        'rmdup',
        description="Remove duplicate contigs",
        help='Remove duplicate contigs')
    parser_rmdup.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help="Input Assembly fasta file(contigs or scaffolds)")

    parser_rmdup.add_argument(
        '-o',
        '--out',
        type=str,
        required=True,
        help=
        "Output new version of assembly with duplicated contigs/scaffolds removed"
    )

    parser_rmdup.add_argument('-c',
                              '--cpus',
                              type=int,
                              metavar="cpus",
                              required=False,
                              default=1,
                              help="Number of CPUs/threads to use.")

    parser_rmdup.add_argument(
        '-w',
        '--workdir',
        '--tmpdir',
        dest='workdir',
        type=str,
        required=False,
        help="Temporary directory to store datafiles and processes in")

    parser_rmdup.add_argument(
        '-pid',
        '--percent_id',
        type=int,
        dest='percent_id',
        required=False,
        default=95,
        help="Percent Identity used in matching contigs for redundancy")

    parser_rmdup.add_argument(
        '-pcov',
        '--percent_cov',
        type=int,
        dest='percent_cov',
        required=False,
        default=95,
        help="Coverage of contig used to decide if it is redundant")

    parser_rmdup.add_argument(
        '-ml',
        '--minlen',
        type=int,
        required=False,
        default=500,
        help="Minimum contig length to keep, shorter ones are dropped")

    parser_rmdup.add_argument(
        '--exhaustive',
        action='store_true',
        help=
        "Compute overlaps for every contig, otherwise only process contigs for L75 and below"
    )

    parser_rmdup.add_argument(
        '--debug',
        action='store_true',
        help='Run rmdup in debugging mode for more output')

    parser_rmdup.add_argument('--pipe',
                              action='store_true',
                              help="AAFTF is running in pipeline mode")

    ##########
    # pilon
    ##########
    # arguments
    # -i / --in: input assembly file
    # -o / --out: output cleaned assembly
    # -rp / --reads-prefix: input/outfile reads prefix
    # --iterations: default 5
    # --tmpdir
    # --debug

    parser_pilon = subparsers.add_parser(
        'pilon',
        description="Polish contig sequences with Pilon",
        help='Polish contig sequences with Pilon')

    parser_pilon.add_argument('-o',
                              '--out',
                              '--outfile',
                              type=str,
                              dest='outfile',
                              required=True,
                              help="Output Pilon polished assembly")

    parser_pilon.add_argument('-i',
                              '--infile',
                              '--input',
                              type=str,
                              dest='infile',
                              required=True,
                              help="Input contigs or scaffold assembly")

    parser_pilon.add_argument('-c',
                              '--cpus',
                              type=int,
                              metavar="cpus",
                              default=1,
                              help="Number of CPUs/threads to use.")

    parser_pilon.add_argument('-m',
                              '--memory',
                              type=int,
                              default=4,
                              dest='memory',
                              required=False,
                              help="Max Memory (in GB) (default is 4gb)")

    parser_pilon.add_argument('-v',
                              '--debug',
                              action='store_true',
                              help="Provide debugging messages")

    parser_pilon.add_argument(
        '-it',
        '--iterations',
        type=int,
        default=5,
        help="Number of Polishing iterations to run (default is 5)")

    parser_pilon.add_argument(
        '-l',
        '--left',
        type=str,
        required=True,
        help=
        'The name of the left/forward reads of paired-end FASTQ formatted reads.'
    )

    parser_pilon.add_argument(
        '-r',
        '--right',
        type=str,
        required=True,
        help=
        'The name of the right/reverse reads of paired-end FASTQ formatted reads.'
    )

    parser_pilon.add_argument(
        '-w',
        '--workdir',
        '--tmpdir',
        type=str,
        dest='workdir',
        required=False,
        help="Temporary directory to store datafiles and processes in")

    parser_pilon.add_argument('--pipe',
                              action='store_true',
                              help="AAFTF is running in pipeline mode")

    ##########
    # sort/rename FASTA headers
    ##########
    # arguments
    # -i / --input: input assembly file
    # -o / --out: output assembly file
    # -n / --name: base name to use default=scaffolds_

    parser_sort = subparsers.add_parser(
        'sort',
        description="Sort contigs by length and rename FASTA headers",
        help='Sort contigs by length and rename FASTA headers')

    parser_sort.add_argument('-i',
                             '--input',
                             '--infile',
                             required=True,
                             dest='input',
                             help='Input genome assembly FASTA')

    parser_sort.add_argument('-o',
                             '--out',
                             '--output',
                             required=True,
                             dest='out',
                             help='Output genome assembly FASTA')

    parser_sort.add_argument(
        '-ml',
        '--minlen',
        type=int,
        required=False,
        default=0,
        help="Minimum contig length to keep, shorter ones are dropped")

    parser_sort.add_argument('-n',
                             '--name',
                             '--basename',
                             default='scaffold',
                             dest='name',
                             help='Basename to rename FASTA headers')

    ##########
    # assess completeness
    ##########
    # arguments
    # -i / --input: input assembly file
    # -r / --report: report file (otherwise stdout)
    # --tmpdir

    parser_assess = subparsers.add_parser(
        'assess',
        description="Assess completeness of genome assembly",
        help='Assess completeness of genome assembly')

    parser_assess.add_argument(
        '-i',
        '--input',
        '--infile',
        required=True,
        help=
        'Input genome assembly to test completeness and provide summary statistics'
    )

    parser_assess.add_argument(
        '-r',
        '--report',
        type=str,
        help=
        'Filename to save report information otherwise will print to stdout')

    ##########
    # pipeline run it all
    ##########
    # arguments
    # -i / --input: input assembly file
    # -r / --report: report file (otherwise stdout)
    # --tmpdir

    parser_pipeline = subparsers.add_parser(
        'pipeline',
        description="Run entire AAFTF pipeline automagically",
        help='Run AAFTF pipeline')

    parser_pipeline.add_argument('--tmpdir',
                                 type=str,
                                 required=False,
                                 help="Assembler temporary dir")
    parser_pipeline.add_argument('--assembler_args',
                                 action='append',
                                 required=False,
                                 help="Additional SPAdes/Megahit arguments")
    parser_pipeline.add_argument(
        '--method',
        type=str,
        required=False,
        default="spades",
        help="Assembly method: spades, dipspades, megahit")

    parser_pipeline.add_argument(
        '-l',
        '--left',
        type=str,
        required=True,
        help='left/forward reads of paired-end FASTQ or single-end FASTQ.')

    parser_pipeline.add_argument(
        '-r',
        '--right',
        type=str,
        required=False,
        help='right/reverse reads of paired-end FASTQ.')

    parser_pipeline.add_argument(
        '-o',
        '--out',
        type=str,
        required=True,
        dest='basename',
        help="Output basename, default to base name of --left reads")

    parser_pipeline.add_argument('-c',
                                 '--cpus',
                                 type=int,
                                 metavar="cpus",
                                 required=False,
                                 default=1,
                                 help="Number of CPUs/threads to use.")

    parser_pipeline.add_argument(
        '-m',
        '--memory',
        type=str,
        dest='memory',
        required=False,
        help="Memory (in GB) setting for SPAdes. Default is Auto")

    parser_pipeline.add_argument(
        '-ml',
        '--minlen',
        type=int,
        default=75,
        required=False,
        help="Minimum read length after trimming, default: 75")

    parser_pipeline.add_argument(
        '-a',
        '--screen_accessions',
        type=str,
        nargs="*",
        help="Genbank accession number(s) to screen out from initial reads.")

    parser_pipeline.add_argument(
        '-u',
        '--screen_urls',
        type=str,
        nargs="*",
        help="URLs to download and screen out initial reads.")

    parser_pipeline.add_argument(
        '-it',
        '--iterations',
        type=int,
        default=5,
        help="Number of Pilon Polishing iterations to run")

    parser_pipeline.add_argument('-mc',
                                 '--mincontiglen',
                                 type=int,
                                 default=500,
                                 required=False,
                                 help="Minimum length of contigs to keep")

    parser_pipeline.add_argument(
        '--AAFTF_DB',
        type=str,
        required=False,
        help="Path to AAFTF resources, defaults to $AAFTF_DB")

    parser_pipeline.add_argument('-w',
                                 '--workdir',
                                 type=str,
                                 help="temp directory")

    parser_pipeline.add_argument('-v',
                                 '--debug',
                                 action='store_true',
                                 help="Provide debugging messages")

    parser_pipeline.add_argument(
        '-p',
        '--phylum',
        required=True,
        nargs="+",
        help="Phylum or Phyla to keep matches, i.e. Ascomycota")

    parser_pipeline.add_argument('--sourdb',
                                 required=False,
                                 help="SourMash LCA k-31 taxonomy database")

    parser_pipeline.add_argument(
        '--mincovpct',
        default=5,
        type=int,
        help="Minimum percent of N50 coverage to remove")

    #set defaults
    parser.set_defaults(func=run_subtool)

    ### process args now ###
    # if no args then print help and exit
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    args = parser.parse_args()

    try:
        status('Running AAFTF v{:}'.format(myversion))
        args.func(parser, args)
    except IOError as e:
        if e.errno != 32:  # ignore SIGPIPE
            raise