def run(parser, args): if args.method == "spades": run_spades(parser, args) elif args.method == "dipspades": run_dipspades(parser, args) elif args.method == "megahit": run_megahit(parser, args) else: status("Unknow assembler method {}".format(args.method))
def orient_to_start(fasta_in, fasta_out, folder='.', start=False): # if not starting, then use cytochrome oxidase (cob) startFile = os.path.join(folder, '{}.fasta'.format(uuid.uuid4())) if not start: # generated as spoa consensus from select fungal cob genes cob1 = 'atgagaattttaaaaagtcatcctttattaaaattagttaatagttatattattgattcaccacaaccttctaatattagttatttatgaaattttggatctttattagctttatgtttagttatacaaattgtaactggtgttacattagctatgcactatacacctaatgttgatttagcttttaattctgtagaacatattatgagagatgtaaataatggttgattaataagatatttacatgctaatactgcttcagcattctttttcttagttatatttacatataggtagaggattatattatggttcatataaatcacctagaacattaacatgagctattgg' with open(startFile, 'w') as outfile: outfile.write('>COB\n{}\n'.format(softwrap(cob1))) else: shutil.copyfile(start, startFile) # load sequence into dictionary initial_seq = '' header = '' with open(fasta_in, 'r') as infile: for title, seq in SimpleFastaParser(infile): initial_seq = seq header = title alignments = [] minimap2_cmd = ['minimap2', '-x', 'map-ont', '-c', fasta_in, startFile] for line in execute(minimap2_cmd, '.'): cols = line.rstrip().split('\t') alignments.append(cols) if len(alignments) == 1: ref_strand = cols[4] ref_offset = int(cols[2]) if ref_strand == '-': ref_start = int(cols[8]) + ref_offset else: ref_start = int(cols[7]) - ref_offset rotated = initial_seq[ref_start:] + initial_seq[:ref_start] if ref_strand == '-': rotated = RevComp(rotated) with open(fasta_out, 'w') as outfile: outfile.write('>{}\n{}\n'.format('mt', softwrap(rotated))) elif len(alignments) == 0: status( 'ERROR: unable to rotate because did not find --starting sequence\n' ) with open(fasta_out, 'w') as outfile: outfile.write('>{}\n{}\n'.format('mt', softwrap(initial_seq))) elif len(alignments) > 1: status('ERROR: unable to rotate because found multiple alignments\n') for x in alignments: sys.stderr.write('{}\n'.format(x)) with open(fasta_out, 'w') as outfile: outfile.write('>{}\n{}\n'.format('mt', softwrap(initial_seq))) if os.path.isfile(startFile): os.remove(startFile)
def run(parser, args): status('Sorting sequences by length longest --> shortest') AllSeqs = {} with open(args.input, 'rU') as fasta_in: for Header, Seq in SimpleFastaParser(fasta_in): if not Header in AllSeqs: if len(Seq) >= args.minlen: AllSeqs[Header] = len(Seq) sortSeqs = sorted(AllSeqs.items(), key=operator.itemgetter(1), reverse=True) orderedSeqs = [i[0] for i in sortSeqs] SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta')) with open(args.out, 'w') as fasta_out: for i, x in enumerate(orderedSeqs): fasta_out.write('>{:}_{:}\n{:}\n'.format( args.name, i + 1, softwrap(str(SeqRecords[x].seq)))) status('Output written to: {:}'.format(args.out))
def run_dipspades(parser, args): if not args.workdir: args.workdir = 'dipspades_' + str(os.getpid()) runcmd = [ 'dipspades.py', '--threads', str(args.cpus), '--cov-cutoff', 'auto', '--mem', args.memory, '-o', args.workdir ] if args.assembler_args: runcmd.extend(args.assembler_args) if args.haplocontigs: runcmd.extend(['--hap', args.haplocontigs]) if args.tmpdir: runcmd.extend(['--tmp-dir', args.tmpdir]) #find reads -- use --left/right or look for cleaned in tmpdir forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status('Unable to located FASTQ raw reads, provide --left') sys.exit(1) if not revReads: runcmd = runcmd + ['-s', forReads] else: runcmd = runcmd + ['--pe1-1', forReads, '--pe1-2', revReads] # this basically overrides everything above and only runs --restart-from option if os.path.isdir(args.workdir): runcmd = ['dipspades.py', '-o', args.workdir, '--continue'] # now run the spades job status('Assembling FASTQ data using Spades') printCMD(runcmd) DEVNULL = open(os.devnull, 'w') if args.debug: subprocess.run(runcmd) else: subprocess.run(runcmd, stdout=DEVNULL, stderr=DEVNULL) #pull out assembly if args.out: finalOut = args.out else: finalOut = prefix + '.dipspades.fasta' dipspadesoutdir = os.path.join(args.workdir, 'dipspades') if os.path.isfile(os.path.join(args.workdir, 'consensus_contigs.fasta')): shutil.copyfile(os.path.join(args.workdir, 'consensus_contigs.fasta'), finalOut) shutil.copyfile( os.path.join(args.workdir, 'dipspades', 'paired_consensus_contigs.fasta'), prefix + ".dipspades_consensus_paired.fasta") shutil.copyfile( os.path.join(args.workdir, 'dipspades', 'paired_consensus_contigs.fasta'), prefix + ".dipspades_consensus_unpaired.fasta") status('Dipspades assembly finished: {:}'.format(finalOut)) status( 'Dipspades assembly copied over: {:}'.format( prefix + ".dipspades_consensus_unpaired.fasta"), prefix + ".dipspades_consensus_paired.fasta") numSeqs, assemblySize = fastastats(finalOut) status('Assembly is {:,} scaffolds and {:,} bp'.format( numSeqs, assemblySize)) else: status( 'Spades assembly output missing -- check Dipspades logfile in {:}.' .format(os.path.join(args.workdir, 'dipspades', 'dipspades.log'))) if not args.pipe: status( 'Your next command might be:\n\tAAFTF vecscreen -i {:} -c {:}\n'. format(finalOut, args.cpus))
def run_megahit(parser, args): if not args.workdir: args.workdir = 'megahit_' + str(os.getpid()) runcmd = ['megahit', '-t', str(args.cpus), '-o', args.workdir] if args.assembler_args: runcmd.extend(args.assembler_args) if args.memory: runcmd.extend(['--memory', args.memory]) if args.tmpdir: runcmd.extend(['--tmp-dir', args.tmpdir]) #find reads -- use --left/right or look for cleaned in tmpdir forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status('Unable to located FASTQ raw reads, provide --left') sys.exit(1) if not revReads: runcmd = runcmd + ['-r', forReads] else: runcmd = runcmd + ['-1', forReads, '-2', revReads] if os.path.isdir(args.workdir): status("Cannot re-run with existing folder {}".format(args.workdir)) # now run the spades job status('Assembling FASTQ data using megahit') printCMD(runcmd) DEVNULL = open(os.devnull, 'w') if args.debug: subprocess.run(runcmd) else: subprocess.run(runcmd, stdout=DEVNULL, stderr=DEVNULL) #pull out assembly if args.out: finalOut = args.out else: finalOut = prefix + '.megahit.fasta' if os.path.isfile(os.path.join(args.workdir, 'final.contigs.fa')): shutil.copyfile(os.path.join(args.workdir, 'final.contigs.fa'), finalOut) status('Megahit assembly finished: {:}'.format(finalOut)) numSeqs, assemblySize = fastastats(finalOut) status('Assembly is {:,} scaffolds and {:,} bp'.format( numSeqs, assemblySize)) else: status('Megahit assembly output missing -- check megahit logfile.') if not args.pipe: status( 'Your next command might be:\n\tAAFTF vecscreen -i {:} -c {:}\n'. format(finalOut, args.cpus))
def run(parser, args): if not args.workdir: args.workdir = 'aaftf-sourpurge_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) bamthreads = 4 if args.cpus < 4: bamthreads = 1 #find reads forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status( 'Unable to located FASTQ raw reads, low coverage will be skipped. Provide -l,--left or -r,--right to enable low coverage filtering.' ) # sys.exit(1) #parse database locations if not args.sourdb: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: SOUR = os.path.join(args.AAFTF_DB, 'genbank-k31.lca.json.gz') else: status( "$AAFTF_DB/genbank-k31.lca.json.gz not found, pass --sourdb" ) sys.exit(1) SOUR = os.path.join(DB, 'genbank-k31.lca.json.gz') if not os.path.isfile(SOUR): status( "{:} sourmash database not found, download and rename to genbank-k31.lca.json.gz" .format(SOUR)) sys.exit(1) else: SOUR = os.path.abspath(args.sourdb) # hard coded tmpfile assembly_working = 'assembly.fasta' megablast_working = 'megablast.out' blobBAM = 'remapped.bam' shutil.copyfile(args.input, os.path.join(args.workdir, assembly_working)) numSeqs, assemblySize = fastastats( os.path.join(args.workdir, assembly_working)) status('Assembly is {:,} contigs and {:,} bp'.format( numSeqs, assemblySize)) DEVNULL = open(os.devnull, 'w') #now filter for taxonomy with sourmash lca classify status('Running SourMash to get taxonomy classification for each contig') sour_sketch = os.path.basename(assembly_working) + '.sig' sour_compute = [ 'sourmash', 'compute', '-k', '31', '--scaled=1000', '--singleton', assembly_working ] printCMD(sour_compute) subprocess.run(sour_compute, cwd=args.workdir, stderr=DEVNULL) sour_classify = [ 'sourmash', 'lca', 'classify', '--db', SOUR, '--query', sour_sketch ] printCMD(sour_classify) # output csv: ID,status,superkingdom,phylum,class,order,family,genus,species,strain Taxonomy = {} UniqueTax = [] sourmashTSV = os.path.join(args.workdir, 'sourmash.csv') with open(sourmashTSV, 'w') as sour_out: for line in execute(sour_classify, args.workdir): sour_out.write(line) if not line or line.startswith('\n') or line.startswith( 'ID') or line.count(',') < 9: continue line = line.strip() cols = line.split(',') if 'found' in cols: idx = cols.index('found') Taxonomy[cols[0]] = cols[idx + 1:] taxClean = [x for x in cols[idx + 1:] if x] UniqueTax.append('{:}'.format(';'.join(taxClean))) elif 'nomatch' in cols: idx = cols.index('nomatch') Taxonomy[cols[0]] = cols[idx + 1:] UniqueTax = set(UniqueTax) status('Found {:} taxonomic classifications for contigs:\n{:}'.format( len(UniqueTax), '\n'.join(UniqueTax))) if args.taxonomy: sys.exit(1) Tax2Drop = [] for k, v in Taxonomy.items(): v = [x for x in v if x] #remove empty items from list if args.debug: print('{:}\t{:}'.format(k, v)) if len(v) > 0: if not any(i in v for i in args.phylum): Tax2Drop.append(k) #drop contigs from taxonomy before calculating coverage status('Dropping {:} contigs from taxonomy screen'.format(len(Tax2Drop))) sourTax = os.path.join(args.workdir, 'sourmashed-tax-screen.fasta') with open(sourTax, 'w') as outfile: with open(os.path.join(args.workdir, assembly_working), 'rU') as infile: for record in SeqIO.parse(infile, 'fasta'): if not record.id in Tax2Drop: SeqIO.write(record, outfile, 'fasta') # only do coverage trimming if reads provided Contigs2Drop = [ ] # this will be empty if no reads given to gather by coverage if forReads: #check if BAM present, if so skip running if not os.path.isfile(os.path.join(args.workdir, blobBAM)): # index bwa_index = ['bwa', 'index', os.path.basename(sourTax)] status('Building BWA index') printCMD(bwa_index) subprocess.run(bwa_index, cwd=args.workdir, stderr=DEVNULL) #mapped reads to assembly using BWA bwa_cmd = [ 'bwa', 'mem', '-t', str(args.cpus), os.path.basename(sourTax), # assembly index base forReads ] if revReads: bwa_cmd.append(revReads) #run BWA and pipe to samtools sort status('Aligning reads to assembly with BWA') printCMD(bwa_cmd) p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '--threads', str(bamthreads), '-o', blobBAM, '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() subprocess.run(['samtools', 'index', blobBAM], cwd=args.workdir) #now calculate coverage from BAM file status('Calculating read coverage per contig') FastaBed = os.path.join(args.workdir, 'assembly.bed') lengths = [] with open(FastaBed, 'w') as bedout: with open(sourTax, 'rU') as SeqIn: for record in SeqIO.parse(SeqIn, 'fasta'): bedout.write('{:}\t{:}\t{:}\n'.format( record.id, 0, len(record.seq))) lengths.append(len(record.seq)) N50 = calcN50(lengths) Coverage = {} coverageBed = os.path.join(args.workdir, 'coverage.bed') cov_cmd = ['samtools', 'bedcov', os.path.basename(FastaBed), blobBAM] printCMD(cov_cmd) with open(coverageBed, 'w') as bed_out: for line in execute(cov_cmd, args.workdir): bed_out.write(line) if not line or line.startswith('\n') or line.count('\t') < 3: continue line = line.strip() cols = line.split('\t') cov = int(cols[3]) / float(cols[2]) Coverage[cols[0]] = (int(cols[2]), cov) #get average coverage of N50 contigs n50Cov = [] for k, v in Coverage.items(): if args.debug: print('{:}; Len: {:}; Cov: {:.2f}'.format(k, v[0], v[1])) if v[0] >= N50: n50Cov.append(v[1]) n50AvgCov = sum(n50Cov) / len(n50Cov) minpct = args.mincovpct / 100 # should we make this a variable? 5% was something arbitrary min_coverage = float(n50AvgCov * minpct) status('Average coverage for N50 contigs is {:}X'.format( int(n50AvgCov))) #Start list of contigs to drop for k, v in Coverage.items(): if v[1] <= min_coverage: Contigs2Drop.append(k) status( 'Found {:,} contigs with coverage less than {:.2f}X ({:}%)'.format( len(Contigs2Drop), min_coverage, args.mincovpct)) if args.debug: print('Contigs dropped due to coverage: {:}'.format( ','.join(Contigs2Drop))) print('Contigs dropped due to taxonomy: {:}'.format( ','.join(Tax2Drop))) DropFinal = Contigs2Drop + Tax2Drop DropFinal = set(DropFinal) status('Dropping {:,} total contigs based on taxonomy and coverage'.format( len(DropFinal))) with open(args.outfile, 'w') as outfile, open(sourTax, 'rU') as seqin: for record in SeqIO.parse(seqin, 'fasta'): if not record.id in DropFinal: SeqIO.write(record, outfile, 'fasta') numSeqs, assemblySize = fastastats(args.outfile) status('Sourpurged assembly is {:,} contigs and {:,} bp'.format( numSeqs, assemblySize)) if '_' in args.outfile: nextOut = args.outfile.split('_')[0] + '.rmdup.fasta' elif '.' in args.outfile: nextOut = args.outfile.split('.')[0] + '.rmdup.fasta' else: nextOut = args.outfile + '.rmdup.fasta' if checkfile(sourmashTSV): baseinput = os.path.basename(args.input) if '.' in baseinput: baseinput = baseinput.rsplit('.', 1)[0] shutil.copy(sourmashTSV, baseinput + '.sourmash-taxonomy.csv') if not args.debug: SafeRemove(args.workdir) if not args.pipe: status('Your next command might be:\n\tAAFTF rmdup -i {:} -o {:}\n'. format(args.outfile, nextOut))
def run(parser, args): # first check if NOVOplasty and minimap2 are installed, else exit programs = ['NOVOplasty.pl', 'minimap2'] for x in programs: if not which_path(x): status('ERROR: {} is not installed, exiting'.format(x)) sys.exit(1) # first we need to generate working directory unique_id = str(uuid.uuid4())[:8] if not args.workdir: args.workdir = 'mito_' + unique_id if not os.path.isdir(args.workdir): os.makedirs(args.workdir) # now estimate read lengths of FASTQ read_len = GuessRL(args.left) # check for seed sequence, otherwise write one if not args.seed: if not args.reference: seedFasta = os.path.abspath( os.path.join(os.path.dirname(__file__), 'mito-seed.fasta')) else: seedFasta = os.path.abspath(args.reference) else: seedFasta = os.path.abspath(args.seed) # now write the novoplasty config file defaultConfig = os.path.join(os.path.dirname(__file__), 'novoplasty-config.txt') novoConfig = os.path.join(args.workdir, 'novo-config.txt') if args.reference: refgenome = os.path.abspath(args.reference) else: refgenome = '' checkWords = ("<PROJECT>", "<MINLEN>", "<MAXLEN>", "<MAXMEM>", "<SEED>", "<READLEN>", "<FORWARD>", "<REVERSE>", "<REFERENCE>") repWords = (unique_id, str(args.minlen), str(args.maxlen), str(int(getRAM() * .75)), seedFasta, str(read_len), os.path.abspath(args.left), os.path.abspath(args.right), refgenome) with open(novoConfig, 'w') as outfile: with open(defaultConfig, 'r') as infile: for line in infile: for check, rep in zip(checkWords, repWords): line = line.replace(check, rep) outfile.write(line) # now we can finally run NOVOplasty.pl status('De novo assembling mitochondrial genome using NOVOplasty') cmd = ['NOVOPlasty.pl', '-c', 'novo-config.txt'] printCMD(cmd) novolog = os.path.join(args.workdir, 'novoplasty.log') with open(novolog, 'w') as logfile: p1 = subprocess.Popen(cmd, cwd=args.workdir, stdout=logfile, stderr=logfile) p1.communicate() # now parse the results draftMito = None circular = False for f in os.listdir(args.workdir): if f.startswith('Circularized_assembly_'): draftMito = os.path.join(args.workdir, f) circular = True break if f.startswith('Contigs_1_'): draftMito = os.path.join(args.workdir, f) break if f.startswith('Uncircularized_assemblies_'): draftMito = os.path.join(args.workdir, f) break if circular: status('NOVOplasty assembled complete circular genome') if args.starting: status('Rotating assembly to start with {}'.format(args.starting)) else: status('Rotating assembly to start with Cytochrome b (cob) gene') orient_to_start(draftMito, args.out, folder=args.workdir, start=args.starting) else: numContigs = 0 contigLength = 0 with open(args.out, 'w') as outfile: with open(draftMito, 'r') as infile: for title, seq in SimpleFastaParser(infile): numContigs += 1 contigLength += len(seq) outfile.write('>contig_{}\n{}\n'.format( numContigs, softwrap(seq))) status( 'NOVOplasty assembled {} contigs consiting of {:,} bp, but was unable to circularize genome' .format(numContigs, contigLength)) status('AAFTF mito complete: {}'.format(args.out)) if not args.pipe: shutil.rmtree(args.workdir)
def run(parser,args): #find reads for pilon forReads, revReads = (None,)*2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status('Unable to located FASTQ raw reads, pass via -l,--left and/or -r,--right') sys.exit(1) custom_workdir = 1 if not args.workdir: custom_workdir = 0 args.workdir = 'aaftf-pilon_'+str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) bamthreads = 4 if args.cpus < 4: bamthreads = args.cpus DEVNULL = open(os.devnull, 'w') for i in range(1, args.iterations+1): status('Starting Pilon polishing iteration {:}'.format(i)) correctedFasta = 'pilon'+str(i)+'.fasta' if i == 1: #first loop initialFasta = args.infile shutil.copyfile(args.infile, os.path.join(args.workdir, os.path.basename(args.infile))) else: initialFasta = os.path.join(args.workdir, 'pilon'+str(i-1)+'.fasta') pilonBAM = os.path.basename(initialFasta)+'.bwa.bam' if not os.path.isfile(os.path.join(args.workdir, pilonBAM)): bwa_index = ['bwa', 'index', os.path.basename(initialFasta)] printCMD(bwa_index) subprocess.run(bwa_index, cwd=args.workdir, stderr=DEVNULL) bwa_cmd = ['bwa', 'mem', '-t', str(args.cpus), os.path.basename(initialFasta), forReads] if revReads: bwa_cmd.append(revReads) #run BWA and pipe to samtools sort printCMD(bwa_cmd) p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen(['samtools', 'sort', '-@', str(bamthreads),'-o', pilonBAM, '-'], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() #BAM file needs to be indexed for Pilon subprocess.run(['samtools', 'index', pilonBAM], cwd=args.workdir) #run Pilon pilon_cmd = ['pilon', '--genome', os.path.basename(initialFasta), '--frags', pilonBAM, '-Xmx{}g'.format(args.memory), '--output', correctedFasta.split('.fasta')[0], '--threads', str(args.cpus), '--changes'] pilon_log = 'pilon'+str(i)+'.log' printCMD(pilon_cmd) with open(os.path.join(args.workdir, pilon_log), 'w') as logfile: subprocess.run(pilon_cmd, cwd=args.workdir, stderr=logfile, stdout=logfile) num_changes = line_count(os.path.join(args.workdir, 'pilon'+str(i)+'.changes')) status('Found {:,} changes in Pilon iteration {:}'.format(num_changes, i)) #clean-up as we iterate to prevent tmp directory from blowing up dirty = [initialFasta+'.sa', initialFasta+'.amb', initialFasta+'.ann', initialFasta+'.pac', initialFasta+'.bwt', os.path.join(args.workdir, pilonBAM), os.path.join(args.workdir, pilonBAM+'.bai')] for f in dirty: if i == 1: if os.path.isfile(os.path.join(args.workdir, f)): os.remove(os.path.join(args.workdir, f)) else: if os.path.isfile(f): os.remove(f) #copy last iteration to output if args.outfile: polishedFasta = args.outfile else: polishedFasta = os.path.basename(args.infile).split('.f')[0]+'.pilon.fasta' shutil.copyfile(os.path.join(args.workdir, 'pilon'+str(args.iterations)+'.fasta'), polishedFasta) status('AAFTF pilon completed {:} iterations.'.format(args.iterations)) status('Pilon polished assembly: {:}'.format(polishedFasta)) if '_' in polishedFasta: nextOut = polishedFasta.split('_')[0]+'.final.fasta' elif '.' in polishedFasta: nextOut = polishedFasta.split('.')[0]+'.final.fasta' else: nextOut = polishedFasta+'.final.fasta' if not args.debug and not custom_workdir: SafeRemove(args.workdir) if not args.pipe: status('Your next command might be:\n\tAAFTF sort -i {:} -o {:}\n'.format(polishedFasta, nextOut))
def parse_clean_blastn(fastafile, prefix, blastn, stringent): ''' Blast header rows: qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue score qlen ''' cleaned = prefix + ".clean.fsa" logging = prefix + ".parse.log" excludes = {} VecHits = {} found_vector_seq = 0 with open(blastn, "r") as vectab: rdr = csv.reader(vectab, delimiter="\t") for row in rdr: qaccver, saccver, pid, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore, score, qlen = row if qaccver in contigs_to_remove: continue #vecscreen https://www.ncbi.nlm.nih.gov/tools/vecscreen/about/#Moderate #says to use score here (I'm interpret as score not bitscore) #need to determine if match is terminal or if internal loc = [int(qstart), int(qend)] if loc[0] > loc[1]: loc = [loc[1], loc[0]] #check for location terminal = False position = None if loc[0] <= 25: terminal = True position = '5' if (int(qlen) - loc[1]) <= 25: terminal = True position = '3' Match = 0 # weak=0, moderate=1, strong=2 score = int(score) if terminal: if score >= 19: Match = 1 if score >= 24: Match = 2 else: if score >= 25: Match = 1 if score >= 30: Match = 2 if Match == 0: continue if stringent == 'high': if Match > 0: found_vector_seq += 1 if not qaccver in VecHits: VecHits[qaccver] = [(saccver, int(qlen), loc, int(score), terminal, position)] else: VecHits[qaccver].append( (saccver, int(qlen), loc, int(score), terminal, position)) else: if Match > 1: found_vector_seq += 1 if not qaccver in VecHits: VecHits[qaccver] = [(saccver, int(qlen), loc, int(score), terminal, position)] else: VecHits[qaccver].append( (saccver, int(qlen), loc, int(score), terminal, position)) trimTerminal = 0 splitContig = 0 with open(cleaned, "w") as output_handle, open(logging, "w") as log: for record in SeqIO.parse(fastafile, "fasta"): FiveEnd = 0 ThreeEnd = len(record.seq) internals = [] slicer = [] sInt = [] Seq = str(record.seq) if not record.id in VecHits: if len(record.seq) >= 200: output_handle.write('>{:}\n{:}\n'.format( record.id, softwrap(Seq))) else: #VecHits contains list of tuples of information, if terminal, then just truncate #off the closest side. Also, need to check if multiple intervals are within 50 #bp of each other, that whole interval is removed. #should be able to accomplish above with the several rounds that it runs with, #so split on internal and trim terminal. done. for hit in VecHits[record.id]: ID, length, loc, score, terminal, pos = hit if terminal and pos == '5': if loc[1] > FiveEnd: FiveEnd = loc[1] elif terminal and pos == '3': if loc[0] < ThreeEnd: ThreeEnd = loc[0] else: #internal hits to add to list if not loc in internals: internals.append(loc) #now sort intervals sInt = sorted(internals, key=lambda x: int(x[0])) #now construct slicing list if len(sInt) < 1: slicer = [FiveEnd, ThreeEnd] else: slicer = [FiveEnd] for x in sInt: slicer = slicer + x slicer.append(ThreeEnd) paired_slicer = list(group(slicer, 2)) if len(paired_slicer) < 2: status('Terminal trimming {:} to {:}'.format( record.id, paired_slicer)) newSeq = Seq[paired_slicer[0][0]:paired_slicer[0][1]] if len(newSeq) >= 200: output_handle.write('>{:}\n{:}\n'.format( record.id, softwrap(newSeq))) else: status('Spliting contig {:} into {:}'.format( record.id, paired_slicer)) for num, y in enumerate(paired_slicer): newSeq = Seq[y[0]:y[1]] if len(newSeq) >= 200: output_handle.write('>split{:}_{:}\n{:}\n'.format( num + 1, record.id, softwrap(newSeq))) return (found_vector_seq, cleaned)
def run(parser, args): if not args.workdir: args.workdir = 'aaftf-vecscreen_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) #parse database locations DB = None if not args.AAFTF_DB: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: DB = args.AAFTF_DB else: pass else: DB = args.AAFTF_DB if args.percent_id: percentid_cutoff = args.percent_id infile = args.infile outfile = os.path.basename(args.outfile) outdir = os.path.dirname(args.outfile) if '.f' in outfile: prefix = outfile.rsplit('.f', 1)[0] print("prefix is ", prefix) else: prefix = str(os.getpid()) if not outfile: outfile = "%s.vecscreen.fasta" % prefix outfile_vec = os.path.join(args.workdir, "%s.tmp_vecscreen.fasta" % (prefix)) # Common Euk/Prot contaminats for blastable DB later on status('Building BLAST databases for contamination screen.') makeblastdblist = [] for d in DB_Links: if d == 'sourmash': continue url = DB_Links[d] dbname = os.path.basename(str(url)) #logger.debug("testing for url=%s dbname=%s"%(url,dbname)) if DB: file = os.path.join(DB, dbname) else: file = os.path.join(args.workdir, dbname) if file.endswith(".gz"): nogz = os.path.splitext(file)[0] if not os.path.exists(nogz): if not os.path.exists(file): urllib.request.urlretrieve(url, file) with gzip.open(file, 'rb') as ingz, open(nogz, 'wb') as outfa: shutil.copyfileobj(ingz, outfa) # call(['gunzip', '-k', file]) make_blastdb('nucl', nogz, os.path.join(args.workdir, d)) else: make_blastdb('nucl', nogz, os.path.join(args.workdir, d)) else: if not os.path.exists(file): urllib.request.urlretrieve(url, file) make_blastdb('nucl', file, os.path.join(args.workdir, d)) global contigs_to_remove contigs_to_remove = {} regions_to_trim = {} #qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore for contam in ["CONTAM_EUKS", "CONTAM_PROKS"]: status("%s Contamination Screen" % (contam)) blastreport = os.path.join(args.workdir, "%s.%s.blastn" % (contam, prefix)) blastnargs = [ 'blastn', '-query', infile, '-db', os.path.join(args.workdir, contam), '-num_threads', str(args.cpus), '-dust', 'yes', '-soft_masking', 'true', '-perc_identity', BlastPercent_ID_ContamMatch, '-lcase_masking', '-outfmt', '6', '-out', blastreport ] printCMD(blastnargs) call(blastnargs) hits = 0 with open(blastreport) as report: colparser = csv.reader(report, delimiter="\t") for row in colparser: if ((float(row[2]) >= 98.0 and int(row[3]) >= 50) or (float(row[2]) >= 94.0 and int(row[3]) >= 100) or (float(row[2]) >= 90.0 and int(row[3]) >= 200)): if not row[0] in regions_to_trim: if int(row[6]) < int(row[7]): start = int(row[6]) end = int(row[7]) else: start = int(row[7]) end = int(row[6]) regions_to_trim[row[0]] = [(start, end, contam, row[1], float(row[2]))] else: regions_to_trim[row[0]].append( (start, end, contam, row[1], float(row[2]))) status('{:} screening finished'.format(contam)) eukCleaned = os.path.join(args.workdir, "%s.euk-prot_cleaned.fasta" % (prefix)) if len(regions_to_trim) > 0: with open(eukCleaned, 'w') as cleanout: with open(infile, 'rU') as fastain: for record in SeqIO.parse(fastain, 'fasta'): if not record.id in regions_to_trim: cleanout.write('>{:}\n{:}\n'.format( record.id, softwrap(str(record.seq)))) else: Seq = str(record.seq) regions = regions_to_trim[record.id] status( 'Splitting {:} due to contamination: {:}'.format( record.id, regions)) lastpos = 0 newSeq = '' for i, x in enumerate(regions): newSeq = Seq[lastpos:x[0]] lastpos = x[1] cleanout.write('>split{:}_{:}\n{:}\n'.format( i, record.id, softwrap(newSeq))) if i == len(regions) - 1: newSeq = Seq[x[1]:] cleanout.write('>split{:}_{:}\n{:}\n'.format( i + 1, record.id, softwrap(newSeq))) else: eukCleaned = infile # MITO screen status('Mitochondria Contamination Screen') mitoHits = [] blastreport = os.path.join(args.workdir, "%s.%s.blastn" % ('MITO', prefix)) blastnargs = [ 'blastn', '-query', eukCleaned, '-db', os.path.join(args.workdir, 'MITO'), '-num_threads', str(args.cpus), '-dust', 'yes', '-soft_masking', 'true', '-perc_identity', BlastPercent_ID_MitoMatch, '-lcase_masking', '-outfmt', '6', '-out', blastreport ] printCMD(blastnargs) call(blastnargs) with open(blastreport) as report: colparser = csv.reader(report, delimiter="\t") for row in colparser: if int(row[3]) >= 120: contigs_to_remove[row[0]] = ('MitoScreen', row[1], float(row[2])) mitoHits.append(row[0]) status('Mito screening finished.') #vecscreen starts here status( 'Starting VecScreen, will remove terminal matches and split internal matches' ) rnd = 0 count = 1 while (count > 0): filepref = "%s.r%d" % (prefix, rnd) report = os.path.join(args.workdir, "%s.vecscreen.tab" % (filepref)) if not os.path.exists(report): cmd = [ 'blastn', '-task', 'blastn', '-reward', '1', '-penalty', '-5', '-gapopen', '3', '-gapextend', '3', '-dust', 'yes', '-soft_masking', 'true', '-evalue', '700', '-searchsp', '1750000000000', '-db', os.path.join(args.workdir, 'UniVec'), '-outfmt', '6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore score qlen', '-num_threads', str(args.cpus), '-query', eukCleaned, '-out', report ] #logger.info('CMD: {:}'.format(printCMD(cmd,7))) call(cmd) # this needs to know/return the new fasta file? status("Parsing VecScreen round {:}: {:} for {:}".format( rnd + 1, filepref, report)) (count, cleanfile) = parse_clean_blastn(eukCleaned, os.path.join(args.workdir, filepref), report, args.stringency) status("count is %d cleanfile is %s" % (count, cleanfile)) if count == 0: # if there are no vector matches < than the pid cutoff status("copying %s to %s" % (eukCleaned, outfile_vec)) shutil.copy(eukCleaned, outfile_vec) else: rnd += 1 eukCleaned = cleanfile status("{:,} contigs will be removed:".format(len(contigs_to_remove))) for k, v in sorted(contigs_to_remove.items()): print('\t{:} --> dbhit={:}; hit={:}; pident={:}'.format( k, v[0], v[1], v[2])) # this could instead use the outfile and strip .fasta/fsa/fna and add mito on it I suppose, but assumes # a bit about the naming structure mitochondria = os.path.join(outdir, prefix + '.mitochondria.fasta') with open(args.outfile, "w") as output_handle, open(mitochondria, 'w') as mito_handle: for record in SeqIO.parse(outfile_vec, "fasta"): if not record.id in contigs_to_remove: SeqIO.write(record, output_handle, "fasta") elif record.id in mitoHits: SeqIO.write(record, mito_handle, "fasta") status('Writing {:,} cleaned contigs to: {:}'.format( countfasta(args.outfile), args.outfile)) status('Writing {:,} mitochondrial contigs to: {:}'.format( countfasta(mitochondria), mitochondria)) if '_' in args.outfile: nextOut = args.outfile.split('_')[0] + '.sourpurge.fasta' elif '.' in args.outfile: nextOut = args.outfile.split('.')[0] + '.sourpurge.fasta' else: nextOut = args.outfile + '.sourpurge.fasta' if not args.pipe: status( 'Your next command might be:\n\tAAFTF sourpurge -i {:} -o {:} -c {:} --phylum Ascomycota\n' .format(args.outfile, nextOut, args.cpus)) if not args.debug: SafeRemove(args.workdir)
def run(parser, args): custom_workdir = 1 if not args.workdir: custom_workdir = 0 args.workdir = 'aaftf-filter_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) #parse database locations DB = None if not args.AAFTF_DB: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: DB = args.AAFTF_DB else: pass else: DB = args.AAFTF_DB bamthreads = 4 if args.cpus < 4: bamthreads = args.cpus earliest_file_age = -1 contam_filenames = [] # db of contaminant (PhiX) for url in Contaminant_Accessions.values(): acc = os.path.basename(url) if DB: acc_file = os.path.join(DB, acc) else: acc_file = os.path.join(args.workdir, acc) contam_filenames.append(acc_file) if not os.path.exists(acc_file): urllib.request.urlretrieve(url, acc_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(acc_file)): earliest_file_age = os.path.getctime(acc_file) # download univec too url = DB_Links['UniVec'] acc = os.path.basename(DB_Links['UniVec']) if DB: acc_file = os.path.join(DB, acc) else: acc_file = os.path.join(args.workdir, acc) contam_filenames.append(acc_file) if not os.path.exists(acc_file): urllib.request.urlretrieve(url, acc_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(acc_file)): earliest_file_age = os.path.getctime(acc_file) if args.screen_accessions: for acc in args.screen_accessions: if DB: acc_file = os.path.join(DB, acc + ".fna") if not os.path.exists(acc_file): acc_file = os.path.join(args.workdir, acc + ".fna") else: acc_file = os.path.join(args.workdir, acc + ".fna") contam_filenames.append(acc_file) if not os.path.exists(acc_file): url = SeqDBs['nucleotide'] % (acc) urllib.request.urlretrieve(url, acc_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(acc_file)): earliest_file_age = os.path.getctime(acc_file) if args.screen_urls: for url in args.screen_urls: url_file = os.path.join(args.workdir, os.path.basename(url)) contam_filenames.append(url_file) if not os.path.exists(url_file): urllib.request.urlretrieve(url, url_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(url_file)): earliest_file_age = os.path.getctime(url_file) if args.screen_local: for f in args.screen_local: contam_filenames.append(os.path.abspath(f)) # concat vector db status('Generating combined contamination database:\n{:}'.format( '\n'.join(contam_filenames))) contamdb = os.path.join(args.workdir, 'contamdb.fa') if (not os.path.exists(contamdb) or (os.path.getctime(contamdb) < earliest_file_age)): with open(contamdb, 'wb') as wfd: for fname in contam_filenames: with open(fname, 'rb') as fd: # reasonably fast copy for append shutil.copyfileobj(fd, wfd) #find reads forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status("Must provide --left, unable to locate FASTQ reads") sys.exit(1) total = countfastq(forReads) if revReads: total = total * 2 status('Loading {:,} total reads'.format(total)) # seems like this needs to be stripping trailing extension? if not args.basename: if '_' in os.path.basename(forReads): args.basename = os.path.basename(forReads).split('_')[0] elif '.' in os.path.basename(forReads): args.basename = os.path.basename(forReads).split('.')[0] else: args.basename = os.path.basename(forReads) #logger.info('Loading {:,} FASTQ reads'.format(countfastq(forReads))) DEVNULL = open(os.devnull, 'w') alignBAM = os.path.join(args.workdir, args.basename + '_contam_db.bam') clean_reads = args.basename + "_filtered" refmatch_bbduk = [contamdb, 'phix', 'artifacts', 'lambda'] if args.aligner == "bbduk": status('Kmer filtering reads using BBDuk') if args.memory: MEM = '-Xmx{:}g'.format(args.memory) else: MEM = '-Xmx{:}g'.format(round(0.6 * getRAM())) cmd = [ 'bbduk.sh', MEM, 't={:}'.format(args.cpus), 'hdist=1', 'k=27', 'overwrite=true', 'in=%s' % (forReads), 'out=%s_1.fastq.gz' % (clean_reads) ] if revReads: cmd.extend( ['in2=%s' % (revReads), 'out2=%s_2.fastq.gz' % (clean_reads)]) cmd.extend(['ref=%s' % (",".join(refmatch_bbduk))]) #cmd.extend(['prealloc','qhdist=1']) printCMD(cmd) if args.debug: subprocess.run(cmd) else: subprocess.run(cmd, stderr=DEVNULL) if not args.debug and not custom_workdir: SafeRemove(args.workdir) clean = countfastq('{:}_1.fastq.gz'.format(clean_reads)) if revReads: clean = clean * 2 status('{:,} reads mapped to contamination database'.format( (total - clean))) status('{:,} reads unmapped and writing to file'.format(clean)) status('Filtering complete:\n\tFor: {:}\n\tRev: {:}'.format( clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF assemble -l {:} -r {:} -c {:} -o {:}\n' .format(clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz', args.cpus, args.basename + '.spades.fasta')) return elif args.aligner == 'bowtie2': # likely not used and less accurate than bbmap? if not os.path.isfile(alignBAM): status('Aligning reads to contamination database using bowtie2') if (not os.path.exists(contamdb + ".1.bt2") or os.path.getctime(contamdb + ".1.bt2") < os.path.getctime(contamdb)): # (re)build index if no index or index is older than # the db bowtie_index = ['bowtie2-build', contamdb, contamdb] printCMD(bowtie_index) subprocess.run(bowtie_index, stderr=DEVNULL, stdout=DEVNULL) bowtie_cmd = [ 'bowtie2', '-x', os.path.basename(contamdb), '-p', str(args.cpus), '--very-sensitive' ] if forReads and revReads: bowtie_cmd = bowtie_cmd + ['-1', forReads, '-2', revReads] elif forReads: bowtie_cmd = bowtie_cmd + ['-U', forReads] #now run and write to BAM sorted printCMD(bowtie_cmd) p1 = subprocess.Popen(bowtie_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '-@', str(bamthreads), '-o', os.path.basename(alignBAM), '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() elif args.aligner == 'bwa': # likely less accurate than bbduk so may not be used if not os.path.isfile(alignBAM): status('Aligning reads to contamination database using BWA') if (not os.path.exists(contamdb + ".amb") or os.path.getctime(contamdb + ".amb") < os.path.getctime(contamdb)): bwa_index = ['bwa', 'index', contamdb] printCMD(bwa_index) subprocess.run(bwa_index, stderr=DEVNULL, stdout=DEVNULL) bwa_cmd = [ 'bwa', 'mem', '-t', str(args.cpus), os.path.basename(contamdb), forReads ] if revReads: bwa_cmd.append(revReads) #now run and write to BAM sorted printCMD(bwa_cmd) p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '-@', str(bamthreads), '-o', os.path.basename(alignBAM), '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() elif args.aligner == 'minimap2': # likely not used but may be useful for pacbio/nanopore? if not os.path.isfile(alignBAM): status('Aligning reads to contamination database using minimap2') minimap2_cmd = [ 'minimap2', '-ax', 'sr', '-t', str(args.cpus), os.path.basename(contamdb), forReads ] if revReads: minimap2_cmd.append(revReads) #now run and write to BAM sorted printCMD(minimap2_cmd) p1 = subprocess.Popen(minimap2_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '-@', str(bamthreads), '-o', os.path.basename(alignBAM), '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() else: status("Must specify bowtie2, bwa, or minimap2 for filtering") if os.path.isfile(alignBAM): #display mapping stats in terminal subprocess.run(['samtools', 'index', alignBAM]) mapped, unmapped = bam_read_count(alignBAM) status('{:,} reads mapped to contamination database'.format(mapped)) status('{:,} reads unmapped and writing to file'.format(unmapped)) #now output unmapped reads from bamfile #this needs to be -f 5 so unmapped-pairs if forReads and revReads: samtools_cmd = [ 'samtools', 'fastq', '-f', '12', '-1', clean_reads + '_1.fastq.gz', '-2', clean_reads + '_2.fastq.gz', alignBAM ] elif forReads: samtools_cmd = [ 'samtools', 'fastq', '-f', '4', '-1', clean_reads + '.fastq.gz', alignBAM ] subprocess.run(samtools_cmd, stderr=DEVNULL) if not args.debug: SafeRemove(args.workdir) if revReads: status('Filtering complete:\n\tFor: {:}\n\tRev: {:}'.format( clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF assemble -l {:} -r {:} -c {:} -o {:}\n' .format(clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz', args.cpus, args.basename + '.spades.fasta')) else: status('Filtering complete:\n\tSingle: {:}'.format(clean_reads + '.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF assemble -l {:} -c {:} -o {:}\n' .format(clean_reads + '.fastq.gz', args.cpus, args.basename + '.spades.fasta'))
def run(parser, args): if not args.basename: if '_' in os.path.basename(args.left): args.basename = os.path.basename(args.left).split('_')[0] elif '.' in os.path.basename(args.left): args.basename = os.path.basename(args.left).split('.')[0] else: args.basename = os.path.basename(args.left) total = countfastq(args.left) if args.right: total = total * 2 status('Loading {:,} total reads'.format(total)) DEVNULL = open(os.devnull, 'w') if args.method == 'bbduk': if args.memory: MEM = '-Xmx{:}g'.format(args.memory) else: MEM = '-Xmx{:}g'.format(round(0.6 * getRAM())) status('Adapter trimming using BBDuk') cmd = [ 'bbduk.sh', MEM, 'ref=adapters', 't={:}'.format(args.cpus), 'ktrim=r', 'k=23', 'mink=11', 'minlen={:}'.format(args.minlen), 'hdist=1', 'ftm=5', 'tpe', 'tbo', 'overwrite=true' ] if args.left and args.right: cmd += [ 'in1={:}'.format(args.left), 'in2={:}'.format(args.right), 'out1={:}_1P.fastq.gz'.format(args.basename), 'out2={:}_2P.fastq.gz'.format(args.basename) ] elif args.left: cmd += [ 'in={:}'.format(args.left), 'out={:}_1U.fastq.gz'.format(args.basename) ] printCMD(cmd) if args.debug: subprocess.run(cmd) else: subprocess.run(cmd, stderr=DEVNULL) if args.right: clean = countfastq('{:}_1P.fastq.gz'.format(args.basename)) clean = clean * 2 status('{:,} reads remaining and writing to file'.format(clean)) status('Trimming finished:\n\tFor: {:}\n\tRev {:}'.format( args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -r {:} -o {:} -c {:}\n' .format(args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz', args.basename, args.cpus)) else: clean = countfastq('{:}_1U.fastq.gz'.format(args.basename)) status('{:,} reads remaining and writing to file'.format(clean)) status('Trimming finished:\n\tSingle: {:}'.format(args.basename + '_1U.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -o {:} -c {:}\n' .format(args.basename + '_1U.fastq.gz', args.basename, args.cpus)) elif args.method == 'trimmomatic': #find path trimmomatic_path = find_trimmomatic() if trimmomatic_path: jarfile = trimmomatic_path elif args.trimmomatic: jarfile = args.trimmomatic else: status( 'Trimmomatic cannot be found - please provide location of trimmomatic.jar file.' ) sys.exit(1) if jarfile: path_to_adaptors = args.trimmomatic_adaptors leadingwindow = "LEADING:%d" % (args.trimmomatic_leadingwindow) trailingwindow = "TRAILING:%d" % (args.trimmomatic_trailingwindow) slidingwindow = "SLIDINGWINDOW:%s" % ( args.trimmomatic_slidingwindow) quality = args.trimmomatic_quality quality = "-%s" % (quality) # add leading dash if not os.path.exists(path_to_adaptors): if args.right: path_to_adaptors = dirname( jarfile) + "/adapters/TruSeq3-PE.fa" else: path_to_adaptors = dirname( jarfile) + "/adapters/TruSeq3-SE.fa" if not os.path.exists(path_to_adaptors): findpath = dirname(jarfile) path_to_adaptors = "" while findpath: if os.path.exists(findpath + "/share"): if args.right: path_to_adaptors = findpath + "/share/trimmomatic/adapters/TruSeq3-PE.fa" else: path_to_adaptors = findpath + "/share/trimmomatic/adapters/TruSeq3-SE.fa" break findpath = dirname(findpath) if not os.path.exists(path_to_adaptors): status( "Cannot find adaptors file, please specify manually") status( "Cannot find adaptors file, please specify manually") return clipstr = args.trimmomatic_clip % (path_to_adaptors) cmd = [] if args.left and args.right: cmd = [ 'java', '-jar', jarfile, 'PE', '-threads', str(args.cpus), quality, args.left, args.right, args.basename + '_1P.fastq', args.basename + '_1U.fastq', args.basename + '_2P.fastq', args.basename + '_2U.fastq', clipstr, leadingwindow, trailingwindow, slidingwindow, "MINLEN:%d" % (args.minlen) ] elif args.left and not args.right: cmd = [ 'java', '-jar', jarfile, 'SE', '-threads', str(args.cpus), quality, args.left, args.basename + '_1U.fastq', clipstr, leadingwindow, trailingwindow, slidingwindow, "MINLEN:%d" % (args.minlen) ] else: status("Must provide left and right pairs or single read set") return status('Running trimmomatic adapter and quality trimming') printCMD(cmd) if args.debug: subprocess.run(cmd) else: subprocess.run(cmd, stderr=DEVNULL) if args.right: status('Compressing trimmed PE FASTQ files') Fzip_inplace(args.basename + '_1P.fastq', args.cpus) Fzip_inplace(args.basename + '_2P.fastq', args.cpus) SafeRemove(args.basename + '_1U.fastq') SafeRemove(args.basename + '_2U.fastq') status('Trimming finished:\n\tFor: {:}\n\tRev {:}'.format( args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -r {:} -o {:} -c {:}\n' .format(args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz', args.basename, args.cpus)) else: status('Compressing trimmed SE FASTQ file') Fzip_inplace(args.basename + '_1U.fastq', args.cpus) status( 'Trimming finished:\n\tSingle: {:}'.format(args.basename + '_1U.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -o {:} -c {:}\n' .format(args.basename + '_1U.fastq.gz', args.basename, args.cpus))
def run(parser, args): #script to run entire AAFTF pipeline args_dict = vars(args) basename = args_dict['basename'] RAM = round(0.75 * getRAM()) if not args.memory: args_dict['memory'] = str(RAM) #run trimming with bbduk if not checkfile(basename + '_1P.fastq.gz'): trimOpts = [ 'memory', 'left', 'right', 'basename', 'cpus', 'debug', 'minlen' ] trimDict = {k: v for (k, v) in args_dict.items() if k in trimOpts} trimDict['method'] = 'bbduk' trimDict['pipe'] = True trimargs = Namespace(**trimDict) trim.run(parser, trimargs) else: if args.right: status('AAFTF trim output found: {:} {:}'.format( basename + '_1P.fastq.gz', basename + '_2P.fastq.gz')) else: status('AAFTF trim output found: {:}'.format(basename + '_1P.fastq.gz')) if not checkfile(basename + '_1P.fastq.gz'): status('AATFT trim failed') sys.exit(1) #run mitochondrial assembly on bbduk trimmed reads if args.right: if not checkfile(basename + '.mito.fasta'): mitoOpts = [ 'left', 'right', 'out', 'minlen', 'maxlen', 'seed', 'starting', 'workdir', 'pipe', 'reference' ] mitoDict = {k: v for (k, v) in args_dict.items() if k in mitoOpts} mitoDict['left'] = basename + '_1P.fastq.gz' mitoDict['right'] = basename + '_2P.fastq.gz' mitoDict['out'] = basename + '.mito.fasta' mitoDict['minlen'] = 10000 mitoDict['maxlen'] = 100000 for x in mitoOpts: if not x in mitoDict: mitoDict[x] = False mitoargs = Namespace(**mitoDict) mito.run(parser, mitoargs) else: status('AAFTF mito output found: {}'.format(basename + '.mito.fasta')) else: status( 'AAFTF mito requires PE reads, skipping mitochondrial de novo assembly' ) #run filtering with bbduk if not checkfile(basename + '_filtered_1.fastq.gz'): filterOpts = [ 'screen_accessions', 'screen_urls', 'basename', 'cpus', 'debug', 'memory', 'AAFTF_DB', 'workdir' ] filterDict = {k: v for (k, v) in args_dict.items() if k in filterOpts} filterDict['aligner'] = 'bbduk' filterDict['left'] = basename + '_1P.fastq.gz' if args.right: filterDict['right'] = basename + '_2P.fastq.gz' filterDict['pipe'] = True if checkfile(basename + '.mito.fasta'): filterDict['screen_local'] = [basename + '.mito.fasta'] filterargs = Namespace(**filterDict) aaftf_filter.run(parser, filterargs) else: if args.right: status('AAFTF filter output found: {:} {:}'.format( basename + '_filtered_1.fastq.gz', basename + '_filtered_2.fastq.gz')) else: status('AAFTF filter output found: {:}'.format( basename + '_filtered_1.fastq.gz')) if not checkfile(basename + '_filtered_1.fastq.gz'): status('AATFT filter failed') sys.exit(1) #run assembly with spades if not checkfile(basename + '.spades.fasta'): assembleOpts = [ 'memory', 'cpus', 'debug', 'workdir', 'method', 'assembler_args', 'tmpdir' ] assembleDict = { k: v for (k, v) in args_dict.items() if k in assembleOpts } assembleDict['left'] = basename + '_filtered_1.fastq.gz' if args.right: assembleDict['right'] = basename + '_filtered_2.fastq.gz' assembleDict['out'] = basename + '.spades.fasta' assembleDict['spades_tmpdir'] = None assembleDict['pipe'] = True assembleargs = Namespace(**assembleDict) assemble.run(parser, assembleargs) else: status('AAFTF assemble output found: {:}'.format(basename + '.spades.fasta')) if not checkfile(basename + '.spades.fasta'): status('AATFT assemble failed') sys.exit(1) #run vecscreen if not checkfile(basename + '.vecscreen.fasta'): vecOpts = ['cpus', 'debug', 'workdir', 'AAFTF_DB'] vecDict = {k: v for (k, v) in args_dict.items() if k in vecOpts} vecDict['percent_id'] = False vecDict['stringency'] = 'high' vecDict['infile'] = basename + '.spades.fasta' vecDict['outfile'] = basename + '.vecscreen.fasta' vecDict['pipe'] = True vecargs = Namespace(**vecDict) vecscreen.run(parser, vecargs) else: status('AAFTF vecscreen output found: {:}'.format(basename + '.vecscreen.fasta')) if not checkfile(basename + '.vecscreen.fasta'): status('AATFT vecscreen failed') sys.exit(1) #run sourmash purge if not checkfile(basename + '.sourpurge.fasta'): sourOpts = [ 'cpus', 'debug', 'workdir', 'AAFTF_DB', 'phylum', 'sourdb', 'mincovpct' ] sourDict = {k: v for (k, v) in args_dict.items() if k in sourOpts} sourDict['left'] = basename + '_filtered_1.fastq.gz' if args.right: sourDict['right'] = basename + '_filtered_2.fastq.gz' sourDict['input'] = basename + '.vecscreen.fasta' sourDict['outfile'] = basename + '.sourpurge.fasta' sourDict['taxonomy'] = False sourDict['pipe'] = True sourargs = Namespace(**sourDict) sourpurge.run(parser, sourargs) else: status('AAFTF sourpurge output found: {:}'.format(basename + '.sourpurge.fasta')) if not checkfile(basename + '.sourpurge.fasta'): status('AATFT sourpurge failed') sys.exit(1) #run remove duplicates if not checkfile(basename + '.rmdup.fasta'): rmdupOpts = ['cpus', 'debug', 'workdir'] rmdupDict = {k: v for (k, v) in args_dict.items() if k in rmdupOpts} rmdupDict['input'] = basename + '.sourpurge.fasta' rmdupDict['out'] = basename + '.rmdup.fasta' rmdupDict['minlen'] = args_dict['mincontiglen'] rmdupDict['percent_id'] = 95 rmdupDict['percent_cov'] = 95 rmdupDict['exhaustive'] = False rmdupDict['pipe'] = True rmdupargs = Namespace(**rmdupDict) rmdup.run(parser, rmdupargs) else: status('AAFTF rmdup output found: {:}'.format(basename + '.rmdup.fasta')) if not checkfile(basename + '.rmdup.fasta'): status('AATFT rmdup failed') sys.exit(1) #run pilon to error-correct if not checkfile(basename + '.pilon.fasta'): pilonOpts = ['cpus', 'debug', 'workdir', 'iterations', 'memory'] pilonDict = {k: v for (k, v) in args_dict.items() if k in pilonOpts} pilonDict['infile'] = basename + '.rmdup.fasta' pilonDict['outfile'] = basename + '.pilon.fasta' pilonDict['left'] = basename + '_filtered_1.fastq.gz' if args.right: pilonDict['right'] = basename + '_filtered_2.fastq.gz' pilonDict['pipe'] = True pilonargs = Namespace(**pilonDict) pilon.run(parser, pilonargs) else: status('AAFTF pilon output found: {:}'.format(basename + '.pilon.fasta')) if not checkfile(basename + '.pilon.fasta'): status('AATFT pilon failed') sys.exit(1) #sort and rename if not checkfile(basename + '.final.fasta'): sortDict = { 'input': basename + '.pilon.fasta', 'out': basename + '.final.fasta', 'name': 'scaffold', 'minlen': args_dict['mincontiglen'] } sortargs = Namespace(**sortDict) aaftf_sort.run(parser, sortargs) else: status('AAFTF sort output found: {:}'.format(basename + '.final.fasta')) if not checkfile(basename + '.final.fasta'): status('AATFT sort failed') sys.exit(1) #assess the assembly assessDict = {'input': basename + '.final.fasta', 'report': False} assessargs = Namespace(**assessDict) assess.run(parser, assessargs)
def main(): ######################################### # create the top-level parser ######################################### parser = argparse.ArgumentParser( prog='AAFTF', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-q", "--quiet", help="Do not output warnings to stderr", action="store_true", dest="quiet") parser.add_argument("-v", "--version", help="Installed AAFTF version", action="version", version="%(prog)s " + str(myversion)) subparsers = parser.add_subparsers(title='[sub-commands]', dest='command', parser_class=ArgumentParserWithDefaults) ######################################### # create the individual tool parsers ######################################### ########## # trim ########## # arguments # --trimmomatic: arguments are path to JAR or application respectively # assume java is PATH already for trimmomatic # -o / --outdir: write outdir # -p / --prefix: outfile prefix # -ml / --minlen: min read length # read info, either paired data are required or singleton # --left: left or forward reads # --right: right or reverse reads # currently singleton / unpaired reads not supported? parser_trim = subparsers.add_parser( 'trim', description= "This comamnd trims reads in FASTQ format to remove low quality reads and trim adaptor sequences", help='Trim FASTQ input reads') parser_trim.add_argument( '-o', '--out', type=str, required=False, dest='basename', help="Output basename, default to base name of --left reads") parser_trim.add_argument('-c', '--cpus', type=int, metavar="cpus", required=False, default=1, help="Number of CPUs/threads to use.") parser_trim.add_argument( '-ml', '--minlen', type=int, default=75, required=False, help="Minimum read length after trimming, default: 75") parser_trim.add_argument( '-l', '--left', type=str, required=True, help='left/forward reads of paired-end FASTQ or single-end FASTQ.') parser_trim.add_argument('-r', '--right', type=str, required=False, help='right/reverse reads of paired-end FASTQ.') parser_trim.add_argument('-v', '--debug', action='store_true', help="Provide debugging messages") parser_trim.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") parser_trim.add_argument('--method', default='bbduk', choices=['bbduk', 'trimmomatic'], help='Program to use for adapter trimming') parser_trim.add_argument('-m', '--memory', type=int, dest='memory', required=False, help="Max Memory (in GB)") tool_group = parser_trim.add_mutually_exclusive_group(required=False) tool_group.add_argument('--trimmomatic', '--jar', metavar='trimmomatic_jar', type=str, required=False, help='Trimmomatic JAR path') trimmomatic_group = parser_trim.add_argument_group( title='Trimmomatic options', description="Trimmomatic trimming options") trimmomatic_group.add_argument( '--trimmomatic_adaptors', default="TruSeq3-PE.fa", help="Trimmomatic adaptor file, default: TruSeq3-PE.fa") trimmomatic_group.add_argument( '--trimmomatic_clip', default="ILLUMINACLIP:%s:2:30:10", help="Trimmomatic clipping, default: ILLUMINACLIP:TruSeq3-PE.fa:2:30:10" ) trimmomatic_group.add_argument( '--trimmomatic_leadingwindow', default="3", type=int, help="Trimmomatic window processing arguments, default: LEADING:3") trimmomatic_group.add_argument( '--trimmomatic_trailingwindow', default="3", type=int, help="Trimmomatic window processing arguments, default: TRAILING:3") trimmomatic_group.add_argument( '--trimmomatic_slidingwindow', default="4:15", type=str, help= "Trimmomatic window processing arguments, default: SLIDINGWINDOW:4:15") trimmomatic_group.add_argument( '--trimmomatic_quality', default="phred33", help="Trimmomatic quality encoding -phred33 or phred64") ########## # mito-asm assembly mitochondrial genome ########## parser_mito = subparsers.add_parser( 'mito', description= "De novo assembly of mitochondrial genome using NOVOplasty, takes PE Illumina adapter trimmed data.", help='De novo assembly of mitochondrial genome') parser_mito.add_argument('-l', '--left', required=True, help="Left (Forward) reads") parser_mito.add_argument('-r', '--right', required=True, help="Right (Reverse) reads") parser_mito.add_argument('-o', '--out', type=str, required=True, help="Output FASTA file for mitochondrial genome") parser_mito.add_argument('--minlen', default=10000, type=int, help="Minimum expected genome size") parser_mito.add_argument('--maxlen', default=100000, type=int, help="Maximum expected genome size") parser_mito.add_argument( '-s', '--seed', required=False, help= "Seed sequence, ie related mitochondrial genome, Default: A. nidulans") parser_mito.add_argument( '--starting', required=False, help="FASTA file of start sequence, rotate genome to, default COB") parser_mito.add_argument('--reference', required=False, help="Run NOVOplasty in reference mode") parser_mito.add_argument( '-w', '--workdir', '--tmpdir', type=str, dest='workdir', required=False, help="Temporary directory to store datafiles and processes in") parser_mito.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # filter ########## # arguments # -i / --indir: input dir # -p / --prefix: in fastq reads and output prefix # -a / --screen_accessions: screening sequence GenBank accessions # -u / --screen_urls: screening sequence URLs (fasta format) # --debug: print debug messages and do no remove contamdb BAM file # read info, either paired data are required or singleton # --left: left or forward reads # --right: right or reverse reads # or value from --prefix # --aligner: bbduk bwa, bowtie2, minimap for read alignment to contamdb parser_filter = subparsers.add_parser( 'filter', description= "Filter reads which match contaminant databases such as phiX", help='Filter contaminanting reads') parser_filter.add_argument('-w', '--workdir', type=str, help="temp directory") parser_filter.add_argument('-c', '--cpus', type=int, metavar="cpus", required=False, default=1, help="Number of CPUs/threads to use.") parser_filter.add_argument('-o', '--out', dest='basename', type=str, required=False, help="Output basename") parser_filter.add_argument( '-v', '--debug', action='store_true', help= "Provide debugging messages and do not remove contamdb matching BAM") parser_filter.add_argument( '-a', '--screen_accessions', type=str, nargs="*", help="Genbank accession number(s) to screen out from initial reads.") parser_filter.add_argument( '-u', '--screen_urls', type=str, nargs="*", help="URLs to download and screen out initial reads.") parser_filter.add_argument( '-s', '--screen_local', type=str, nargs="+", help="Local FASTA file(s) to use contamination screen") parser_filter.add_argument('-l', '--left', required=True, help="Left (Forward) reads") parser_filter.add_argument('-r', '--right', required=False, help="Right (Reverse) reads") parser_filter.add_argument( '--AAFTF_DB', type=str, required=False, help="Path to AAFTF resources, defaults to $AAFTF_DB") parser_filter.add_argument( '--aligner', default='bbduk', choices=['bbduk', 'bowtie2', 'bwa', 'minimap2'], help='Aligner to use to map reads to contamination database') parser_filter.add_argument('-m', '--memory', type=int, dest='memory', required=False, help="Max Memory (in GB)") parser_filter.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # assemble ########## # arguments # -i / --indir: input folder # -o / --outdir: output folder # -p / --prefix: input/outfile prefix # --paired or --unpaired # --spades # --tmpdir: tempdir for spades parser_asm = subparsers.add_parser( 'assemble', description="Run assembler on cleaned reads", help='Assemble reads') parser_asm.add_argument('--method', type=str, required=False, default="spades", help="Assembly method: spades, dipspades, megahit") parser_asm.add_argument( '-o', '--out', type=str, required=True, # think about sensible replacement in future help="Output assembly FASTA") parser_asm.add_argument('-w', '--workdir', type=str, dest='workdir', help="assembly output directory") parser_asm.add_argument('-c', '--cpus', type=int, metavar="cpus", required=False, default=1, help="Number of CPUs/threads to use.") parser_asm.add_argument( '-m', '--memory', type=str, dest='memory', required=False, default='32', help="Memory (in GB) setting for SPAdes. Default is 32") parser_asm.add_argument('-l', '--left', required=False, help="Left (Forward) reads") parser_asm.add_argument('-r', '--right', required=False, help="Right (Reverse) reads") parser_asm.add_argument('-v', '--debug', action='store_true', help="Print Spades stdout to terminal") parser_asm.add_argument('--tmpdir', type=str, required=False, help="Assembler temporary dir") parser_asm.add_argument('--assembler_args', action='append', required=False, help="Additional SPAdes/Megahit arguments") parser_asm.add_argument('--haplocontigs', dest='haplocontigs', default=False, action='store_true', help="For dipSPAdes take the haplocontigs file") parser_asm.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # vecscreen ########## # arguments # -i / --input: input assembly file # -o / --outfile: output cleaned assembly # --prefix: Prefix for output / temp files # --tmpdir # --pid / percent_id parser_vecscreen = subparsers.add_parser( 'vecscreen', description="Screen contigs for vector and common contaminantion", help='Vector and Contaminant Screening of assembled contigs') parser_vecscreen.add_argument('-c', '--cpus', type=int, metavar="cpus", default=1, help="Number of CPUs/threads to use.") parser_vecscreen.add_argument('-i', '--input', '--infile', type=str, required=True, dest='infile', help="Input contigs or scaffold assembly") parser_vecscreen.add_argument( '-o', '--outfile', type=str, required=True, help="Output vector screened and cleaned assembly") parser_vecscreen.add_argument( '-pid', '--percent_id', type=int, required=False, help="Percent Identity cutoff for vecscreen adaptor matches") parser_vecscreen.add_argument( '-w', '--workdir', '--tmpdir', type=str, help="Working directory to store datafiles and processes in") parser_vecscreen.add_argument( '--AAFTF_DB', type=str, required=False, help="Path to AAFTF resources, defaults to $AAFTF_DB") parser_vecscreen.add_argument('-s', '--stringency', default='high', choices=['high', 'low'], help="Stringency to filter VecScreen hits") parser_vecscreen.add_argument('-v', '--debug', action='store_true', dest='debug', help="Provide debugging messages") parser_vecscreen.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # sourpurge ########## # arguments # -a / --assembly: input assembly file # -o / --out: output cleaned assembly file # -p / --prefix: datafile prefix and temp/output file prefix # -i / --indir: directory where sequence reads are located # -c / --cpus: number of cpus # --tmpdir # --phylum: phylum to keep parser_sour = subparsers.add_parser( 'sourpurge', description="Purge contigs based on sourmash results", help='Purge contigs based on sourmash results') parser_sour.add_argument('-i', '--input', type=str, required=True, help="Input contigs or scaffold assembly") parser_sour.add_argument( '-o', '--outfile', type=str, required=True, # think about sensible replacement in future help="Output sourmash cleaned assembly") parser_sour.add_argument('-l', '--left', required=False, help="Left (Forward) reads") parser_sour.add_argument('-r', '--right', required=False, help="Right (Reverse) reads") parser_sour.add_argument( '-p', '--phylum', required=True, nargs="+", help="Phylum or Phyla to keep matches, i.e. Ascomycota") parser_sour.add_argument('--sourdb', required=False, help="SourMash LCA k-31 taxonomy database") parser_sour.add_argument('-mc', '--mincovpct', default=5, type=int, help="Minimum percent of N50 coverage to remove") parser_sour.add_argument('-c', '--cpus', type=int, metavar="cpus", default=1, help="Number of CPUs/threads to use.") parser_sour.add_argument( '-w', '--workdir', '--tmpdir', type=str, dest='workdir', required=False, help="Temporary directory to store datafiles and processes in") parser_sour.add_argument('-v', '--debug', action='store_true', dest='debug', help="Provide debugging messages") parser_sour.add_argument( '--AAFTF_DB', type=str, required=False, help="Path to AAFTF resources, defaults to $AAFTF_DB") parser_sour.add_argument('--just-show-taxonomy', dest='taxonomy', action='store_true', help="Show taxonomy information and exit") parser_sour.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # rmdup ########## # -i / --input # -o / --out # --tmpdir # --percent_id # ---mincovpct # -ml / --minlen # --exhaustive # --debug parser_rmdup = subparsers.add_parser( 'rmdup', description="Remove duplicate contigs", help='Remove duplicate contigs') parser_rmdup.add_argument( '-i', '--input', type=str, required=True, help="Input Assembly fasta file(contigs or scaffolds)") parser_rmdup.add_argument( '-o', '--out', type=str, required=True, help= "Output new version of assembly with duplicated contigs/scaffolds removed" ) parser_rmdup.add_argument('-c', '--cpus', type=int, metavar="cpus", required=False, default=1, help="Number of CPUs/threads to use.") parser_rmdup.add_argument( '-w', '--workdir', '--tmpdir', dest='workdir', type=str, required=False, help="Temporary directory to store datafiles and processes in") parser_rmdup.add_argument( '-pid', '--percent_id', type=int, dest='percent_id', required=False, default=95, help="Percent Identity used in matching contigs for redundancy") parser_rmdup.add_argument( '-pcov', '--percent_cov', type=int, dest='percent_cov', required=False, default=95, help="Coverage of contig used to decide if it is redundant") parser_rmdup.add_argument( '-ml', '--minlen', type=int, required=False, default=500, help="Minimum contig length to keep, shorter ones are dropped") parser_rmdup.add_argument( '--exhaustive', action='store_true', help= "Compute overlaps for every contig, otherwise only process contigs for L75 and below" ) parser_rmdup.add_argument( '--debug', action='store_true', help='Run rmdup in debugging mode for more output') parser_rmdup.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # pilon ########## # arguments # -i / --in: input assembly file # -o / --out: output cleaned assembly # -rp / --reads-prefix: input/outfile reads prefix # --iterations: default 5 # --tmpdir # --debug parser_pilon = subparsers.add_parser( 'pilon', description="Polish contig sequences with Pilon", help='Polish contig sequences with Pilon') parser_pilon.add_argument('-o', '--out', '--outfile', type=str, dest='outfile', required=True, help="Output Pilon polished assembly") parser_pilon.add_argument('-i', '--infile', '--input', type=str, dest='infile', required=True, help="Input contigs or scaffold assembly") parser_pilon.add_argument('-c', '--cpus', type=int, metavar="cpus", default=1, help="Number of CPUs/threads to use.") parser_pilon.add_argument('-m', '--memory', type=int, default=4, dest='memory', required=False, help="Max Memory (in GB) (default is 4gb)") parser_pilon.add_argument('-v', '--debug', action='store_true', help="Provide debugging messages") parser_pilon.add_argument( '-it', '--iterations', type=int, default=5, help="Number of Polishing iterations to run (default is 5)") parser_pilon.add_argument( '-l', '--left', type=str, required=True, help= 'The name of the left/forward reads of paired-end FASTQ formatted reads.' ) parser_pilon.add_argument( '-r', '--right', type=str, required=True, help= 'The name of the right/reverse reads of paired-end FASTQ formatted reads.' ) parser_pilon.add_argument( '-w', '--workdir', '--tmpdir', type=str, dest='workdir', required=False, help="Temporary directory to store datafiles and processes in") parser_pilon.add_argument('--pipe', action='store_true', help="AAFTF is running in pipeline mode") ########## # sort/rename FASTA headers ########## # arguments # -i / --input: input assembly file # -o / --out: output assembly file # -n / --name: base name to use default=scaffolds_ parser_sort = subparsers.add_parser( 'sort', description="Sort contigs by length and rename FASTA headers", help='Sort contigs by length and rename FASTA headers') parser_sort.add_argument('-i', '--input', '--infile', required=True, dest='input', help='Input genome assembly FASTA') parser_sort.add_argument('-o', '--out', '--output', required=True, dest='out', help='Output genome assembly FASTA') parser_sort.add_argument( '-ml', '--minlen', type=int, required=False, default=0, help="Minimum contig length to keep, shorter ones are dropped") parser_sort.add_argument('-n', '--name', '--basename', default='scaffold', dest='name', help='Basename to rename FASTA headers') ########## # assess completeness ########## # arguments # -i / --input: input assembly file # -r / --report: report file (otherwise stdout) # --tmpdir parser_assess = subparsers.add_parser( 'assess', description="Assess completeness of genome assembly", help='Assess completeness of genome assembly') parser_assess.add_argument( '-i', '--input', '--infile', required=True, help= 'Input genome assembly to test completeness and provide summary statistics' ) parser_assess.add_argument( '-r', '--report', type=str, help= 'Filename to save report information otherwise will print to stdout') ########## # pipeline run it all ########## # arguments # -i / --input: input assembly file # -r / --report: report file (otherwise stdout) # --tmpdir parser_pipeline = subparsers.add_parser( 'pipeline', description="Run entire AAFTF pipeline automagically", help='Run AAFTF pipeline') parser_pipeline.add_argument('--tmpdir', type=str, required=False, help="Assembler temporary dir") parser_pipeline.add_argument('--assembler_args', action='append', required=False, help="Additional SPAdes/Megahit arguments") parser_pipeline.add_argument( '--method', type=str, required=False, default="spades", help="Assembly method: spades, dipspades, megahit") parser_pipeline.add_argument( '-l', '--left', type=str, required=True, help='left/forward reads of paired-end FASTQ or single-end FASTQ.') parser_pipeline.add_argument( '-r', '--right', type=str, required=False, help='right/reverse reads of paired-end FASTQ.') parser_pipeline.add_argument( '-o', '--out', type=str, required=True, dest='basename', help="Output basename, default to base name of --left reads") parser_pipeline.add_argument('-c', '--cpus', type=int, metavar="cpus", required=False, default=1, help="Number of CPUs/threads to use.") parser_pipeline.add_argument( '-m', '--memory', type=str, dest='memory', required=False, help="Memory (in GB) setting for SPAdes. Default is Auto") parser_pipeline.add_argument( '-ml', '--minlen', type=int, default=75, required=False, help="Minimum read length after trimming, default: 75") parser_pipeline.add_argument( '-a', '--screen_accessions', type=str, nargs="*", help="Genbank accession number(s) to screen out from initial reads.") parser_pipeline.add_argument( '-u', '--screen_urls', type=str, nargs="*", help="URLs to download and screen out initial reads.") parser_pipeline.add_argument( '-it', '--iterations', type=int, default=5, help="Number of Pilon Polishing iterations to run") parser_pipeline.add_argument('-mc', '--mincontiglen', type=int, default=500, required=False, help="Minimum length of contigs to keep") parser_pipeline.add_argument( '--AAFTF_DB', type=str, required=False, help="Path to AAFTF resources, defaults to $AAFTF_DB") parser_pipeline.add_argument('-w', '--workdir', type=str, help="temp directory") parser_pipeline.add_argument('-v', '--debug', action='store_true', help="Provide debugging messages") parser_pipeline.add_argument( '-p', '--phylum', required=True, nargs="+", help="Phylum or Phyla to keep matches, i.e. Ascomycota") parser_pipeline.add_argument('--sourdb', required=False, help="SourMash LCA k-31 taxonomy database") parser_pipeline.add_argument( '--mincovpct', default=5, type=int, help="Minimum percent of N50 coverage to remove") #set defaults parser.set_defaults(func=run_subtool) ### process args now ### # if no args then print help and exit if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() try: status('Running AAFTF v{:}'.format(myversion)) args.func(parser, args) except IOError as e: if e.errno != 32: # ignore SIGPIPE raise