def make_blastdb(type, file, name): indexfile = name if type == 'nucl': indexfile += ".nin" else: indexfile += ".pin" if not os.path.exists( indexfile) or os.path.getctime(indexfile) < os.path.getctime(file): cmd = ['makeblastdb', '-dbtype', type, '-in', file, '-out', name] printCMD(cmd) call(cmd, stdout=DEVNULL, stderr=DEVNULL)
def run_dipspades(parser, args): if not args.workdir: args.workdir = 'dipspades_' + str(os.getpid()) runcmd = [ 'dipspades.py', '--threads', str(args.cpus), '--cov-cutoff', 'auto', '--mem', args.memory, '-o', args.workdir ] if args.assembler_args: runcmd.extend(args.assembler_args) if args.haplocontigs: runcmd.extend(['--hap', args.haplocontigs]) if args.tmpdir: runcmd.extend(['--tmp-dir', args.tmpdir]) #find reads -- use --left/right or look for cleaned in tmpdir forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status('Unable to located FASTQ raw reads, provide --left') sys.exit(1) if not revReads: runcmd = runcmd + ['-s', forReads] else: runcmd = runcmd + ['--pe1-1', forReads, '--pe1-2', revReads] # this basically overrides everything above and only runs --restart-from option if os.path.isdir(args.workdir): runcmd = ['dipspades.py', '-o', args.workdir, '--continue'] # now run the spades job status('Assembling FASTQ data using Spades') printCMD(runcmd) DEVNULL = open(os.devnull, 'w') if args.debug: subprocess.run(runcmd) else: subprocess.run(runcmd, stdout=DEVNULL, stderr=DEVNULL) #pull out assembly if args.out: finalOut = args.out else: finalOut = prefix + '.dipspades.fasta' dipspadesoutdir = os.path.join(args.workdir, 'dipspades') if os.path.isfile(os.path.join(args.workdir, 'consensus_contigs.fasta')): shutil.copyfile(os.path.join(args.workdir, 'consensus_contigs.fasta'), finalOut) shutil.copyfile( os.path.join(args.workdir, 'dipspades', 'paired_consensus_contigs.fasta'), prefix + ".dipspades_consensus_paired.fasta") shutil.copyfile( os.path.join(args.workdir, 'dipspades', 'paired_consensus_contigs.fasta'), prefix + ".dipspades_consensus_unpaired.fasta") status('Dipspades assembly finished: {:}'.format(finalOut)) status( 'Dipspades assembly copied over: {:}'.format( prefix + ".dipspades_consensus_unpaired.fasta"), prefix + ".dipspades_consensus_paired.fasta") numSeqs, assemblySize = fastastats(finalOut) status('Assembly is {:,} scaffolds and {:,} bp'.format( numSeqs, assemblySize)) else: status( 'Spades assembly output missing -- check Dipspades logfile in {:}.' .format(os.path.join(args.workdir, 'dipspades', 'dipspades.log'))) if not args.pipe: status( 'Your next command might be:\n\tAAFTF vecscreen -i {:} -c {:}\n'. format(finalOut, args.cpus))
def run_megahit(parser, args): if not args.workdir: args.workdir = 'megahit_' + str(os.getpid()) runcmd = ['megahit', '-t', str(args.cpus), '-o', args.workdir] if args.assembler_args: runcmd.extend(args.assembler_args) if args.memory: runcmd.extend(['--memory', args.memory]) if args.tmpdir: runcmd.extend(['--tmp-dir', args.tmpdir]) #find reads -- use --left/right or look for cleaned in tmpdir forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status('Unable to located FASTQ raw reads, provide --left') sys.exit(1) if not revReads: runcmd = runcmd + ['-r', forReads] else: runcmd = runcmd + ['-1', forReads, '-2', revReads] if os.path.isdir(args.workdir): status("Cannot re-run with existing folder {}".format(args.workdir)) # now run the spades job status('Assembling FASTQ data using megahit') printCMD(runcmd) DEVNULL = open(os.devnull, 'w') if args.debug: subprocess.run(runcmd) else: subprocess.run(runcmd, stdout=DEVNULL, stderr=DEVNULL) #pull out assembly if args.out: finalOut = args.out else: finalOut = prefix + '.megahit.fasta' if os.path.isfile(os.path.join(args.workdir, 'final.contigs.fa')): shutil.copyfile(os.path.join(args.workdir, 'final.contigs.fa'), finalOut) status('Megahit assembly finished: {:}'.format(finalOut)) numSeqs, assemblySize = fastastats(finalOut) status('Assembly is {:,} scaffolds and {:,} bp'.format( numSeqs, assemblySize)) else: status('Megahit assembly output missing -- check megahit logfile.') if not args.pipe: status( 'Your next command might be:\n\tAAFTF vecscreen -i {:} -c {:}\n'. format(finalOut, args.cpus))
def run(parser, args): if not args.workdir: args.workdir = 'aaftf-sourpurge_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) bamthreads = 4 if args.cpus < 4: bamthreads = 1 #find reads forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status( 'Unable to located FASTQ raw reads, low coverage will be skipped. Provide -l,--left or -r,--right to enable low coverage filtering.' ) # sys.exit(1) #parse database locations if not args.sourdb: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: SOUR = os.path.join(args.AAFTF_DB, 'genbank-k31.lca.json.gz') else: status( "$AAFTF_DB/genbank-k31.lca.json.gz not found, pass --sourdb" ) sys.exit(1) SOUR = os.path.join(DB, 'genbank-k31.lca.json.gz') if not os.path.isfile(SOUR): status( "{:} sourmash database not found, download and rename to genbank-k31.lca.json.gz" .format(SOUR)) sys.exit(1) else: SOUR = os.path.abspath(args.sourdb) # hard coded tmpfile assembly_working = 'assembly.fasta' megablast_working = 'megablast.out' blobBAM = 'remapped.bam' shutil.copyfile(args.input, os.path.join(args.workdir, assembly_working)) numSeqs, assemblySize = fastastats( os.path.join(args.workdir, assembly_working)) status('Assembly is {:,} contigs and {:,} bp'.format( numSeqs, assemblySize)) DEVNULL = open(os.devnull, 'w') #now filter for taxonomy with sourmash lca classify status('Running SourMash to get taxonomy classification for each contig') sour_sketch = os.path.basename(assembly_working) + '.sig' sour_compute = [ 'sourmash', 'compute', '-k', '31', '--scaled=1000', '--singleton', assembly_working ] printCMD(sour_compute) subprocess.run(sour_compute, cwd=args.workdir, stderr=DEVNULL) sour_classify = [ 'sourmash', 'lca', 'classify', '--db', SOUR, '--query', sour_sketch ] printCMD(sour_classify) # output csv: ID,status,superkingdom,phylum,class,order,family,genus,species,strain Taxonomy = {} UniqueTax = [] sourmashTSV = os.path.join(args.workdir, 'sourmash.csv') with open(sourmashTSV, 'w') as sour_out: for line in execute(sour_classify, args.workdir): sour_out.write(line) if not line or line.startswith('\n') or line.startswith( 'ID') or line.count(',') < 9: continue line = line.strip() cols = line.split(',') if 'found' in cols: idx = cols.index('found') Taxonomy[cols[0]] = cols[idx + 1:] taxClean = [x for x in cols[idx + 1:] if x] UniqueTax.append('{:}'.format(';'.join(taxClean))) elif 'nomatch' in cols: idx = cols.index('nomatch') Taxonomy[cols[0]] = cols[idx + 1:] UniqueTax = set(UniqueTax) status('Found {:} taxonomic classifications for contigs:\n{:}'.format( len(UniqueTax), '\n'.join(UniqueTax))) if args.taxonomy: sys.exit(1) Tax2Drop = [] for k, v in Taxonomy.items(): v = [x for x in v if x] #remove empty items from list if args.debug: print('{:}\t{:}'.format(k, v)) if len(v) > 0: if not any(i in v for i in args.phylum): Tax2Drop.append(k) #drop contigs from taxonomy before calculating coverage status('Dropping {:} contigs from taxonomy screen'.format(len(Tax2Drop))) sourTax = os.path.join(args.workdir, 'sourmashed-tax-screen.fasta') with open(sourTax, 'w') as outfile: with open(os.path.join(args.workdir, assembly_working), 'rU') as infile: for record in SeqIO.parse(infile, 'fasta'): if not record.id in Tax2Drop: SeqIO.write(record, outfile, 'fasta') # only do coverage trimming if reads provided Contigs2Drop = [ ] # this will be empty if no reads given to gather by coverage if forReads: #check if BAM present, if so skip running if not os.path.isfile(os.path.join(args.workdir, blobBAM)): # index bwa_index = ['bwa', 'index', os.path.basename(sourTax)] status('Building BWA index') printCMD(bwa_index) subprocess.run(bwa_index, cwd=args.workdir, stderr=DEVNULL) #mapped reads to assembly using BWA bwa_cmd = [ 'bwa', 'mem', '-t', str(args.cpus), os.path.basename(sourTax), # assembly index base forReads ] if revReads: bwa_cmd.append(revReads) #run BWA and pipe to samtools sort status('Aligning reads to assembly with BWA') printCMD(bwa_cmd) p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '--threads', str(bamthreads), '-o', blobBAM, '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() subprocess.run(['samtools', 'index', blobBAM], cwd=args.workdir) #now calculate coverage from BAM file status('Calculating read coverage per contig') FastaBed = os.path.join(args.workdir, 'assembly.bed') lengths = [] with open(FastaBed, 'w') as bedout: with open(sourTax, 'rU') as SeqIn: for record in SeqIO.parse(SeqIn, 'fasta'): bedout.write('{:}\t{:}\t{:}\n'.format( record.id, 0, len(record.seq))) lengths.append(len(record.seq)) N50 = calcN50(lengths) Coverage = {} coverageBed = os.path.join(args.workdir, 'coverage.bed') cov_cmd = ['samtools', 'bedcov', os.path.basename(FastaBed), blobBAM] printCMD(cov_cmd) with open(coverageBed, 'w') as bed_out: for line in execute(cov_cmd, args.workdir): bed_out.write(line) if not line or line.startswith('\n') or line.count('\t') < 3: continue line = line.strip() cols = line.split('\t') cov = int(cols[3]) / float(cols[2]) Coverage[cols[0]] = (int(cols[2]), cov) #get average coverage of N50 contigs n50Cov = [] for k, v in Coverage.items(): if args.debug: print('{:}; Len: {:}; Cov: {:.2f}'.format(k, v[0], v[1])) if v[0] >= N50: n50Cov.append(v[1]) n50AvgCov = sum(n50Cov) / len(n50Cov) minpct = args.mincovpct / 100 # should we make this a variable? 5% was something arbitrary min_coverage = float(n50AvgCov * minpct) status('Average coverage for N50 contigs is {:}X'.format( int(n50AvgCov))) #Start list of contigs to drop for k, v in Coverage.items(): if v[1] <= min_coverage: Contigs2Drop.append(k) status( 'Found {:,} contigs with coverage less than {:.2f}X ({:}%)'.format( len(Contigs2Drop), min_coverage, args.mincovpct)) if args.debug: print('Contigs dropped due to coverage: {:}'.format( ','.join(Contigs2Drop))) print('Contigs dropped due to taxonomy: {:}'.format( ','.join(Tax2Drop))) DropFinal = Contigs2Drop + Tax2Drop DropFinal = set(DropFinal) status('Dropping {:,} total contigs based on taxonomy and coverage'.format( len(DropFinal))) with open(args.outfile, 'w') as outfile, open(sourTax, 'rU') as seqin: for record in SeqIO.parse(seqin, 'fasta'): if not record.id in DropFinal: SeqIO.write(record, outfile, 'fasta') numSeqs, assemblySize = fastastats(args.outfile) status('Sourpurged assembly is {:,} contigs and {:,} bp'.format( numSeqs, assemblySize)) if '_' in args.outfile: nextOut = args.outfile.split('_')[0] + '.rmdup.fasta' elif '.' in args.outfile: nextOut = args.outfile.split('.')[0] + '.rmdup.fasta' else: nextOut = args.outfile + '.rmdup.fasta' if checkfile(sourmashTSV): baseinput = os.path.basename(args.input) if '.' in baseinput: baseinput = baseinput.rsplit('.', 1)[0] shutil.copy(sourmashTSV, baseinput + '.sourmash-taxonomy.csv') if not args.debug: SafeRemove(args.workdir) if not args.pipe: status('Your next command might be:\n\tAAFTF rmdup -i {:} -o {:}\n'. format(args.outfile, nextOut))
def run(parser, args): # first check if NOVOplasty and minimap2 are installed, else exit programs = ['NOVOplasty.pl', 'minimap2'] for x in programs: if not which_path(x): status('ERROR: {} is not installed, exiting'.format(x)) sys.exit(1) # first we need to generate working directory unique_id = str(uuid.uuid4())[:8] if not args.workdir: args.workdir = 'mito_' + unique_id if not os.path.isdir(args.workdir): os.makedirs(args.workdir) # now estimate read lengths of FASTQ read_len = GuessRL(args.left) # check for seed sequence, otherwise write one if not args.seed: if not args.reference: seedFasta = os.path.abspath( os.path.join(os.path.dirname(__file__), 'mito-seed.fasta')) else: seedFasta = os.path.abspath(args.reference) else: seedFasta = os.path.abspath(args.seed) # now write the novoplasty config file defaultConfig = os.path.join(os.path.dirname(__file__), 'novoplasty-config.txt') novoConfig = os.path.join(args.workdir, 'novo-config.txt') if args.reference: refgenome = os.path.abspath(args.reference) else: refgenome = '' checkWords = ("<PROJECT>", "<MINLEN>", "<MAXLEN>", "<MAXMEM>", "<SEED>", "<READLEN>", "<FORWARD>", "<REVERSE>", "<REFERENCE>") repWords = (unique_id, str(args.minlen), str(args.maxlen), str(int(getRAM() * .75)), seedFasta, str(read_len), os.path.abspath(args.left), os.path.abspath(args.right), refgenome) with open(novoConfig, 'w') as outfile: with open(defaultConfig, 'r') as infile: for line in infile: for check, rep in zip(checkWords, repWords): line = line.replace(check, rep) outfile.write(line) # now we can finally run NOVOplasty.pl status('De novo assembling mitochondrial genome using NOVOplasty') cmd = ['NOVOPlasty.pl', '-c', 'novo-config.txt'] printCMD(cmd) novolog = os.path.join(args.workdir, 'novoplasty.log') with open(novolog, 'w') as logfile: p1 = subprocess.Popen(cmd, cwd=args.workdir, stdout=logfile, stderr=logfile) p1.communicate() # now parse the results draftMito = None circular = False for f in os.listdir(args.workdir): if f.startswith('Circularized_assembly_'): draftMito = os.path.join(args.workdir, f) circular = True break if f.startswith('Contigs_1_'): draftMito = os.path.join(args.workdir, f) break if f.startswith('Uncircularized_assemblies_'): draftMito = os.path.join(args.workdir, f) break if circular: status('NOVOplasty assembled complete circular genome') if args.starting: status('Rotating assembly to start with {}'.format(args.starting)) else: status('Rotating assembly to start with Cytochrome b (cob) gene') orient_to_start(draftMito, args.out, folder=args.workdir, start=args.starting) else: numContigs = 0 contigLength = 0 with open(args.out, 'w') as outfile: with open(draftMito, 'r') as infile: for title, seq in SimpleFastaParser(infile): numContigs += 1 contigLength += len(seq) outfile.write('>contig_{}\n{}\n'.format( numContigs, softwrap(seq))) status( 'NOVOplasty assembled {} contigs consiting of {:,} bp, but was unable to circularize genome' .format(numContigs, contigLength)) status('AAFTF mito complete: {}'.format(args.out)) if not args.pipe: shutil.rmtree(args.workdir)
def run(parser,args): #find reads for pilon forReads, revReads = (None,)*2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status('Unable to located FASTQ raw reads, pass via -l,--left and/or -r,--right') sys.exit(1) custom_workdir = 1 if not args.workdir: custom_workdir = 0 args.workdir = 'aaftf-pilon_'+str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) bamthreads = 4 if args.cpus < 4: bamthreads = args.cpus DEVNULL = open(os.devnull, 'w') for i in range(1, args.iterations+1): status('Starting Pilon polishing iteration {:}'.format(i)) correctedFasta = 'pilon'+str(i)+'.fasta' if i == 1: #first loop initialFasta = args.infile shutil.copyfile(args.infile, os.path.join(args.workdir, os.path.basename(args.infile))) else: initialFasta = os.path.join(args.workdir, 'pilon'+str(i-1)+'.fasta') pilonBAM = os.path.basename(initialFasta)+'.bwa.bam' if not os.path.isfile(os.path.join(args.workdir, pilonBAM)): bwa_index = ['bwa', 'index', os.path.basename(initialFasta)] printCMD(bwa_index) subprocess.run(bwa_index, cwd=args.workdir, stderr=DEVNULL) bwa_cmd = ['bwa', 'mem', '-t', str(args.cpus), os.path.basename(initialFasta), forReads] if revReads: bwa_cmd.append(revReads) #run BWA and pipe to samtools sort printCMD(bwa_cmd) p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen(['samtools', 'sort', '-@', str(bamthreads),'-o', pilonBAM, '-'], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() #BAM file needs to be indexed for Pilon subprocess.run(['samtools', 'index', pilonBAM], cwd=args.workdir) #run Pilon pilon_cmd = ['pilon', '--genome', os.path.basename(initialFasta), '--frags', pilonBAM, '-Xmx{}g'.format(args.memory), '--output', correctedFasta.split('.fasta')[0], '--threads', str(args.cpus), '--changes'] pilon_log = 'pilon'+str(i)+'.log' printCMD(pilon_cmd) with open(os.path.join(args.workdir, pilon_log), 'w') as logfile: subprocess.run(pilon_cmd, cwd=args.workdir, stderr=logfile, stdout=logfile) num_changes = line_count(os.path.join(args.workdir, 'pilon'+str(i)+'.changes')) status('Found {:,} changes in Pilon iteration {:}'.format(num_changes, i)) #clean-up as we iterate to prevent tmp directory from blowing up dirty = [initialFasta+'.sa', initialFasta+'.amb', initialFasta+'.ann', initialFasta+'.pac', initialFasta+'.bwt', os.path.join(args.workdir, pilonBAM), os.path.join(args.workdir, pilonBAM+'.bai')] for f in dirty: if i == 1: if os.path.isfile(os.path.join(args.workdir, f)): os.remove(os.path.join(args.workdir, f)) else: if os.path.isfile(f): os.remove(f) #copy last iteration to output if args.outfile: polishedFasta = args.outfile else: polishedFasta = os.path.basename(args.infile).split('.f')[0]+'.pilon.fasta' shutil.copyfile(os.path.join(args.workdir, 'pilon'+str(args.iterations)+'.fasta'), polishedFasta) status('AAFTF pilon completed {:} iterations.'.format(args.iterations)) status('Pilon polished assembly: {:}'.format(polishedFasta)) if '_' in polishedFasta: nextOut = polishedFasta.split('_')[0]+'.final.fasta' elif '.' in polishedFasta: nextOut = polishedFasta.split('.')[0]+'.final.fasta' else: nextOut = polishedFasta+'.final.fasta' if not args.debug and not custom_workdir: SafeRemove(args.workdir) if not args.pipe: status('Your next command might be:\n\tAAFTF sort -i {:} -o {:}\n'.format(polishedFasta, nextOut))
def run(parser, args): if not args.workdir: args.workdir = 'aaftf-vecscreen_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) #parse database locations DB = None if not args.AAFTF_DB: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: DB = args.AAFTF_DB else: pass else: DB = args.AAFTF_DB if args.percent_id: percentid_cutoff = args.percent_id infile = args.infile outfile = os.path.basename(args.outfile) outdir = os.path.dirname(args.outfile) if '.f' in outfile: prefix = outfile.rsplit('.f', 1)[0] print("prefix is ", prefix) else: prefix = str(os.getpid()) if not outfile: outfile = "%s.vecscreen.fasta" % prefix outfile_vec = os.path.join(args.workdir, "%s.tmp_vecscreen.fasta" % (prefix)) # Common Euk/Prot contaminats for blastable DB later on status('Building BLAST databases for contamination screen.') makeblastdblist = [] for d in DB_Links: if d == 'sourmash': continue url = DB_Links[d] dbname = os.path.basename(str(url)) #logger.debug("testing for url=%s dbname=%s"%(url,dbname)) if DB: file = os.path.join(DB, dbname) else: file = os.path.join(args.workdir, dbname) if file.endswith(".gz"): nogz = os.path.splitext(file)[0] if not os.path.exists(nogz): if not os.path.exists(file): urllib.request.urlretrieve(url, file) with gzip.open(file, 'rb') as ingz, open(nogz, 'wb') as outfa: shutil.copyfileobj(ingz, outfa) # call(['gunzip', '-k', file]) make_blastdb('nucl', nogz, os.path.join(args.workdir, d)) else: make_blastdb('nucl', nogz, os.path.join(args.workdir, d)) else: if not os.path.exists(file): urllib.request.urlretrieve(url, file) make_blastdb('nucl', file, os.path.join(args.workdir, d)) global contigs_to_remove contigs_to_remove = {} regions_to_trim = {} #qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore for contam in ["CONTAM_EUKS", "CONTAM_PROKS"]: status("%s Contamination Screen" % (contam)) blastreport = os.path.join(args.workdir, "%s.%s.blastn" % (contam, prefix)) blastnargs = [ 'blastn', '-query', infile, '-db', os.path.join(args.workdir, contam), '-num_threads', str(args.cpus), '-dust', 'yes', '-soft_masking', 'true', '-perc_identity', BlastPercent_ID_ContamMatch, '-lcase_masking', '-outfmt', '6', '-out', blastreport ] printCMD(blastnargs) call(blastnargs) hits = 0 with open(blastreport) as report: colparser = csv.reader(report, delimiter="\t") for row in colparser: if ((float(row[2]) >= 98.0 and int(row[3]) >= 50) or (float(row[2]) >= 94.0 and int(row[3]) >= 100) or (float(row[2]) >= 90.0 and int(row[3]) >= 200)): if not row[0] in regions_to_trim: if int(row[6]) < int(row[7]): start = int(row[6]) end = int(row[7]) else: start = int(row[7]) end = int(row[6]) regions_to_trim[row[0]] = [(start, end, contam, row[1], float(row[2]))] else: regions_to_trim[row[0]].append( (start, end, contam, row[1], float(row[2]))) status('{:} screening finished'.format(contam)) eukCleaned = os.path.join(args.workdir, "%s.euk-prot_cleaned.fasta" % (prefix)) if len(regions_to_trim) > 0: with open(eukCleaned, 'w') as cleanout: with open(infile, 'rU') as fastain: for record in SeqIO.parse(fastain, 'fasta'): if not record.id in regions_to_trim: cleanout.write('>{:}\n{:}\n'.format( record.id, softwrap(str(record.seq)))) else: Seq = str(record.seq) regions = regions_to_trim[record.id] status( 'Splitting {:} due to contamination: {:}'.format( record.id, regions)) lastpos = 0 newSeq = '' for i, x in enumerate(regions): newSeq = Seq[lastpos:x[0]] lastpos = x[1] cleanout.write('>split{:}_{:}\n{:}\n'.format( i, record.id, softwrap(newSeq))) if i == len(regions) - 1: newSeq = Seq[x[1]:] cleanout.write('>split{:}_{:}\n{:}\n'.format( i + 1, record.id, softwrap(newSeq))) else: eukCleaned = infile # MITO screen status('Mitochondria Contamination Screen') mitoHits = [] blastreport = os.path.join(args.workdir, "%s.%s.blastn" % ('MITO', prefix)) blastnargs = [ 'blastn', '-query', eukCleaned, '-db', os.path.join(args.workdir, 'MITO'), '-num_threads', str(args.cpus), '-dust', 'yes', '-soft_masking', 'true', '-perc_identity', BlastPercent_ID_MitoMatch, '-lcase_masking', '-outfmt', '6', '-out', blastreport ] printCMD(blastnargs) call(blastnargs) with open(blastreport) as report: colparser = csv.reader(report, delimiter="\t") for row in colparser: if int(row[3]) >= 120: contigs_to_remove[row[0]] = ('MitoScreen', row[1], float(row[2])) mitoHits.append(row[0]) status('Mito screening finished.') #vecscreen starts here status( 'Starting VecScreen, will remove terminal matches and split internal matches' ) rnd = 0 count = 1 while (count > 0): filepref = "%s.r%d" % (prefix, rnd) report = os.path.join(args.workdir, "%s.vecscreen.tab" % (filepref)) if not os.path.exists(report): cmd = [ 'blastn', '-task', 'blastn', '-reward', '1', '-penalty', '-5', '-gapopen', '3', '-gapextend', '3', '-dust', 'yes', '-soft_masking', 'true', '-evalue', '700', '-searchsp', '1750000000000', '-db', os.path.join(args.workdir, 'UniVec'), '-outfmt', '6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore score qlen', '-num_threads', str(args.cpus), '-query', eukCleaned, '-out', report ] #logger.info('CMD: {:}'.format(printCMD(cmd,7))) call(cmd) # this needs to know/return the new fasta file? status("Parsing VecScreen round {:}: {:} for {:}".format( rnd + 1, filepref, report)) (count, cleanfile) = parse_clean_blastn(eukCleaned, os.path.join(args.workdir, filepref), report, args.stringency) status("count is %d cleanfile is %s" % (count, cleanfile)) if count == 0: # if there are no vector matches < than the pid cutoff status("copying %s to %s" % (eukCleaned, outfile_vec)) shutil.copy(eukCleaned, outfile_vec) else: rnd += 1 eukCleaned = cleanfile status("{:,} contigs will be removed:".format(len(contigs_to_remove))) for k, v in sorted(contigs_to_remove.items()): print('\t{:} --> dbhit={:}; hit={:}; pident={:}'.format( k, v[0], v[1], v[2])) # this could instead use the outfile and strip .fasta/fsa/fna and add mito on it I suppose, but assumes # a bit about the naming structure mitochondria = os.path.join(outdir, prefix + '.mitochondria.fasta') with open(args.outfile, "w") as output_handle, open(mitochondria, 'w') as mito_handle: for record in SeqIO.parse(outfile_vec, "fasta"): if not record.id in contigs_to_remove: SeqIO.write(record, output_handle, "fasta") elif record.id in mitoHits: SeqIO.write(record, mito_handle, "fasta") status('Writing {:,} cleaned contigs to: {:}'.format( countfasta(args.outfile), args.outfile)) status('Writing {:,} mitochondrial contigs to: {:}'.format( countfasta(mitochondria), mitochondria)) if '_' in args.outfile: nextOut = args.outfile.split('_')[0] + '.sourpurge.fasta' elif '.' in args.outfile: nextOut = args.outfile.split('.')[0] + '.sourpurge.fasta' else: nextOut = args.outfile + '.sourpurge.fasta' if not args.pipe: status( 'Your next command might be:\n\tAAFTF sourpurge -i {:} -o {:} -c {:} --phylum Ascomycota\n' .format(args.outfile, nextOut, args.cpus)) if not args.debug: SafeRemove(args.workdir)
def run(parser, args): custom_workdir = 1 if not args.workdir: custom_workdir = 0 args.workdir = 'aaftf-filter_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) #parse database locations DB = None if not args.AAFTF_DB: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: DB = args.AAFTF_DB else: pass else: DB = args.AAFTF_DB bamthreads = 4 if args.cpus < 4: bamthreads = args.cpus earliest_file_age = -1 contam_filenames = [] # db of contaminant (PhiX) for url in Contaminant_Accessions.values(): acc = os.path.basename(url) if DB: acc_file = os.path.join(DB, acc) else: acc_file = os.path.join(args.workdir, acc) contam_filenames.append(acc_file) if not os.path.exists(acc_file): urllib.request.urlretrieve(url, acc_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(acc_file)): earliest_file_age = os.path.getctime(acc_file) # download univec too url = DB_Links['UniVec'] acc = os.path.basename(DB_Links['UniVec']) if DB: acc_file = os.path.join(DB, acc) else: acc_file = os.path.join(args.workdir, acc) contam_filenames.append(acc_file) if not os.path.exists(acc_file): urllib.request.urlretrieve(url, acc_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(acc_file)): earliest_file_age = os.path.getctime(acc_file) if args.screen_accessions: for acc in args.screen_accessions: if DB: acc_file = os.path.join(DB, acc + ".fna") if not os.path.exists(acc_file): acc_file = os.path.join(args.workdir, acc + ".fna") else: acc_file = os.path.join(args.workdir, acc + ".fna") contam_filenames.append(acc_file) if not os.path.exists(acc_file): url = SeqDBs['nucleotide'] % (acc) urllib.request.urlretrieve(url, acc_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(acc_file)): earliest_file_age = os.path.getctime(acc_file) if args.screen_urls: for url in args.screen_urls: url_file = os.path.join(args.workdir, os.path.basename(url)) contam_filenames.append(url_file) if not os.path.exists(url_file): urllib.request.urlretrieve(url, url_file) if (earliest_file_age < 0 or earliest_file_age < os.path.getctime(url_file)): earliest_file_age = os.path.getctime(url_file) if args.screen_local: for f in args.screen_local: contam_filenames.append(os.path.abspath(f)) # concat vector db status('Generating combined contamination database:\n{:}'.format( '\n'.join(contam_filenames))) contamdb = os.path.join(args.workdir, 'contamdb.fa') if (not os.path.exists(contamdb) or (os.path.getctime(contamdb) < earliest_file_age)): with open(contamdb, 'wb') as wfd: for fname in contam_filenames: with open(fname, 'rb') as fd: # reasonably fast copy for append shutil.copyfileobj(fd, wfd) #find reads forReads, revReads = (None, ) * 2 if args.left: forReads = os.path.abspath(args.left) if args.right: revReads = os.path.abspath(args.right) if not forReads: status("Must provide --left, unable to locate FASTQ reads") sys.exit(1) total = countfastq(forReads) if revReads: total = total * 2 status('Loading {:,} total reads'.format(total)) # seems like this needs to be stripping trailing extension? if not args.basename: if '_' in os.path.basename(forReads): args.basename = os.path.basename(forReads).split('_')[0] elif '.' in os.path.basename(forReads): args.basename = os.path.basename(forReads).split('.')[0] else: args.basename = os.path.basename(forReads) #logger.info('Loading {:,} FASTQ reads'.format(countfastq(forReads))) DEVNULL = open(os.devnull, 'w') alignBAM = os.path.join(args.workdir, args.basename + '_contam_db.bam') clean_reads = args.basename + "_filtered" refmatch_bbduk = [contamdb, 'phix', 'artifacts', 'lambda'] if args.aligner == "bbduk": status('Kmer filtering reads using BBDuk') if args.memory: MEM = '-Xmx{:}g'.format(args.memory) else: MEM = '-Xmx{:}g'.format(round(0.6 * getRAM())) cmd = [ 'bbduk.sh', MEM, 't={:}'.format(args.cpus), 'hdist=1', 'k=27', 'overwrite=true', 'in=%s' % (forReads), 'out=%s_1.fastq.gz' % (clean_reads) ] if revReads: cmd.extend( ['in2=%s' % (revReads), 'out2=%s_2.fastq.gz' % (clean_reads)]) cmd.extend(['ref=%s' % (",".join(refmatch_bbduk))]) #cmd.extend(['prealloc','qhdist=1']) printCMD(cmd) if args.debug: subprocess.run(cmd) else: subprocess.run(cmd, stderr=DEVNULL) if not args.debug and not custom_workdir: SafeRemove(args.workdir) clean = countfastq('{:}_1.fastq.gz'.format(clean_reads)) if revReads: clean = clean * 2 status('{:,} reads mapped to contamination database'.format( (total - clean))) status('{:,} reads unmapped and writing to file'.format(clean)) status('Filtering complete:\n\tFor: {:}\n\tRev: {:}'.format( clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF assemble -l {:} -r {:} -c {:} -o {:}\n' .format(clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz', args.cpus, args.basename + '.spades.fasta')) return elif args.aligner == 'bowtie2': # likely not used and less accurate than bbmap? if not os.path.isfile(alignBAM): status('Aligning reads to contamination database using bowtie2') if (not os.path.exists(contamdb + ".1.bt2") or os.path.getctime(contamdb + ".1.bt2") < os.path.getctime(contamdb)): # (re)build index if no index or index is older than # the db bowtie_index = ['bowtie2-build', contamdb, contamdb] printCMD(bowtie_index) subprocess.run(bowtie_index, stderr=DEVNULL, stdout=DEVNULL) bowtie_cmd = [ 'bowtie2', '-x', os.path.basename(contamdb), '-p', str(args.cpus), '--very-sensitive' ] if forReads and revReads: bowtie_cmd = bowtie_cmd + ['-1', forReads, '-2', revReads] elif forReads: bowtie_cmd = bowtie_cmd + ['-U', forReads] #now run and write to BAM sorted printCMD(bowtie_cmd) p1 = subprocess.Popen(bowtie_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '-@', str(bamthreads), '-o', os.path.basename(alignBAM), '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() elif args.aligner == 'bwa': # likely less accurate than bbduk so may not be used if not os.path.isfile(alignBAM): status('Aligning reads to contamination database using BWA') if (not os.path.exists(contamdb + ".amb") or os.path.getctime(contamdb + ".amb") < os.path.getctime(contamdb)): bwa_index = ['bwa', 'index', contamdb] printCMD(bwa_index) subprocess.run(bwa_index, stderr=DEVNULL, stdout=DEVNULL) bwa_cmd = [ 'bwa', 'mem', '-t', str(args.cpus), os.path.basename(contamdb), forReads ] if revReads: bwa_cmd.append(revReads) #now run and write to BAM sorted printCMD(bwa_cmd) p1 = subprocess.Popen(bwa_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '-@', str(bamthreads), '-o', os.path.basename(alignBAM), '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() elif args.aligner == 'minimap2': # likely not used but may be useful for pacbio/nanopore? if not os.path.isfile(alignBAM): status('Aligning reads to contamination database using minimap2') minimap2_cmd = [ 'minimap2', '-ax', 'sr', '-t', str(args.cpus), os.path.basename(contamdb), forReads ] if revReads: minimap2_cmd.append(revReads) #now run and write to BAM sorted printCMD(minimap2_cmd) p1 = subprocess.Popen(minimap2_cmd, cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL) p2 = subprocess.Popen([ 'samtools', 'sort', '-@', str(bamthreads), '-o', os.path.basename(alignBAM), '-' ], cwd=args.workdir, stdout=subprocess.PIPE, stderr=DEVNULL, stdin=p1.stdout) p1.stdout.close() p2.communicate() else: status("Must specify bowtie2, bwa, or minimap2 for filtering") if os.path.isfile(alignBAM): #display mapping stats in terminal subprocess.run(['samtools', 'index', alignBAM]) mapped, unmapped = bam_read_count(alignBAM) status('{:,} reads mapped to contamination database'.format(mapped)) status('{:,} reads unmapped and writing to file'.format(unmapped)) #now output unmapped reads from bamfile #this needs to be -f 5 so unmapped-pairs if forReads and revReads: samtools_cmd = [ 'samtools', 'fastq', '-f', '12', '-1', clean_reads + '_1.fastq.gz', '-2', clean_reads + '_2.fastq.gz', alignBAM ] elif forReads: samtools_cmd = [ 'samtools', 'fastq', '-f', '4', '-1', clean_reads + '.fastq.gz', alignBAM ] subprocess.run(samtools_cmd, stderr=DEVNULL) if not args.debug: SafeRemove(args.workdir) if revReads: status('Filtering complete:\n\tFor: {:}\n\tRev: {:}'.format( clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF assemble -l {:} -r {:} -c {:} -o {:}\n' .format(clean_reads + '_1.fastq.gz', clean_reads + '_2.fastq.gz', args.cpus, args.basename + '.spades.fasta')) else: status('Filtering complete:\n\tSingle: {:}'.format(clean_reads + '.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF assemble -l {:} -c {:} -o {:}\n' .format(clean_reads + '.fastq.gz', args.cpus, args.basename + '.spades.fasta'))
def run(parser, args): if not args.basename: if '_' in os.path.basename(args.left): args.basename = os.path.basename(args.left).split('_')[0] elif '.' in os.path.basename(args.left): args.basename = os.path.basename(args.left).split('.')[0] else: args.basename = os.path.basename(args.left) total = countfastq(args.left) if args.right: total = total * 2 status('Loading {:,} total reads'.format(total)) DEVNULL = open(os.devnull, 'w') if args.method == 'bbduk': if args.memory: MEM = '-Xmx{:}g'.format(args.memory) else: MEM = '-Xmx{:}g'.format(round(0.6 * getRAM())) status('Adapter trimming using BBDuk') cmd = [ 'bbduk.sh', MEM, 'ref=adapters', 't={:}'.format(args.cpus), 'ktrim=r', 'k=23', 'mink=11', 'minlen={:}'.format(args.minlen), 'hdist=1', 'ftm=5', 'tpe', 'tbo', 'overwrite=true' ] if args.left and args.right: cmd += [ 'in1={:}'.format(args.left), 'in2={:}'.format(args.right), 'out1={:}_1P.fastq.gz'.format(args.basename), 'out2={:}_2P.fastq.gz'.format(args.basename) ] elif args.left: cmd += [ 'in={:}'.format(args.left), 'out={:}_1U.fastq.gz'.format(args.basename) ] printCMD(cmd) if args.debug: subprocess.run(cmd) else: subprocess.run(cmd, stderr=DEVNULL) if args.right: clean = countfastq('{:}_1P.fastq.gz'.format(args.basename)) clean = clean * 2 status('{:,} reads remaining and writing to file'.format(clean)) status('Trimming finished:\n\tFor: {:}\n\tRev {:}'.format( args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -r {:} -o {:} -c {:}\n' .format(args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz', args.basename, args.cpus)) else: clean = countfastq('{:}_1U.fastq.gz'.format(args.basename)) status('{:,} reads remaining and writing to file'.format(clean)) status('Trimming finished:\n\tSingle: {:}'.format(args.basename + '_1U.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -o {:} -c {:}\n' .format(args.basename + '_1U.fastq.gz', args.basename, args.cpus)) elif args.method == 'trimmomatic': #find path trimmomatic_path = find_trimmomatic() if trimmomatic_path: jarfile = trimmomatic_path elif args.trimmomatic: jarfile = args.trimmomatic else: status( 'Trimmomatic cannot be found - please provide location of trimmomatic.jar file.' ) sys.exit(1) if jarfile: path_to_adaptors = args.trimmomatic_adaptors leadingwindow = "LEADING:%d" % (args.trimmomatic_leadingwindow) trailingwindow = "TRAILING:%d" % (args.trimmomatic_trailingwindow) slidingwindow = "SLIDINGWINDOW:%s" % ( args.trimmomatic_slidingwindow) quality = args.trimmomatic_quality quality = "-%s" % (quality) # add leading dash if not os.path.exists(path_to_adaptors): if args.right: path_to_adaptors = dirname( jarfile) + "/adapters/TruSeq3-PE.fa" else: path_to_adaptors = dirname( jarfile) + "/adapters/TruSeq3-SE.fa" if not os.path.exists(path_to_adaptors): findpath = dirname(jarfile) path_to_adaptors = "" while findpath: if os.path.exists(findpath + "/share"): if args.right: path_to_adaptors = findpath + "/share/trimmomatic/adapters/TruSeq3-PE.fa" else: path_to_adaptors = findpath + "/share/trimmomatic/adapters/TruSeq3-SE.fa" break findpath = dirname(findpath) if not os.path.exists(path_to_adaptors): status( "Cannot find adaptors file, please specify manually") status( "Cannot find adaptors file, please specify manually") return clipstr = args.trimmomatic_clip % (path_to_adaptors) cmd = [] if args.left and args.right: cmd = [ 'java', '-jar', jarfile, 'PE', '-threads', str(args.cpus), quality, args.left, args.right, args.basename + '_1P.fastq', args.basename + '_1U.fastq', args.basename + '_2P.fastq', args.basename + '_2U.fastq', clipstr, leadingwindow, trailingwindow, slidingwindow, "MINLEN:%d" % (args.minlen) ] elif args.left and not args.right: cmd = [ 'java', '-jar', jarfile, 'SE', '-threads', str(args.cpus), quality, args.left, args.basename + '_1U.fastq', clipstr, leadingwindow, trailingwindow, slidingwindow, "MINLEN:%d" % (args.minlen) ] else: status("Must provide left and right pairs or single read set") return status('Running trimmomatic adapter and quality trimming') printCMD(cmd) if args.debug: subprocess.run(cmd) else: subprocess.run(cmd, stderr=DEVNULL) if args.right: status('Compressing trimmed PE FASTQ files') Fzip_inplace(args.basename + '_1P.fastq', args.cpus) Fzip_inplace(args.basename + '_2P.fastq', args.cpus) SafeRemove(args.basename + '_1U.fastq') SafeRemove(args.basename + '_2U.fastq') status('Trimming finished:\n\tFor: {:}\n\tRev {:}'.format( args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -r {:} -o {:} -c {:}\n' .format(args.basename + '_1P.fastq.gz', args.basename + '_2P.fastq.gz', args.basename, args.cpus)) else: status('Compressing trimmed SE FASTQ file') Fzip_inplace(args.basename + '_1U.fastq', args.cpus) status( 'Trimming finished:\n\tSingle: {:}'.format(args.basename + '_1U.fastq.gz')) if not args.pipe: status( 'Your next command might be:\n\tAAFTF filter -l {:} -o {:} -c {:}\n' .format(args.basename + '_1U.fastq.gz', args.basename, args.cpus))