def orient_to_start(fasta_in, fasta_out, folder='.', start=False): # if not starting, then use cytochrome oxidase (cob) startFile = os.path.join(folder, '{}.fasta'.format(uuid.uuid4())) if not start: # generated as spoa consensus from select fungal cob genes cob1 = 'atgagaattttaaaaagtcatcctttattaaaattagttaatagttatattattgattcaccacaaccttctaatattagttatttatgaaattttggatctttattagctttatgtttagttatacaaattgtaactggtgttacattagctatgcactatacacctaatgttgatttagcttttaattctgtagaacatattatgagagatgtaaataatggttgattaataagatatttacatgctaatactgcttcagcattctttttcttagttatatttacatataggtagaggattatattatggttcatataaatcacctagaacattaacatgagctattgg' with open(startFile, 'w') as outfile: outfile.write('>COB\n{}\n'.format(softwrap(cob1))) else: shutil.copyfile(start, startFile) # load sequence into dictionary initial_seq = '' header = '' with open(fasta_in, 'r') as infile: for title, seq in SimpleFastaParser(infile): initial_seq = seq header = title alignments = [] minimap2_cmd = ['minimap2', '-x', 'map-ont', '-c', fasta_in, startFile] for line in execute(minimap2_cmd, '.'): cols = line.rstrip().split('\t') alignments.append(cols) if len(alignments) == 1: ref_strand = cols[4] ref_offset = int(cols[2]) if ref_strand == '-': ref_start = int(cols[8]) + ref_offset else: ref_start = int(cols[7]) - ref_offset rotated = initial_seq[ref_start:] + initial_seq[:ref_start] if ref_strand == '-': rotated = RevComp(rotated) with open(fasta_out, 'w') as outfile: outfile.write('>{}\n{}\n'.format('mt', softwrap(rotated))) elif len(alignments) == 0: status( 'ERROR: unable to rotate because did not find --starting sequence\n' ) with open(fasta_out, 'w') as outfile: outfile.write('>{}\n{}\n'.format('mt', softwrap(initial_seq))) elif len(alignments) > 1: status('ERROR: unable to rotate because found multiple alignments\n') for x in alignments: sys.stderr.write('{}\n'.format(x)) with open(fasta_out, 'w') as outfile: outfile.write('>{}\n{}\n'.format('mt', softwrap(initial_seq))) if os.path.isfile(startFile): os.remove(startFile)
def run(parser, args): status('Sorting sequences by length longest --> shortest') AllSeqs = {} with open(args.input, 'rU') as fasta_in: for Header, Seq in SimpleFastaParser(fasta_in): if not Header in AllSeqs: if len(Seq) >= args.minlen: AllSeqs[Header] = len(Seq) sortSeqs = sorted(AllSeqs.items(), key=operator.itemgetter(1), reverse=True) orderedSeqs = [i[0] for i in sortSeqs] SeqRecords = SeqIO.to_dict(SeqIO.parse(args.input, 'fasta')) with open(args.out, 'w') as fasta_out: for i, x in enumerate(orderedSeqs): fasta_out.write('>{:}_{:}\n{:}\n'.format( args.name, i + 1, softwrap(str(SeqRecords[x].seq)))) status('Output written to: {:}'.format(args.out))
def run(parser, args): # first check if NOVOplasty and minimap2 are installed, else exit programs = ['NOVOplasty.pl', 'minimap2'] for x in programs: if not which_path(x): status('ERROR: {} is not installed, exiting'.format(x)) sys.exit(1) # first we need to generate working directory unique_id = str(uuid.uuid4())[:8] if not args.workdir: args.workdir = 'mito_' + unique_id if not os.path.isdir(args.workdir): os.makedirs(args.workdir) # now estimate read lengths of FASTQ read_len = GuessRL(args.left) # check for seed sequence, otherwise write one if not args.seed: if not args.reference: seedFasta = os.path.abspath( os.path.join(os.path.dirname(__file__), 'mito-seed.fasta')) else: seedFasta = os.path.abspath(args.reference) else: seedFasta = os.path.abspath(args.seed) # now write the novoplasty config file defaultConfig = os.path.join(os.path.dirname(__file__), 'novoplasty-config.txt') novoConfig = os.path.join(args.workdir, 'novo-config.txt') if args.reference: refgenome = os.path.abspath(args.reference) else: refgenome = '' checkWords = ("<PROJECT>", "<MINLEN>", "<MAXLEN>", "<MAXMEM>", "<SEED>", "<READLEN>", "<FORWARD>", "<REVERSE>", "<REFERENCE>") repWords = (unique_id, str(args.minlen), str(args.maxlen), str(int(getRAM() * .75)), seedFasta, str(read_len), os.path.abspath(args.left), os.path.abspath(args.right), refgenome) with open(novoConfig, 'w') as outfile: with open(defaultConfig, 'r') as infile: for line in infile: for check, rep in zip(checkWords, repWords): line = line.replace(check, rep) outfile.write(line) # now we can finally run NOVOplasty.pl status('De novo assembling mitochondrial genome using NOVOplasty') cmd = ['NOVOPlasty.pl', '-c', 'novo-config.txt'] printCMD(cmd) novolog = os.path.join(args.workdir, 'novoplasty.log') with open(novolog, 'w') as logfile: p1 = subprocess.Popen(cmd, cwd=args.workdir, stdout=logfile, stderr=logfile) p1.communicate() # now parse the results draftMito = None circular = False for f in os.listdir(args.workdir): if f.startswith('Circularized_assembly_'): draftMito = os.path.join(args.workdir, f) circular = True break if f.startswith('Contigs_1_'): draftMito = os.path.join(args.workdir, f) break if f.startswith('Uncircularized_assemblies_'): draftMito = os.path.join(args.workdir, f) break if circular: status('NOVOplasty assembled complete circular genome') if args.starting: status('Rotating assembly to start with {}'.format(args.starting)) else: status('Rotating assembly to start with Cytochrome b (cob) gene') orient_to_start(draftMito, args.out, folder=args.workdir, start=args.starting) else: numContigs = 0 contigLength = 0 with open(args.out, 'w') as outfile: with open(draftMito, 'r') as infile: for title, seq in SimpleFastaParser(infile): numContigs += 1 contigLength += len(seq) outfile.write('>contig_{}\n{}\n'.format( numContigs, softwrap(seq))) status( 'NOVOplasty assembled {} contigs consiting of {:,} bp, but was unable to circularize genome' .format(numContigs, contigLength)) status('AAFTF mito complete: {}'.format(args.out)) if not args.pipe: shutil.rmtree(args.workdir)
def parse_clean_blastn(fastafile, prefix, blastn, stringent): ''' Blast header rows: qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue score qlen ''' cleaned = prefix + ".clean.fsa" logging = prefix + ".parse.log" excludes = {} VecHits = {} found_vector_seq = 0 with open(blastn, "r") as vectab: rdr = csv.reader(vectab, delimiter="\t") for row in rdr: qaccver, saccver, pid, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore, score, qlen = row if qaccver in contigs_to_remove: continue #vecscreen https://www.ncbi.nlm.nih.gov/tools/vecscreen/about/#Moderate #says to use score here (I'm interpret as score not bitscore) #need to determine if match is terminal or if internal loc = [int(qstart), int(qend)] if loc[0] > loc[1]: loc = [loc[1], loc[0]] #check for location terminal = False position = None if loc[0] <= 25: terminal = True position = '5' if (int(qlen) - loc[1]) <= 25: terminal = True position = '3' Match = 0 # weak=0, moderate=1, strong=2 score = int(score) if terminal: if score >= 19: Match = 1 if score >= 24: Match = 2 else: if score >= 25: Match = 1 if score >= 30: Match = 2 if Match == 0: continue if stringent == 'high': if Match > 0: found_vector_seq += 1 if not qaccver in VecHits: VecHits[qaccver] = [(saccver, int(qlen), loc, int(score), terminal, position)] else: VecHits[qaccver].append( (saccver, int(qlen), loc, int(score), terminal, position)) else: if Match > 1: found_vector_seq += 1 if not qaccver in VecHits: VecHits[qaccver] = [(saccver, int(qlen), loc, int(score), terminal, position)] else: VecHits[qaccver].append( (saccver, int(qlen), loc, int(score), terminal, position)) trimTerminal = 0 splitContig = 0 with open(cleaned, "w") as output_handle, open(logging, "w") as log: for record in SeqIO.parse(fastafile, "fasta"): FiveEnd = 0 ThreeEnd = len(record.seq) internals = [] slicer = [] sInt = [] Seq = str(record.seq) if not record.id in VecHits: if len(record.seq) >= 200: output_handle.write('>{:}\n{:}\n'.format( record.id, softwrap(Seq))) else: #VecHits contains list of tuples of information, if terminal, then just truncate #off the closest side. Also, need to check if multiple intervals are within 50 #bp of each other, that whole interval is removed. #should be able to accomplish above with the several rounds that it runs with, #so split on internal and trim terminal. done. for hit in VecHits[record.id]: ID, length, loc, score, terminal, pos = hit if terminal and pos == '5': if loc[1] > FiveEnd: FiveEnd = loc[1] elif terminal and pos == '3': if loc[0] < ThreeEnd: ThreeEnd = loc[0] else: #internal hits to add to list if not loc in internals: internals.append(loc) #now sort intervals sInt = sorted(internals, key=lambda x: int(x[0])) #now construct slicing list if len(sInt) < 1: slicer = [FiveEnd, ThreeEnd] else: slicer = [FiveEnd] for x in sInt: slicer = slicer + x slicer.append(ThreeEnd) paired_slicer = list(group(slicer, 2)) if len(paired_slicer) < 2: status('Terminal trimming {:} to {:}'.format( record.id, paired_slicer)) newSeq = Seq[paired_slicer[0][0]:paired_slicer[0][1]] if len(newSeq) >= 200: output_handle.write('>{:}\n{:}\n'.format( record.id, softwrap(newSeq))) else: status('Spliting contig {:} into {:}'.format( record.id, paired_slicer)) for num, y in enumerate(paired_slicer): newSeq = Seq[y[0]:y[1]] if len(newSeq) >= 200: output_handle.write('>split{:}_{:}\n{:}\n'.format( num + 1, record.id, softwrap(newSeq))) return (found_vector_seq, cleaned)
def run(parser, args): if not args.workdir: args.workdir = 'aaftf-vecscreen_' + str(uuid.uuid4())[:8] if not os.path.exists(args.workdir): os.mkdir(args.workdir) #parse database locations DB = None if not args.AAFTF_DB: try: DB = os.environ["AAFTF_DB"] except KeyError: if args.AAFTF_DB: DB = args.AAFTF_DB else: pass else: DB = args.AAFTF_DB if args.percent_id: percentid_cutoff = args.percent_id infile = args.infile outfile = os.path.basename(args.outfile) outdir = os.path.dirname(args.outfile) if '.f' in outfile: prefix = outfile.rsplit('.f', 1)[0] print("prefix is ", prefix) else: prefix = str(os.getpid()) if not outfile: outfile = "%s.vecscreen.fasta" % prefix outfile_vec = os.path.join(args.workdir, "%s.tmp_vecscreen.fasta" % (prefix)) # Common Euk/Prot contaminats for blastable DB later on status('Building BLAST databases for contamination screen.') makeblastdblist = [] for d in DB_Links: if d == 'sourmash': continue url = DB_Links[d] dbname = os.path.basename(str(url)) #logger.debug("testing for url=%s dbname=%s"%(url,dbname)) if DB: file = os.path.join(DB, dbname) else: file = os.path.join(args.workdir, dbname) if file.endswith(".gz"): nogz = os.path.splitext(file)[0] if not os.path.exists(nogz): if not os.path.exists(file): urllib.request.urlretrieve(url, file) with gzip.open(file, 'rb') as ingz, open(nogz, 'wb') as outfa: shutil.copyfileobj(ingz, outfa) # call(['gunzip', '-k', file]) make_blastdb('nucl', nogz, os.path.join(args.workdir, d)) else: make_blastdb('nucl', nogz, os.path.join(args.workdir, d)) else: if not os.path.exists(file): urllib.request.urlretrieve(url, file) make_blastdb('nucl', file, os.path.join(args.workdir, d)) global contigs_to_remove contigs_to_remove = {} regions_to_trim = {} #qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore for contam in ["CONTAM_EUKS", "CONTAM_PROKS"]: status("%s Contamination Screen" % (contam)) blastreport = os.path.join(args.workdir, "%s.%s.blastn" % (contam, prefix)) blastnargs = [ 'blastn', '-query', infile, '-db', os.path.join(args.workdir, contam), '-num_threads', str(args.cpus), '-dust', 'yes', '-soft_masking', 'true', '-perc_identity', BlastPercent_ID_ContamMatch, '-lcase_masking', '-outfmt', '6', '-out', blastreport ] printCMD(blastnargs) call(blastnargs) hits = 0 with open(blastreport) as report: colparser = csv.reader(report, delimiter="\t") for row in colparser: if ((float(row[2]) >= 98.0 and int(row[3]) >= 50) or (float(row[2]) >= 94.0 and int(row[3]) >= 100) or (float(row[2]) >= 90.0 and int(row[3]) >= 200)): if not row[0] in regions_to_trim: if int(row[6]) < int(row[7]): start = int(row[6]) end = int(row[7]) else: start = int(row[7]) end = int(row[6]) regions_to_trim[row[0]] = [(start, end, contam, row[1], float(row[2]))] else: regions_to_trim[row[0]].append( (start, end, contam, row[1], float(row[2]))) status('{:} screening finished'.format(contam)) eukCleaned = os.path.join(args.workdir, "%s.euk-prot_cleaned.fasta" % (prefix)) if len(regions_to_trim) > 0: with open(eukCleaned, 'w') as cleanout: with open(infile, 'rU') as fastain: for record in SeqIO.parse(fastain, 'fasta'): if not record.id in regions_to_trim: cleanout.write('>{:}\n{:}\n'.format( record.id, softwrap(str(record.seq)))) else: Seq = str(record.seq) regions = regions_to_trim[record.id] status( 'Splitting {:} due to contamination: {:}'.format( record.id, regions)) lastpos = 0 newSeq = '' for i, x in enumerate(regions): newSeq = Seq[lastpos:x[0]] lastpos = x[1] cleanout.write('>split{:}_{:}\n{:}\n'.format( i, record.id, softwrap(newSeq))) if i == len(regions) - 1: newSeq = Seq[x[1]:] cleanout.write('>split{:}_{:}\n{:}\n'.format( i + 1, record.id, softwrap(newSeq))) else: eukCleaned = infile # MITO screen status('Mitochondria Contamination Screen') mitoHits = [] blastreport = os.path.join(args.workdir, "%s.%s.blastn" % ('MITO', prefix)) blastnargs = [ 'blastn', '-query', eukCleaned, '-db', os.path.join(args.workdir, 'MITO'), '-num_threads', str(args.cpus), '-dust', 'yes', '-soft_masking', 'true', '-perc_identity', BlastPercent_ID_MitoMatch, '-lcase_masking', '-outfmt', '6', '-out', blastreport ] printCMD(blastnargs) call(blastnargs) with open(blastreport) as report: colparser = csv.reader(report, delimiter="\t") for row in colparser: if int(row[3]) >= 120: contigs_to_remove[row[0]] = ('MitoScreen', row[1], float(row[2])) mitoHits.append(row[0]) status('Mito screening finished.') #vecscreen starts here status( 'Starting VecScreen, will remove terminal matches and split internal matches' ) rnd = 0 count = 1 while (count > 0): filepref = "%s.r%d" % (prefix, rnd) report = os.path.join(args.workdir, "%s.vecscreen.tab" % (filepref)) if not os.path.exists(report): cmd = [ 'blastn', '-task', 'blastn', '-reward', '1', '-penalty', '-5', '-gapopen', '3', '-gapextend', '3', '-dust', 'yes', '-soft_masking', 'true', '-evalue', '700', '-searchsp', '1750000000000', '-db', os.path.join(args.workdir, 'UniVec'), '-outfmt', '6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore score qlen', '-num_threads', str(args.cpus), '-query', eukCleaned, '-out', report ] #logger.info('CMD: {:}'.format(printCMD(cmd,7))) call(cmd) # this needs to know/return the new fasta file? status("Parsing VecScreen round {:}: {:} for {:}".format( rnd + 1, filepref, report)) (count, cleanfile) = parse_clean_blastn(eukCleaned, os.path.join(args.workdir, filepref), report, args.stringency) status("count is %d cleanfile is %s" % (count, cleanfile)) if count == 0: # if there are no vector matches < than the pid cutoff status("copying %s to %s" % (eukCleaned, outfile_vec)) shutil.copy(eukCleaned, outfile_vec) else: rnd += 1 eukCleaned = cleanfile status("{:,} contigs will be removed:".format(len(contigs_to_remove))) for k, v in sorted(contigs_to_remove.items()): print('\t{:} --> dbhit={:}; hit={:}; pident={:}'.format( k, v[0], v[1], v[2])) # this could instead use the outfile and strip .fasta/fsa/fna and add mito on it I suppose, but assumes # a bit about the naming structure mitochondria = os.path.join(outdir, prefix + '.mitochondria.fasta') with open(args.outfile, "w") as output_handle, open(mitochondria, 'w') as mito_handle: for record in SeqIO.parse(outfile_vec, "fasta"): if not record.id in contigs_to_remove: SeqIO.write(record, output_handle, "fasta") elif record.id in mitoHits: SeqIO.write(record, mito_handle, "fasta") status('Writing {:,} cleaned contigs to: {:}'.format( countfasta(args.outfile), args.outfile)) status('Writing {:,} mitochondrial contigs to: {:}'.format( countfasta(mitochondria), mitochondria)) if '_' in args.outfile: nextOut = args.outfile.split('_')[0] + '.sourpurge.fasta' elif '.' in args.outfile: nextOut = args.outfile.split('.')[0] + '.sourpurge.fasta' else: nextOut = args.outfile + '.sourpurge.fasta' if not args.pipe: status( 'Your next command might be:\n\tAAFTF sourpurge -i {:} -o {:} -c {:} --phylum Ascomycota\n' .format(args.outfile, nextOut, args.cpus)) if not args.debug: SafeRemove(args.workdir)