def run_nucmer(ref, query, output, min_percent_id=95, run_promer=False): '''Run nucmer and return a list of alignment objects''' runner = nucmer.Runner( ref, query, output, min_id=min_percent_id, coords_header=False, maxmatch=True, simplify=False, promer=run_promer) # nucmer default break length is 200 runner.run() file_reader = coords_file.reader(output) alignments = [coord for coord in file_reader] return alignments
def getTIRs(elements=None, flankdist=10, minid=80, minterm=10, minseed=5, diagfactor=0.3, mites=False, report='split', temp=None, keeptemp=False, alignTool='nucmer', verbose=False): """ Align elements to self and attempt to identify TIRs. Optionally attempt to construct synthetic MITEs from TIRs. """ # Set temp directory to cwd if none. if not temp: temp = os.getcwd() # For each candidate LTR element for rec in elements: # Create temp paths for single element fasta and alignment coords tempFasta = os.path.join(temp, cleanID(rec.id) + '.fasta') tempCoords = tempFasta + '.coords' # Write current element to single fasta manageTemp(record=rec, tempPath=tempFasta, scrub=False) # Align to self with nucmer if alignTool == 'nucmer': # Compose Nucmer script for current element vs self runner = nucmer.Runner(tempFasta, tempFasta, tempCoords, min_id=minid, min_length=minseed, diagfactor=diagfactor, mincluster=minterm, breaklen=200, maxmatch=True, simplify=False) # Execute nucmer runner.run() elif alignTool == 'blastn': # Alternatively, use blastn as search tool and write nucmer.coords-like output. cmds = makeBlast(seq=tempFasta, outfile=tempCoords, pid=minid) run_blast(cmds, verbose=verbose) # Import coords file to iterator object file_reader = coords_file.reader(tempCoords) # Exclude hits to self. Also converts iterator output to stable list alignments = [hit for hit in file_reader if not hit.is_self_hit()] # Filter hits less than min length (Done internally for nucmer, not blastn.) alignments = [ hit for hit in alignments if hit.ref_end - hit.ref_start >= minterm ] # Filter for hits on same strand i.e. tandem repeats / LTRs alignments = [hit for hit in alignments if not hit.on_same_strand()] # Filter for 5' repeats which begin within x bases of element start alignments = [hit for hit in alignments if hit.ref_start <= flankdist] # Scrub overlappying ref / query segments, and also complementary 3' to 5' flank hits alignments = [hit for hit in alignments if hit.ref_end < hit.qry_end] # Sort largest to smallest dist between end of ref (subject) and start of query (hit) # x.qry_start - x.ref_end = length of internal segment alignments = sorted(alignments, key=lambda x: (x.qry_end - x.ref_end), reverse=True) # If alignments exist after filtering report features using alignment pair with largest # internal segment i.e. first element in sorted list. if alignments: if verbose: [print(x) for x in alignments] if report == 'all': yield rec if report in ['split', 'external']: # yield TIR slice - append "_TIR" extSeg = rec[alignments[0].ref_start:alignments[0].ref_end + 1] extSeg.id = extSeg.id + "_TIR" extSeg.name = extSeg.id extSeg.description = "[" + rec.id + " TIR segment]" yield extSeg if report in ['split', 'internal']: # yield internal slice - append "_I" intSeg = rec[alignments[0].ref_end:alignments[0].qry_end + 1] intSeg.id = intSeg.id + "_I" intSeg.name = intSeg.id intSeg.description = "[" + rec.id + " internal segment]" yield intSeg if mites: # Assemble TIRs into hypothetical MITEs synMITE = rec[alignments[0].ref_start:alignments[0].ref_end + 1] + rec[alignments[0]. qry_end:alignments[0].qry_start + 1] synMITE.id = synMITE.id + "_synMITE" synMITE.name = synMITE.id synMITE.description = "[Synthetic MITE constructed from " + rec.id + " TIRs]" yield synMITE else: # If alignment list empty after filtering print alert and continue print('No TIRs found for candidate element: %s' % rec.id) # Scrub single fasta and coords file for current element. if not keeptemp: manageTemp(tempPath=tempFasta, scrub=True) manageTemp(tempPath=tempCoords, scrub=True)
else: contigs.append(x) #print("long", x.id) for pathname in glob.glob("*.fasta"): basename = os.path.basename(pathname) for x in short_contigs: if x.id in basename: runner = nucmer.Runner(basename, basename, "%(x)s_out.coords" % {'x': x.id}, maxmatch=True, simplify=False, mincluster=2000, min_id=99, min_length=2000, coords_header=True) runner.run() # The below lines are for saving fasta files of the contigs if desired #SeqIO.write(short_contigs , "short_contigs.fasta", "fasta") #SeqIO.write(lin_contigs , "lin_contigs.fasta", "fasta") # The below lines are for visually checking which files are repetitive or not ''' for pathname in glob.glob("*.coords"):
#check if valid parameter is provided for identity value if args.minidentity.isdigit() == False: print('Error: Identity (-i) threshold must be a numeric value') exit() #Set a default header if user-defined header name is absent if args.header == None: args.header = 'reordered_contig' print(args.header) #Save query file in faidx index contigfile = Fasta(args.query) #run nucmer program and filter the coords output file runner = nucmer.Runner(args.reference, args.query, args.coordinates, min_id=args.minidentity, coords_header=False) runner.run() #open output files coordsfile = open(args.coordinates) outfile = open(args.output, 'w') #reorder the sequences based on the reference genome coordinates reordered = '' #print(contigfile[0].name) for line in coordsfile: fields = line.replace('[BEGIN]', '').rstrip('\n').split('\t')[:-1] #print(fields) start = int(fields[2])
def main(): p = argparse.ArgumentParser() p.add_argument('oddities_csv') p.add_argument('genomes_dir', help='fastani database dir') p.add_argument('--percent-threshold', type=float, default=95.0) p.add_argument('--length-threshold', type=int, default=0) p.add_argument('-v', '--verbose', action='store_true') p.add_argument('--genome-extension', default='', type=str) args = p.parse_args() print('loading', args.oddities_csv) print('getting genomes from:', args.genomes_dir) print('length threshold for alignments (bp):', args.length_threshold) print('lower cutoff for identity (%):', args.percent_threshold) prefix = args.oddities_csv assert prefix.endswith('.csv') if prefix.endswith('.csv'): prefix = prefix[:-4] fp = open(args.oddities_csv, 'rt') r = csv.DictReader(fp) alignments_dir = prefix + '.alignments' print('putting alignments in:', alignments_dir) try: os.mkdir(alignments_dir) except FileExistsError: print('warning: directory already exists!') print('----') for row in r: cluster_name = row['cluster'] ident1 = os.path.basename(row['ident1']) ident2 = os.path.basename(row['ident2']) if args.verbose: print(cluster_name, ident1, ident2) # copy & name genome files "clusterx.y.IDENT.fa. gunzip if necessary, # since nucmer doesn't handle gzip. fn1 = find_genome_filename(args.genomes_dir, ident1, args.genome_extension) genome1 = os.path.join(alignments_dir, '{}.{}.fa'.format(cluster_name, ident1)) copy_and_gunzip_genome(fn1, genome1) fn2 = find_genome_filename(args.genomes_dir, ident2, args.genome_extension) genome2 = os.path.join(alignments_dir, '{}.{}.fa'.format(cluster_name, ident2)) copy_and_gunzip_genome(fn2, genome2) nucmer_output_name = os.path.join(alignments_dir, cluster_name + '.a') if not os.path.exists(nucmer_output_name): print('running {} alignments...'.format(cluster_name)) runner = nucmer.Runner(genome1, genome2, nucmer_output_name) runner.run() print('...done!') else: if args.verbose: print('using cached alignments file', nucmer_output_name) file_reader = coords_file.reader(nucmer_output_name) alignments = [coord for coord in file_reader if not coord.is_self_hit()] # alignment obj: # 'frame', 'hit_length_qry', 'hit_length_ref', 'intersects_variant', 'is_self_hit', 'on_same_strand', 'percent_identity', 'qry_coords', 'qry_coords_from_ref_coord', 'qry_end', 'qry_length', 'qry_name', 'qry_start', 'ref_coords', 'ref_coords_from_qry_coord', 'ref_end', 'ref_length', 'ref_name', 'ref_start', 'reverse_query', 'reverse_reference', 'to_msp_crunch'] # sort alignments by length of hit alignments.sort(key = lambda x: -x.hit_length_qry) # track alignments over a particular threshold keep_alignments = [] all_bp = 0 aligned_bp = 0 weighted_percent_identity = 0. skipped_bp = 0 skipped_aln = 0 for alignment in alignments: weighted_percent_identity += alignment.percent_identity * alignment.hit_length_qry all_bp += alignment.hit_length_qry # do we pass the length and percent identitiy thresholds? if so, # keep! if alignment.hit_length_qry >= args.length_threshold and \ alignment.percent_identity >= args.percent_threshold: aligned_bp += alignment.hit_length_qry keep_alignments.append(alignment) else: skipped_bp += alignment.hit_length_qry skipped_aln += 1 # ditch if no alignments if not keep_alignments: print('** FLAG: no kept alignments for {}, punting.'.format(cluster_name)) print('') continue # set up the printed out info lca_name = "(root)" # if empty lca, => root of taxonomy. if row['lca']: lca_name = row['lca'] shared_kmers = int(row['shared_kmers']) ksize = int(row['ksize']) # nice output! with some flags. print('{}: {:.0f}kb aln ({:.0f}k {}-mers) across {}; longest contig: {:.0f} kb'.format(cluster_name, aligned_bp / 1000, shared_kmers / 1000, ksize, lca_name, keep_alignments[0].hit_length_qry / 1000)) print('weighted percent identity across alignments: {:.1f}%'.format(weighted_percent_identity / all_bp)) print('skipped {:.0f} kb of alignments in {} alignments (< {} bp or < {:.0f}% identity)'.format(skipped_bp / 1000, skipped_aln, args.length_threshold, args.percent_threshold)) if abs(math.log(shared_kmers / aligned_bp) > 1): print('** FLAG, oddly too little or too many aligned bp vs k-mers') ### track & remove contigs from query genome (genome2) keep_d = defaultdict(set) for aln in keep_alignments: keep_d[aln.qry_name].add(aln) bp_removed = remove_contigs(ident2, genome2, keep_d) flag_2 = 0 if bp_removed > 2.5*aligned_bp: flag_2 = 1 # reset to rm kept, and removed is empty. os.unlink(genome2 + '.kept.fa') with open(genome2 + '.removed.fa', 'wt') as fp: pass ### track & remove contigs from ref genome (genome1) keep_d = defaultdict(set) for aln in keep_alignments: keep_d[aln.ref_name].add(aln) bp_removed = remove_contigs(ident1, genome1, keep_d) flag_1 = 0 if bp_removed > 2.5*aligned_bp: flag_1 = 1 # reset to rm kept, and removed is empty. os.unlink(genome1 + '.kept.fa') with open(genome1 + '.removed.fa', 'wt') as fp: pass # output summary of flags if flag_1 and flag_2: print('** FLAGFLAG, too much removed from both!') elif flag_1 and not flag_2: print('** FLAG, {} is probably contaminated (too much rm from {})'.format(ident2, ident1)) elif flag_2 and not flag_1: print('** FLAG, {} is probably contaminated (too much rm from {})'.format(ident1, ident2)) print('')