def _write(self): with open(self.path, 'w') as fh: fh.write('user_genome\treference_genome\tfastani_ani\tfastani_af\t' 'reference_taxonomy\tsatisfies_gtdb_circumscription_criteria\n') for gid in sorted(self.genomes): if gid in self.results: thresh_results = [(ref_gid, hit) for (ref_gid, hit) in self.results[gid].items() if hit['af'] >= self.min_af] closest = sorted(thresh_results, key=lambda x: (-x[1]['ani'], -x[1]['af'])) if len(closest) > 0: ref_gid = closest[0][0] canonical_rid = canonical_gid(ref_gid) taxonomy_str = ';'.join(self.taxonomy[canonical_rid]) gtdb_ani_radius = self.gtdb_radii.get_rep_ani(canonical_rid) closest_ani = closest[0][1]["ani"] closest_af = closest[0][1]["af"] fh.write(f'{gid}\t{ref_gid}') fh.write(f'\t{closest_ani}\t{closest_af}') fh.write(f'\t{taxonomy_str}') fh.write(f'\t{closest_ani >= gtdb_ani_radius and closest_af >= self.gtdb_min_af}\n') else: fh.write(f'{gid}\tno result\tno result\tno result\tno result\tno result\n') else: fh.write(f'{gid}\tno result\tno result\tno result\tno result\n') self.logger.info(f'Closest representative hits saved to: {self.path}')
def _read(self): """Read the file and create any data.""" self._rep_idx, self._species_idx = dict(), dict() with open(self.path) as fh: for line in fh.readlines(): species, genome, ani = line.strip().split('\t') genome = canonical_gid(genome) ani = float(ani) self._rep_idx[genome] = {'species': species, 'ani': ani} self._species_idx[species] = {'rep': genome, 'ani': ani}
def _write(self): with open(self.path, 'w') as fh: fh.write('user_genome\treference_genome\tfastani_ani\tfastani_af\treference_taxonomy\n') for qry_gid, ref_hits in sorted(self.results.items()): for ref_gid, ref_hit in sorted(ref_hits.items(), key=lambda x: (-x[1]['af'], -x[1]['ani'], x[0])): canonical_rid = canonical_gid(ref_gid) taxonomy_str = ';'.join(self.taxonomy[canonical_rid]) fh.write(f'{qry_gid}\t{ref_gid}') fh.write(f'\t{ref_hit["ani"]}\t{ref_hit["af"]}') fh.write(f'\t{taxonomy_str}\n') self.logger.info(f'Summary of results saved to: {self.path}')
def read(self, taxonomy_file: str, canonical_ids: bool = False) -> Dict[str, List[str]]: """Read Greengenes-style taxonomy file. Expected format is: <id>\t<taxonomy string> where the taxonomy string has the formats: d__; p__; c__; o__; f__; g__; s__ Parameters ---------- taxonomy_file : str Path to a Greengenes-style taxonomy file. canonical_ids : bool True if to use the canonical ID format, False otherwise. """ try: d = {} with open(taxonomy_file, 'r') as f: for row, line in enumerate(f.readlines()): line_split = line.split('\t') if len(line_split) != 2: raise GTDBTkExit(f'Not a tab-separated line: {line}') unique_id = line_split[0] if canonical_ids: unique_id = canonical_gid(unique_id) tax_str = line_split[1].rstrip() if tax_str[-1] == ';': # remove trailing semicolons which sometimes # appear in Greengenes-style taxonomy files tax_str = tax_str[0:-1] d[unique_id] = [x.strip() for x in tax_str.split(';')] except: self.logger.error('Failed to parse taxonomy file on line %d' % (row + 1)) raise return d
def read(self, taxonomy_file, canonical_ids=False): """Read Greengenes-style taxonomy file. Expected format is: <id>\t<taxonomy string> where the taxonomy string has the formats: d__; c__; o__; f__; g__; s__ Parameters ---------- taxonomy_file : str Greengenes-style taxonomy file. Returns ------- dict[str, tuple[str, str, str, str, str, str, str]] d[unique_id] -> [d__<taxon>, ..., s__<taxon>] """ try: d = {} with open(taxonomy_file, 'r') as f: for row, line in enumerate(f.readlines()): line_split = line.split('\t') unique_id = line_split[0] if canonical_ids: unique_id = canonical_gid(unique_id) tax_str = line_split[1].rstrip() if tax_str[-1] == ';': # remove trailing semicolons which sometimes # appear in Greengenes-style taxonomy files tax_str = tax_str[0:-1] d[unique_id] = [x.strip() for x in tax_str.split(';')] except: self.logger.error('Failed to parse taxonomy file on line %d' % (row + 1)) raise return d
def _write(self): with open(self.path, 'w') as fh: fh.write('user_genome\treference_genome\tfastani_ani\tfastani_af\treference_taxonomy\n') for gid in sorted(self.genomes): if gid in self.results: thresh_results = [(ref_gid, hit) for (ref_gid, hit) in self.results[gid].items() if hit['af'] >= self.min_af] closest = sorted(thresh_results, key=lambda x: (-x[1]['ani'], -x[1]['af'])) if len(closest) > 0: ref_gid = closest[0][0] canonical_rid = canonical_gid(ref_gid) taxonomy_str = ';'.join(self.taxonomy[canonical_rid]) fh.write(f'{gid}\t{ref_gid}') fh.write(f'\t{closest[0][1]["ani"]}\t{closest[0][1]["af"]}') fh.write(f'\t{taxonomy_str}\n') else: fh.write(f'{gid}\tno result\tno result\tno result\tno result\n') else: fh.write(f'{gid}\tno result\tno result\tno result\n') self.logger.info(f'Closest representative hits saved to: {self.path}')