def test_aniblastall_concordance( paths_concordance_fna, path_concordance_jspecies, tolerance_anib_hi, fragment_length, tmp_path, ): """Check ANIblastall results are concordant with JSpecies.""" # Get lengths of input genomes orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna) # Perform ANIblastall on the input directory contents fragfiles, fraglengths = anib.fragment_fasta_files( paths_concordance_fna, tmp_path, fragment_length ) jobgraph = anib.make_job_graph( paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIblastall", tmp_path), ) assert 0 == run_mp.run_dependency_graph(jobgraph) # Jobs must run correctly # Process BLAST output result_pid = anib.process_blast( tmp_path, orglengths, fraglengths, mode="ANIblastall" ).percentage_identity # Compare JSpecies output to results result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"].values assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anib_hi)
def test_aniblastall_concordance(self): """Check ANIblastall results are concordant with JSpecies.""" # Perform ANIblastall on the input directory contents outdir = self.outdir / "blastall" outdir.mkdir(exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdir)) self.assertEqual(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIblastall") result_pid = results.percentage_identity result_pid.to_csv(self.outdir / "pyani_aniblastall.tab", sep="\t") # Compare JSpecies output to results result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 diffmat = result_pid.values - self.target["ANIb"].values aniblastall_diff = pd.DataFrame(diffmat, index=result_pid.index, columns=result_pid.columns) aniblastall_diff.to_csv(self.outdir / "pyani_aniblastall_diff.tab", sep="\t") self.assertLess(aniblastall_diff.abs().values.max(), self.tolerance["ANIblastall"])
def test_dependency_graph_run(self): """module runs dependency graph.""" fragresult = anib.fragment_fasta_files(self.infiles, self.outdir, self.fraglen) blastcmds = anib.make_blastcmd_builder("ANIb", self.outdir) jobgraph = anib.make_job_graph(self.infiles, fragresult[0], blastcmds) result = run_multiprocessing.run_dependency_graph(jobgraph) assert_equal(0, result)
def test_dependency_graph_run(self): """Test that module runs dependency graph.""" fragresult = anib.fragment_fasta_files(self.infiles, self.outdir, self.fraglen) blastcmds = anib.make_blastcmd_builder("ANIb", self.outdir) jobgraph = anib.make_job_graph(self.infiles, fragresult[0], blastcmds) result = run_multiprocessing.run_dependency_graph(jobgraph) self.assertEqual(0, result)
def test_aniblastall_concordance(self): """ANIblastall results concordant with JSpecies.""" # Perform ANIblastall on the input directory contents outdir = os.path.join(self.outdir, "blastall") os.makedirs(outdir, exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize ) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdir) ) assert_equal(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast( outdir, self.orglengths, fraglengths, mode="ANIblastall" ) result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_aniblastall.tab"), sep="\t") # Compare JSpecies output to results result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 diffmat = result_pid.values - self.target["ANIb"].values aniblastall_diff = pd.DataFrame( diffmat, index=result_pid.index, columns=result_pid.columns ) aniblastall_diff.to_csv( os.path.join(self.outdir, "pyani_aniblastall_diff.tab"), sep="\t" ) assert_less(aniblastall_diff.abs().values.max(), self.tolerance["ANIblastall"])
def test_dependency_graph_run(path_fna_two, fragment_length, tmp_path): """Test that module runs dependency graph.""" fragresult = fragment_fasta_files(path_fna_two, tmp_path, fragment_length) blastcmds = make_blastcmd_builder("ANIb", tmp_path) jobgraph = make_job_graph(path_fna_two, fragresult[0], blastcmds) result = run_dependency_graph(jobgraph) assert 0 == result
def test_aniblastall_concordance(): """Test concordance of ANIblastall method with JSpecies output.""" # Make/check output directory mode = "ANIblastall" outdirname = delete_and_remake_outdir(mode) # Get dataframes of JSpecies output aniblastall_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIblastall concordance: # Make fragments fragfiles, fraglengths = anib.fragment_fasta_files(infiles, outdirname, pyani_config.FRAGSIZE) # Build jobgraph jobgraph = anib.make_job_graph( infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdirname)) print("\nJobgraph:\n", jobgraph) print("\nJob 0:\n", jobgraph[0].script) # Run jobgraph with multiprocessing run_dependency_graph(jobgraph) print("Ran multiprocessing jobs") # Process BLAST; the pid data is in anib_data[1] aniblastall_data = anib.process_blast(outdirname, org_lengths, fraglengths, mode="ANIblastall") aniblastall_pid = \ aniblastall_data.percentage_identity.sort_index(axis=0).\ sort_index(axis=1) * 100. index, columns = aniblastall_pid.index, aniblastall_pid.columns diffmat = aniblastall_pid.as_matrix() - aniblastall_jspecies.as_matrix() aniblastall_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference aniblastall_pid.to_csv(os.path.join(outdirname, 'ANIblastall_pid.tab'), sep='\t') aniblastall_jspecies.to_csv(os.path.join(outdirname, 'ANIblastall_jspecies.tab'), sep='\t') aniblastall_diff.to_csv(os.path.join(outdirname, 'ANIblastall_diff.tab'), sep='\t') print("ANIblastall concordance test output placed in %s" % outdirname) print("ANIblastall PID:\n", aniblastall_pid) print("ANIblastall JSpecies:\n", aniblastall_jspecies) print("ANIblastall diff:\n", aniblastall_diff) # We'd like the absolute difference reported to be < ANIBLASTALL_THRESHOLD max_diff = aniblastall_diff.abs().values.max() print("Maximum difference for ANIblastall: %e" % max_diff) assert_less(max_diff, ANIB_THRESHOLD)
def test_fragment_files(path_fna_all, tmp_path, dir_tgt_fragments, fragment_length): """Fragment files for ANIb/ANIblastall.""" result = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length) # # Test fragment lengths are in bounds for _, fragdict in result[-1].items(): for _, fraglen in fragdict.items(): assert fraglen <= fragment_length
def test_fragment_files(self): """fragment files for ANIb/ANIblastall.""" result = anib.fragment_fasta_files(self.infnames, self.outdir, self.fraglen) # Are files created? for outfname in self.outfnames: assert os.path.isfile(outfname) # Test fragment lengths for accession, fragdict in result[-1].items(): for fragname, fraglen in fragdict.items(): assert fraglen <= self.fraglen
def test_blastn_graph(path_fna_all, tmp_path, fragment_length): """Create jobgraph for BLASTN+ jobs.""" fragresult = anib.fragment_fasta_files(path_fna_all, tmp_path, fragment_length) blastcmds = anib.make_blastcmd_builder("ANIb", tmp_path) jobgraph = anib.make_job_graph(path_fna_all, fragresult[0], blastcmds) # We check that the main script job is a blastn job, and that there # is a single dependency, which is a makeblastdb job for job in jobgraph: assert job.script.startswith("blastn") assert len(job.dependencies) == 1 assert job.dependencies[0].script.startswith("makeblastdb")
def test_blastall_graph(self): """create jobgraph for legacy BLASTN jobs.""" fragresult = anib.fragment_fasta_files(self.infiles, self.outdir, self.fraglen) blastcmds = anib.make_blastcmd_builder("ANIblastall", self.outdir) jobgraph = anib.make_job_graph(self.infiles, fragresult[0], blastcmds) # We check that the main script job is a blastn job, and that there # is a single dependency, which is a makeblastdb job for job in jobgraph: assert job.script.startswith("blastall -p blastn") assert_equal(1, len(job.dependencies)) dep = job.dependencies[0] assert dep.script.startswith("formatdb")
def test_anib_concordance(self): """ANIb results concordant with JSpecies. We expect ANIb results to be quite different, as the BLASTN algorithm changed substantially between BLAST and BLAST+ """ # Perform ANIb on the input directory contents outdir = os.path.join(self.outdir, "blastn") os.makedirs(outdir, exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIb", outdir)) assert_equal(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIb") result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_anib.tab"), sep="\t") # Compare JSpecies output to results. We do this in two blocks, # masked according to whether the expected result is greater than # 90% identity, or less than that threshold. # The complete difference matrix is written to output, though result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 lo_result = result_pid.mask(result_pid >= 90).fillna(0) hi_result = result_pid.mask(result_pid < 90).fillna(0) lo_target = self.target["ANIb"].mask( self.target["ANIb"] >= 90).fillna(0) hi_target = self.target["ANIb"].mask( self.target["ANIb"] < 90).fillna(0) lo_diffmat = lo_result.as_matrix() - lo_target.as_matrix() hi_diffmat = hi_result.as_matrix() - hi_target.as_matrix() diffmat = result_pid.as_matrix() - self.target["ANIb"].as_matrix() lo_diff = pd.DataFrame(lo_diffmat, index=result_pid.index, columns=result_pid.columns) hi_diff = pd.DataFrame(hi_diffmat, index=result_pid.index, columns=result_pid.columns) anib_diff = pd.DataFrame(diffmat, index=result_pid.index, columns=result_pid.columns) anib_diff.to_csv(os.path.join(self.outdir, "pyani_anib_diff.tab"), sep="\t") assert_less(lo_diff.abs().values.max(), self.tolerance["ANIb_lo"]) assert_less(hi_diff.abs().values.max(), self.tolerance["ANIb_hi"])
def test_anib_concordance( paths_concordance_fna, path_concordance_jspecies, tolerance_anib_hi, tolerance_anib_lo, threshold_anib_lo_hi, fragment_length, tmp_path, ): """Check ANIb results are concordant with JSpecies. We expect ANIb results to be quite different, as the BLASTN algorithm changed substantially between BLAST and BLAST+ (the megaBLAST algorithm is now the default for BLASTN) """ # Get lengths of input genomes orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna) # Build and run BLAST jobs fragfiles, fraglengths = anib.fragment_fasta_files( paths_concordance_fna, tmp_path, fragment_length ) jobgraph = anib.make_job_graph( paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIb", tmp_path) ) assert 0 == run_mp.run_dependency_graph(jobgraph) # Jobs must run correctly # Process BLAST output result_pid = anib.process_blast( tmp_path, orglengths, fraglengths, mode="ANIb" ).percentage_identity # Compare JSpecies output to results. We do this in two blocks, # masked according to whether the expected result is greater than # a threshold separating "low" from "high" identity comparisons. result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 lo_result = result_pid.mask(result_pid >= threshold_anib_lo_hi).fillna(0).values hi_result = result_pid.mask(result_pid < threshold_anib_lo_hi).fillna(0).values tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"] lo_target = tgt_pid.mask(tgt_pid >= threshold_anib_lo_hi).fillna(0).values hi_target = tgt_pid.mask(tgt_pid < threshold_anib_lo_hi).fillna(0).values assert (lo_result - lo_target, hi_result - hi_target) == ( pytest.approx(0, abs=tolerance_anib_lo), pytest.approx(0, abs=tolerance_anib_hi), )
def test_anib_concordance(self): """ANIb results concordant with JSpecies. We expect ANIb results to be quite different, as the BLASTN algorithm changed substantially between BLAST and BLAST+ """ # Perform ANIb on the input directory contents outdir = os.path.join(self.outdir, "blastn") os.makedirs(outdir, exist_ok=True) fragfiles, fraglengths = anib.fragment_fasta_files( self.infiles, outdir, self.fragsize ) jobgraph = anib.make_job_graph( self.infiles, fragfiles, anib.make_blastcmd_builder("ANIb", outdir) ) assert_equal(0, run_mp.run_dependency_graph(jobgraph)) results = anib.process_blast(outdir, self.orglengths, fraglengths, mode="ANIb") result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_anib.tab"), sep="\t") # Compare JSpecies output to results. We do this in two blocks, # masked according to whether the expected result is greater than # 90% identity, or less than that threshold. # The complete difference matrix is written to output, though result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 lo_result = result_pid.mask(result_pid >= 90).fillna(0) hi_result = result_pid.mask(result_pid < 90).fillna(0) lo_target = self.target["ANIb"].mask(self.target["ANIb"] >= 90).fillna(0) hi_target = self.target["ANIb"].mask(self.target["ANIb"] < 90).fillna(0) lo_diffmat = lo_result.values - lo_target.values hi_diffmat = hi_result.values - hi_target.values diffmat = result_pid.values - self.target["ANIb"].values lo_diff = pd.DataFrame( lo_diffmat, index=result_pid.index, columns=result_pid.columns ) hi_diff = pd.DataFrame( hi_diffmat, index=result_pid.index, columns=result_pid.columns ) anib_diff = pd.DataFrame( diffmat, index=result_pid.index, columns=result_pid.columns ) anib_diff.to_csv(os.path.join(self.outdir, "pyani_anib_diff.tab"), sep="\t") assert_less(lo_diff.abs().values.max(), self.tolerance["ANIb_lo"]) assert_less(hi_diff.abs().values.max(), self.tolerance["ANIb_hi"])
def make_sequence_fragments( args: Namespace, logger: Logger, infiles: List[Path], blastdir: Path ) -> Tuple[List, Dict]: """Return tuple of fragment files, and fragment sizes. :param args: Namespace of command-line arguments :param logger: logging object :param infiles: iterable of sequence files to fragment :param blastdir: path of directory to place BLASTN databases of fragments Splits input FASTA sequence files into the fragments (a requirement for ANIb methods), and writes BLAST databases of these fragments, and fragment lengths of sequences, to local files. """ fragfiles, fraglengths = anib.fragment_fasta_files(infiles, blastdir, args.fragsize) # Export fragment lengths as JSON, in case we re-run with --skip_blastn fragpath = blastdir / "fraglengths.json" logger.info(f"Writing cache of fragment lengths to {fragpath}") with open(fragpath, "w") as ofh: json.dump(fraglengths, ofh) return fragfiles, fraglengths
def unified_anib(infiles, org_lengths): """Calculate ANIb for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Calculates ANI by the ANIb method, as described in Goris et al. (2007) Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are some minor differences depending on whether BLAST+ or legacy BLAST (BLASTALL) methods are used. All FASTA format files (selected by suffix) in the input directory are used to construct BLAST databases, placed in the output directory. Each file's contents are also split into sequence fragments of length options.fragsize, and the multiple FASTA file that results written to the output directory. These are BLASTNed, pairwise, against the databases. The BLAST output is interrogated for all fragment matches that cover at least 70% of the query sequence, with at least 30% nucleotide identity over the full length of the query sequence. This is an odd choice and doesn't correspond to the twilight zone limit as implied by Goris et al. We persist with their definition, however. Only these qualifying matches contribute to the total aligned length, and total aligned sequence identity used to calculate ANI. The results are processed to give matrices of aligned sequence length (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of each genome, for each pairwise comparison. These are written to the output directory in plain text tab-separated format. """ logger.info("Running %s", args.method) blastdir = os.path.join(args.outdirname, ALIGNDIR[args.method]) logger.info("Writing BLAST output to %s", blastdir) # Build BLAST databases and run pairwise BLASTN if not args.skip_blastn: # Make sequence fragments logger.info("Fragmenting input files, and writing to %s", args.outdirname) # Fraglengths does not get reused with BLASTN fragfiles, fraglengths = anib.fragment_fasta_files( infiles, blastdir, args.fragsize) # Export fragment lengths as JSON, in case we re-run with --skip_blastn with open(os.path.join(blastdir, 'fraglengths.json'), 'w') as outfile: json.dump(fraglengths, outfile) # Which executables are we using? #if args.method == "ANIblastall": # format_exe = args.formatdb_exe # blast_exe = args.blastall_exe #else: # format_exe = args.makeblastdb_exe # blast_exe = args.blastn_exe # Run BLAST database-building and executables from a jobgraph logger.info("Creating job dependency graph") jobgraph = anib.make_job_graph( infiles, fragfiles, anib.make_blastcmd_builder(args.method, blastdir)) #jobgraph = anib.make_job_graph(infiles, fragfiles, blastdir, # format_exe, blast_exe, args.method, # jobprefix=args.jobprefix) if args.scheduler == 'multiprocessing': logger.info("Running jobs with multiprocessing") logger.info("Running job dependency graph") cumval = run_mp.run_dependency_graph(jobgraph, logger=logger) if 0 < cumval: logger.warning( "At least one BLAST run failed. " + "%s may fail.", args.method) else: logger.info("All multiprocessing jobs complete.") else: run_sge.run_dependency_graph(jobgraph, logger=logger) logger.info("Running jobs with SGE") else: # Import fragment lengths from JSON if args.method == "ANIblastall": with open(os.path.join(blastdir, 'fraglengths.json'), 'rU') as infile: fraglengths = json.load(infile) else: fraglengths = None logger.warning("Skipping BLASTN runs (as instructed)!") # Process pairwise BLASTN output logger.info("Processing pairwise %s BLAST output.", args.method) try: data = anib.process_blast(blastdir, org_lengths, fraglengths=fraglengths, mode=args.method) except ZeroDivisionError: logger.error("One or more BLAST output files has a problem.") if not args.skip_blastn: if 0 < cumval: logger.error("This is possibly due to BLASTN run failure, " + "please investigate") else: logger.error("This is possibly due to a BLASTN comparison " + "being too distant for use.") logger.error(last_exception()) if not args.nocompress: logger.info("Compressing/deleting %s", blastdir) compress_delete_outdir(blastdir) # Return processed BLAST data return data
def subcmd_anib(args: Namespace, logger: Logger) -> None: """Perform ANIb on all genome files in an input directory. :param args: Namespace, command-line arguments :param logger: logging object Finds ANI by the ANIb method, as described in Goris J, Konstantinidis KT, Klappenbach JA, Coenye T, Vandamme P, et al. (2007) DNA-DNA hybridization values and their relationship to whole-genome sequence similarities. Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. All FASTA format files (selected by suffix) in the input directory are fragmented into (by default 1020nt) consecutive sections, and a BLAST+ database constructed from the whole genome input. The BLAST+ blastn tool is then used to query each set of fragments against each BLAST+ database, in turn. For each query, the BLAST+ .tab output is parsed to obtain alignment length, identity and similarity error count. Alignments below a threshold are not included in the calculation (this introduces systematic bias with respect to ANIm). The results are processed to calculate the ANI percentages, coverage, and similarity error. The calculated values are stored in the local SQLite3 database. """ logger.info("Running ANIm analysis") # announce that we're starting # Get BLAST+ version - this will be used in the database entries blastn_version = anib.get_version(args.blastn_exe) logger.info(f"BLAST+ blastn version: {blastn_version}") # Use provided name, or make new one for this analysis start_time = datetime.datetime.now() name = args.name or "_".join(["ANIb", start_time.isoformat()]) logger.info(f"Analysis name: {name}") # Connect to existing database (which may be "clean" or have old analyses) logger.info(f"Connecting to database {args.dbpath}") try: session = get_session(args.dbpath) except Exception: logger.error(f"Could not connect to database {args.dbpath} (exiting)", exc_info=True) raise SystemExit(1) # Add information about this run to the database logger.info(f"Adding run info to database {args.dbpath}...") try: run = add_run( session, method="ANIb", cmdline=args.cmdline, date=start_time, status="started", name=name, ) except PyaniORMException: logger.error("Could not add run to the database (exiting)", exc_info=True) raise SystemExit(1) logger.info(f"\t...added run ID: {run} to the database") # Identify input files for comparison, and populate the database logger.info(f"Adding files for {run} to database...") try: genome_ids = add_run_genomes(session, run, args.indir, args.classes, args.labels) except PyaniORMException: logger.error( f"Could not add genomes to database for run {run} (exiting)", exc_info=True) logger.info(f"\t...added genome IDs: {genome_ids}") # Get list of genomes for this analysis from the database logger.info("Compiling genomes for comparison") genomes = run.genomes.all() logger.info(f"\tCollected {len(genomes)} genomes for this run") # Create output directories. We create the main parent directory (args.outdir), but # also subdirectories for the BLAST databases, logger.info(f"Creating output directory {args.outdir}") try: os.makedirs(args.outdir, exist_ok=True) except IOError: logger.error( f"Could not create output directory {args.outdir} (exiting)", exc_info=True) raise SystemError(1) fragdir = Path(str(args.outdir)) / "fragments" blastdbdir = Path(str(args.outdir)) / "blastdbs" logger.info(f"\t...creating subdirectories") os.makedirs(fragdir, exist_ok=True) os.makedirs(blastdbdir, exist_ok=True) # Create a new sequence fragment file and a new BLAST+ database for each input genome, # and add this data to the database as a row in BlastDB logger.info("Creating input sequence fragment files...") for genome in genomes: fragpath, fraglengths = fragment_fasta_file(Path(str(genome.path)), Path(str(fragdir)), args.fragsize) print(fragpath, len(fraglengths)) # blastdb = add_blastdb( # session, genome, run, fragpath, dbpath, fraglengths, dbcmd # ) raise NotImplementedError # Generate all pair permutations of genome IDs as a list of (Genome, Genome) tuples logger.info( "Compiling pairwise comparisons (this can take time for large datasets)..." ) comparisons = list( permutations(tqdm(genomes, disable=args.disable_tqdm), 2)) logger.info( f"\t...total parwise comparisons to be performed: {len(comparisons)}") # Check for existing comparisons; if one has already been done (for the same # software package, version, and setting) we add the comparison to this run, # but remove it from the list of comparisons to be performed logger.info("Checking database for existing comparison data...") comparisons_to_run = filter_existing_comparisons(session, run, comparisons, "blastn", blastn_version, args.fragsize, None) logger.info( f"\t...after check, still need to run {len(comparisons_to_run)} comparisons" ) # If there are no comparisons to run, update the Run matrices and exit # from this function if not comparisons_to_run: logger.info( "All comparison results present in database (skipping comparisons)" ) logger.info("Updating summary matrices with existing results") update_comparison_matrices(session, run) return # If we are in recovery mode, we are salvaging output from a previous # run, and do not necessarily need to rerun all the jobs. In this case, # we prepare a list of output files we want to recover from the results # in the output directory. if args.recovery: logger.warning("Entering recovery mode...") logger.info( f"\tIn this mode, existing comparison output from {args.outdir} is reused" ) existingfiles = collect_existing_output(args.outdir, "blastn", args) logger.info( f"\tIdentified {len(existingfiles)} existing output files for reuse" ) else: existingfiles = None logger.info(f"\tIdentified no existing output files") # Split the input genome files into contiguous fragments of the specified size, # as described in Goris et al. We create a new directory to hold sequence # fragments, away from the main genomes logger.info( f"Splitting input genome files into {args.fragsize}nt fragments...") fragdir = Path(args.outdir) / "fragments" os.makedirs(fragdir, exist_ok=True) fragfiles, fraglens = anib.fragment_fasta_files( [Path(str(_.path)) for _ in genomes], Path(args.outdir) / "fragments", args.fragsize, ) logger.info(f"...wrote {len(fragfiles)} fragment files to {fragdir}") # Create list of BLASTN jobs for each comparison still to be performed logger.info("Creating blastn jobs for ANIb...") joblist = generate_joblist(comparisons_to_run, existingfiles, fragfiles, fraglens, args, logger) logger.info(f"...created {len(joblist)} blastn jobs") raise NotImplementedError