def test_anim_concordance( paths_concordance_fna, path_concordance_jspecies, tolerance_anim, tmp_path ): """Check ANIm results are concordant with JSpecies.""" # Perform ANIm on the input directory contents # We have to separate nucmer/delta-filter command generation # because Travis-CI doesn't play nicely with changes we made # for local SGE/OGE integration. # This might be avoidable with a scheduler flag passed to # jobgroup generation in the anim.py module. That's a TODO. ncmds, fcmds = anim.generate_nucmer_commands(paths_concordance_fna, tmp_path) (tmp_path / "nucmer_output").mkdir(exist_ok=True, parents=True) run_mp.multiprocessing_run(ncmds) # delta-filter commands need to be treated with care for # Travis-CI. Our cluster won't take redirection or semicolon # separation in individual commands, but the wrapper we wrote # for this (delta_filter_wrapper.py) can't be called under # Travis-CI. So we must deconstruct the commands below dfcmds = [ " > ".join([" ".join(fcmd.split()[1:-1]), fcmd.split()[-1]]) for fcmd in fcmds ] run_mp.multiprocessing_run(dfcmds) orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna) results = anim.process_deltadir(tmp_path / "nucmer_output", orglengths) result_pid = results.percentage_identity result_pid.to_csv(tmp_path / "pyani_anim.tab", sep="\t") # Compare JSpecies output to results result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIm"].values assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anim)
def test_anim_concordance(self): """Check ANIm results are concordant with JSpecies.""" # Perform ANIm on the input directory contents # We have to separate nucmer/delta-filter command generation # because Travis-CI doesn't play nicely with changes we made # for local SGE/OGE integration. # This might be avoidable with a scheduler flag passed to # jobgroup generation in the anim.py module. That's a TODO. ncmds, fcmds = anim.generate_nucmer_commands(self.infiles, self.outdir) run_mp.multiprocessing_run(ncmds) # delta-filter commands need to be treated with care for # Travis-CI. Our cluster won't take redirection or semicolon # separation in individual commands, but the wrapper we wrote # for this (delta_filter_wrapper.py) can't be called under # Travis-CI. So we must deconstruct the commands below dfcmds = [ " > ".join([" ".join(fcmd.split()[1:-1]), fcmd.split()[-1]]) for fcmd in fcmds ] run_mp.multiprocessing_run(dfcmds) results = anim.process_deltadir(self.deltadir, self.orglengths) result_pid = results.percentage_identity result_pid.to_csv(self.outdir / "pyani_anim.tab", sep="\t") # Compare JSpecies output to results result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 diffmat = result_pid.values - self.target["ANIm"].values anim_diff = pd.DataFrame(diffmat, index=result_pid.index, columns=result_pid.columns) anim_diff.to_csv(self.outdir / "pyani_anim_diff.tab", sep="\t") self.assertLess(anim_diff.abs().values.max(), self.tolerance["ANIm"])
def test_anim_concordance(self): """ANIm results concordant with JSpecies.""" # Perform ANIm on the input directory contents # We have to separate nucmer/delta-filter command generation # because Travis-CI doesn't play nicely with changes we made # for local SGE/OGE integration. # This might be avoidable with a scheduler flag passed to # jobgroup generation in the anim.py module. That's a TODO. ncmds, fcmds = anim.generate_nucmer_commands(self.infiles, self.outdir) run_mp.multiprocessing_run(ncmds) # delta-filter commands need to be treated with care for # Travis-CI. Our cluster won't take redirection or semicolon # separation in individual commands, but the wrapper we wrote # for this (delta_filter_wrapper.py) can't be called under # Travis-CI. So we must deconstruct the commands below dfcmds = [ " > ".join([" ".join(fcmd.split()[1:-1]), fcmd.split()[-1]]) for fcmd in fcmds ] run_mp.multiprocessing_run(dfcmds) results = anim.process_deltadir(self.deltadir, self.orglengths) result_pid = results.percentage_identity result_pid.to_csv(os.path.join(self.outdir, "pyani_anim.tab"), sep="\t") # Compare JSpecies output to results result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0 diffmat = result_pid.values - self.target["ANIm"].values anim_diff = pd.DataFrame( diffmat, index=result_pid.index, columns=result_pid.columns ) anim_diff.to_csv(os.path.join(self.outdir, "pyani_anim_diff.tab"), sep="\t") assert_less(anim_diff.abs().values.max(), self.tolerance["ANIm"])
def test_process_deltadir(self): """processes directory of .delta files into ANIResults.""" seqfiles = pyani_files.get_fasta_files(self.seqdir) orglengths = pyani_files.get_sequence_lengths(seqfiles) result = anim.process_deltadir(self.deltadir, orglengths) assert_frame_equal(result.percentage_identity.sort_index(1).sort_index(), self.df_pid.sort_index(1).sort_index())
def calculate_anim(infiles, org_lengths): """Returns ANIm result dataframes for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Finds ANI by the ANIm method, as described in Richter et al (2009) Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106. All FASTA format files (selected by suffix) in the input directory are compared against each other, pairwise, using NUCmer (which must be in the path). NUCmer output is stored in the output directory. The NUCmer .delta file output is parsed to obtain an alignment length and similarity error count for every unique region alignment between the two organisms, as represented by the sequences in the FASTA files. These are processed to give matrices of aligned sequence lengths, average nucleotide identity (ANI) percentages, coverage (aligned percentage of whole genome), and similarity error cound for each pairwise comparison. """ logger.info("Running ANIm") logger.info("Generating NUCmer command-lines") # Schedule NUCmer runs if not args.skip_nucmer: joblist = anim.generate_nucmer_jobs( infiles, args.outdirname, nucmer_exe=args.nucmer_exe, maxmatch=args.maxmatch ) if args.scheduler == "multiprocessing": logger.info("Running jobs with multiprocessing") cumval = run_mp.run_dependency_graph(joblist, verbose=args.verbose, logger=logger) logger.info("Cumulative return value: %d" % cumval) if 0 < cumval: logger.warning("At least one NUCmer comparison failed. " + "ANIm may fail.") else: logger.info("All multiprocessing jobs complete.") else: logger.info("Running jobs with SGE") run_sge.run_dependency_graph(joblist, verbose=args.verbose, logger=logger) else: logger.warning("Skipping NUCmer run (as instructed)!") # Process resulting .delta files logger.info("Processing NUCmer .delta files.") try: data = anim.process_deltadir(args.outdirname, org_lengths) except ZeroDivisionError: logger.error("One or more NUCmer output files has a problem.") if not args.skip_nucmer: if 0 < cumval: logger.error("This is possibly due to NUCmer run failure, " + "please investigate") else: logger.error( "This is possibly due to a NUCmer comparison " + "being too distant for use. Please consider " + "using the --maxmatch option." ) logger.error(last_exception()) return data
def test_process_deltadir(self): """processes directory of .delta files into ANIResults.""" seqfiles = pyani_files.get_fasta_files(self.seqdir) orglengths = pyani_files.get_sequence_lengths(seqfiles) result = anim.process_deltadir(self.deltadir, orglengths) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), self.df_pid.sort_index(1).sort_index())
def test_deltadir_parsing(delta_output_dir): """Process test directory of .delta files into ANIResults.""" seqfiles = pyani_files.get_fasta_files(delta_output_dir.seqdir) orglengths = pyani_files.get_sequence_lengths(seqfiles) result = anim.process_deltadir(delta_output_dir.deltadir, orglengths) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), delta_output_dir.deltaresult.sort_index(1).sort_index(), )
def test_anim_concordance(): """Test concordance of ANIm method with JSpecies output.""" # Make/check output directory mode = "ANIm" outdirname = delete_and_remake_outdir(mode) nucmername = os.path.join(outdirname, 'nucmer_output') os.makedirs(nucmername, exist_ok=True) # Get dataframes of JSpecies output anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIm concordance: # Run pairwise NUCmer cmdlist = anim.generate_nucmer_commands(infiles, outdirname, pyani_config.NUCMER_DEFAULT) print('\n'.join(cmdlist)) multiprocessing_run(cmdlist) # Process .delta files results = anim.process_deltadir(nucmername, org_lengths) anim_pid = \ results.percentage_identity.sort_index(axis=0).sort_index(axis=1) * 100. print("ANIm data\n", results) index, columns = anim_pid.index, anim_pid.columns diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix() anim_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference anim_pid.to_csv(os.path.join(outdirname, 'ANIm_pid.tab'), sep='\t') anim_jspecies.to_csv(os.path.join(outdirname, 'ANIm_jspecies.tab'), sep='\t') anim_diff.to_csv(os.path.join(outdirname, 'ANIm_diff.tab'), sep='\t') print("ANIm concordance test output placed in %s" % outdirname) print("ANIm PID\n", anim_pid) print("ANIm JSpecies\n", anim_jspecies) print("ANIm diff\n", anim_diff) # We'd like the absolute difference reported to be < ANIB_THRESHOLD max_diff = anim_diff.abs().values.max() print("Maximum difference for ANIm: %e" % max_diff) assert_less(max_diff, ANIM_THRESHOLD)
def test_anim_concordance(): """Test concordance of ANIm method with JSpecies output.""" # Make/check output directory mode = "ANIm" outdirname = make_outdir(mode) # Get dataframes of JSpecies output anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIm concordance: # Run pairwise NUCmer cmdlist = anim.generate_nucmer_commands(infiles, outdirname, pyani_config.NUCMER_DEFAULT) multiprocessing_run(cmdlist, verbose=False) # Process .delta files anim_data = anim.process_deltadir(outdirname, org_lengths) anim_pid = anim_data[1].sort(axis=0).sort(axis=1) * 100. print anim_data index, columns = anim_pid.index, anim_pid.columns diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix() anim_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference anim_pid.to_csv(os.path.join(outdirname, 'ANIm_pid.tab'), sep='\t') anim_jspecies.to_csv(os.path.join(outdirname, 'ANIm_jspecies.tab'), sep='\t') anim_diff.to_csv(os.path.join(outdirname, 'ANIm_diff.tab'), sep='\t') print "ANIm concordance test output placed in %s" % outdirname print anim_pid, anim_jspecies, anim_diff # We'd like the absolute difference reported to be < ANIB_THRESHOLD max_diff = anim_diff.abs().values.max() print "Maximum difference for ANIm: %e" % max_diff assert_less(max_diff, ANIM_THRESHOLD)
def calculate_anim(infiles, org_lengths): """Returns ANIm result dataframes for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Finds ANI by the ANIm method, as described in Richter et al (2009) Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106. All FASTA format files (selected by suffix) in the input directory are compared against each other, pairwise, using NUCmer (which must be in the path). NUCmer output is stored in the output directory. The NUCmer .delta file output is parsed to obtain an alignment length and similarity error count for every unique region alignment between the two organisms, as represented by the sequences in the FASTA files. These are processed to give matrices of aligned sequence lengths, average nucleotide identity (ANI) percentages, coverage (aligned percentage of whole genome), and similarity error cound for each pairwise comparison. """ logger.info("Running ANIm") logger.info("Generating NUCmer command-lines") deltadir = os.path.join(args.outdirname, ALIGNDIR['ANIm']) logger.info("Writing nucmer output to %s", deltadir) # Schedule NUCmer runs if not args.skip_nucmer: joblist = anim.generate_nucmer_jobs(infiles, args.outdirname, nucmer_exe=args.nucmer_exe, jobprefix=args.jobprefix) if args.scheduler == 'multiprocessing': logger.info("Running jobs with multiprocessing") if args.workers is None: logger.info("(using maximum number of available " + "worker threads)") else: logger.info("(using %d worker threads, if available)", args.workers) cumval = run_mp.run_dependency_graph(joblist, workers=args.workers, logger=logger) logger.info("Cumulative return value: %d", cumval) if 0 < cumval: logger.warning("At least one NUCmer comparison failed. " + "ANIm may fail.") else: logger.info("All multiprocessing jobs complete.") else: logger.info("Running jobs with SGE") logger.info("Jobarray group size set to %d", args.sgegroupsize) run_sge.run_dependency_graph(joblist, logger=logger, jgprefix=args.jobprefix, sgegroupsize=args.sgegroupsize) else: logger.warning("Skipping NUCmer run (as instructed)!") # Process resulting .delta files logger.info("Processing NUCmer .delta files.") results = anim.process_deltadir(deltadir, org_lengths, logger=logger) if results.zero_error: # zero percentage identity error if not args.skip_nucmer and args.scheduler == 'multiprocessing': if 0 < cumval: logger.error("This has possibly been a NUCmer run failure, " + "please investigate") logger.error(last_exception()) sys.exit(1) else: logger.error("This is possibly due to a NUCmer comparison " + "being too distant for use. Please consider " + "using the --maxmatch option.") logger.error("This is alternatively due to NUCmer run " + "failure, analysis will continue, but please " + "investigate.") if not args.nocompress: logger.info("Compressing/deleting %s", deltadir) compress_delete_outdir(deltadir) # Return processed data from .delta files return results
def calculate_anim(args: Namespace, infiles: List[Path], org_lengths: Dict) -> pyani_tools.ANIResults: """Return ANIm result dataframes for files in input directory. :param args: Namespace, command-line arguments :param logger: logging object :param infiles: list of paths to each input file :param org_lengths: dict, input sequence lengths, keyed by sequence Finds ANI by the ANIm method, as described in Richter et al (2009) Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106. All FASTA format files (selected by suffix) in the input directory are compared against each other, pairwise, using NUCmer (which must be in the path). NUCmer output is stored in the output directory. The NUCmer .delta file output is parsed to obtain an alignment length and similarity error count for every unique region alignment between the two organisms, as represented by the sequences in the FASTA files. These are processed to give matrices of aligned sequence lengths, average nucleotide identity (ANI) percentages, coverage (aligned percentage of whole genome), and similarity error cound for each pairwise comparison. """ logger = logging.getLogger(__name__) logger.info("Running ANIm") logger.info("Generating NUCmer command-lines") deltadir = args.outdirname / ALIGNDIR["ANIm"] logger.info("Writing nucmer output to %s", deltadir) # Schedule NUCmer runs if not args.skip_nucmer: joblist = anim.generate_nucmer_jobs( infiles, args.outdirname, nucmer_exe=args.nucmer_exe, filter_exe=args.filter_exe, maxmatch=args.maxmatch, jobprefix=args.jobprefix, ) if args.scheduler == "multiprocessing": logger.info("Running jobs with multiprocessing") if args.workers is None: logger.info( "(using maximum number of available worker threads)") else: logger.info("(using %d worker threads, if available)", args.workers) cumval = run_mp.run_dependency_graph(joblist, workers=args.workers, logger=logger) logger.info("Cumulative return value: %d", cumval) if cumval > 0: logger.warning( "At least one NUCmer comparison failed. ANIm may fail.") else: logger.info("All multiprocessing jobs complete.") else: logger.info("Running jobs with SGE") logger.info("Jobarray group size set to %d", args.sgegroupsize) run_sge.run_dependency_graph( joblist, jgprefix=args.jobprefix, sgegroupsize=args.sgegroupsize, sgeargs=args.sgeargs, ) else: logger.warning("Skipping NUCmer run (as instructed)!") # Process resulting .delta files logger.info("Processing NUCmer .delta files.") results = anim.process_deltadir(deltadir, org_lengths, logger=logger) if results.zero_error: # zero percentage identity error if not args.skip_nucmer and args.scheduler == "multiprocessing": if cumval > 0: logger.error( "This has possibly been a NUCmer run failure, please investigate", exc_info=True, ) raise SystemExit(1) logger.error( "This is possibly due to:\n\t(i) a NUCmer comparison being too distant " "for use (please consider using the --maxmatch option)\n\t(ii) NUCmer run " "failure (analysis will continue, but please investigate)") if not args.nocompress: logger.info("Compressing/deleting %s", deltadir) compress_delete_outdir(deltadir, logger) # Return processed data from .delta files return results
def calculate_anim(infiles, org_lengths): """Returns ANIm result dataframes for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Finds ANI by the ANIm method, as described in Richter et al (2009) Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106. All FASTA format files (selected by suffix) in the input directory are compared against each other, pairwise, using NUCmer (which must be in the path). NUCmer output is stored in the output directory. The NUCmer .delta file output is parsed to obtain an alignment length and similarity error count for every unique region alignment between the two organisms, as represented by the sequences in the FASTA files. These are processed to give matrices of aligned sequence lengths, average nucleotide identity (ANI) percentages, coverage (aligned percentage of whole genome), and similarity error cound for each pairwise comparison. """ logger.info("Running ANIm") logger.info("Generating NUCmer command-lines") # Schedule NUCmer runs if not args.skip_nucmer: joblist = anim.generate_nucmer_jobs(infiles, args.outdirname, nucmer_exe=args.nucmer_exe, maxmatch=args.maxmatch) if args.scheduler == 'multiprocessing': logger.info("Running jobs with multiprocessing") cumval = run_mp.run_dependency_graph(joblist, verbose=args.verbose, logger=logger) logger.info("Cumulative return value: %d" % cumval) if 0 < cumval: logger.warning("At least one NUCmer comparison failed. " + "ANIm may fail.") else: logger.info("All multiprocessing jobs complete.") else: logger.info("Running jobs with SGE") run_sge.run_dependency_graph(joblist, verbose=args.verbose, logger=logger) else: logger.warning("Skipping NUCmer run (as instructed)!") # Process resulting .delta files logger.info("Processing NUCmer .delta files.") try: data = anim.process_deltadir(args.outdirname, org_lengths) except ZeroDivisionError: logger.error("One or more NUCmer output files has a problem.") if not args.skip_nucmer: if 0 < cumval: logger.error("This is possibly due to NUCmer run failure, " + "please investigate") else: logger.error("This is possibly due to a NUCmer comparison " + "being too distant for use. Please consider " + "using the --maxmatch option.") logger.error(last_exception()) return data
def calculate_anim(infiles, org_lengths): """Returns ANIm result dataframes for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Finds ANI by the ANIm method, as described in Richter et al (2009) Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106. All FASTA format files (selected by suffix) in the input directory are compared against each other, pairwise, using NUCmer (which must be in the path). NUCmer output is stored in the output directory. The NUCmer .delta file output is parsed to obtain an alignment length and similarity error count for every unique region alignment between the two organisms, as represented by the sequences in the FASTA files. These are processed to give matrices of aligned sequence lengths, average nucleotide identity (ANI) percentages, coverage (aligned percentage of whole genome), and similarity error cound for each pairwise comparison. """ logger.info("Running ANIm") logger.info("Generating NUCmer command-lines") deltadir = os.path.join(args.outdirname, ALIGNDIR['ANIm']) logger.info("Writing nucmer output to %s" % deltadir) # Schedule NUCmer runs if not args.skip_nucmer: joblist = anim.generate_nucmer_jobs(infiles, args.outdirname, nucmer_exe=args.nucmer_exe, maxmatch=args.maxmatch, jobprefix=args.jobprefix) if args.scheduler == 'multiprocessing': logger.info("Running jobs with multiprocessing") if args.workers is None: logger.info("(using maximum number of available " + "worker threads)") else: logger.info("(using %d worker threads, if available)" % args.workers) cumval = run_mp.run_dependency_graph(joblist, workers=args.workers, verbose=args.verbose, logger=logger) logger.info("Cumulative return value: %d" % cumval) if 0 < cumval: logger.warning("At least one NUCmer comparison failed. " + "ANIm may fail.") else: logger.info("All multiprocessing jobs complete.") else: logger.info("Running jobs with SGE") logger.info("Jobarray group size set to %d" % args.sgegroupsize) run_sge.run_dependency_graph(joblist, verbose=args.verbose, logger=logger, jgprefix=args.jobprefix, sgegroupsize=args.sgegroupsize) else: logger.warning("Skipping NUCmer run (as instructed)!") # Process resulting .delta files logger.info("Processing NUCmer .delta files.") data = anim.process_deltadir(deltadir, org_lengths, logger=logger) if data[-1]: # zero percentage identity error if not args.skip_nucmer and args.scheduler == 'multiprocessing': if 0 < cumval: logger.error("This has possibly been a NUCmer run failure, " + "please investigate") logger.error(last_exception()) sys.exit(1) else: logger.error("This is possibly due to a NUCmer comparison " + "being too distant for use. Please consider " + "using the --maxmatch option.") logger.error("This is alternatively due to NUCmer run " + "failure, analysis will continue, but please " + "investigate.") if not args.nocompress: logger.info("Compressing/deleting %s" % deltadir) compress_delete_outdir(deltadir) # Return processed data from .delta files return tuple(data[:-1])