def test_process_deltadir(self): """processes directory of .delta files into ANIResults.""" seqfiles = pyani_files.get_fasta_files(self.seqdir) orglengths = pyani_files.get_sequence_lengths(seqfiles) result = anim.process_deltadir(self.deltadir, orglengths) assert_frame_equal(result.percentage_identity.sort_index(1).sort_index(), self.df_pid.sort_index(1).sort_index())
def test_aniblastall_concordance(): """Test concordance of ANIblastall method with JSpecies output.""" # Make/check output directory mode = "ANIblastall" outdirname = delete_and_remake_outdir(mode) # Get dataframes of JSpecies output aniblastall_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIblastall concordance: # Make fragments fragfiles, fraglengths = anib.fragment_fasta_files(infiles, outdirname, pyani_config.FRAGSIZE) # Build jobgraph jobgraph = anib.make_job_graph( infiles, fragfiles, anib.make_blastcmd_builder("ANIblastall", outdirname)) print("\nJobgraph:\n", jobgraph) print("\nJob 0:\n", jobgraph[0].script) # Run jobgraph with multiprocessing run_dependency_graph(jobgraph) print("Ran multiprocessing jobs") # Process BLAST; the pid data is in anib_data[1] aniblastall_data = anib.process_blast(outdirname, org_lengths, fraglengths, mode="ANIblastall") aniblastall_pid = \ aniblastall_data.percentage_identity.sort_index(axis=0).\ sort_index(axis=1) * 100. index, columns = aniblastall_pid.index, aniblastall_pid.columns diffmat = aniblastall_pid.as_matrix() - aniblastall_jspecies.as_matrix() aniblastall_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference aniblastall_pid.to_csv(os.path.join(outdirname, 'ANIblastall_pid.tab'), sep='\t') aniblastall_jspecies.to_csv(os.path.join(outdirname, 'ANIblastall_jspecies.tab'), sep='\t') aniblastall_diff.to_csv(os.path.join(outdirname, 'ANIblastall_diff.tab'), sep='\t') print("ANIblastall concordance test output placed in %s" % outdirname) print("ANIblastall PID:\n", aniblastall_pid) print("ANIblastall JSpecies:\n", aniblastall_jspecies) print("ANIblastall diff:\n", aniblastall_diff) # We'd like the absolute difference reported to be < ANIBLASTALL_THRESHOLD max_diff = aniblastall_diff.abs().values.max() print("Maximum difference for ANIblastall: %e" % max_diff) assert_less(max_diff, ANIB_THRESHOLD)
def test_process_deltadir(self): """processes directory of .delta files into ANIResults.""" seqfiles = pyani_files.get_fasta_files(self.seqdir) orglengths = pyani_files.get_sequence_lengths(seqfiles) result = anim.process_deltadir(self.deltadir, orglengths) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), self.df_pid.sort_index(1).sort_index())
def test_deltadir_parsing(delta_output_dir): """Process test directory of .delta files into ANIResults.""" seqfiles = pyani_files.get_fasta_files(delta_output_dir.seqdir) orglengths = pyani_files.get_sequence_lengths(seqfiles) result = anim.process_deltadir(delta_output_dir.deltadir, orglengths) assert_frame_equal( result.percentage_identity.sort_index(1).sort_index(), delta_output_dir.deltaresult.sort_index(1).sort_index(), )
def test_anib_concordance(): """Test concordance of ANIb method with JSpecies output. This may take some time. Please be patient. """ # Make/check output directory mode = "ANIb" outdirname = make_outdir(mode) # Get dataframes of JSpecies output anib_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIb concordance: # Make fragments fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, outdirname, pyani_config.FRAGSIZE) # Build databases cmdlist = anib.generate_blastdb_commands(infiles, outdirname, pyani_config.MAKEBLASTDB_DEFAULT, mode="ANIb") multiprocessing_run(cmdlist) # Run pairwise BLASTN cmdlist = anib.generate_blastn_commands(fragfiles, outdirname, pyani_config.BLASTN_DEFAULT, mode="ANIb") multiprocessing_run(cmdlist, verbose=False) # Process BLAST; the pid data is in anib_data[1] anib_data = anib.process_blast(outdirname, org_lengths, fraglengths, mode="ANIb") anib_pid = anib_data[1].sort(axis=0).sort(axis=1) * 100. index, columns = anib_pid.index, anib_pid.columns diffmat = anib_pid.as_matrix() - anib_jspecies.as_matrix() anib_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference anib_pid.to_csv(os.path.join(outdirname, 'ANIb_pid.tab'), sep='\t') anib_jspecies.to_csv(os.path.join(outdirname, 'ANIb_jspecies.tab'), sep='\t') anib_diff.to_csv(os.path.join(outdirname, 'ANIb_diff.tab'), sep='\t') print "ANIb concordance test output placed in %s" % outdirname print anib_pid, anib_jspecies, anib_diff # We'd like the absolute difference reported to be < ANIB_THRESHOLD max_diff = anib_diff.abs().values.max() print "Maximum difference for ANIb: %e" % max_diff assert_less(max_diff, ANIB_THRESHOLD)
def unified_anib(indirname,User_ID): # Build BLAST databases and run pairwise BLASTN # Fraglengths does not get reused with BLASTN os.mkdir(indirname+'{0}_out/'.format(User_ID)) os.system("chmod 777 {0}".format(indirname+'{0}_out'.format(User_ID))) logging.basicConfig(level=logging.DEBUG, filename="/home/linproject/Workspace/LIN_log/logfile_{0}".format(User_ID), filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s") infiles = pyani_files.get_fasta_files(indirname) org_lengths = pyani_files.get_sequence_lengths(infiles) fragsize = pyani_config.FRAGSIZE filestems = pyani_config.ANIB_FILESTEMS filenames = os.listdir(indirname) for fname in filenames: if ' ' in os.path.abspath(fname): logging.error("File or directory '%s' contains whitespace" % fname) logging.error("This will cause issues with MUMmer and BLAST") logging.error("(exiting)") sys.exit(1) fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, indirname+'{0}_out/'.format(User_ID), fragsize) # Export fragment lengths as JSON, in case we re-run BLASTALL with # --skip_blastn with open(os.path.join(indirname+'{0}_out/'.format(User_ID), 'fraglengths.json'), 'w') as outfile: json.dump(fraglengths, outfile) # Which executables are we using? format_exe = pyani_config.FORMATDB_DEFAULT blast_exe = pyani_config.BLASTALL_DEFAULT # Run BLAST database-building and executables from a jobgraph logging.info("Creating job dependency graph") jobgraph = anib.make_job_graph(infiles, fragfiles, indirname+'{0}_out/'.format(User_ID), format_exe, blast_exe, 'ANIblastall') logging.info("Running jobs with multiprocessing") logging.info("Running job dependency graph") cumval = run_mp.run_dependency_graph(jobgraph, verbose=False, logger=logging) if 0 < cumval: logging.warning("At least one BLAST run failed. " + "%s may fail." % 'ANIblastall') else: logging.info("All multiprocessing jobs complete.") # Process pairwise BLASTN output logging.info("Processing pairwise %s BLAST output." % 'ANIblastall') try: data = anib.process_blast(indirname+'{0}_out/'.format(User_ID), org_lengths, fraglengths=fraglengths, mode='ANIblastall') except ZeroDivisionError: logging.error("One or more BLAST output files has a problem.") if 0 < cumval: logging.error("This is possibly due to BLASTN run failure, " + "please investigate") else: logging.error("This is possibly due to ara BLASTN comparison " + "being too distant for use.") logging.error(last_exception()) return data[1]
def test_anim_concordance(): """Test concordance of ANIm method with JSpecies output.""" # Make/check output directory mode = "ANIm" outdirname = delete_and_remake_outdir(mode) nucmername = os.path.join(outdirname, 'nucmer_output') os.makedirs(nucmername, exist_ok=True) # Get dataframes of JSpecies output anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIm concordance: # Run pairwise NUCmer cmdlist = anim.generate_nucmer_commands(infiles, outdirname, pyani_config.NUCMER_DEFAULT) print('\n'.join(cmdlist)) multiprocessing_run(cmdlist) # Process .delta files results = anim.process_deltadir(nucmername, org_lengths) anim_pid = \ results.percentage_identity.sort_index(axis=0).sort_index(axis=1) * 100. print("ANIm data\n", results) index, columns = anim_pid.index, anim_pid.columns diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix() anim_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference anim_pid.to_csv(os.path.join(outdirname, 'ANIm_pid.tab'), sep='\t') anim_jspecies.to_csv(os.path.join(outdirname, 'ANIm_jspecies.tab'), sep='\t') anim_diff.to_csv(os.path.join(outdirname, 'ANIm_diff.tab'), sep='\t') print("ANIm concordance test output placed in %s" % outdirname) print("ANIm PID\n", anim_pid) print("ANIm JSpecies\n", anim_jspecies) print("ANIm diff\n", anim_diff) # We'd like the absolute difference reported to be < ANIB_THRESHOLD max_diff = anim_diff.abs().values.max() print("Maximum difference for ANIm: %e" % max_diff) assert_less(max_diff, ANIM_THRESHOLD)
def test_anim_concordance(): """Test concordance of ANIm method with JSpecies output.""" # Make/check output directory mode = "ANIm" outdirname = make_outdir(mode) # Get dataframes of JSpecies output anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test ANIm concordance: # Run pairwise NUCmer cmdlist = anim.generate_nucmer_commands(infiles, outdirname, pyani_config.NUCMER_DEFAULT) multiprocessing_run(cmdlist, verbose=False) # Process .delta files anim_data = anim.process_deltadir(outdirname, org_lengths) anim_pid = anim_data[1].sort(axis=0).sort(axis=1) * 100. print anim_data index, columns = anim_pid.index, anim_pid.columns diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix() anim_diff = pd.DataFrame(diffmat, index=index, columns=columns) # Write dataframes to file, for reference anim_pid.to_csv(os.path.join(outdirname, 'ANIm_pid.tab'), sep='\t') anim_jspecies.to_csv(os.path.join(outdirname, 'ANIm_jspecies.tab'), sep='\t') anim_diff.to_csv(os.path.join(outdirname, 'ANIm_diff.tab'), sep='\t') print "ANIm concordance test output placed in %s" % outdirname print anim_pid, anim_jspecies, anim_diff # We'd like the absolute difference reported to be < ANIB_THRESHOLD max_diff = anim_diff.abs().values.max() print "Maximum difference for ANIm: %e" % max_diff assert_less(max_diff, ANIM_THRESHOLD)
def setUp(self): """Set values and parameters for tests.""" self.indir = os.path.join("tests", "test_input", "concordance") self.outdir = os.path.join("tests", "test_output", "concordance") self.tgtdir = os.path.join("tests", "test_targets", "concordance") self.deltadir = os.path.join(self.outdir, "nucmer_output") self.infiles = pyani_files.get_fasta_files(self.indir) self.orglengths = pyani_files.get_sequence_lengths(self.infiles) self.target = parse_jspecies(os.path.join(self.tgtdir, "jspecies_output.tab")) self.tolerance = { "ANIm": 0.1, "ANIb_lo": 5, "ANIb_hi": 0.1, "ANIblastall": 0.1, "TETRA": 0.1, } self.fragsize = 1020 os.makedirs(self.outdir, exist_ok=True) os.makedirs(self.deltadir, exist_ok=True)
def setUp(self): """Set values and parameters for tests.""" testdir = Path("tests") self.indir = testdir / "test_input" / "concordance" self.outdir = testdir / "test_output" / "concordance" self.tgtdir = testdir / "test_targets" / "concordance" self.deltadir = self.outdir / "nucmer_output" self.infiles = pyani_files.get_fasta_files(self.indir) self.orglengths = pyani_files.get_sequence_lengths(self.infiles) self.target = parse_jspecies(self.tgtdir / "jspecies_output.tab") self.tolerance = { "ANIm": 0.1, "ANIb_lo": 5, "ANIb_hi": 0.1, "ANIblastall": 0.1, "TETRA": 0.1, } self.fragsize = 1020 self.outdir.mkdir(exist_ok=True) self.deltadir.mkdir(exist_ok=True)
def test_tetra_concordance(): """Test concordance of TETRA method with JSpecies output.""" # Make/check output directory mode = "TETRA" outdirname = make_outdir(mode) # Get dataframes of JSpecies output tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test TETRA concordance tetra_zscores = {} for filename in infiles: org = os.path.splitext(os.path.split(filename)[-1])[0] tetra_zscores[org] = tetra.calculate_tetra_zscore(filename) tetra_correlations = tetra.calculate_correlations(tetra_zscores) index, columns = tetra_correlations.index, tetra_correlations.columns tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\ tetra_jspecies.as_matrix(), index=index, columns=columns) # Write dataframes to file, for reference tetra_correlations.to_csv(os.path.join(outdirname, 'tetra_correlations.tab'), sep='\t') tetra_jspecies.to_csv(os.path.join(outdirname, 'tetra_jspecies.tab'), sep='\t') tetra_diff.to_csv(os.path.join(outdirname, 'tetra_diff.tab'), sep='\t') print "TETRA concordance test output placed in %s" % outdirname print tetra_correlations, tetra_jspecies, tetra_diff # We'd like the absolute difference reported to be < TETRA_THRESHOLD max_diff = tetra_diff.abs().values.max() print "Maximum difference for TETRA: %e" % max_diff assert_less(max_diff, TETRA_THRESHOLD)
def test_tetra_concordance(): """Test concordance of TETRA method with JSpecies output.""" # Make/check output directory mode = "TETRA" outdirname = delete_and_remake_outdir(mode) # Get dataframes of JSpecies output tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test TETRA concordance tetra_zscores = {} for filename in infiles: org = os.path.splitext(os.path.split(filename)[-1])[0] tetra_zscores[org] = tetra.calculate_tetra_zscore(filename) tetra_correlations = tetra.calculate_correlations(tetra_zscores) index, columns = tetra_correlations.index, tetra_correlations.columns tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\ tetra_jspecies.as_matrix(), index=index, columns=columns) # Write dataframes to file, for reference tetra_correlations.to_csv(os.path.join(outdirname, 'tetra_correlations.tab'), sep='\t') tetra_jspecies.to_csv(os.path.join(outdirname, 'tetra_jspecies.tab'), sep='\t') tetra_diff.to_csv(os.path.join(outdirname, 'tetra_diff.tab'), sep='\t') print("TETRA concordance test output placed in %s" % outdirname) print("TETRA correlations:\n", tetra_correlations) print("TETRA JSpecies:\n", tetra_jspecies) print("TETRA diff:\n", tetra_diff) # We'd like the absolute difference reported to be < TETRA_THRESHOLD max_diff = tetra_diff.abs().values.max() print("Maximum difference for TETRA: %e" % max_diff) assert_less(max_diff, TETRA_THRESHOLD)
if args.rerender: logger.warning("--rerender option used") logger.warning("Producing graphics with no new recalculations") else: # Have we got a valid scheduler choice? schedulers = ["multiprocessing", "SGE"] if args.scheduler not in schedulers: logger.error("scheduler %s not recognised (exiting)", args.scheduler) logger.error("Valid schedulers are: %s", '; '.join(schedulers)) sys.exit(1) logger.info("Using scheduler method: %s", args.scheduler) # Get input files logger.info("Identifying FASTA files in %s", args.indirname) infiles = pyani_files.get_fasta_files(args.indirname) logger.info("Input files:\n\t%s", '\n\t'.join(infiles)) # Are we subsampling? If so, make the selection here if args.subsample: infiles = subsample_input(infiles) logger.info("Sampled input files:\n\t%s", '\n\t'.join(infiles)) # Get lengths of input sequences logger.info("Processing input sequence lengths") org_lengths = pyani_files.get_sequence_lengths(infiles) logger.info("Sequence lengths:\n" + os.linesep.join( ["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())])) # Run appropriate method on the contents of the input directory, # and write out corresponding results.
if args.rerender: logger.warning("--rerender option used") logger.warning("Producing graphics with no new recalculations") else: # Have we got a valid scheduler choice? schedulers = ["multiprocessing", "SGE"] if args.scheduler not in schedulers: logger.error("scheduler %s not recognised (exiting)", args.scheduler) logger.error("Valid schedulers are: %s", "; ".join(schedulers)) sys.exit(1) logger.info("Using scheduler method: %s", args.scheduler) # Get input files logger.info("Identifying FASTA files in %s", args.indirname) infiles = pyani_files.get_fasta_files(args.indirname, recurse=True) print(f"infiles: {infiles}") logger.info("Input files:\n\t%s", "\n\t".join(infiles)) # Are we subsampling? If so, make the selection here if args.subsample: infiles = subsample_input(infiles) logger.info("Sampled input files:\n\t%s", "\n\t".join(infiles)) # Get lengths of input sequences logger.info("Processing input sequence lengths") org_lengths = pyani_files.get_sequence_lengths(infiles) logger.info("Sequence lengths:\n" + os.linesep.join( ["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())])) # Run appropriate method on the contents of the input directory,
logger.error("ANI method %s not recognised (exiting)" % args.method) logger.error("Valid methods are: %s" % methods.keys()) sys.exit(1) logger.info("Using ANI method: %s" % args.method) # Have we got a valid scheduler choice? schedulers = ["multiprocessing", "SGE"] if args.scheduler not in schedulers: logger.error("scheduler %s not recognised (exiting)" % args.scheduler) logger.error("Valid schedulers are: %s" % '; '.join(schedulers)) sys.exit(1) logger.info("Using scheduler method: %s" % args.scheduler) # Get input files logger.info("Identifying FASTA files in %s" % args.indirname) infiles = pyani_files.get_fasta_files(args.indirname) logger.info("Input files:\n\t%s" % '\n\t'.join(infiles)) # Get lengths of input sequences logger.info("Processing input sequence lengths") org_lengths = pyani_files.get_sequence_lengths(infiles) logger.info("Sequence lengths:\n" + os.linesep.join(["\t%s: %d" % (k, v) for k, v in org_lengths.items()])) # Run appropriate method on the contents of the input directory, # and write out corresponding results. logger.info("Carrying out %s analysis" % args.method) results = methods[args.method][0](infiles, org_lengths) write(results, methods[args.method][1])
def run_main(argsin: Optional[Namespace] = None) -> int: """Run main process for average_nucleotide_identity.py script. :param argsin: Namespace, command-line arguments :param logger: logging object """ time0 = time.time() # Process command-line and build logger args = process_arguments(argsin) logger = logging.getLogger(__name__) config_logger(args) # Ensure argument validity and get method function/config test_class_label_paths(args, logger) test_scheduler(args, logger) method_function, method_config = get_method(args) make_outdirs(args) # Skip calculations (or not) depending on rerender option if args.rerender: logger.warning( "--rerender option used. Producing graphics with no new recalculations" ) else: # Run ANI comparisons logger.info("Identifying FASTA files in %s", args.indirname) infiles = pyani_files.get_fasta_files(args.indirname) logger.info("Input files:\n\t%s", "\n\t".join([str(_) for _ in infiles])) # Are we subsampling? If so, make the selection here if args.subsample: infiles = subsample_input(args, logger, infiles) logger.info("Sampled input files:\n\t%s", "\n\t".join([str(_) for _ in infiles])) # Get lengths of input sequences logger.info("Processing input sequence lengths") org_lengths = pyani_files.get_sequence_lengths(infiles) seqlens = os.linesep.join( ["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())]) logger.info("Sequence lengths:\n%s", seqlens) # Run appropriate method on the contents of the input directory, # and write out corresponding results. logger.info("Carrying out %s analysis", args.method) if args.method == "TETRA": results = method_function(infiles) else: results = method_function(args, infiles, org_lengths) write(args, results) # Do we want graphical output? if args.graphics or args.rerender: logger.info("Rendering output graphics") logger.info("Formats requested: %s", args.gformat) for gfmt in args.gformat.split(","): logger.info("Graphics format: %s", gfmt) logger.info("Graphics method: %s", args.gmethod) draw(args, method_config, gfmt) # Close any open matplotlib figures plt.close("all") # Report that we've finished logger.info("Done: %s.", time.asctime()) logger.info("Time taken: %.2fs", (time.time() - time0)) # Exit return 0