def calculate_tetra(infiles): """Calculate TETRA for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Calculates TETRA correlation scores, as described in: Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for the prokaryotic species definition. Proc Natl Acad Sci USA 106: 19126-19131. doi:10.1073/pnas.0906412106. and Teeling et al. (2004) Application of tetranucleotide frequencies for the assignment of genomic fragments. Env. Microbiol. 6(9): 938-947. doi:10.1111/j.1462-2920.2004.00624.x """ logger.info("Running TETRA.") # First, find Z-scores logger.info("Calculating TETRA Z-scores for each sequence.") tetra_zscores = {} for filename in infiles: logger.info("Calculating TETRA Z-scores for %s", filename) org = os.path.splitext(os.path.split(filename)[-1])[0] tetra_zscores[org] = tetra.calculate_tetra_zscore(filename) # Then calculate Pearson correlation between Z-scores for each sequence logger.info("Calculating TETRA correlation scores.") tetra_correlations = tetra.calculate_correlations(tetra_zscores) return (tetra_correlations, )
def calculate_tetra(infiles: List[Path]) -> pd.DataFrame: """Calculate TETRA for files in input directory. :param logger: logging object :param infiles: list, paths to each input file Calculates TETRA correlation scores, as described in: Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for the prokaryotic species definition. Proc Natl Acad Sci USA 106: 19126-19131. doi:10.1073/pnas.0906412106. and Teeling et al. (2004) Application of tetranucleotide frequencies for the assignment of genomic fragments. Env. Microbiol. 6(9): 938-947. doi:10.1111/j.1462-2920.2004.00624.x """ logger = logging.getLogger(__name__) logger.info("Running TETRA.") # First, find Z-scores logger.info("Calculating TETRA Z-scores for each sequence.") tetra_zscores = {} for filename in infiles: logger.info("Calculating TETRA Z-scores for %s", filename) tetra_zscores[filename.stem] = tetra.calculate_tetra_zscore(filename) # Then calculate Pearson correlation between Z-scores for each sequence logger.info("Calculating TETRA correlation scores.") tetra_correlations = tetra.calculate_correlations(tetra_zscores) return tetra_correlations
def calculate_tetra(infiles, org_lengths): """Calculate TETRA for files in input directory. - infiles - paths to each input file - org_lengths - dictionary of input sequence lengths, keyed by sequence Calculates TETRA correlation scores, as described in: Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for the prokaryotic species definition. Proc Natl Acad Sci USA 106: 19126-19131. doi:10.1073/pnas.0906412106. and Teeling et al. (2004) Application of tetranucleotide frequencies for the assignment of genomic fragments. Env. Microbiol. 6(9): 938-947. doi:10.1111/j.1462-2920.2004.00624.x """ logger.info("Running TETRA.") # First, find Z-scores logger.info("Calculating TETRA Z-scores for each sequence.") tetra_zscores = {} for filename in infiles: logger.info("Calculating TETRA Z-scores for %s" % filename) org = os.path.splitext(os.path.split(filename)[-1])[0] tetra_zscores[org] = tetra.calculate_tetra_zscore(filename) # Then calculate Pearson correlation between Z-scores for each sequence logger.info("Calculating TETRA correlation scores.") tetra_correlations = tetra.calculate_correlations(tetra_zscores) return (tetra_correlations, )
def test_correlations(path_fna_all, dir_targets): """Test that TETRA correlation calculated correctly.""" infiles = ordered(path_fna_all)[:2] # only test a single correlation corr = calculate_correlations(calculate_tetra_zscores(infiles)) target = pd.read_csv(dir_targets / "tetra" / "correlation.tab", sep="\t", index_col=0) assert_frame_equal(corr, target)
def test_correlations(self): """Test that TETRA correlation calculated correctly.""" infiles = ordered(self.infiles)[:2] # only test a single correlation corr = tetra.calculate_correlations( tetra.calculate_tetra_zscores(infiles)) target = pd.read_csv(self.tgtdir / "correlation.tab", sep="\t", index_col=0) assert_frame_equal(corr, target)
def test_correlations(self): """TETRA correlation calculated correctly.""" infiles = ordered(self.infiles)[:2] # only test a single correlation corr = tetra.calculate_correlations( tetra.calculate_tetra_zscores(infiles)) target = pd.read_csv(os.path.join(self.tgtdir, 'correlation.tab'), sep='\t', index_col=0) assert_frame_equal(corr, target)
def test_tetra_concordance( paths_concordance_fna, path_concordance_jspecies, tolerance_tetra, tmp_path ): """Check TETRA results are concordant with JSpecies.""" # Perform TETRA analysis zscores = dict() for filename in paths_concordance_fna: zscores[filename.stem] = tetra.calculate_tetra_zscore(filename) results = tetra.calculate_correlations(zscores).values # Compare JSpecies output tgt_mat = parse_jspecies(path_concordance_jspecies)["Tetra"].values assert results - tgt_mat == pytest.approx(0, abs=tolerance_tetra)
def test_tetra_concordance(self): """TETRA results concordant with JSpecies.""" # Perform TETRA analysis zscores = dict() for filename in self.infiles: org = os.path.splitext(os.path.split(filename)[-1])[0] zscores[org] = tetra.calculate_tetra_zscore(filename) results = tetra.calculate_correlations(zscores) results.to_csv(os.path.join(self.outdir, "pyani_tetra.tab"), sep="\t") # Compare JSpecies output diffmat = results.values - self.target["Tetra"].values tetra_diff = pd.DataFrame(diffmat, index=results.index, columns=results.columns) tetra_diff.to_csv(os.path.join(self.outdir, "pyani_tetra_diff.tab"), sep="\t") assert_less(tetra_diff.abs().values.max(), self.tolerance["TETRA"])
def test_tetra_concordance(self): """Check TETRA results are concordant with JSpecies.""" # Perform TETRA analysis zscores = dict() for filename in self.infiles: zscores[filename.stem] = tetra.calculate_tetra_zscore(filename) results = tetra.calculate_correlations(zscores) results.to_csv(self.outdir / "pyani_tetra.tab", sep="\t") # Compare JSpecies output diffmat = results.values - self.target["Tetra"].values tetra_diff = pd.DataFrame(diffmat, index=results.index, columns=results.columns) tetra_diff.to_csv(self.outdir / "pyani_tetra_diff.tab", sep="\t") self.assertLess(tetra_diff.abs().values.max(), self.tolerance["TETRA"])
def test_tetra_concordance(): """Test concordance of TETRA method with JSpecies output.""" # Make/check output directory mode = "TETRA" outdirname = make_outdir(mode) # Get dataframes of JSpecies output tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test TETRA concordance tetra_zscores = {} for filename in infiles: org = os.path.splitext(os.path.split(filename)[-1])[0] tetra_zscores[org] = tetra.calculate_tetra_zscore(filename) tetra_correlations = tetra.calculate_correlations(tetra_zscores) index, columns = tetra_correlations.index, tetra_correlations.columns tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\ tetra_jspecies.as_matrix(), index=index, columns=columns) # Write dataframes to file, for reference tetra_correlations.to_csv(os.path.join(outdirname, 'tetra_correlations.tab'), sep='\t') tetra_jspecies.to_csv(os.path.join(outdirname, 'tetra_jspecies.tab'), sep='\t') tetra_diff.to_csv(os.path.join(outdirname, 'tetra_diff.tab'), sep='\t') print "TETRA concordance test output placed in %s" % outdirname print tetra_correlations, tetra_jspecies, tetra_diff # We'd like the absolute difference reported to be < TETRA_THRESHOLD max_diff = tetra_diff.abs().values.max() print "Maximum difference for TETRA: %e" % max_diff assert_less(max_diff, TETRA_THRESHOLD)
def test_tetra_concordance(): """Test concordance of TETRA method with JSpecies output.""" # Make/check output directory mode = "TETRA" outdirname = delete_and_remake_outdir(mode) # Get dataframes of JSpecies output tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra') # Identify our input files, and the total lengths of each organism seq infiles = pyani_files.get_fasta_files(INDIRNAME) org_lengths = pyani_files.get_sequence_lengths(infiles) # Test TETRA concordance tetra_zscores = {} for filename in infiles: org = os.path.splitext(os.path.split(filename)[-1])[0] tetra_zscores[org] = tetra.calculate_tetra_zscore(filename) tetra_correlations = tetra.calculate_correlations(tetra_zscores) index, columns = tetra_correlations.index, tetra_correlations.columns tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\ tetra_jspecies.as_matrix(), index=index, columns=columns) # Write dataframes to file, for reference tetra_correlations.to_csv(os.path.join(outdirname, 'tetra_correlations.tab'), sep='\t') tetra_jspecies.to_csv(os.path.join(outdirname, 'tetra_jspecies.tab'), sep='\t') tetra_diff.to_csv(os.path.join(outdirname, 'tetra_diff.tab'), sep='\t') print("TETRA concordance test output placed in %s" % outdirname) print("TETRA correlations:\n", tetra_correlations) print("TETRA JSpecies:\n", tetra_jspecies) print("TETRA diff:\n", tetra_diff) # We'd like the absolute difference reported to be < TETRA_THRESHOLD max_diff = tetra_diff.abs().values.max() print("Maximum difference for TETRA: %e" % max_diff) assert_less(max_diff, TETRA_THRESHOLD)