def calculate_tetra(infiles):
    """Calculate TETRA for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Calculates TETRA correlation scores, as described in:

    Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for
    the prokaryotic species definition. Proc Natl Acad Sci USA 106:
    19126-19131. doi:10.1073/pnas.0906412106.

    and

    Teeling et al. (2004) Application of tetranucleotide frequencies for the
    assignment of genomic fragments. Env. Microbiol. 6(9): 938-947.
    doi:10.1111/j.1462-2920.2004.00624.x
    """
    logger.info("Running TETRA.")
    # First, find Z-scores
    logger.info("Calculating TETRA Z-scores for each sequence.")
    tetra_zscores = {}
    for filename in infiles:
        logger.info("Calculating TETRA Z-scores for %s", filename)
        org = os.path.splitext(os.path.split(filename)[-1])[0]
        tetra_zscores[org] = tetra.calculate_tetra_zscore(filename)
    # Then calculate Pearson correlation between Z-scores for each sequence
    logger.info("Calculating TETRA correlation scores.")
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    return (tetra_correlations, )
def calculate_tetra(infiles: List[Path]) -> pd.DataFrame:
    """Calculate TETRA for files in input directory.

    :param logger:  logging object
    :param infiles:  list, paths to each input file

    Calculates TETRA correlation scores, as described in:

    Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for
    the prokaryotic species definition. Proc Natl Acad Sci USA 106:
    19126-19131. doi:10.1073/pnas.0906412106.

    and

    Teeling et al. (2004) Application of tetranucleotide frequencies for the
    assignment of genomic fragments. Env. Microbiol. 6(9): 938-947.
    doi:10.1111/j.1462-2920.2004.00624.x
    """
    logger = logging.getLogger(__name__)

    logger.info("Running TETRA.")
    # First, find Z-scores
    logger.info("Calculating TETRA Z-scores for each sequence.")
    tetra_zscores = {}
    for filename in infiles:
        logger.info("Calculating TETRA Z-scores for %s", filename)
        tetra_zscores[filename.stem] = tetra.calculate_tetra_zscore(filename)
    # Then calculate Pearson correlation between Z-scores for each sequence
    logger.info("Calculating TETRA correlation scores.")
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    return tetra_correlations
def calculate_tetra(infiles, org_lengths):
    """Calculate TETRA for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Calculates TETRA correlation scores, as described in:

    Richter M, Rossello-Mora R (2009) Shifting the genomic gold standard for the
    prokaryotic species definition. Proc Natl Acad Sci USA 106: 19126-19131.
    doi:10.1073/pnas.0906412106.

    and

    Teeling et al. (2004) Application of tetranucleotide frequencies for the
    assignment of genomic fragments. Env. Microbiol. 6(9): 938-947.
    doi:10.1111/j.1462-2920.2004.00624.x
    """
    logger.info("Running TETRA.")
    # First, find Z-scores
    logger.info("Calculating TETRA Z-scores for each sequence.")
    tetra_zscores = {}
    for filename in infiles:
        logger.info("Calculating TETRA Z-scores for %s" % filename)
        org = os.path.splitext(os.path.split(filename)[-1])[0]
        tetra_zscores[org] = tetra.calculate_tetra_zscore(filename)
    # Then calculate Pearson correlation between Z-scores for each sequence
    logger.info("Calculating TETRA correlation scores.")
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    return (tetra_correlations, )
Beispiel #4
0
def test_correlations(path_fna_all, dir_targets):
    """Test that TETRA correlation calculated correctly."""
    infiles = ordered(path_fna_all)[:2]  # only test a single correlation
    corr = calculate_correlations(calculate_tetra_zscores(infiles))
    target = pd.read_csv(dir_targets / "tetra" / "correlation.tab",
                         sep="\t",
                         index_col=0)
    assert_frame_equal(corr, target)
Beispiel #5
0
 def test_correlations(self):
     """Test that TETRA correlation calculated correctly."""
     infiles = ordered(self.infiles)[:2]  # only test a single correlation
     corr = tetra.calculate_correlations(
         tetra.calculate_tetra_zscores(infiles))
     target = pd.read_csv(self.tgtdir / "correlation.tab",
                          sep="\t",
                          index_col=0)
     assert_frame_equal(corr, target)
Beispiel #6
0
 def test_correlations(self):
     """TETRA correlation calculated correctly."""
     infiles = ordered(self.infiles)[:2]  # only test a single correlation
     corr = tetra.calculate_correlations(
         tetra.calculate_tetra_zscores(infiles))
     target = pd.read_csv(os.path.join(self.tgtdir, 'correlation.tab'),
                          sep='\t',
                          index_col=0)
     assert_frame_equal(corr, target)
Beispiel #7
0
def test_tetra_concordance(
    paths_concordance_fna, path_concordance_jspecies, tolerance_tetra, tmp_path
):
    """Check TETRA results are concordant with JSpecies."""
    # Perform TETRA analysis
    zscores = dict()
    for filename in paths_concordance_fna:
        zscores[filename.stem] = tetra.calculate_tetra_zscore(filename)
    results = tetra.calculate_correlations(zscores).values

    # Compare JSpecies output
    tgt_mat = parse_jspecies(path_concordance_jspecies)["Tetra"].values
    assert results - tgt_mat == pytest.approx(0, abs=tolerance_tetra)
Beispiel #8
0
    def test_tetra_concordance(self):
        """TETRA results concordant with JSpecies."""
        # Perform TETRA analysis
        zscores = dict()
        for filename in self.infiles:
            org = os.path.splitext(os.path.split(filename)[-1])[0]
            zscores[org] = tetra.calculate_tetra_zscore(filename)
        results = tetra.calculate_correlations(zscores)
        results.to_csv(os.path.join(self.outdir, "pyani_tetra.tab"), sep="\t")

        # Compare JSpecies output
        diffmat = results.values - self.target["Tetra"].values
        tetra_diff = pd.DataFrame(diffmat, index=results.index, columns=results.columns)
        tetra_diff.to_csv(os.path.join(self.outdir, "pyani_tetra_diff.tab"), sep="\t")
        assert_less(tetra_diff.abs().values.max(), self.tolerance["TETRA"])
Beispiel #9
0
    def test_tetra_concordance(self):
        """Check TETRA results are concordant with JSpecies."""
        # Perform TETRA analysis
        zscores = dict()
        for filename in self.infiles:
            zscores[filename.stem] = tetra.calculate_tetra_zscore(filename)
        results = tetra.calculate_correlations(zscores)
        results.to_csv(self.outdir / "pyani_tetra.tab", sep="\t")

        # Compare JSpecies output
        diffmat = results.values - self.target["Tetra"].values
        tetra_diff = pd.DataFrame(diffmat,
                                  index=results.index,
                                  columns=results.columns)
        tetra_diff.to_csv(self.outdir / "pyani_tetra_diff.tab", sep="\t")
        self.assertLess(tetra_diff.abs().values.max(), self.tolerance["TETRA"])
Beispiel #10
0
def test_tetra_concordance():
    """Test concordance of TETRA method with JSpecies output."""
    # Make/check output directory
    mode = "TETRA"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test TETRA concordance
    tetra_zscores = {}
    for filename in infiles:
        org = os.path.splitext(os.path.split(filename)[-1])[0]
        tetra_zscores[org] = tetra.calculate_tetra_zscore(filename)
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    index, columns = tetra_correlations.index, tetra_correlations.columns
    tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\
                              tetra_jspecies.as_matrix(),
                              index=index, columns=columns)

    # Write dataframes to file, for reference
    tetra_correlations.to_csv(os.path.join(outdirname,
                                           'tetra_correlations.tab'),
                              sep='\t')
    tetra_jspecies.to_csv(os.path.join(outdirname,
                                       'tetra_jspecies.tab'),
                          sep='\t')
    tetra_diff.to_csv(os.path.join(outdirname,
                                   'tetra_diff.tab'),
                      sep='\t')
    print "TETRA concordance test output placed in %s" % outdirname
    print tetra_correlations, tetra_jspecies, tetra_diff

    # We'd like the absolute difference reported to be < TETRA_THRESHOLD
    max_diff = tetra_diff.abs().values.max()
    print "Maximum difference for TETRA: %e" % max_diff
    assert_less(max_diff, TETRA_THRESHOLD)
Beispiel #11
0
def test_tetra_concordance():
    """Test concordance of TETRA method with JSpecies output."""
    # Make/check output directory
    mode = "TETRA"
    outdirname = delete_and_remake_outdir(mode)

    # Get dataframes of JSpecies output
    tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test TETRA concordance
    tetra_zscores = {}
    for filename in infiles:
        org = os.path.splitext(os.path.split(filename)[-1])[0]
        tetra_zscores[org] = tetra.calculate_tetra_zscore(filename)
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    index, columns = tetra_correlations.index, tetra_correlations.columns
    tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\
                              tetra_jspecies.as_matrix(),
                              index=index, columns=columns)

    # Write dataframes to file, for reference
    tetra_correlations.to_csv(os.path.join(outdirname,
                                           'tetra_correlations.tab'),
                              sep='\t')
    tetra_jspecies.to_csv(os.path.join(outdirname, 'tetra_jspecies.tab'),
                          sep='\t')
    tetra_diff.to_csv(os.path.join(outdirname, 'tetra_diff.tab'), sep='\t')
    print("TETRA concordance test output placed in %s" % outdirname)
    print("TETRA correlations:\n", tetra_correlations)
    print("TETRA JSpecies:\n", tetra_jspecies)
    print("TETRA diff:\n", tetra_diff)

    # We'd like the absolute difference reported to be < TETRA_THRESHOLD
    max_diff = tetra_diff.abs().values.max()
    print("Maximum difference for TETRA: %e" % max_diff)
    assert_less(max_diff, TETRA_THRESHOLD)