Ejemplo n.º 1
0
def test_anim_concordance(
    paths_concordance_fna, path_concordance_jspecies, tolerance_anim, tmp_path
):
    """Check ANIm results are concordant with JSpecies."""
    # Perform ANIm on the input directory contents
    # We have to separate nucmer/delta-filter command generation
    # because Travis-CI doesn't play nicely with changes we made
    # for local SGE/OGE integration.
    # This might be avoidable with a scheduler flag passed to
    # jobgroup generation in the anim.py module. That's a TODO.
    ncmds, fcmds = anim.generate_nucmer_commands(paths_concordance_fna, tmp_path)
    (tmp_path / "nucmer_output").mkdir(exist_ok=True, parents=True)
    run_mp.multiprocessing_run(ncmds)

    # delta-filter commands need to be treated with care for
    # Travis-CI. Our cluster won't take redirection or semicolon
    # separation in individual commands, but the wrapper we wrote
    # for this (delta_filter_wrapper.py) can't be called under
    # Travis-CI. So we must deconstruct the commands below
    dfcmds = [
        " > ".join([" ".join(fcmd.split()[1:-1]), fcmd.split()[-1]]) for fcmd in fcmds
    ]
    run_mp.multiprocessing_run(dfcmds)

    orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)

    results = anim.process_deltadir(tmp_path / "nucmer_output", orglengths)
    result_pid = results.percentage_identity
    result_pid.to_csv(tmp_path / "pyani_anim.tab", sep="\t")

    # Compare JSpecies output to results
    result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values
    tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIm"].values

    assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anim)
Ejemplo n.º 2
0
def test_aniblastall_concordance(
    paths_concordance_fna,
    path_concordance_jspecies,
    tolerance_anib_hi,
    fragment_length,
    tmp_path,
):
    """Check ANIblastall results are concordant with JSpecies."""
    # Get lengths of input genomes
    orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)

    # Perform ANIblastall on the input directory contents
    fragfiles, fraglengths = anib.fragment_fasta_files(
        paths_concordance_fna, tmp_path, fragment_length
    )
    jobgraph = anib.make_job_graph(
        paths_concordance_fna,
        fragfiles,
        anib.make_blastcmd_builder("ANIblastall", tmp_path),
    )
    assert 0 == run_mp.run_dependency_graph(jobgraph)  # Jobs must run correctly

    # Process BLAST output
    result_pid = anib.process_blast(
        tmp_path, orglengths, fraglengths, mode="ANIblastall"
    ).percentage_identity

    # Compare JSpecies output to results
    result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values
    tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"].values
    assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anib_hi)
Ejemplo n.º 3
0
 def test_process_deltadir(self):
     """processes directory of .delta files into ANIResults."""
     seqfiles = pyani_files.get_fasta_files(self.seqdir)
     orglengths = pyani_files.get_sequence_lengths(seqfiles)
     result = anim.process_deltadir(self.deltadir, orglengths)
     assert_frame_equal(result.percentage_identity.sort_index(1).sort_index(),
                        self.df_pid.sort_index(1).sort_index())
Ejemplo n.º 4
0
def test_aniblastall_concordance():
    """Test concordance of ANIblastall method with JSpecies output."""
    # Make/check output directory
    mode = "ANIblastall"
    outdirname = delete_and_remake_outdir(mode)

    # Get dataframes of JSpecies output
    aniblastall_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIblastall concordance:
    # Make fragments
    fragfiles, fraglengths = anib.fragment_fasta_files(infiles, outdirname,
                                                       pyani_config.FRAGSIZE)

    # Build jobgraph
    jobgraph = anib.make_job_graph(
        infiles, fragfiles,
        anib.make_blastcmd_builder("ANIblastall", outdirname))
    print("\nJobgraph:\n", jobgraph)
    print("\nJob 0:\n", jobgraph[0].script)

    # Run jobgraph with multiprocessing
    run_dependency_graph(jobgraph)
    print("Ran multiprocessing jobs")

    # Process BLAST; the pid data is in anib_data[1]
    aniblastall_data = anib.process_blast(outdirname,
                                          org_lengths,
                                          fraglengths,
                                          mode="ANIblastall")
    aniblastall_pid = \
        aniblastall_data.percentage_identity.sort_index(axis=0).\
        sort_index(axis=1) * 100.

    index, columns = aniblastall_pid.index, aniblastall_pid.columns
    diffmat = aniblastall_pid.as_matrix() - aniblastall_jspecies.as_matrix()
    aniblastall_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    aniblastall_pid.to_csv(os.path.join(outdirname, 'ANIblastall_pid.tab'),
                           sep='\t')
    aniblastall_jspecies.to_csv(os.path.join(outdirname,
                                             'ANIblastall_jspecies.tab'),
                                sep='\t')
    aniblastall_diff.to_csv(os.path.join(outdirname, 'ANIblastall_diff.tab'),
                            sep='\t')
    print("ANIblastall concordance test output placed in %s" % outdirname)
    print("ANIblastall PID:\n", aniblastall_pid)
    print("ANIblastall JSpecies:\n", aniblastall_jspecies)
    print("ANIblastall diff:\n", aniblastall_diff)

    # We'd like the absolute difference reported to be < ANIBLASTALL_THRESHOLD
    max_diff = aniblastall_diff.abs().values.max()
    print("Maximum difference for ANIblastall: %e" % max_diff)
    assert_less(max_diff, ANIB_THRESHOLD)
Ejemplo n.º 5
0
 def test_process_deltadir(self):
     """processes directory of .delta files into ANIResults."""
     seqfiles = pyani_files.get_fasta_files(self.seqdir)
     orglengths = pyani_files.get_sequence_lengths(seqfiles)
     result = anim.process_deltadir(self.deltadir, orglengths)
     assert_frame_equal(
         result.percentage_identity.sort_index(1).sort_index(),
         self.df_pid.sort_index(1).sort_index())
Ejemplo n.º 6
0
def test_deltadir_parsing(delta_output_dir):
    """Process test directory of .delta files into ANIResults."""
    seqfiles = pyani_files.get_fasta_files(delta_output_dir.seqdir)
    orglengths = pyani_files.get_sequence_lengths(seqfiles)
    result = anim.process_deltadir(delta_output_dir.deltadir, orglengths)
    assert_frame_equal(
        result.percentage_identity.sort_index(1).sort_index(),
        delta_output_dir.deltaresult.sort_index(1).sort_index(),
    )
Ejemplo n.º 7
0
def test_anib_concordance():
    """Test concordance of ANIb method with JSpecies output.

    This may take some time. Please be patient.
    """
    # Make/check output directory
    mode = "ANIb"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    anib_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIb concordance:
    # Make fragments
    fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, outdirname,
                                                       pyani_config.FRAGSIZE)
    # Build databases
    cmdlist = anib.generate_blastdb_commands(infiles, outdirname,
                                             pyani_config.MAKEBLASTDB_DEFAULT,
                                             mode="ANIb")
    multiprocessing_run(cmdlist)
    # Run pairwise BLASTN
    cmdlist = anib.generate_blastn_commands(fragfiles, outdirname,
                                            pyani_config.BLASTN_DEFAULT,
                                            mode="ANIb")
    multiprocessing_run(cmdlist, verbose=False)
    # Process BLAST; the pid data is in anib_data[1]
    anib_data = anib.process_blast(outdirname, org_lengths, fraglengths,
                                   mode="ANIb")
    anib_pid = anib_data[1].sort(axis=0).sort(axis=1) * 100.

    index, columns = anib_pid.index, anib_pid.columns
    diffmat = anib_pid.as_matrix() - anib_jspecies.as_matrix()
    anib_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anib_pid.to_csv(os.path.join(outdirname,
                                'ANIb_pid.tab'),
                   sep='\t')
    anib_jspecies.to_csv(os.path.join(outdirname,
                                      'ANIb_jspecies.tab'),
                         sep='\t')
    anib_diff.to_csv(os.path.join(outdirname,
                                  'ANIb_diff.tab'),
                     sep='\t')
    print "ANIb concordance test output placed in %s" % outdirname
    print anib_pid, anib_jspecies, anib_diff

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anib_diff.abs().values.max()
    print "Maximum difference for ANIb: %e" % max_diff
    assert_less(max_diff, ANIB_THRESHOLD)
Ejemplo n.º 8
0
def unified_anib(indirname,User_ID):
    # Build BLAST databases and run pairwise BLASTN
    # Fraglengths does not get reused with BLASTN
    os.mkdir(indirname+'{0}_out/'.format(User_ID))
    os.system("chmod 777 {0}".format(indirname+'{0}_out'.format(User_ID)))
    logging.basicConfig(level=logging.DEBUG, filename="/home/linproject/Workspace/LIN_log/logfile_{0}".format(User_ID),
                        filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s")
    infiles = pyani_files.get_fasta_files(indirname)
    org_lengths = pyani_files.get_sequence_lengths(infiles)
    fragsize = pyani_config.FRAGSIZE
    filestems = pyani_config.ANIB_FILESTEMS
    filenames = os.listdir(indirname)
    for fname in filenames:
        if ' ' in  os.path.abspath(fname):
            logging.error("File or directory '%s' contains whitespace" % fname)
            logging.error("This will cause issues with MUMmer and BLAST")
            logging.error("(exiting)")
            sys.exit(1)
    fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, indirname+'{0}_out/'.format(User_ID), fragsize)
    # Export fragment lengths as JSON, in case we re-run BLASTALL with
    # --skip_blastn
    with open(os.path.join(indirname+'{0}_out/'.format(User_ID), 'fraglengths.json'), 'w') as outfile:
        json.dump(fraglengths, outfile)
    # Which executables are we using?
    format_exe = pyani_config.FORMATDB_DEFAULT
    blast_exe = pyani_config.BLASTALL_DEFAULT
    # Run BLAST database-building and executables from a jobgraph
    logging.info("Creating job dependency graph")
    jobgraph = anib.make_job_graph(infiles, fragfiles, indirname+'{0}_out/'.format(User_ID), format_exe, blast_exe, 'ANIblastall')

    logging.info("Running jobs with multiprocessing")
    logging.info("Running job dependency graph")
    cumval = run_mp.run_dependency_graph(jobgraph, verbose=False,
                                         logger=logging)
    if 0 < cumval:
        logging.warning("At least one BLAST run failed. " +
                       "%s may fail." % 'ANIblastall')
    else:
        logging.info("All multiprocessing jobs complete.")

    # Process pairwise BLASTN output
    logging.info("Processing pairwise %s BLAST output." % 'ANIblastall')
    try:
        data = anib.process_blast(indirname+'{0}_out/'.format(User_ID), org_lengths,
                                  fraglengths=fraglengths, mode='ANIblastall')
    except ZeroDivisionError:
        logging.error("One or more BLAST output files has a problem.")
        if 0 < cumval:
            logging.error("This is possibly due to BLASTN run failure, " +
                         "please investigate")
        else:
            logging.error("This is possibly due to ara BLASTN comparison " +
                         "being too distant for use.")
        logging.error(last_exception())
    return data[1]
Ejemplo n.º 9
0
def test_anib_concordance():
    """Test concordance of ANIb method with JSpecies output.

    This may take some time. Please be patient.
    """
    # Make/check output directory
    mode = "ANIb"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    anib_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIb concordance:
    # Make fragments
    fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, outdirname,
                                                       pyani_config.FRAGSIZE)
    # Build databases
    cmdlist = anib.generate_blastdb_commands(infiles,
                                             outdirname,
                                             pyani_config.MAKEBLASTDB_DEFAULT,
                                             mode="ANIb")
    multiprocessing_run(cmdlist)
    # Run pairwise BLASTN
    cmdlist = anib.generate_blastn_commands(fragfiles,
                                            outdirname,
                                            pyani_config.BLASTN_DEFAULT,
                                            mode="ANIb")
    multiprocessing_run(cmdlist, verbose=False)
    # Process BLAST; the pid data is in anib_data[1]
    anib_data = anib.process_blast(outdirname,
                                   org_lengths,
                                   fraglengths,
                                   mode="ANIb")
    anib_pid = anib_data[1].sort(axis=0).sort(axis=1) * 100.

    index, columns = anib_pid.index, anib_pid.columns
    diffmat = anib_pid.as_matrix() - anib_jspecies.as_matrix()
    anib_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anib_pid.to_csv(os.path.join(outdirname, 'ANIb_pid.tab'), sep='\t')
    anib_jspecies.to_csv(os.path.join(outdirname, 'ANIb_jspecies.tab'),
                         sep='\t')
    anib_diff.to_csv(os.path.join(outdirname, 'ANIb_diff.tab'), sep='\t')
    print "ANIb concordance test output placed in %s" % outdirname
    print anib_pid, anib_jspecies, anib_diff

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anib_diff.abs().values.max()
    print "Maximum difference for ANIb: %e" % max_diff
    assert_less(max_diff, ANIB_THRESHOLD)
Ejemplo n.º 10
0
def test_parse_blastdir(anib_output_dir):
    """Parse directory of BLAST+ output."""
    orglengths = pyani_files.get_sequence_lengths(anib_output_dir.infiles)
    fraglengths = anib.get_fraglength_dict(anib_output_dir.fragfiles)
    result = anib.process_blast(anib_output_dir.blastdir,
                                orglengths,
                                fraglengths,
                                mode="ANIb")
    assert_frame_equal(
        result.percentage_identity.sort_index(1).sort_index(),
        anib_output_dir.blastresult.sort_index(1).sort_index(),
    )
Ejemplo n.º 11
0
 def test_legacy_blastdir_processing(self):
     """parse directory of legacy .blast_tab output"""
     orglengths = pyani_files.get_sequence_lengths(self.infnames)
     fraglengths = anib.get_fraglength_dict(self.fragfiles)
     # ANIblastall
     result = anib.process_blast(self.aniblastalldir,
                                 orglengths,
                                 fraglengths,
                                 mode="ANIblastall")
     assert_frame_equal(
         result.percentage_identity.sort_index(1).sort_index(),
         self.aniblastalltgt.sort_index(1).sort_index(),
     )
Ejemplo n.º 12
0
def test_anib_concordance(
    paths_concordance_fna,
    path_concordance_jspecies,
    tolerance_anib_hi,
    tolerance_anib_lo,
    threshold_anib_lo_hi,
    fragment_length,
    tmp_path,
):
    """Check ANIb results are concordant with JSpecies.

    We expect ANIb results to be quite different, as the BLASTN
    algorithm changed substantially between BLAST and BLAST+ (the
    megaBLAST algorithm is now the default for BLASTN)
    """
    # Get lengths of input genomes
    orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)

    # Build and run BLAST jobs
    fragfiles, fraglengths = anib.fragment_fasta_files(
        paths_concordance_fna, tmp_path, fragment_length
    )
    jobgraph = anib.make_job_graph(
        paths_concordance_fna, fragfiles, anib.make_blastcmd_builder("ANIb", tmp_path)
    )
    assert 0 == run_mp.run_dependency_graph(jobgraph)  # Jobs must run correctly

    # Process BLAST output
    result_pid = anib.process_blast(
        tmp_path, orglengths, fraglengths, mode="ANIb"
    ).percentage_identity

    # Compare JSpecies output to results. We do this in two blocks,
    # masked according to whether the expected result is greater than
    # a threshold separating "low" from "high" identity comparisons.
    result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
    lo_result = result_pid.mask(result_pid >= threshold_anib_lo_hi).fillna(0).values
    hi_result = result_pid.mask(result_pid < threshold_anib_lo_hi).fillna(0).values

    tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIb"]
    lo_target = tgt_pid.mask(tgt_pid >= threshold_anib_lo_hi).fillna(0).values
    hi_target = tgt_pid.mask(tgt_pid < threshold_anib_lo_hi).fillna(0).values

    assert (lo_result - lo_target, hi_result - hi_target) == (
        pytest.approx(0, abs=tolerance_anib_lo),
        pytest.approx(0, abs=tolerance_anib_hi),
    )
Ejemplo n.º 13
0
def test_anim_concordance():
    """Test concordance of ANIm method with JSpecies output."""
    # Make/check output directory
    mode = "ANIm"
    outdirname = delete_and_remake_outdir(mode)
    nucmername = os.path.join(outdirname, 'nucmer_output')
    os.makedirs(nucmername, exist_ok=True)

    # Get dataframes of JSpecies output
    anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIm concordance:
    # Run pairwise NUCmer
    cmdlist = anim.generate_nucmer_commands(infiles, outdirname,
                                            pyani_config.NUCMER_DEFAULT)
    print('\n'.join(cmdlist))
    multiprocessing_run(cmdlist)
    # Process .delta files
    results = anim.process_deltadir(nucmername, org_lengths)
    anim_pid = \
        results.percentage_identity.sort_index(axis=0).sort_index(axis=1) * 100.

    print("ANIm data\n", results)

    index, columns = anim_pid.index, anim_pid.columns
    diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix()
    anim_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anim_pid.to_csv(os.path.join(outdirname, 'ANIm_pid.tab'), sep='\t')
    anim_jspecies.to_csv(os.path.join(outdirname, 'ANIm_jspecies.tab'),
                         sep='\t')
    anim_diff.to_csv(os.path.join(outdirname, 'ANIm_diff.tab'), sep='\t')
    print("ANIm concordance test output placed in %s" % outdirname)
    print("ANIm PID\n", anim_pid)
    print("ANIm JSpecies\n", anim_jspecies)
    print("ANIm diff\n", anim_diff)

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anim_diff.abs().values.max()
    print("Maximum difference for ANIm: %e" % max_diff)
    assert_less(max_diff, ANIM_THRESHOLD)
Ejemplo n.º 14
0
def test_anim_concordance():
    """Test concordance of ANIm method with JSpecies output."""
    # Make/check output directory
    mode = "ANIm"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIm concordance:
    # Run pairwise NUCmer
    cmdlist = anim.generate_nucmer_commands(infiles, outdirname,
                                            pyani_config.NUCMER_DEFAULT)
    multiprocessing_run(cmdlist, verbose=False)
    # Process .delta files
    anim_data = anim.process_deltadir(outdirname, org_lengths)
    anim_pid = anim_data[1].sort(axis=0).sort(axis=1) * 100.

    print anim_data

    index, columns = anim_pid.index, anim_pid.columns
    diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix()
    anim_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anim_pid.to_csv(os.path.join(outdirname,
                                'ANIm_pid.tab'),
                   sep='\t')
    anim_jspecies.to_csv(os.path.join(outdirname,
                                      'ANIm_jspecies.tab'),
                         sep='\t')
    anim_diff.to_csv(os.path.join(outdirname,
                                  'ANIm_diff.tab'),
                     sep='\t')
    print "ANIm concordance test output placed in %s" % outdirname
    print anim_pid, anim_jspecies, anim_diff

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anim_diff.abs().values.max()
    print "Maximum difference for ANIm: %e" % max_diff
    assert_less(max_diff, ANIM_THRESHOLD)
Ejemplo n.º 15
0
 def setUp(self):
     """Set values and parameters for tests."""
     self.indir = os.path.join("tests", "test_input", "concordance")
     self.outdir = os.path.join("tests", "test_output", "concordance")
     self.tgtdir = os.path.join("tests", "test_targets", "concordance")
     self.deltadir = os.path.join(self.outdir, "nucmer_output")
     self.infiles = pyani_files.get_fasta_files(self.indir)
     self.orglengths = pyani_files.get_sequence_lengths(self.infiles)
     self.target = parse_jspecies(os.path.join(self.tgtdir, "jspecies_output.tab"))
     self.tolerance = {
         "ANIm": 0.1,
         "ANIb_lo": 5,
         "ANIb_hi": 0.1,
         "ANIblastall": 0.1,
         "TETRA": 0.1,
     }
     self.fragsize = 1020
     os.makedirs(self.outdir, exist_ok=True)
     os.makedirs(self.deltadir, exist_ok=True)
Ejemplo n.º 16
0
 def setUp(self):
     """Set values and parameters for tests."""
     testdir = Path("tests")
     self.indir = testdir / "test_input" / "concordance"
     self.outdir = testdir / "test_output" / "concordance"
     self.tgtdir = testdir / "test_targets" / "concordance"
     self.deltadir = self.outdir / "nucmer_output"
     self.infiles = pyani_files.get_fasta_files(self.indir)
     self.orglengths = pyani_files.get_sequence_lengths(self.infiles)
     self.target = parse_jspecies(self.tgtdir / "jspecies_output.tab")
     self.tolerance = {
         "ANIm": 0.1,
         "ANIb_lo": 5,
         "ANIb_hi": 0.1,
         "ANIblastall": 0.1,
         "TETRA": 0.1,
     }
     self.fragsize = 1020
     self.outdir.mkdir(exist_ok=True)
     self.deltadir.mkdir(exist_ok=True)
Ejemplo n.º 17
0
def test_tetra_concordance():
    """Test concordance of TETRA method with JSpecies output."""
    # Make/check output directory
    mode = "TETRA"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test TETRA concordance
    tetra_zscores = {}
    for filename in infiles:
        org = os.path.splitext(os.path.split(filename)[-1])[0]
        tetra_zscores[org] = tetra.calculate_tetra_zscore(filename)
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    index, columns = tetra_correlations.index, tetra_correlations.columns
    tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\
                              tetra_jspecies.as_matrix(),
                              index=index, columns=columns)

    # Write dataframes to file, for reference
    tetra_correlations.to_csv(os.path.join(outdirname,
                                           'tetra_correlations.tab'),
                              sep='\t')
    tetra_jspecies.to_csv(os.path.join(outdirname,
                                       'tetra_jspecies.tab'),
                          sep='\t')
    tetra_diff.to_csv(os.path.join(outdirname,
                                   'tetra_diff.tab'),
                      sep='\t')
    print "TETRA concordance test output placed in %s" % outdirname
    print tetra_correlations, tetra_jspecies, tetra_diff

    # We'd like the absolute difference reported to be < TETRA_THRESHOLD
    max_diff = tetra_diff.abs().values.max()
    print "Maximum difference for TETRA: %e" % max_diff
    assert_less(max_diff, TETRA_THRESHOLD)
Ejemplo n.º 18
0
def test_tetra_concordance():
    """Test concordance of TETRA method with JSpecies output."""
    # Make/check output directory
    mode = "TETRA"
    outdirname = delete_and_remake_outdir(mode)

    # Get dataframes of JSpecies output
    tetra_jspecies = parse_table(JSPECIES_OUTFILE, 'Tetra')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test TETRA concordance
    tetra_zscores = {}
    for filename in infiles:
        org = os.path.splitext(os.path.split(filename)[-1])[0]
        tetra_zscores[org] = tetra.calculate_tetra_zscore(filename)
    tetra_correlations = tetra.calculate_correlations(tetra_zscores)
    index, columns = tetra_correlations.index, tetra_correlations.columns
    tetra_diff = pd.DataFrame(tetra_correlations.as_matrix() -\
                              tetra_jspecies.as_matrix(),
                              index=index, columns=columns)

    # Write dataframes to file, for reference
    tetra_correlations.to_csv(os.path.join(outdirname,
                                           'tetra_correlations.tab'),
                              sep='\t')
    tetra_jspecies.to_csv(os.path.join(outdirname, 'tetra_jspecies.tab'),
                          sep='\t')
    tetra_diff.to_csv(os.path.join(outdirname, 'tetra_diff.tab'), sep='\t')
    print("TETRA concordance test output placed in %s" % outdirname)
    print("TETRA correlations:\n", tetra_correlations)
    print("TETRA JSpecies:\n", tetra_jspecies)
    print("TETRA diff:\n", tetra_diff)

    # We'd like the absolute difference reported to be < TETRA_THRESHOLD
    max_diff = tetra_diff.abs().values.max()
    print("Maximum difference for TETRA: %e" % max_diff)
    assert_less(max_diff, TETRA_THRESHOLD)
            sys.exit(1)
        logger.info("Using scheduler method: %s", args.scheduler)

        # Get input files
        logger.info("Identifying FASTA files in %s", args.indirname)
        infiles = pyani_files.get_fasta_files(args.indirname)
        logger.info("Input files:\n\t%s", '\n\t'.join(infiles))

        # Are we subsampling? If so, make the selection here
        if args.subsample:
            infiles = subsample_input(infiles)
            logger.info("Sampled input files:\n\t%s", '\n\t'.join(infiles))

        # Get lengths of input sequences
        logger.info("Processing input sequence lengths")
        org_lengths = pyani_files.get_sequence_lengths(infiles)
        logger.info("Sequence lengths:\n" + os.linesep.join(
            ["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())]))

        # Run appropriate method on the contents of the input directory,
        # and write out corresponding results.
        logger.info("Carrying out %s analysis", args.method)
        if args.method == "TETRA":
            results = methods[args.method][0](infiles)
        else:
            results = methods[args.method][0](infiles, org_lengths)
        write(results)

    # Do we want graphical output?
    if args.graphics or args.rerender:
        logger.info("Rendering output graphics")
def run_main(argsin: Optional[Namespace] = None) -> int:
    """Run main process for average_nucleotide_identity.py script.

    :param argsin:  Namespace, command-line arguments
    :param logger:  logging object
    """
    time0 = time.time()

    # Process command-line and build logger
    args = process_arguments(argsin)
    logger = logging.getLogger(__name__)
    config_logger(args)

    # Ensure argument validity and get method function/config
    test_class_label_paths(args, logger)
    test_scheduler(args, logger)
    method_function, method_config = get_method(args)
    make_outdirs(args)

    # Skip calculations (or not) depending on rerender option
    if args.rerender:
        logger.warning(
            "--rerender option used. Producing graphics with no new recalculations"
        )
    else:
        # Run ANI comparisons
        logger.info("Identifying FASTA files in %s", args.indirname)
        infiles = pyani_files.get_fasta_files(args.indirname)
        logger.info("Input files:\n\t%s",
                    "\n\t".join([str(_) for _ in infiles]))

        # Are we subsampling? If so, make the selection here
        if args.subsample:
            infiles = subsample_input(args, logger, infiles)
            logger.info("Sampled input files:\n\t%s",
                        "\n\t".join([str(_) for _ in infiles]))

        # Get lengths of input sequences
        logger.info("Processing input sequence lengths")
        org_lengths = pyani_files.get_sequence_lengths(infiles)
        seqlens = os.linesep.join(
            ["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())])
        logger.info("Sequence lengths:\n%s", seqlens)

        # Run appropriate method on the contents of the input directory,
        # and write out corresponding results.
        logger.info("Carrying out %s analysis", args.method)
        if args.method == "TETRA":
            results = method_function(infiles)
        else:
            results = method_function(args, infiles, org_lengths)
        write(args, results)

    # Do we want graphical output?
    if args.graphics or args.rerender:
        logger.info("Rendering output graphics")
        logger.info("Formats requested: %s", args.gformat)
        for gfmt in args.gformat.split(","):
            logger.info("Graphics format: %s", gfmt)
            logger.info("Graphics method: %s", args.gmethod)
            draw(args, method_config, gfmt)

    # Close any open matplotlib figures
    plt.close("all")

    # Report that we've finished
    logger.info("Done: %s.", time.asctime())
    logger.info("Time taken: %.2fs", (time.time() - time0))

    # Exit
    return 0
Ejemplo n.º 21
0
    # Have we got a valid scheduler choice?
    schedulers = ["multiprocessing", "SGE"]
    if args.scheduler not in schedulers:
        logger.error("scheduler %s not recognised (exiting)" % args.scheduler)
        logger.error("Valid schedulers are: %s" % '; '.join(schedulers))
        sys.exit(1)
    logger.info("Using scheduler method: %s" % args.scheduler)

    # Get input files
    logger.info("Identifying FASTA files in %s" % args.indirname)
    infiles = pyani_files.get_fasta_files(args.indirname)
    logger.info("Input files:\n\t%s" % '\n\t'.join(infiles))

    # Get lengths of input sequences
    logger.info("Processing input sequence lengths")
    org_lengths = pyani_files.get_sequence_lengths(infiles)
    logger.info("Sequence lengths:\n" +
                os.linesep.join(["\t%s: %d" % (k, v) for
                                 k, v in org_lengths.items()]))

    # Run appropriate method on the contents of the input directory,
    # and write out corresponding results.
    logger.info("Carrying out %s analysis" % args.method)
    results = methods[args.method][0](infiles, org_lengths)
    write(results, methods[args.method][1])

    # Do we want graphical output?
    if args.graphics:
        logger.info("Rendering output graphics")
        logger.info("Formats requested: %s" % args.gformat)
        for gfmt in args.gformat.split(','):