Example #1
0
def test_anim_concordance(
    paths_concordance_fna, path_concordance_jspecies, tolerance_anim, tmp_path
):
    """Check ANIm results are concordant with JSpecies."""
    # Perform ANIm on the input directory contents
    # We have to separate nucmer/delta-filter command generation
    # because Travis-CI doesn't play nicely with changes we made
    # for local SGE/OGE integration.
    # This might be avoidable with a scheduler flag passed to
    # jobgroup generation in the anim.py module. That's a TODO.
    ncmds, fcmds = anim.generate_nucmer_commands(paths_concordance_fna, tmp_path)
    (tmp_path / "nucmer_output").mkdir(exist_ok=True, parents=True)
    run_mp.multiprocessing_run(ncmds)

    # delta-filter commands need to be treated with care for
    # Travis-CI. Our cluster won't take redirection or semicolon
    # separation in individual commands, but the wrapper we wrote
    # for this (delta_filter_wrapper.py) can't be called under
    # Travis-CI. So we must deconstruct the commands below
    dfcmds = [
        " > ".join([" ".join(fcmd.split()[1:-1]), fcmd.split()[-1]]) for fcmd in fcmds
    ]
    run_mp.multiprocessing_run(dfcmds)

    orglengths = pyani_files.get_sequence_lengths(paths_concordance_fna)

    results = anim.process_deltadir(tmp_path / "nucmer_output", orglengths)
    result_pid = results.percentage_identity
    result_pid.to_csv(tmp_path / "pyani_anim.tab", sep="\t")

    # Compare JSpecies output to results
    result_pid = (result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0).values
    tgt_pid = parse_jspecies(path_concordance_jspecies)["ANIm"].values

    assert result_pid - tgt_pid == pytest.approx(0, abs=tolerance_anim)
Example #2
0
def test_multiprocessing_run():
    """Test basic multiprocessing function
    """
    cmdlist = ['for i in %s; do echo "Thread %d: value ${i}"; done' %
               (' '.join([str(e) for e in range(v)]), v) for
               v in range(5)]
    run_multiprocessing.multiprocessing_run(cmdlist)
Example #3
0
    def test_anim_concordance(self):
        """ANIm results concordant with JSpecies."""
        # Perform ANIm on the input directory contents
        # We have to separate nucmer/delta-filter command generation
        # because Travis-CI doesn't play nicely with changes we made
        # for local SGE/OGE integration.
        # This might be avoidable with a scheduler flag passed to
        # jobgroup generation in the anim.py module. That's a TODO.
        ncmds, fcmds = anim.generate_nucmer_commands(self.infiles, self.outdir)
        run_mp.multiprocessing_run(ncmds)

        # delta-filter commands need to be treated with care for
        # Travis-CI. Our cluster won't take redirection or semicolon
        # separation in individual commands, but the wrapper we wrote
        # for this (delta_filter_wrapper.py) can't be called under
        # Travis-CI. So we must deconstruct the commands below
        dfcmds = [
            " > ".join([" ".join(fcmd.split()[1:-1]), fcmd.split()[-1]])
            for fcmd in fcmds
        ]
        run_mp.multiprocessing_run(dfcmds)

        results = anim.process_deltadir(self.deltadir, self.orglengths)
        result_pid = results.percentage_identity
        result_pid.to_csv(os.path.join(self.outdir, "pyani_anim.tab"), sep="\t")

        # Compare JSpecies output to results
        result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
        diffmat = result_pid.values - self.target["ANIm"].values
        anim_diff = pd.DataFrame(
            diffmat, index=result_pid.index, columns=result_pid.columns
        )
        anim_diff.to_csv(os.path.join(self.outdir, "pyani_anim_diff.tab"), sep="\t")
        assert_less(anim_diff.abs().values.max(), self.tolerance["ANIm"])
Example #4
0
    def test_anim_concordance(self):
        """Check ANIm results are concordant with JSpecies."""
        # Perform ANIm on the input directory contents
        # We have to separate nucmer/delta-filter command generation
        # because Travis-CI doesn't play nicely with changes we made
        # for local SGE/OGE integration.
        # This might be avoidable with a scheduler flag passed to
        # jobgroup generation in the anim.py module. That's a TODO.
        ncmds, fcmds = anim.generate_nucmer_commands(self.infiles, self.outdir)
        run_mp.multiprocessing_run(ncmds)

        # delta-filter commands need to be treated with care for
        # Travis-CI. Our cluster won't take redirection or semicolon
        # separation in individual commands, but the wrapper we wrote
        # for this (delta_filter_wrapper.py) can't be called under
        # Travis-CI. So we must deconstruct the commands below
        dfcmds = [
            " > ".join([" ".join(fcmd.split()[1:-1]),
                        fcmd.split()[-1]]) for fcmd in fcmds
        ]
        run_mp.multiprocessing_run(dfcmds)

        results = anim.process_deltadir(self.deltadir, self.orglengths)
        result_pid = results.percentage_identity
        result_pid.to_csv(self.outdir / "pyani_anim.tab", sep="\t")

        # Compare JSpecies output to results
        result_pid = result_pid.sort_index(axis=0).sort_index(axis=1) * 100.0
        diffmat = result_pid.values - self.target["ANIm"].values
        anim_diff = pd.DataFrame(diffmat,
                                 index=result_pid.index,
                                 columns=result_pid.columns)
        anim_diff.to_csv(self.outdir / "pyani_anim_diff.tab", sep="\t")
        self.assertLess(anim_diff.abs().values.max(), self.tolerance["ANIm"])
Example #5
0
def test_multiprocessing_run():
    """Test basic multiprocessing function
    """
    cmdlist = [
        'for i in %s; do echo "Thread %d: value ${i}"; done' %
        (' '.join([str(e) for e in range(v)]), v) for v in range(5)
    ]
    run_multiprocessing.multiprocessing_run(cmdlist)
Example #6
0
def test_anib_concordance():
    """Test concordance of ANIb method with JSpecies output.

    This may take some time. Please be patient.
    """
    # Make/check output directory
    mode = "ANIb"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    anib_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIb concordance:
    # Make fragments
    fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, outdirname,
                                                       pyani_config.FRAGSIZE)
    # Build databases
    cmdlist = anib.generate_blastdb_commands(infiles, outdirname,
                                             pyani_config.MAKEBLASTDB_DEFAULT,
                                             mode="ANIb")
    multiprocessing_run(cmdlist)
    # Run pairwise BLASTN
    cmdlist = anib.generate_blastn_commands(fragfiles, outdirname,
                                            pyani_config.BLASTN_DEFAULT,
                                            mode="ANIb")
    multiprocessing_run(cmdlist, verbose=False)
    # Process BLAST; the pid data is in anib_data[1]
    anib_data = anib.process_blast(outdirname, org_lengths, fraglengths,
                                   mode="ANIb")
    anib_pid = anib_data[1].sort(axis=0).sort(axis=1) * 100.

    index, columns = anib_pid.index, anib_pid.columns
    diffmat = anib_pid.as_matrix() - anib_jspecies.as_matrix()
    anib_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anib_pid.to_csv(os.path.join(outdirname,
                                'ANIb_pid.tab'),
                   sep='\t')
    anib_jspecies.to_csv(os.path.join(outdirname,
                                      'ANIb_jspecies.tab'),
                         sep='\t')
    anib_diff.to_csv(os.path.join(outdirname,
                                  'ANIb_diff.tab'),
                     sep='\t')
    print "ANIb concordance test output placed in %s" % outdirname
    print anib_pid, anib_jspecies, anib_diff

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anib_diff.abs().values.max()
    print "Maximum difference for ANIb: %e" % max_diff
    assert_less(max_diff, ANIB_THRESHOLD)
Example #7
0
def test_anib_concordance():
    """Test concordance of ANIb method with JSpecies output.

    This may take some time. Please be patient.
    """
    # Make/check output directory
    mode = "ANIb"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    anib_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIb')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIb concordance:
    # Make fragments
    fragfiles, fraglengths = anib.fragment_FASTA_files(infiles, outdirname,
                                                       pyani_config.FRAGSIZE)
    # Build databases
    cmdlist = anib.generate_blastdb_commands(infiles,
                                             outdirname,
                                             pyani_config.MAKEBLASTDB_DEFAULT,
                                             mode="ANIb")
    multiprocessing_run(cmdlist)
    # Run pairwise BLASTN
    cmdlist = anib.generate_blastn_commands(fragfiles,
                                            outdirname,
                                            pyani_config.BLASTN_DEFAULT,
                                            mode="ANIb")
    multiprocessing_run(cmdlist, verbose=False)
    # Process BLAST; the pid data is in anib_data[1]
    anib_data = anib.process_blast(outdirname,
                                   org_lengths,
                                   fraglengths,
                                   mode="ANIb")
    anib_pid = anib_data[1].sort(axis=0).sort(axis=1) * 100.

    index, columns = anib_pid.index, anib_pid.columns
    diffmat = anib_pid.as_matrix() - anib_jspecies.as_matrix()
    anib_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anib_pid.to_csv(os.path.join(outdirname, 'ANIb_pid.tab'), sep='\t')
    anib_jspecies.to_csv(os.path.join(outdirname, 'ANIb_jspecies.tab'),
                         sep='\t')
    anib_diff.to_csv(os.path.join(outdirname, 'ANIb_diff.tab'), sep='\t')
    print "ANIb concordance test output placed in %s" % outdirname
    print anib_pid, anib_jspecies, anib_diff

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anib_diff.abs().values.max()
    print "Maximum difference for ANIb: %e" % max_diff
    assert_less(max_diff, ANIB_THRESHOLD)
Example #8
0
def test_anim_concordance():
    """Test concordance of ANIm method with JSpecies output."""
    # Make/check output directory
    mode = "ANIm"
    outdirname = delete_and_remake_outdir(mode)
    nucmername = os.path.join(outdirname, 'nucmer_output')
    os.makedirs(nucmername, exist_ok=True)

    # Get dataframes of JSpecies output
    anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIm concordance:
    # Run pairwise NUCmer
    cmdlist = anim.generate_nucmer_commands(infiles, outdirname,
                                            pyani_config.NUCMER_DEFAULT)
    print('\n'.join(cmdlist))
    multiprocessing_run(cmdlist)
    # Process .delta files
    results = anim.process_deltadir(nucmername, org_lengths)
    anim_pid = \
        results.percentage_identity.sort_index(axis=0).sort_index(axis=1) * 100.

    print("ANIm data\n", results)

    index, columns = anim_pid.index, anim_pid.columns
    diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix()
    anim_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anim_pid.to_csv(os.path.join(outdirname, 'ANIm_pid.tab'), sep='\t')
    anim_jspecies.to_csv(os.path.join(outdirname, 'ANIm_jspecies.tab'),
                         sep='\t')
    anim_diff.to_csv(os.path.join(outdirname, 'ANIm_diff.tab'), sep='\t')
    print("ANIm concordance test output placed in %s" % outdirname)
    print("ANIm PID\n", anim_pid)
    print("ANIm JSpecies\n", anim_jspecies)
    print("ANIm diff\n", anim_diff)

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anim_diff.abs().values.max()
    print("Maximum difference for ANIm: %e" % max_diff)
    assert_less(max_diff, ANIM_THRESHOLD)
Example #9
0
def test_anim_concordance():
    """Test concordance of ANIm method with JSpecies output."""
    # Make/check output directory
    mode = "ANIm"
    outdirname = make_outdir(mode)

    # Get dataframes of JSpecies output
    anim_jspecies = parse_table(JSPECIES_OUTFILE, 'ANIm')

    # Identify our input files, and the total lengths of each organism seq
    infiles = pyani_files.get_fasta_files(INDIRNAME)
    org_lengths = pyani_files.get_sequence_lengths(infiles)

    # Test ANIm concordance:
    # Run pairwise NUCmer
    cmdlist = anim.generate_nucmer_commands(infiles, outdirname,
                                            pyani_config.NUCMER_DEFAULT)
    multiprocessing_run(cmdlist, verbose=False)
    # Process .delta files
    anim_data = anim.process_deltadir(outdirname, org_lengths)
    anim_pid = anim_data[1].sort(axis=0).sort(axis=1) * 100.

    print anim_data

    index, columns = anim_pid.index, anim_pid.columns
    diffmat = anim_pid.as_matrix() - anim_jspecies.as_matrix()
    anim_diff = pd.DataFrame(diffmat, index=index, columns=columns)

    # Write dataframes to file, for reference
    anim_pid.to_csv(os.path.join(outdirname,
                                'ANIm_pid.tab'),
                   sep='\t')
    anim_jspecies.to_csv(os.path.join(outdirname,
                                      'ANIm_jspecies.tab'),
                         sep='\t')
    anim_diff.to_csv(os.path.join(outdirname,
                                  'ANIm_diff.tab'),
                     sep='\t')
    print "ANIm concordance test output placed in %s" % outdirname
    print anim_pid, anim_jspecies, anim_diff

    # We'd like the absolute difference reported to be < ANIB_THRESHOLD
    max_diff = anim_diff.abs().values.max()
    print "Maximum difference for ANIm: %e" % max_diff
    assert_less(max_diff, ANIM_THRESHOLD)
Example #10
0
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        cmdlist = anim.generate_nucmer_commands(infiles,
                                                args.outdirname,
                                                nucmer_exe=args.nucmer_exe,
                                                maxmatch=args.maxmatch)
        logger.info("NUCmer commands:\n" + os.linesep.join(cmdlist))
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = multiprocessing_run(cmdlist, verbose=args.verbose)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            raise NotImplementedError
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    try:
        data = anim.process_deltadir(args.outdirname, org_lengths)
    except ZeroDivisionError:
        logger.error("One or more NUCmer output files has a problem.")
        if not args.skip_nucmer:
            if 0 < cumval:
                logger.error("This is possibly due to NUCmer run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
        logger.error(last_exception())
    return data
Example #11
0
 def test_multiprocessing_run(self):
     """Test that multiprocessing() runs basic jobs."""
     result = run_multiprocessing.multiprocessing_run(self.cmdlist)
     self.assertEqual(0, result)
Example #12
0
def test_multiprocessing_run(mp_cmdlist):
    """Test that multiprocessing() runs basic jobs."""
    result = multiprocessing_run(mp_cmdlist)
    assert 0 == result
Example #13
0
 def test_multiprocessing_run(self):
     """multiprocessing() runs basic jobs."""
     result = run_multiprocessing.multiprocessing_run(self.cmdlist)
     assert_equal(0, result)
def calculate_anim(infiles, org_lengths):
    """Returns ANIm result dataframes for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Finds ANI by the ANIm method, as described in Richter et al (2009)
    Proc Natl Acad Sci USA 106: 19126-19131 doi:10.1073/pnas.0906412106.

    All FASTA format files (selected by suffix) in the input directory
    are compared against each other, pairwise, using NUCmer (which must
    be in the path). NUCmer output is stored in the output directory.

    The NUCmer .delta file output is parsed to obtain an alignment length
    and similarity error count for every unique region alignment between
    the two organisms, as represented by the sequences in the FASTA files.

    These are processed to give matrices of aligned sequence lengths,
    average nucleotide identity (ANI) percentages, coverage (aligned
    percentage of whole genome), and similarity error cound for each pairwise
    comparison.
    """
    logger.info("Running ANIm")
    logger.info("Generating NUCmer command-lines")
    # Schedule NUCmer runs
    if not args.skip_nucmer:
        cmdlist = anim.generate_nucmer_commands(infiles, args.outdirname,
                                                nucmer_exe=args.nucmer_exe,
                                                maxmatch=args.maxmatch)
        logger.info("NUCmer commands:\n" + os.linesep.join(cmdlist))
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = multiprocessing_run(cmdlist, verbose=args.verbose)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one NUCmer comparison failed. " +
                               "ANIm may fail.")
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            raise NotImplementedError
    else:
        logger.warning("Skipping NUCmer run (as instructed)!")

    # Process resulting .delta files
    logger.info("Processing NUCmer .delta files.")
    try:
        data = anim.process_deltadir(args.outdirname, org_lengths)
    except ZeroDivisionError:
        logger.error("One or more NUCmer output files has a problem.")
        if not args.skip_nucmer:
            if 0 < cumval:
                logger.error("This is possibly due to NUCmer run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a NUCmer comparison " +
                             "being too distant for use. Please consider " +
                             "using the --maxmatch option.")
        logger.error(last_exception())
    return data
def unified_anib(infiles, org_lengths):
    """Calculate ANIb for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Calculates ANI by the ANIb method, as described in Goris et al. (2007)
    Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are
    some minor differences depending on whether BLAST+ or legacy BLAST
    (BLASTALL) methods are used.

    All FASTA format files (selected by suffix) in the input directory are
    used to construct BLAST databases, placed in the output directory.
    Each file's contents are also split into sequence fragments of length
    options.fragsize, and the multiple FASTA file that results written to
    the output directory. These are BLASTNed, pairwise, against the
    databases.

    The BLAST output is interrogated for all fragment matches that cover
    at least 70% of the query sequence, with at least 30% nucleotide
    identity over the full length of the query sequence. This is an odd
    choice and doesn't correspond to the twilight zone limit as implied by
    Goris et al. We persist with their definition, however.  Only these
    qualifying matches contribute to the total aligned length, and total
    aligned sequence identity used to calculate ANI.

    The results are processed to give matrices of aligned sequence length
    (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs
    (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of
    each genome, for each pairwise comparison. These are written to the
    output directory in plain text tab-separated format.
    """
    logger.info("Running %s" % args.method)
    # Build BLAST databases and run pairwise BLASTN
    if not args.skip_blastn:
        # Make sequence fragments
        logger.info("Fragmenting input files, and writing to %s" %
                    args.outdirname)
        # Fraglengths does not get reused with BLASTN
        fragfiles, fraglengths = anib.fragment_FASTA_files(infiles,
                                                           args.outdirname,
                                                           args.fragsize)
        # Export fragment lengths as JSON, in case we re-run BLASTALL with
        # --skip_blastn
        if args.method == "ANIblastall":
            with open(os.path.join(args.outdirname,
                                   'fraglengths.json'), 'w') as outfile:
                json.dump(fraglengths, outfile)

        # Which executables are we using?
        if args.method == "ANIblastall":
            blastdb_exe = args.formatdb_exe
            blastn_exe = args.blastall_exe
        else:
            blastdb_exe = args.makeblastdb_exe
            blastn_exe = args.blastn_exe

        # Build BLASTN databases
        logger.info("Constructing %s BLAST databases" % args.method)
        cmdlist = anib.generate_blastdb_commands(infiles, args.outdirname,
                                                 blastdb_exe=blastdb_exe,
                                                 mode=args.method)
        logger.info("Generated commands:\n%s" % '\n'.join(cmdlist))
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = multiprocessing_run(cmdlist, verbose=args.verbose)
            if 0 < cumval:
                logger.warning("At least one makeblastdb run failed. " +
                               "%s may fail." % args.method)
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            raise NotImplementedError

        # Run pairwise BLASTN
        logger.info("Running %s BLASTN jobs" % args.method)
        cmdlist = anib.generate_blastn_commands(fragfiles, args.outdirname,
                                                blastn_exe, mode=args.method)
        logger.info("Generated commands:\n%s" % '\n'.join(cmdlist))
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = multiprocessing_run(cmdlist, verbose=args.verbose)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one BLASTN comparison failed. " +
                               "%s may fail." % args.method)
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            raise NotImplementedError
    else:
        # Import fragment lengths from JSON
        if args.method == "ANIblastall":
            with open(os.path.join(args.outdirname, 'fraglengths.json'),
                      'rU') as infile:
                fraglengths = json.load(infile)
        else:
            fraglengths = None
        logger.warning("Skipping BLASTN runs (as instructed)!")

    # Process pairwise BLASTN output
    logger.info("Processing pairwise %s BLAST output." % args.method)
    try:
        data = anib.process_blast(args.outdirname, org_lengths,
                                  fraglengths=fraglengths, mode=args.method)
    except ZeroDivisionError:
        logger.error("One or more BLAST output files has a problem.")
        if not args.skip_blastn:
            if 0 < cumval:
                logger.error("This is possibly due to BLASTN run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a BLASTN comparison " +
                             "being too distant for use.")
        logger.error(last_exception())
    return data
Example #16
0
 def test_multiprocessing_run(self):
     """multiprocessing() runs basic jobs."""
     result = run_multiprocessing.multiprocessing_run(self.cmdlist)
     assert_equal(0, result)
Example #17
0
def unified_anib(infiles, org_lengths):
    """Calculate ANIb for files in input directory.

    - infiles - paths to each input file
    - org_lengths - dictionary of input sequence lengths, keyed by sequence

    Calculates ANI by the ANIb method, as described in Goris et al. (2007)
    Int J Syst Evol Micr 57: 81-91. doi:10.1099/ijs.0.64483-0. There are
    some minor differences depending on whether BLAST+ or legacy BLAST
    (BLASTALL) methods are used.

    All FASTA format files (selected by suffix) in the input directory are
    used to construct BLAST databases, placed in the output directory.
    Each file's contents are also split into sequence fragments of length
    options.fragsize, and the multiple FASTA file that results written to
    the output directory. These are BLASTNed, pairwise, against the
    databases.

    The BLAST output is interrogated for all fragment matches that cover
    at least 70% of the query sequence, with at least 30% nucleotide
    identity over the full length of the query sequence. This is an odd
    choice and doesn't correspond to the twilight zone limit as implied by
    Goris et al. We persist with their definition, however.  Only these
    qualifying matches contribute to the total aligned length, and total
    aligned sequence identity used to calculate ANI.

    The results are processed to give matrices of aligned sequence length
    (aln_lengths.tab), similarity error counts (sim_errors.tab), ANIs
    (perc_ids.tab), and minimum aligned percentage (perc_aln.tab) of
    each genome, for each pairwise comparison. These are written to the
    output directory in plain text tab-separated format.
    """
    logger.info("Running %s" % args.method)
    # Build BLAST databases and run pairwise BLASTN
    if not args.skip_blastn:
        # Make sequence fragments
        logger.info("Fragmenting input files, and writing to %s" %
                    args.outdirname)
        # Fraglengths does not get reused with BLASTN
        fragfiles, fraglengths = anib.fragment_FASTA_files(
            infiles, args.outdirname, args.fragsize)
        # Export fragment lengths as JSON, in case we re-run BLASTALL with
        # --skip_blastn
        if args.method == "ANIblastall":
            with open(os.path.join(args.outdirname, 'fraglengths.json'),
                      'w') as outfile:
                json.dump(fraglengths, outfile)

        # Which executables are we using?
        if args.method == "ANIblastall":
            blastdb_exe = args.formatdb_exe
            blastn_exe = args.blastall_exe
        else:
            blastdb_exe = args.makeblastdb_exe
            blastn_exe = args.blastn_exe

        # Build BLASTN databases
        logger.info("Constructing %s BLAST databases" % args.method)
        cmdlist = anib.generate_blastdb_commands(infiles,
                                                 args.outdirname,
                                                 blastdb_exe=blastdb_exe,
                                                 mode=args.method)
        logger.info("Generated commands:\n%s" % '\n'.join(cmdlist))
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = multiprocessing_run(cmdlist, verbose=args.verbose)
            if 0 < cumval:
                logger.warning("At least one makeblastdb run failed. " +
                               "%s may fail." % args.method)
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            raise NotImplementedError

        # Run pairwise BLASTN
        logger.info("Running %s BLASTN jobs" % args.method)
        cmdlist = anib.generate_blastn_commands(fragfiles,
                                                args.outdirname,
                                                blastn_exe,
                                                mode=args.method)
        logger.info("Generated commands:\n%s" % '\n'.join(cmdlist))
        if args.scheduler == 'multiprocessing':
            logger.info("Running jobs with multiprocessing")
            cumval = multiprocessing_run(cmdlist, verbose=args.verbose)
            logger.info("Cumulative return value: %d" % cumval)
            if 0 < cumval:
                logger.warning("At least one BLASTN comparison failed. " +
                               "%s may fail." % args.method)
            else:
                logger.info("All multiprocessing jobs complete.")
        else:
            logger.info("Running jobs with SGE")
            raise NotImplementedError
    else:
        # Import fragment lengths from JSON
        if args.method == "ANIblastall":
            with open(os.path.join(args.outdirname, 'fraglengths.json'),
                      'rU') as infile:
                fraglengths = json.load(infile)
        else:
            fraglengths = None
        logger.warning("Skipping BLASTN runs (as instructed)!")

    # Process pairwise BLASTN output
    logger.info("Processing pairwise %s BLAST output." % args.method)
    try:
        data = anib.process_blast(args.outdirname,
                                  org_lengths,
                                  fraglengths=fraglengths,
                                  mode=args.method)
    except ZeroDivisionError:
        logger.error("One or more BLAST output files has a problem.")
        if not args.skip_blastn:
            if 0 < cumval:
                logger.error("This is possibly due to BLASTN run failure, " +
                             "please investigate")
            else:
                logger.error("This is possibly due to a BLASTN comparison " +
                             "being too distant for use.")
        logger.error(last_exception())
    return data