def test_find_integrase_gembase(self):
        cfg = Config(self.args)
        self.args.gembase = True
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = self.find_data(
            os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon_name + ".prt")

        shutil.copyfile(
            self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
            prot_file)

        integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)

        for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                       '_phage_int_table.res'):
            res = os.path.join(self.tmp_dir, replicon.id + suffix)
            self.assertTrue(os.path.exists(res))
    def test_find_integrase_no_gembase_no_protfile_no_prodigal(self):
        try:
            self.args.hmmsearch = 'foo'
            self.args.gembase = False
            cfg = Config(self.args)
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

            shutil.copyfile(
                self.find_data(os.path.join('Proteins', replicon.id + ".prt")),
                prot_file)

            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertTrue(
                re.search(
                    "failed : \[Errno 2\] No such file or directory: 'foo'",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 500000

            prot_file = os.path.join(self.tmp_dir, "foo.prt")
            open(prot_file, 'w').close()
            with self.catch_log():
                with self.assertRaises(EmptyFileError) as ctx:
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_no_protfile_short_seq(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            prot_name = 'ACBA.007.P01_13'
            prot_path = self.find_data(
                os.path.join('Proteins', prot_name + '.prt'))

            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            shutil.copyfile(prot_path, prot_file)

            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
            for suffix in ('_intI.res', '_intI_table.res', '_phage_int.res',
                           '_phage_int_table.res'):
                res = os.path.join(self.tmp_dir, replicon.id + suffix)
                self.assertTrue(os.path.exists(res))
        finally:
            replicon.__class__.__len__ = len_ori
    def test_find_integrase_no_gembase_with_protfile_empty(self):
        try:
            cfg = Config(self.args)
            self.args.gembase = False
            cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

            replicon_name = 'acba.007.p01.13'
            replicon_path = self.find_data(
                os.path.join('Replicons', replicon_name + '.fst'))
            topologies = Topology('lin')
            with FastaIterator(replicon_path) as sequences_db:
                sequences_db.topologies = topologies
                replicon = next(sequences_db)

            len_ori = replicon.__class__.__len__
            replicon.__class__.__len__ = lambda x: 200

            prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
            open(prot_file, 'w').close()
            with self.assertRaises(EmptyFileError) as ctx:
                with self.catch_log():
                    integrase.find_integrase(replicon.id, prot_file,
                                             self.tmp_dir, cfg)
            self.assertTrue(
                re.match(
                    "^The protein file: '.*' is empty cannot perform hmmsearch on it.$",
                    str(ctx.exception)))
        finally:
            replicon.__class__.__len__ = len_ori
    def setUp(self):
        """
        Define variables common to all tests
        """
        replicon_name = "acba.007.p01.13"
        self.replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))
        topologies = Topology('lin')
        with FastaIterator(self.replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            self.replicon = next(sequences_db)

        self.tmp_dir = os.path.join(tempfile.gettempdir(), 'tmp_test_integron_finder')
        if os.path.isdir(self.tmp_dir):
            shutil.rmtree(self.tmp_dir)
        os.makedirs(self.tmp_dir)

        # Resfams is too big to bee in tests/data
        # search directly in data
        self.hmm_files = [os.path.normpath(
            os.path.join(os.path.dirname(__file__), "..", "data", "Functional_annotation", "Resfams.hmm")
        )]
        # Define integron_finder variables
        args = argparse.Namespace()
        args.gembase = False
        args.annot_parser_name = None
        args.hmmsearch = distutils.spawn.find_executable("hmmsearch")
        args.prodigal = distutils.spawn.find_executable("prodigal")
        args.cpu = 1
        args.out_dir = self.tmp_dir
        self.cfg = Config(args)
        self.cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        prot_dir = os.path.join(self.tmp_dir, 'Proteins')
        os.makedirs(prot_dir)
        self.prot_file = os.path.join(prot_dir, self.replicon.name + ".prt")
        shutil.copyfile(self.find_data(os.path.join('Proteins', self.replicon.id + ".prt")), self.prot_file)
        self.prot_db = ProdigalDB(self.replicon, self.cfg, prot_file=self.prot_file)

        self.exp_files = ["{}{}".format(self.replicon.id, suffix) for suffix in ("_Resfams_fa_table.res",
                                                                                 "_intI_table.res",
                                                                                 "_phage_int_table.res",
                                                                                 "_Resfams_fa.res",
                                                                                 "_intI.res",
                                                                                 "_phage_int.res",
                                                                                 "_subseqprot.tmp")]
        self.exp_files = [os.path.join(self.tmp_dir, file) for file in self.exp_files]

        self.prot_dtype = {"pos_beg": 'int',
                           "pos_end": 'int',
                           "strand": 'int',
                           "evalue": 'float',
                           "type_elt": 'str',
                           "annotation": 'str',
                           "model": 'str',
                           "distance_2attC": 'float'}

        # Run prodigal to find CDS on replicon (and run hmmsearch on integrase (2 profiles))
        self.integrases = find_integrase(self.replicon.id, self.prot_file, self.tmp_dir, self.cfg)
        annotation.call = self.mute_call(_annot_call_ori)
    def test_find_integrase_gembase_hmmer_error(self):
        self.args.gembase = True
        self.args.cpu = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")
        shutil.copyfile(
            os.path.join(self._data_dir, 'Proteins', replicon.id + ".prt"),
            prot_file)
        with self.assertRaises(RuntimeError) as ctx:
            integrase.find_integrase(replicon.id, prot_file, self.tmp_dir, cfg)
        self.assertTrue(str(ctx.exception).endswith('failed return code = 1'))
    def test_find_integrase_gembase_no_hmmer_no_replicon(self):
        self.args.gembase = True
        self.args.hmmsearch = 'foo'
        cfg = Config(self.args)
        cfg._prefix_data = os.path.join(os.path.dirname(__file__), 'data')

        replicon_name = 'acba.007.p01.13'
        replicon_path = os.path.join(self._data_dir, 'Replicons',
                                     replicon_name + '.fst')
        topologies = Topology('lin')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        prot_file = os.path.join(self.tmp_dir, replicon.id + ".prt")

        with self.catch_log():
            with self.assertRaises(RuntimeError) as ctx:
                integrase.find_integrase(replicon.id, prot_file, self.tmp_dir,
                                         cfg)
            self.assertEqual(
                "The protein file: '{}' does not exists cannot perform hmmsearch on it."
                .format(prot_file), str(ctx.exception))
Beispiel #9
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError("the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                                "profile with --path-func-annot option".format(config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning("No hmm profiles for functional annotation detected, skip functional annotation step.")

    if config.gembase_path:
        protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(phageI_file):
                find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(integrons, replicon, config.distance_threshold,
                                             config.model_attc_path,
                                             max_attc_size=config.max_attc_size,
                                             min_attc_size=config.min_attc_size,
                                             circular=circular, out_dir=result_tmp_dir,
                                             cpu=config.cpu,
                                             evalue_attc=config.evalue_attc)
                integron_max.to_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[(integron_max.evalue < config.evalue_attc) &
                                            (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) &
                                            (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info("Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False,
                           columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN'])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db, config.distance_threshold)
                SeqIO.write(replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(result_tmp_dir, str(err)))

    return integron_file, summary_file
Beispiel #10
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file