def test_find_attc_max_linear(self):

        integrase = pd.DataFrame(
            {
                'pos_beg': 1545830,
                'pos_end': 1546807,
                'strand': -1,
                'evalue': 1.100000e-21,
                'type_elt': 'protein',
                'annotation': 'intI',
                'model': 'intersection_tyr_intI',
                'distance_2attC': np.nan
            },
            index=['OBAL001.B.00005.C001_141'],
            columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)
        self.integron.integrase = integrase

        attC = pd.DataFrame(
            {
                'pos_beg': [1547800, 1548775],
                'pos_end': [1547859, 1548834],
                'strand': [1, 1],
                'evalue': [0.00049, 0.00009],
                'type_elt': ['attC', 'attC'],
                'annotation': ['attC', 'attC'],
                'model': ['attc_4', 'attc_4'],
                'distance_2attC': [np.nan, 916.0]
            },
            index=['attc_001', 'attc_002'],
            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)
        self.integron.attC = attC
        integrons = [self.integron]

        max_final = find_attc_max(integrons,
                                  self.replicon,
                                  self.cfg.distance_threshold,
                                  self.cfg.model_attc_path,
                                  self.cfg.max_attc_size,
                                  self.cfg.min_attc_size,
                                  circular=False,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame(
            {
                'Accession_number':
                ['OBAL001.B.00005.C001', 'OBAL001.B.00005.C001'],
                'cm_attC': ['attC_4', 'attC_4'],
                'cm_debut': [4, 1],
                'cm_fin': [44, 47],
                'pos_beg': [1547800, 1548775],
                'pos_end': [1547859, 1548834],
                'sens': ['+', '+'],
                'evalue': [0.000240, 0.000045]
            },
            index=[0, 1],
            columns=self.max_cols)
        exp = exp.astype(dtype=self.max_dtype)
        pdt.assert_frame_equal(max_final, exp)
    def test_find_attc_max_In0(self):
        replicon_name = 'ESCO001.B.00018.P002'
        replicon_path = self.find_data(os.path.join('Replicons', replicon_name + '.fst'))

        topologies = Topology('circ')
        with FastaIterator(replicon_path) as sequences_db:
            sequences_db.topologies = topologies
            replicon = next(sequences_db)

        integron = Integron(replicon, self.cfg)

        integrase = pd.DataFrame({'pos_beg': [90229],
                                  'pos_end': [91242],
                                  'strand': -1,
                                  'evalue': 1.400000e-24,
                                  'type_elt': 'protein',
                                  'annotation': 'intI',
                                  'model': 'intersection_tyr_intI',
                                  'distance_2attC': np.nan
                                  },
                                 index=['ESCO001.B.00018.P002_106'],
                                 columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)
        integron.integrase = integrase
        integrons = [integron]

        max_final = find_attc_max(integrons, replicon,
                                  self.cfg.distance_threshold, self.cfg.model_attc_path,
                                  self.cfg.max_attc_size, self.cfg.min_attc_size,
                                  circular=True,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame(columns=self.max_cols)
        exp = exp.astype(dtype=self.max_dtype)
        pdt.assert_frame_equal(max_final, exp)
    def test_find_attc_max_linear(self):

        integrase = pd.DataFrame({'pos_beg': 1545830,
                                  'pos_end': 1546807,
                                  'strand': -1,
                                  'evalue': 1.100000e-21,
                                  'type_elt': 'protein',
                                  'annotation': 'intI',
                                  'model': 'intersection_tyr_intI',
                                  'distance_2attC': np.nan
                                  },
                                 index=['OBAL001.B.00005.C001_141'],
                                 columns=self.columns)
        integrase = integrase.astype(dtype=self.dtype)
        self.integron.integrase = integrase

        attC = pd.DataFrame({'pos_beg': [1547800, 1548775],
                             'pos_end': [1547859, 1548834],
                             'strand': [1, 1],
                             'evalue': [0.00049, 0.00009],
                             'type_elt': ['attC', 'attC'],
                             'annotation': ['attC', 'attC'],
                             'model': ['attc_4', 'attc_4'],
                             'distance_2attC': [np.nan, 916.0]
                             },
                            index=['attc_001', 'attc_002'],
                            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)
        self.integron.attC = attC
        integrons = [self.integron]

        max_final = find_attc_max(integrons, self.replicon,
                                  self.cfg.distance_threshold, self.cfg.model_attc_path,
                                  self.cfg.max_attc_size, self.cfg.min_attc_size,
                                  circular=False,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame({'Accession_number': ['OBAL001.B.00005.C001', 'OBAL001.B.00005.C001'],
                            'cm_attC': ['attC_4', 'attC_4'],
                            'cm_debut': [4, 1],
                            'cm_fin': [44, 47],
                            'pos_beg': [1547800, 1548775],
                            'pos_end': [1547859, 1548834],
                            'sens': ['+', '+'],
                            'evalue': [0.000240,  0.000045]
                            },
                           index=[0, 1],
                           columns=self.max_cols
                           )
        exp = exp.astype(dtype=self.max_dtype)
        pdt.assert_frame_equal(max_final, exp)
    def test_find_attc_max_calin(self):

        attC = pd.DataFrame(
            {
                'pos_beg': [421689],
                'pos_end': [421764],
                'strand': [1],
                'evalue': [0.13],
                'type_elt': ['attC'],
                'annotation': ['attC'],
                'model': ['attc_4'],
                'distance_2attC': [np.nan]
            },
            index=['attc_001'],
            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)
        self.integron.attC = attC
        integrons = [self.integron]

        max_final = find_attc_max(integrons,
                                  self.replicon,
                                  self.cfg.distance_threshold,
                                  self.cfg.model_attc_path,
                                  self.cfg.max_attc_size,
                                  self.cfg.min_attc_size,
                                  circular=True,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame(
            {
                'Accession_number': ['OBAL001.B.00005.C001'],
                'cm_attC': ['attC_4'],
                'cm_debut': [1],
                'cm_fin': [47],
                'pos_beg': [421689],
                'pos_end': [421764],
                'sens': ['+'],
                'evalue': [0.062]
            },
            index=[0],
            columns=self.max_cols)
        exp = exp.astype(dtype=self.max_dtype)

        pdt.assert_frame_equal(max_final, exp)
    def test_find_attc_max_calin(self):

        attC = pd.DataFrame({'pos_beg': [421689],
                             'pos_end': [421764],
                             'strand': [1],
                             'evalue': [0.13],
                             'type_elt': ['attC'],
                             'annotation': ['attC'],
                             'model': ['attc_4'],
                             'distance_2attC': [np.nan]
                             },
                            index=['attc_001'],
                            columns=self.columns)
        attC = attC.astype(dtype=self.dtype)
        self.integron.attC = attC
        integrons = [self.integron]

        max_final = find_attc_max(integrons, self.replicon,
                                  self.cfg.distance_threshold, self.cfg.model_attc_path,
                                  self.cfg.max_attc_size, self.cfg.min_attc_size,
                                  circular=True,
                                  out_dir=self.tmp_dir)

        exp = pd.DataFrame({'Accession_number': ['OBAL001.B.00005.C001'],
                            'cm_attC': ['attC_4'],
                            'cm_debut': [1],
                            'cm_fin': [47],
                            'pos_beg': [421689],
                            'pos_end': [421764],
                            'sens': ['+'],
                            'evalue': [0.062]
                            },
                           index=[0],
                           columns=self.max_cols
                           )
        exp = exp.astype(dtype=self.max_dtype)

        pdt.assert_frame_equal(max_final, exp)
Example #6
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError("the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                                "profile with --path-func-annot option".format(config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning("No hmm profiles for functional annotation detected, skip functional annotation step.")

    if config.gembase_path:
        protein_db = GembaseDB(replicon, config, gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir, replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(phageI_file):
                find_integrase(replicon.id, protein_db.protfile, result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path, replicon.name, config.cmsearch, result_tmp_dir, config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file, intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(integrons, replicon, config.distance_threshold,
                                             config.model_attc_path,
                                             max_attc_size=config.max_attc_size,
                                             min_attc_size=config.min_attc_size,
                                             circular=circular, out_dir=result_tmp_dir,
                                             cpu=config.cpu,
                                             evalue_attc=config.evalue_attc)
                integron_max.to_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[(integron_max.evalue < config.evalue_attc) &
                                            (abs(integron_max.pos_end - integron_max.pos_beg) < config.max_attc_size) &
                                            (config.min_attc_size < abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info("Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max, intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config, result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file, sep="\t", index=False, na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file, sep="\t", na_rep="NA", index=False,
                           columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN'])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db, config.distance_threshold)
                SeqIO.write(replicon, os.path.join(config.result_dir, replicon.id + ".gbk"), "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(result_tmp_dir, str(err)))

    return integron_file, summary_file
Example #7
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file