Ejemplo n.º 1
0
    def batch_tag(self, sentences):
        # Write the test corpus to a temporary file
        (fd, test_file) = mkstemp('.txt', 'test')
        self.write_test_corpus(sentences, os.fdopen(fd, 'w'))

        try:
            # Run mallet on the test file.
            stdout, stderr = call_mallet([self._RUN_CRF,
                '--model-file', os.path.abspath(self.crf_info.model_filename),
                '--test-file', test_file], stdout='pipe')

            # Decode the output
            labels = self.parse_mallet_output(stdout)

            # strip __start__ and __end__
            if self.crf_info.add_start_state and self.crf_info.add_end_state:
                labels = [labs[1:-1] for labs in labels]
            elif self.crf_info.add_start_state:
                labels = [labs[1:] for labs in labels]
            elif self.crf_info.add_end_state:
                labels = [labs[:-1] for labs in labels]

            # Combine the labels and the original sentences.
            return [zip(sent, label) for (sent,label) in
                    zip(sentences, labels)]

        finally:
            os.remove(test_file)
Ejemplo n.º 2
0
    def batch_tag(self, sentences):
        # Write the test corpus to a temporary file
        (fd, test_file) = mkstemp('.txt', 'test')
        self.write_test_corpus(sentences, os.fdopen(fd, 'w'))
        
        try:
            # Run mallet on the test file.
            stdout, stderr = call_mallet([self._RUN_CRF,
                '--model-file', os.path.abspath(self.crf_info.model_filename),
                '--test-file', test_file], stdout='pipe')

            # Decode the output
            labels = self.parse_mallet_output(stdout)

            # strip __start__ and __end__
            if self.crf_info.add_start_state and self.crf_info.add_end_state:
                labels = [labs[1:-1] for labs in labels]
            elif self.crf_info.add_start_state:
                labels = [labs[1:] for labs in labels]
            elif self.crf_info.add_end_state:
                labels = [labs[:-1] for labs in labels]

            # Combine the labels and the original sentences.
            return [zip(sent, label) for (sent,label) in
                    zip(sentences, labels)]
            
        finally:
            os.remove(test_file)
Ejemplo n.º 3
0
    def train(cls, feature_detector, corpus, filename=None,
              weight_groups=None, gaussian_variance=1, default_label='O',
              transduction_type='VITERBI', max_iterations=500,
              add_start_state=True, add_end_state=True, trace=1):
        """
        Train a new linear chain CRF tagger based on the given corpus
        of training sequences.  This tagger will be backed by a crf
        model file, containing both a serialized Mallet model and
        information about the CRF's structure.  This crf model file
        will not be automatically deleted -- if you wish to delete
        it, you must delete it manually.  The filename of the model
        file for a MalletCRF crf is available as ``crf.filename()``.

        :type corpus: list(tuple(str, str))
        :param corpus: Training data, represented as a list of
            sentences, where each sentence is a list of (token, tag) tuples.
        :type filename: str
        :param filename: The filename that should be used for the crf
            model file that backs the new MalletCRF.  If no
            filename is given, then a new filename will be chosen
            automatically.
        :type weight_groups: list(CRFInfo.WeightGroup)
        :param weight_groups: Specifies how input-features should
            be mapped to joint-features.  See CRFInfo.WeightGroup
            for more information.
        :type gaussian_variance: float
        :param gaussian_variance: The gaussian variance of the prior
            that should be used to train the new CRF.
        :type default_label: str
        :param default_label: The "label for initial context and
            uninteresting tokens" (from Mallet's SimpleTagger.java.)
            It's unclear whether this currently has any effect.
        :type transduction_type: str
        :param transduction_type: The type of transduction used by
            the CRF.  Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM,
            VITERBI_FBBEAM, or VITERBI_FBEAMKL.
        :type max_iterations: int
        :param max_iterations: The maximum number of iterations that
            should be used for training the CRF.
        :type add_start_state: bool
        :param add_start_state: If true, then NLTK will add a special
            start state, named '__start__'.  The initial cost for
            the start state will be set to 0; and the initial cost for
            all other states will be set to +inf.
        :type add_end_state: bool
        :param add_end_state: If true, then NLTK will add a special
            end state, named '__end__'.  The final cost for the end
            state will be set to 0; and the final cost for all other
            states will be set to +inf.
        :type trace: int
        :param trace: Controls the verbosity of trace output generated
            while training the CRF.  Higher numbers generate more verbose
            output.
        """
        t0 = time.time() # Record starting time.

        # If they did not supply a model filename, then choose one.
        if filename is None:
            (fd, filename) = mkstemp('.crf', 'model')
            os.fdopen(fd).close()

        # Ensure that the filename ends with '.zip'
        if not filename.endswith('.crf'):
            filename += '.crf'

        if trace >= 1:
            print('[MalletCRF] Training a new CRF: %s' % filename)

        # Create crf-info object describing the new CRF.
        crf_info = MalletCRF._build_crf_info(
            corpus, gaussian_variance, default_label, max_iterations,
            transduction_type, weight_groups, add_start_state,
            add_end_state, filename, feature_detector)

        # Create a zipfile, and write crf-info to it.
        if trace >= 2:
            print('[MalletCRF] Adding crf-info.xml to %s' % filename)
        zf = zipfile.ZipFile(filename, mode='w')
        zf.writestr('crf-info.xml', crf_info.toxml()+'\n')
        zf.close()

        # Create the CRF object.
        crf = MalletCRF(filename, feature_detector)

        # Write the Training corpus to a temporary file.
        if trace >= 2:
            print('[MalletCRF] Writing training corpus...')
        (fd, train_file) = mkstemp('.txt', 'train')
        crf.write_training_corpus(corpus, os.fdopen(fd, 'w'))

        try:
            if trace >= 1:
                print('[MalletCRF] Calling mallet to train CRF...')
            cmd = [MalletCRF._TRAIN_CRF,
                   '--model-file', os.path.abspath(filename),
                   '--train-file', train_file]
            if trace > 3:
                call_mallet(cmd)
            else:
                p = call_mallet(cmd, stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                blocking=False)
                MalletCRF._filter_training_output(p, trace)
        finally:
            # Delete the temp file containing the training corpus.
            os.remove(train_file)

        if trace >= 1:
            print('[MalletCRF] Training complete.')
            print('[MalletCRF]   Model stored in: %s' % filename)
        if trace >= 2:
            dt = time.time()-t0
            print('[MalletCRF]   Total training time: %d seconds' % dt)

        # Return the completed CRF.
        return crf
Ejemplo n.º 4
0
    def train(cls, feature_detector, corpus, filename=None,
              weight_groups=None, gaussian_variance=1, default_label='O',
              transduction_type='VITERBI', max_iterations=500,
              add_start_state=True, add_end_state=True, trace=1):
        """
        Train a new linear chain CRF tagger based on the given corpus
        of training sequences.  This tagger will be backed by a crf
        model file, containing both a serialized Mallet model and
        information about the CRF's structure.  This crf model file
        will not be automatically deleted -- if you wish to delete
        it, you must delete it manually.  The filename of the model
        file for a MalletCRF crf is available as ``crf.filename()``.

        :type corpus: list(tuple(str, str))
        :param corpus: Training data, represented as a list of
            sentences, where each sentence is a list of (token, tag) tuples.
        :type filename: str
        :param filename: The filename that should be used for the crf
            model file that backs the new MalletCRF.  If no
            filename is given, then a new filename will be chosen
            automatically.
        :type weight_groups: list(CRFInfo.WeightGroup)
        :param weight_groups: Specifies how input-features should
            be mapped to joint-features.  See CRFInfo.WeightGroup
            for more information.
        :type gaussian_variance: float
        :param gaussian_variance: The gaussian variance of the prior
            that should be used to train the new CRF.
        :type default_label: str
        :param default_label: The "label for initial context and
            uninteresting tokens" (from Mallet's SimpleTagger.java.)
            It's unclear whether this currently has any effect.
        :type transduction_type: str
        :param transduction_type: The type of transduction used by
            the CRF.  Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM,
            VITERBI_FBBEAM, or VITERBI_FBEAMKL.
        :type max_iterations: int
        :param max_iterations: The maximum number of iterations that
            should be used for training the CRF.
        :type add_start_state: bool
        :param add_start_state: If true, then NLTK will add a special
            start state, named '__start__'.  The initial cost for
            the start state will be set to 0; and the initial cost for
            all other states will be set to +inf.
        :type add_end_state: bool
        :param add_end_state: If true, then NLTK will add a special
            end state, named '__end__'.  The final cost for the end
            state will be set to 0; and the final cost for all other
            states will be set to +inf.
        :type trace: int
        :param trace: Controls the verbosity of trace output generated
            while training the CRF.  Higher numbers generate more verbose
            output.
        """
        t0 = time.time() # Record starting time.

        # If they did not supply a model filename, then choose one.
        if filename is None:
            (fd, filename) = mkstemp('.crf', 'model')
            os.fdopen(fd).close()

        # Ensure that the filename ends with '.zip'
        if not filename.endswith('.crf'):
            filename += '.crf'
        
        if trace >= 1:
            print '[MalletCRF] Training a new CRF: %s' % filename
                
        # Create crf-info object describing the new CRF.
        crf_info = MalletCRF._build_crf_info(
            corpus, gaussian_variance, default_label, max_iterations,
            transduction_type, weight_groups, add_start_state,
            add_end_state, filename, feature_detector)
        
        # Create a zipfile, and write crf-info to it.
        if trace >= 2:
            print '[MalletCRF] Adding crf-info.xml to %s' % filename
        zf = zipfile.ZipFile(filename, mode='w')
        zf.writestr('crf-info.xml', crf_info.toxml()+'\n')
        zf.close()
        
        # Create the CRF object.
        crf = MalletCRF(filename, feature_detector)

        # Write the Training corpus to a temporary file.
        if trace >= 2:
            print '[MalletCRF] Writing training corpus...'
        (fd, train_file) = mkstemp('.txt', 'train')
        crf.write_training_corpus(corpus, os.fdopen(fd, 'w'))

        try:
            if trace >= 1:
                print '[MalletCRF] Calling mallet to train CRF...'
            cmd = [MalletCRF._TRAIN_CRF,
                   '--model-file', os.path.abspath(filename),
                   '--train-file', train_file]
            if trace > 3:
                call_mallet(cmd)
            else:
                p = call_mallet(cmd, stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                blocking=False)
                MalletCRF._filter_training_output(p, trace)
        finally:
            # Delete the temp file containing the training corpus.
            os.remove(train_file)
            
        if trace >= 1:
            print '[MalletCRF] Training complete.'
            print '[MalletCRF]   Model stored in: %s' % filename
        if trace >= 2:
            dt = time.time()-t0
            print '[MalletCRF]   Total training time: %d seconds' % dt
                
        # Return the completed CRF.
        return crf