Ejemplo n.º 1
0
    def batch_tag(self, sentences):
        # Write the test corpus to a temporary file
        (fd, test_file) = mkstemp('.txt', 'test')
        self.write_test_corpus(sentences, os.fdopen(fd, 'w'))

        try:
            # Run mallet on the test file.
            stdout, stderr = call_mallet([
                self._RUN_CRF, '--model-file',
                os.path.abspath(self.crf_info.model_filename), '--test-file',
                test_file
            ],
                                         stdout='pipe')

            # Decode the output
            labels = self.parse_mallet_output(stdout)

            # strip __start__ and __end__
            if self.crf_info.add_start_state and self.crf_info.add_end_state:
                labels = [labs[1:-1] for labs in labels]
            elif self.crf_info.add_start_state:
                labels = [labs[1:] for labs in labels]
            elif self.crf_info.add_end_state:
                labels = [labs[:-1] for labs in labels]

            # Combine the labels and the original sentences.
            return [
                zip(sent, label) for (sent, label) in zip(sentences, labels)
            ]

        finally:
            os.remove(test_file)
Ejemplo n.º 2
0
    def batch_tag(self, sentences):
        # Write the test corpus to a temporary file
        (fd, test_file) = mkstemp('.txt', 'test')
        self.write_test_corpus(sentences, os.fdopen(fd, 'w'))
        
        try:
            # Run mallet on the test file.
            stdout, stderr = call_mallet([self._RUN_CRF,
                '--model-file', os.path.abspath(self.crf_info.model_filename),
                '--test-file', test_file], stdout='pipe')

            # Decode the output
            labels = self.parse_mallet_output(stdout)

            # strip __start__ and __end__
            if self.crf_info.add_start_state and self.crf_info.add_end_state:
                labels = [labs[1:-1] for labs in labels]
            elif self.crf_info.add_start_state:
                labels = [labs[1:] for labs in labels]
            elif self.crf_info.add_end_state:
                labels = [labs[:-1] for labs in labels]

            # Combine the labels and the original sentences.
            return [zip(sent, label) for (sent,label) in
                    zip(sentences, labels)]
            
        finally:
            os.remove(test_file)
Ejemplo n.º 3
0
    def train(cls,
              feature_detector,
              corpus,
              filename=None,
              weight_groups=None,
              gaussian_variance=1,
              default_label='O',
              transduction_type='VITERBI',
              max_iterations=500,
              add_start_state=True,
              add_end_state=True,
              trace=1):
        """
        Train a new linear chain CRF tagger based on the given corpus
        of training sequences.  This tagger will be backed by a I{crf
        model file}, containing both a serialized Mallet model and
        information about the CRF's structure.  This crf model file
        will I{not} be automatically deleted -- if you wish to delete
        it, you must delete it manually.  The filename of the model
        file for a MalletCRF C{crf} is available as C{crf.filename}.


        @type corpus: C{list} of C{tuple}
        @param corpus: Training data, represented as a list of
            sentences, where each sentence is a list of (token, tag)
            tuples.

        @type filename: C{str}
        @param filename: The filename that should be used for the crf
            model file that backs the new C{MalletCRF}.  If no
            filename is given, then a new filename will be chosen
            automatically.

        @type weight_groups: C{list} of L{CRFInfo.WeightGroup}
        @param weight_groups: Specifies how input-features should
            be mapped to joint-features.  See L{CRFInfo.WeightGroup}
            for more information.

        @type gaussian_variance: C{float}
        @param gaussian_variance: The gaussian variance of the prior
            that should be used to train the new CRF.

        @type default_label: C{str}
        @param default_label: The "label for initial context and
            uninteresting tokens" (from Mallet's SimpleTagger.java.)
            It's unclear whether this currently has any effect.

        @type transduction_type: C{str}
        @param transduction_type: The type of transduction used by
            the CRF.  Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM,
            VITERBI_FBBEAM, or VITERBI_FBEAMKL.

        @type max_iterations: C{int}
        @param max_iterations: The maximum number of iterations that
            should be used for training the CRF.
            
        @type add_start_state: C{bool}
        @param add_start_state: If true, then NLTK will add a special
            start state, named C{'__start__'}.  The initial cost for
            the start state will be set to 0; and the initial cost for
            all other states will be set to +inf.
            
        @type add_end_state: C{bool}
        @param add_end_state: If true, then NLTK will add a special
            end state, named C{'__end__'}.  The final cost for the end
            state will be set to 0; and the final cost for all other
            states will be set to +inf.

        @type trace: C{int}
        @param trace: Controls the verbosity of trace output generated
            while training the CRF.  Higher numbers generate more verbose
            output.
        """
        t0 = time.time()  # Record starting time.

        # If they did not supply a model filename, then choose one.
        if filename is None:
            (fd, filename) = mkstemp('.crf', 'model')
            os.fdopen(fd).close()

        # Ensure that the filename ends with '.zip'
        if not filename.endswith('.crf'):
            filename += '.crf'

        if trace >= 1:
            print '[MalletCRF] Training a new CRF: %s' % filename

        # Create crf-info object describing the new CRF.
        crf_info = MalletCRF._build_crf_info(corpus, gaussian_variance,
                                             default_label, max_iterations,
                                             transduction_type, weight_groups,
                                             add_start_state, add_end_state,
                                             filename, feature_detector)

        # Create a zipfile, and write crf-info to it.
        if trace >= 2:
            print '[MalletCRF] Adding crf-info.xml to %s' % filename
        zf = zipfile.ZipFile(filename, mode='w')
        zf.writestr('crf-info.xml', crf_info.toxml() + '\n')
        zf.close()

        # Create the CRF object.
        crf = MalletCRF(filename, feature_detector)

        # Write the Training corpus to a temporary file.
        if trace >= 2:
            print '[MalletCRF] Writing training corpus...'
        (fd, train_file) = mkstemp('.txt', 'train')
        crf.write_training_corpus(corpus, os.fdopen(fd, 'w'))

        try:
            if trace >= 1:
                print '[MalletCRF] Calling mallet to train CRF...'
            cmd = [
                MalletCRF._TRAIN_CRF, '--model-file',
                os.path.abspath(filename), '--train-file', train_file
            ]
            if trace > 3:
                call_mallet(cmd)
            else:
                p = call_mallet(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                blocking=False)
                MalletCRF._filter_training_output(p, trace)
        finally:
            # Delete the temp file containing the training corpus.
            os.remove(train_file)

        if trace >= 1:
            print '[MalletCRF] Training complete.'
            print '[MalletCRF]   Model stored in: %s' % filename
        if trace >= 2:
            dt = time.time() - t0
            print '[MalletCRF]   Total training time: %d seconds' % dt

        # Return the completed CRF.
        return crf
Ejemplo n.º 4
0
    def train(cls, feature_detector, corpus, filename=None,
              weight_groups=None, gaussian_variance=1, default_label='O',
              transduction_type='VITERBI', max_iterations=500,
              add_start_state=True, add_end_state=True, trace=1):
        """
        Train a new linear chain CRF tagger based on the given corpus
        of training sequences.  This tagger will be backed by a I{crf
        model file}, containing both a serialized Mallet model and
        information about the CRF's structure.  This crf model file
        will I{not} be automatically deleted -- if you wish to delete
        it, you must delete it manually.  The filename of the model
        file for a MalletCRF C{crf} is available as C{crf.filename}.


        @type corpus: C{list} of C{tuple}
        @param corpus: Training data, represented as a list of
            sentences, where each sentence is a list of (token, tag)
            tuples.

        @type filename: C{str}
        @param filename: The filename that should be used for the crf
            model file that backs the new C{MalletCRF}.  If no
            filename is given, then a new filename will be chosen
            automatically.

        @type weight_groups: C{list} of L{CRFInfo.WeightGroup}
        @param weight_groups: Specifies how input-features should
            be mapped to joint-features.  See L{CRFInfo.WeightGroup}
            for more information.

        @type gaussian_variance: C{float}
        @param gaussian_variance: The gaussian variance of the prior
            that should be used to train the new CRF.

        @type default_label: C{str}
        @param default_label: The "label for initial context and
            uninteresting tokens" (from Mallet's SimpleTagger.java.)
            It's unclear whether this currently has any effect.

        @type transduction_type: C{str}
        @param transduction_type: The type of transduction used by
            the CRF.  Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM,
            VITERBI_FBBEAM, or VITERBI_FBEAMKL.

        @type max_iterations: C{int}
        @param max_iterations: The maximum number of iterations that
            should be used for training the CRF.
            
        @type add_start_state: C{bool}
        @param add_start_state: If true, then NLTK will add a special
            start state, named C{'__start__'}.  The initial cost for
            the start state will be set to 0; and the initial cost for
            all other states will be set to +inf.
            
        @type add_end_state: C{bool}
        @param add_end_state: If true, then NLTK will add a special
            end state, named C{'__end__'}.  The final cost for the end
            state will be set to 0; and the final cost for all other
            states will be set to +inf.

        @type trace: C{int}
        @param trace: Controls the verbosity of trace output generated
            while training the CRF.  Higher numbers generate more verbose
            output.
        """
        t0 = time.time() # Record starting time.

        # If they did not supply a model filename, then choose one.
        if filename is None:
            (fd, filename) = mkstemp('.crf', 'model')
            os.fdopen(fd).close()

        # Ensure that the filename ends with '.zip'
        if not filename.endswith('.crf'):
            filename += '.crf'
        
        if trace >= 1:
            print '[MalletCRF] Training a new CRF: %s' % filename
                
        # Create crf-info object describing the new CRF.
        crf_info = MalletCRF._build_crf_info(
            corpus, gaussian_variance, default_label, max_iterations,
            transduction_type, weight_groups, add_start_state,
            add_end_state, filename, feature_detector)
        
        # Create a zipfile, and write crf-info to it.
        if trace >= 2:
            print '[MalletCRF] Adding crf-info.xml to %s' % filename
        zf = zipfile.ZipFile(filename, mode='w')
        zf.writestr('crf-info.xml', crf_info.toxml()+'\n')
        zf.close()
        
        # Create the CRF object.
        crf = MalletCRF(filename, feature_detector)

        # Write the Training corpus to a temporary file.
        if trace >= 2:
            print '[MalletCRF] Writing training corpus...'
        (fd, train_file) = mkstemp('.txt', 'train')
        crf.write_training_corpus(corpus, os.fdopen(fd, 'w'))

        try:
            if trace >= 1:
                print '[MalletCRF] Calling mallet to train CRF...'
            cmd = [MalletCRF._TRAIN_CRF,
                   '--model-file', os.path.abspath(filename),
                   '--train-file', train_file]
            if trace > 3:
                call_mallet(cmd)
            else:
                p = call_mallet(cmd, stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                blocking=False)
                MalletCRF._filter_training_output(p, trace)
        finally:
            # Delete the temp file containing the training corpus.
            os.remove(train_file)
            
        if trace >= 1:
            print '[MalletCRF] Training complete.'
            print '[MalletCRF]   Model stored in: %s' % filename
        if trace >= 2:
            dt = time.time()-t0
            print '[MalletCRF]   Total training time: %d seconds' % dt
                
        # Return the completed CRF.
        return crf