def batch_tag(self, sentences): # Write the test corpus to a temporary file (fd, test_file) = mkstemp('.txt', 'test') self.write_test_corpus(sentences, os.fdopen(fd, 'w')) try: # Run mallet on the test file. stdout, stderr = call_mallet([ self._RUN_CRF, '--model-file', os.path.abspath(self.crf_info.model_filename), '--test-file', test_file ], stdout='pipe') # Decode the output labels = self.parse_mallet_output(stdout) # strip __start__ and __end__ if self.crf_info.add_start_state and self.crf_info.add_end_state: labels = [labs[1:-1] for labs in labels] elif self.crf_info.add_start_state: labels = [labs[1:] for labs in labels] elif self.crf_info.add_end_state: labels = [labs[:-1] for labs in labels] # Combine the labels and the original sentences. return [ zip(sent, label) for (sent, label) in zip(sentences, labels) ] finally: os.remove(test_file)
def batch_tag(self, sentences): # Write the test corpus to a temporary file (fd, test_file) = mkstemp('.txt', 'test') self.write_test_corpus(sentences, os.fdopen(fd, 'w')) try: # Run mallet on the test file. stdout, stderr = call_mallet([self._RUN_CRF, '--model-file', os.path.abspath(self.crf_info.model_filename), '--test-file', test_file], stdout='pipe') # Decode the output labels = self.parse_mallet_output(stdout) # strip __start__ and __end__ if self.crf_info.add_start_state and self.crf_info.add_end_state: labels = [labs[1:-1] for labs in labels] elif self.crf_info.add_start_state: labels = [labs[1:] for labs in labels] elif self.crf_info.add_end_state: labels = [labs[:-1] for labs in labels] # Combine the labels and the original sentences. return [zip(sent, label) for (sent,label) in zip(sentences, labels)] finally: os.remove(test_file)
def train(cls, feature_detector, corpus, filename=None, weight_groups=None, gaussian_variance=1, default_label='O', transduction_type='VITERBI', max_iterations=500, add_start_state=True, add_end_state=True, trace=1): """ Train a new linear chain CRF tagger based on the given corpus of training sequences. This tagger will be backed by a I{crf model file}, containing both a serialized Mallet model and information about the CRF's structure. This crf model file will I{not} be automatically deleted -- if you wish to delete it, you must delete it manually. The filename of the model file for a MalletCRF C{crf} is available as C{crf.filename}. @type corpus: C{list} of C{tuple} @param corpus: Training data, represented as a list of sentences, where each sentence is a list of (token, tag) tuples. @type filename: C{str} @param filename: The filename that should be used for the crf model file that backs the new C{MalletCRF}. If no filename is given, then a new filename will be chosen automatically. @type weight_groups: C{list} of L{CRFInfo.WeightGroup} @param weight_groups: Specifies how input-features should be mapped to joint-features. See L{CRFInfo.WeightGroup} for more information. @type gaussian_variance: C{float} @param gaussian_variance: The gaussian variance of the prior that should be used to train the new CRF. @type default_label: C{str} @param default_label: The "label for initial context and uninteresting tokens" (from Mallet's SimpleTagger.java.) It's unclear whether this currently has any effect. @type transduction_type: C{str} @param transduction_type: The type of transduction used by the CRF. Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM, VITERBI_FBBEAM, or VITERBI_FBEAMKL. @type max_iterations: C{int} @param max_iterations: The maximum number of iterations that should be used for training the CRF. @type add_start_state: C{bool} @param add_start_state: If true, then NLTK will add a special start state, named C{'__start__'}. The initial cost for the start state will be set to 0; and the initial cost for all other states will be set to +inf. @type add_end_state: C{bool} @param add_end_state: If true, then NLTK will add a special end state, named C{'__end__'}. The final cost for the end state will be set to 0; and the final cost for all other states will be set to +inf. @type trace: C{int} @param trace: Controls the verbosity of trace output generated while training the CRF. Higher numbers generate more verbose output. """ t0 = time.time() # Record starting time. # If they did not supply a model filename, then choose one. if filename is None: (fd, filename) = mkstemp('.crf', 'model') os.fdopen(fd).close() # Ensure that the filename ends with '.zip' if not filename.endswith('.crf'): filename += '.crf' if trace >= 1: print '[MalletCRF] Training a new CRF: %s' % filename # Create crf-info object describing the new CRF. crf_info = MalletCRF._build_crf_info(corpus, gaussian_variance, default_label, max_iterations, transduction_type, weight_groups, add_start_state, add_end_state, filename, feature_detector) # Create a zipfile, and write crf-info to it. if trace >= 2: print '[MalletCRF] Adding crf-info.xml to %s' % filename zf = zipfile.ZipFile(filename, mode='w') zf.writestr('crf-info.xml', crf_info.toxml() + '\n') zf.close() # Create the CRF object. crf = MalletCRF(filename, feature_detector) # Write the Training corpus to a temporary file. if trace >= 2: print '[MalletCRF] Writing training corpus...' (fd, train_file) = mkstemp('.txt', 'train') crf.write_training_corpus(corpus, os.fdopen(fd, 'w')) try: if trace >= 1: print '[MalletCRF] Calling mallet to train CRF...' cmd = [ MalletCRF._TRAIN_CRF, '--model-file', os.path.abspath(filename), '--train-file', train_file ] if trace > 3: call_mallet(cmd) else: p = call_mallet(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, blocking=False) MalletCRF._filter_training_output(p, trace) finally: # Delete the temp file containing the training corpus. os.remove(train_file) if trace >= 1: print '[MalletCRF] Training complete.' print '[MalletCRF] Model stored in: %s' % filename if trace >= 2: dt = time.time() - t0 print '[MalletCRF] Total training time: %d seconds' % dt # Return the completed CRF. return crf
def train(cls, feature_detector, corpus, filename=None, weight_groups=None, gaussian_variance=1, default_label='O', transduction_type='VITERBI', max_iterations=500, add_start_state=True, add_end_state=True, trace=1): """ Train a new linear chain CRF tagger based on the given corpus of training sequences. This tagger will be backed by a I{crf model file}, containing both a serialized Mallet model and information about the CRF's structure. This crf model file will I{not} be automatically deleted -- if you wish to delete it, you must delete it manually. The filename of the model file for a MalletCRF C{crf} is available as C{crf.filename}. @type corpus: C{list} of C{tuple} @param corpus: Training data, represented as a list of sentences, where each sentence is a list of (token, tag) tuples. @type filename: C{str} @param filename: The filename that should be used for the crf model file that backs the new C{MalletCRF}. If no filename is given, then a new filename will be chosen automatically. @type weight_groups: C{list} of L{CRFInfo.WeightGroup} @param weight_groups: Specifies how input-features should be mapped to joint-features. See L{CRFInfo.WeightGroup} for more information. @type gaussian_variance: C{float} @param gaussian_variance: The gaussian variance of the prior that should be used to train the new CRF. @type default_label: C{str} @param default_label: The "label for initial context and uninteresting tokens" (from Mallet's SimpleTagger.java.) It's unclear whether this currently has any effect. @type transduction_type: C{str} @param transduction_type: The type of transduction used by the CRF. Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM, VITERBI_FBBEAM, or VITERBI_FBEAMKL. @type max_iterations: C{int} @param max_iterations: The maximum number of iterations that should be used for training the CRF. @type add_start_state: C{bool} @param add_start_state: If true, then NLTK will add a special start state, named C{'__start__'}. The initial cost for the start state will be set to 0; and the initial cost for all other states will be set to +inf. @type add_end_state: C{bool} @param add_end_state: If true, then NLTK will add a special end state, named C{'__end__'}. The final cost for the end state will be set to 0; and the final cost for all other states will be set to +inf. @type trace: C{int} @param trace: Controls the verbosity of trace output generated while training the CRF. Higher numbers generate more verbose output. """ t0 = time.time() # Record starting time. # If they did not supply a model filename, then choose one. if filename is None: (fd, filename) = mkstemp('.crf', 'model') os.fdopen(fd).close() # Ensure that the filename ends with '.zip' if not filename.endswith('.crf'): filename += '.crf' if trace >= 1: print '[MalletCRF] Training a new CRF: %s' % filename # Create crf-info object describing the new CRF. crf_info = MalletCRF._build_crf_info( corpus, gaussian_variance, default_label, max_iterations, transduction_type, weight_groups, add_start_state, add_end_state, filename, feature_detector) # Create a zipfile, and write crf-info to it. if trace >= 2: print '[MalletCRF] Adding crf-info.xml to %s' % filename zf = zipfile.ZipFile(filename, mode='w') zf.writestr('crf-info.xml', crf_info.toxml()+'\n') zf.close() # Create the CRF object. crf = MalletCRF(filename, feature_detector) # Write the Training corpus to a temporary file. if trace >= 2: print '[MalletCRF] Writing training corpus...' (fd, train_file) = mkstemp('.txt', 'train') crf.write_training_corpus(corpus, os.fdopen(fd, 'w')) try: if trace >= 1: print '[MalletCRF] Calling mallet to train CRF...' cmd = [MalletCRF._TRAIN_CRF, '--model-file', os.path.abspath(filename), '--train-file', train_file] if trace > 3: call_mallet(cmd) else: p = call_mallet(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, blocking=False) MalletCRF._filter_training_output(p, trace) finally: # Delete the temp file containing the training corpus. os.remove(train_file) if trace >= 1: print '[MalletCRF] Training complete.' print '[MalletCRF] Model stored in: %s' % filename if trace >= 2: dt = time.time()-t0 print '[MalletCRF] Total training time: %d seconds' % dt # Return the completed CRF. return crf