def batch_tag(self, sentences): # Write the test corpus to a temporary file (fd, test_file) = mkstemp('.txt', 'test') self.write_test_corpus(sentences, os.fdopen(fd, 'w')) try: # Run mallet on the test file. stdout, stderr = call_mallet([self._RUN_CRF, '--model-file', os.path.abspath(self.crf_info.model_filename), '--test-file', test_file], stdout='pipe') # Decode the output labels = self.parse_mallet_output(stdout) # strip __start__ and __end__ if self.crf_info.add_start_state and self.crf_info.add_end_state: labels = [labs[1:-1] for labs in labels] elif self.crf_info.add_start_state: labels = [labs[1:] for labs in labels] elif self.crf_info.add_end_state: labels = [labs[:-1] for labs in labels] # Combine the labels and the original sentences. return [zip(sent, label) for (sent,label) in zip(sentences, labels)] finally: os.remove(test_file)
def train(cls, feature_detector, corpus, filename=None, weight_groups=None, gaussian_variance=1, default_label='O', transduction_type='VITERBI', max_iterations=500, add_start_state=True, add_end_state=True, trace=1): """ Train a new linear chain CRF tagger based on the given corpus of training sequences. This tagger will be backed by a crf model file, containing both a serialized Mallet model and information about the CRF's structure. This crf model file will not be automatically deleted -- if you wish to delete it, you must delete it manually. The filename of the model file for a MalletCRF crf is available as ``crf.filename()``. :type corpus: list(tuple(str, str)) :param corpus: Training data, represented as a list of sentences, where each sentence is a list of (token, tag) tuples. :type filename: str :param filename: The filename that should be used for the crf model file that backs the new MalletCRF. If no filename is given, then a new filename will be chosen automatically. :type weight_groups: list(CRFInfo.WeightGroup) :param weight_groups: Specifies how input-features should be mapped to joint-features. See CRFInfo.WeightGroup for more information. :type gaussian_variance: float :param gaussian_variance: The gaussian variance of the prior that should be used to train the new CRF. :type default_label: str :param default_label: The "label for initial context and uninteresting tokens" (from Mallet's SimpleTagger.java.) It's unclear whether this currently has any effect. :type transduction_type: str :param transduction_type: The type of transduction used by the CRF. Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM, VITERBI_FBBEAM, or VITERBI_FBEAMKL. :type max_iterations: int :param max_iterations: The maximum number of iterations that should be used for training the CRF. :type add_start_state: bool :param add_start_state: If true, then NLTK will add a special start state, named '__start__'. The initial cost for the start state will be set to 0; and the initial cost for all other states will be set to +inf. :type add_end_state: bool :param add_end_state: If true, then NLTK will add a special end state, named '__end__'. The final cost for the end state will be set to 0; and the final cost for all other states will be set to +inf. :type trace: int :param trace: Controls the verbosity of trace output generated while training the CRF. Higher numbers generate more verbose output. """ t0 = time.time() # Record starting time. # If they did not supply a model filename, then choose one. if filename is None: (fd, filename) = mkstemp('.crf', 'model') os.fdopen(fd).close() # Ensure that the filename ends with '.zip' if not filename.endswith('.crf'): filename += '.crf' if trace >= 1: print('[MalletCRF] Training a new CRF: %s' % filename) # Create crf-info object describing the new CRF. crf_info = MalletCRF._build_crf_info( corpus, gaussian_variance, default_label, max_iterations, transduction_type, weight_groups, add_start_state, add_end_state, filename, feature_detector) # Create a zipfile, and write crf-info to it. if trace >= 2: print('[MalletCRF] Adding crf-info.xml to %s' % filename) zf = zipfile.ZipFile(filename, mode='w') zf.writestr('crf-info.xml', crf_info.toxml()+'\n') zf.close() # Create the CRF object. crf = MalletCRF(filename, feature_detector) # Write the Training corpus to a temporary file. if trace >= 2: print('[MalletCRF] Writing training corpus...') (fd, train_file) = mkstemp('.txt', 'train') crf.write_training_corpus(corpus, os.fdopen(fd, 'w')) try: if trace >= 1: print('[MalletCRF] Calling mallet to train CRF...') cmd = [MalletCRF._TRAIN_CRF, '--model-file', os.path.abspath(filename), '--train-file', train_file] if trace > 3: call_mallet(cmd) else: p = call_mallet(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, blocking=False) MalletCRF._filter_training_output(p, trace) finally: # Delete the temp file containing the training corpus. os.remove(train_file) if trace >= 1: print('[MalletCRF] Training complete.') print('[MalletCRF] Model stored in: %s' % filename) if trace >= 2: dt = time.time()-t0 print('[MalletCRF] Total training time: %d seconds' % dt) # Return the completed CRF. return crf
def train(cls, feature_detector, corpus, filename=None, weight_groups=None, gaussian_variance=1, default_label='O', transduction_type='VITERBI', max_iterations=500, add_start_state=True, add_end_state=True, trace=1): """ Train a new linear chain CRF tagger based on the given corpus of training sequences. This tagger will be backed by a crf model file, containing both a serialized Mallet model and information about the CRF's structure. This crf model file will not be automatically deleted -- if you wish to delete it, you must delete it manually. The filename of the model file for a MalletCRF crf is available as ``crf.filename()``. :type corpus: list(tuple(str, str)) :param corpus: Training data, represented as a list of sentences, where each sentence is a list of (token, tag) tuples. :type filename: str :param filename: The filename that should be used for the crf model file that backs the new MalletCRF. If no filename is given, then a new filename will be chosen automatically. :type weight_groups: list(CRFInfo.WeightGroup) :param weight_groups: Specifies how input-features should be mapped to joint-features. See CRFInfo.WeightGroup for more information. :type gaussian_variance: float :param gaussian_variance: The gaussian variance of the prior that should be used to train the new CRF. :type default_label: str :param default_label: The "label for initial context and uninteresting tokens" (from Mallet's SimpleTagger.java.) It's unclear whether this currently has any effect. :type transduction_type: str :param transduction_type: The type of transduction used by the CRF. Can be VITERBI, VITERBI_FBEAM, VITERBI_BBEAM, VITERBI_FBBEAM, or VITERBI_FBEAMKL. :type max_iterations: int :param max_iterations: The maximum number of iterations that should be used for training the CRF. :type add_start_state: bool :param add_start_state: If true, then NLTK will add a special start state, named '__start__'. The initial cost for the start state will be set to 0; and the initial cost for all other states will be set to +inf. :type add_end_state: bool :param add_end_state: If true, then NLTK will add a special end state, named '__end__'. The final cost for the end state will be set to 0; and the final cost for all other states will be set to +inf. :type trace: int :param trace: Controls the verbosity of trace output generated while training the CRF. Higher numbers generate more verbose output. """ t0 = time.time() # Record starting time. # If they did not supply a model filename, then choose one. if filename is None: (fd, filename) = mkstemp('.crf', 'model') os.fdopen(fd).close() # Ensure that the filename ends with '.zip' if not filename.endswith('.crf'): filename += '.crf' if trace >= 1: print '[MalletCRF] Training a new CRF: %s' % filename # Create crf-info object describing the new CRF. crf_info = MalletCRF._build_crf_info( corpus, gaussian_variance, default_label, max_iterations, transduction_type, weight_groups, add_start_state, add_end_state, filename, feature_detector) # Create a zipfile, and write crf-info to it. if trace >= 2: print '[MalletCRF] Adding crf-info.xml to %s' % filename zf = zipfile.ZipFile(filename, mode='w') zf.writestr('crf-info.xml', crf_info.toxml()+'\n') zf.close() # Create the CRF object. crf = MalletCRF(filename, feature_detector) # Write the Training corpus to a temporary file. if trace >= 2: print '[MalletCRF] Writing training corpus...' (fd, train_file) = mkstemp('.txt', 'train') crf.write_training_corpus(corpus, os.fdopen(fd, 'w')) try: if trace >= 1: print '[MalletCRF] Calling mallet to train CRF...' cmd = [MalletCRF._TRAIN_CRF, '--model-file', os.path.abspath(filename), '--train-file', train_file] if trace > 3: call_mallet(cmd) else: p = call_mallet(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, blocking=False) MalletCRF._filter_training_output(p, trace) finally: # Delete the temp file containing the training corpus. os.remove(train_file) if trace >= 1: print '[MalletCRF] Training complete.' print '[MalletCRF] Model stored in: %s' % filename if trace >= 2: dt = time.time()-t0 print '[MalletCRF] Total training time: %d seconds' % dt # Return the completed CRF. return crf