def patent_invention_train(mallet_file,
                           features="invention",
                           version="1",
                           xval=0,
                           verbose=False,
                           stats_file=None):
    """Wrapper around mallet.py functionality to create a classifier model. The
    .mallet training instances file must exist and full path passed in. Other
    files needed for mallet processing will be placed in the same directory
    (train_output_dir). Creates an instance of MalletTraining class to do the
    rest: creating the .vectors file from the mallet file, and creating the
    model."""

    #d_phr2label = load_phrase_labels3(annotation_file, annotation_count)
    train_output_dir = os.path.dirname(mallet_file)
    mconfig = mallet.MalletConfig(config.MALLET_DIR,
                                  'itrain',
                                  'iclassify',
                                  version,
                                  train_output_dir,
                                  '/tmp',
                                  classifier_type="MaxEnt",
                                  number_xval=xval,
                                  training_portion=0,
                                  prune_p=False,
                                  infogain_pruning="5000",
                                  count_pruning="3")
    mtr = mallet.MalletTraining(mconfig, features)
    # we can't use make_utraining_file3 since we do not base our annotations on doc_feats.
    #mtr.make_utraining_file3(fnames, d_phr2label, features=features)
    mtr.write_train_mallet_vectors_file()
    mtr.mallet_train_classifier()
Exemple #2
0
def patent_utraining_data3(mallet_file,
                           annotation_file,
                           annotation_count,
                           fnames,
                           features=None,
                           version="1",
                           xval=0,
                           verbose=False,
                           stats_file=None):
    """Wrapper around mallet.py functionality to create a classifier model. Creates
    a dictionary of annotations, sets the mallet configuration and creates an
    instance of MalletTraining class to do the rest: creating .mallet file,
    creating the .vectors file from the mallet file, and creating the model."""
    d_phr2label = load_phrase_labels3(annotation_file, annotation_count)
    train_output_dir = os.path.dirname(mallet_file)
    mconfig = mallet.MalletConfig(config.MALLET_DIR,
                                  'train',
                                  'classify',
                                  version,
                                  train_output_dir,
                                  '/tmp',
                                  classifier_type="MaxEnt",
                                  number_xval=xval,
                                  training_portion=0,
                                  prune_p=False,
                                  infogain_pruning="5000",
                                  count_pruning="3")
    mtr = mallet.MalletTraining(mconfig, features)
    mtr.make_utraining_file3(fnames, d_phr2label)
    mtr.mallet_train_classifier()
 def _create_mallet_file(self):
     self._load_phrase_labels()
     mconfig = mallet.MalletConfig(
         self.model, 'train', 'classify', '0', self.model, '/tmp',
         classifier_type="MaxEnt", number_xval=0, training_portion=0,
         prune_p=False, infogain_pruning="5000", count_pruning="3")
     mtr = mallet.MalletTraining(mconfig)
     fnames = filename_generator(self.input_dataset.path, self.file_list)
     mtr.make_utraining_file3(fnames, self.d_phr2label, verbose=VERBOSE)
     self._create_info_stats_file(mtr.stats_labeled_count, mtr.stats_unlabeled_count,
                                  mtr.stats_terms, mtr.stats_terms_y, mtr.stats_terms_n)