Ejemplo n.º 1
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Ejemplo n.º 2
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Ejemplo n.º 3
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0, int(round(self.train_part * len(train))),
                              copy=False)
     return train
Ejemplo n.º 4
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Ejemplo n.º 5
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Ejemplo n.º 6
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0,
                              int(round(self.train_part * len(train))),
                              copy=False)
     return train
Ejemplo n.º 7
0
 def train_on_data(self, train):
     """\
     Train model on the specified training data set (which must be a loaded
     DataSet object).
     """
     log_info('Preparing data set...')
     self.data_headers = train.get_headers()
     train_vect = self.__vectorize(train)
     train_classes = self.get_classes(train)
     # if all the training data have the same class, use a dummy classifier
     if train.get_attrib(self.class_attr).num_values == 1:
         self.feature_filter = None
         self.classifier = DummyClassifier()
     # filter features
     log_info('Filtering...')
     train_filt = self.__filter_features(train_vect, train_classes)
     # train the classifier
     log_info('Training...')
     if self.use_weights:
         self.classifier.fit(train_filt, train_classes,
                             sample_weight=train.inst_weights)
     else:
         self.classifier.fit(train_filt, train_classes)
     self.classifier_trained = True
     log_info('Training done.')
Ejemplo n.º 8
0
 def train_on_data(self, train):
     """\
     Train model on the specified training data set (which must be a loaded
     DataSet object).
     """
     log_info('Preparing data set...')
     self.data_headers = train.get_headers()
     train_vect = self.__vectorize(train)
     train_classes = self.get_classes(train)
     # if all the training data have the same class, use a dummy classifier
     if train.get_attrib(self.class_attr).num_values == 1:
         self.feature_filter = None
         self.classifier = DummyClassifier()
     # filter features
     log_info('Filtering...')
     train_filt = self.__filter_features(train_vect, train_classes)
     # train the classifier
     log_info('Training...')
     if self.use_weights:
         self.classifier.fit(train_filt,
                             train_classes,
                             sample_weight=train.inst_weights)
     else:
         self.classifier.fit(train_filt, train_classes)
     self.classifier_trained = True
     log_info('Training done.')
Ejemplo n.º 9
0
 def process_document(self, filename):
     """\
     Read a Tecto-Template file and return its contents as
     a Document object.
     """
     fh = file_stream(filename, encoding=self.encoding)
     doc = Document(filename)
     for line in fh:
         bundle = doc.create_bundle()
         zone = bundle.create_zone(self.language, self.selector)
         ttree = zone.create_ttree()
         self.parse_line(line, ttree)
         log_info('Parsed a tree with %d nodes.' %
                  len(ttree.get_descendants()))
     fh.close()
     return doc
Ejemplo n.º 10
0
 def apply_to(self, string, language=None, selector=None):
     """
     Apply the whole scenario to a string (which should be readable by
     the first block of the scenario), return the sentence(s) of the
     given target language and selector.
     """
     # check if we know the target language and selector
     language = language or self.global_args["language"]
     selector = selector or self.global_args.get("selector", "")
     # the first block is supposed to be a reader which creates the document
     fh = StringIO(string)
     doc = self.blocks[0].process_document(fh)
     # apply all other blocks
     for block_no, block in enumerate(self.blocks[1:], start=2):
         log_info("Applying block " + str(block_no) + "/" + str(len(self.blocks)) + ": " + block.__class__.__name__)
         block.process_document(doc)
     # return the text of all bundles for the specified sentence
     return "\n".join([b.get_zone(language, selector).sentence for b in doc.bundles])
Ejemplo n.º 11
0
 def load_blocks(self):
     "Load all blocks into memory, finding and creating class objects."
     self.blocks = []
     for block_no, block_data in enumerate(self.scenario_data, start=1):
         # create the block name and import it
         if "." in block_data["block"]:
             class_subpath, class_name = block_data["block"].rsplit(".", 1)
             class_subpath += "."
         else:
             class_subpath, class_name = "", block_data["block"]
         class_package = "alex.components.nlg.tectotpl.block." + class_subpath + class_name.lower()
         log_info("Loading block " + str(block_no) + "/" + str(len(self.scenario_data)) + ": " + class_name)
         exec("import " + class_package)
         class_obj = getattr(sys.modules[class_package], class_name)
         # create the block object
         args = self.global_args.copy()
         args.update(block_data.get("args", {}))
         self.blocks.append(class_obj(self, args))
         # load models etc.
         self.blocks[-1].load()
Ejemplo n.º 12
0
 def train(self, train_file, work_dir, memory=8, encoding='UTF-8'):
     """\
     Read training data, split them and train the individual models
     (in cluster jobs).
     """
     # load the entire data set
     train = self.load_training_set(train_file, encoding)
     self.data_headers = train.get_headers()
     # train a backoff model
     log_info('Training a backoff model...')
     self.backoff_model = self.__train_backoff_model(train)
     # split it
     log_info('Split...')
     train_split = train.split(eval(self.divide_func), keep_copy=False)
     jobs = []
     model_files = {}
     # save training files and create training jobs
     for key, subset in train_split.iteritems():
         fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file)
         fn = os.path.join(work_dir, os.path.basename(fn))
         subset.save_to_arff(fn, encoding)
         job, model_file = Model.create_training_job(self.config, work_dir,
                                                     fn, memory=memory,
                                                     encoding=encoding)
         jobs.append(job)
         model_files[key] = model_file
     # submit the training jobs and wait for all of them
     log_info('Submitting training jobs...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs...')
     for job in jobs:
         job.wait()
     # load all models
     log_info('Training complete. Assembling model files...')
     for key, model_file in model_files.iteritems():
         self.models[key] = Model.load_from_file(model_file)
     self.trained = True
     log_info('Training done.')
Ejemplo n.º 13
0
 def train(self, train_file, work_dir, memory=8, encoding='UTF-8'):
     """\
     Read training data, split them and train the individual models
     (in cluster jobs).
     """
     # load the entire data set
     train = self.load_training_set(train_file, encoding)
     self.data_headers = train.get_headers()
     # train a backoff model
     log_info('Training a backoff model...')
     self.backoff_model = self.__train_backoff_model(train)
     # split it
     log_info('Split...')
     train_split = train.split(eval(self.divide_func), keep_copy=False)
     jobs = []
     model_files = {}
     # save training files and create training jobs
     for key, subset in train_split.iteritems():
         fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file)
         fn = os.path.join(work_dir, os.path.basename(fn))
         subset.save_to_arff(fn, encoding)
         job, model_file = Model.create_training_job(self.config,
                                                     work_dir,
                                                     fn,
                                                     memory=memory,
                                                     encoding=encoding)
         jobs.append(job)
         model_files[key] = model_file
     # submit the training jobs and wait for all of them
     log_info('Submitting training jobs...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs...')
     for job in jobs:
         job.wait()
     # load all models
     log_info('Training complete. Assembling model files...')
     for key, model_file in model_files.iteritems():
         self.models[key] = Model.load_from_file(model_file)
     self.trained = True
     log_info('Training done.')