def save_to_file(self, model_file): """\ Save the model to a pickle file or stream (supports GZip compression). """ log_info('Saving model to file ' + str(model_file)) fh = file_stream(model_file, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self) fh.close() log_info('Model successfully saved.')
def load_training_set(self, filename, encoding='UTF-8'): """\ Load the given training data set into memory and strip it if configured to via the train_part parameter. """ log_info('Loading training data set from ' + str(filename) + '...') train = DataSet() train.load_from_arff(filename, encoding) if self.train_part < 1: train = train.subset(0, int(round(self.train_part * len(train))), copy=False) return train
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() fh.close() log_info('Model loaded successfully.') return model
def train_on_data(self, train): """\ Train model on the specified training data set (which must be a loaded DataSet object). """ log_info('Preparing data set...') self.data_headers = train.get_headers() train_vect = self.__vectorize(train) train_classes = self.get_classes(train) # if all the training data have the same class, use a dummy classifier if train.get_attrib(self.class_attr).num_values == 1: self.feature_filter = None self.classifier = DummyClassifier() # filter features log_info('Filtering...') train_filt = self.__filter_features(train_vect, train_classes) # train the classifier log_info('Training...') if self.use_weights: self.classifier.fit(train_filt, train_classes, sample_weight=train.inst_weights) else: self.classifier.fit(train_filt, train_classes) self.classifier_trained = True log_info('Training done.')
def process_document(self, filename): """\ Read a Tecto-Template file and return its contents as a Document object. """ fh = file_stream(filename, encoding=self.encoding) doc = Document(filename) for line in fh: bundle = doc.create_bundle() zone = bundle.create_zone(self.language, self.selector) ttree = zone.create_ttree() self.parse_line(line, ttree) log_info('Parsed a tree with %d nodes.' % len(ttree.get_descendants())) fh.close() return doc
def apply_to(self, string, language=None, selector=None): """ Apply the whole scenario to a string (which should be readable by the first block of the scenario), return the sentence(s) of the given target language and selector. """ # check if we know the target language and selector language = language or self.global_args["language"] selector = selector or self.global_args.get("selector", "") # the first block is supposed to be a reader which creates the document fh = StringIO(string) doc = self.blocks[0].process_document(fh) # apply all other blocks for block_no, block in enumerate(self.blocks[1:], start=2): log_info("Applying block " + str(block_no) + "/" + str(len(self.blocks)) + ": " + block.__class__.__name__) block.process_document(doc) # return the text of all bundles for the specified sentence return "\n".join([b.get_zone(language, selector).sentence for b in doc.bundles])
def load_blocks(self): "Load all blocks into memory, finding and creating class objects." self.blocks = [] for block_no, block_data in enumerate(self.scenario_data, start=1): # create the block name and import it if "." in block_data["block"]: class_subpath, class_name = block_data["block"].rsplit(".", 1) class_subpath += "." else: class_subpath, class_name = "", block_data["block"] class_package = "alex.components.nlg.tectotpl.block." + class_subpath + class_name.lower() log_info("Loading block " + str(block_no) + "/" + str(len(self.scenario_data)) + ": " + class_name) exec("import " + class_package) class_obj = getattr(sys.modules[class_package], class_name) # create the block object args = self.global_args.copy() args.update(block_data.get("args", {})) self.blocks.append(class_obj(self, args)) # load models etc. self.blocks[-1].load()
def train(self, train_file, work_dir, memory=8, encoding='UTF-8'): """\ Read training data, split them and train the individual models (in cluster jobs). """ # load the entire data set train = self.load_training_set(train_file, encoding) self.data_headers = train.get_headers() # train a backoff model log_info('Training a backoff model...') self.backoff_model = self.__train_backoff_model(train) # split it log_info('Split...') train_split = train.split(eval(self.divide_func), keep_copy=False) jobs = [] model_files = {} # save training files and create training jobs for key, subset in train_split.iteritems(): fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file) fn = os.path.join(work_dir, os.path.basename(fn)) subset.save_to_arff(fn, encoding) job, model_file = Model.create_training_job(self.config, work_dir, fn, memory=memory, encoding=encoding) jobs.append(job) model_files[key] = model_file # submit the training jobs and wait for all of them log_info('Submitting training jobs...') for job in jobs: job.submit() log_info('Waiting for jobs...') for job in jobs: job.wait() # load all models log_info('Training complete. Assembling model files...') for key, model_file in model_files.iteritems(): self.models[key] = Model.load_from_file(model_file) self.trained = True log_info('Training done.')