def load_from_file(fname): log_info('Loading model from ' + fname) with file_stream(fname, mode='rb', encoding=None) as fh: candgen = pickle.load(fh) # various backward compatibility tricks if type(candgen) == dict: child_type_counts = candgen candgen = RandomCandidateGenerator({}) candgen.child_type_counts = child_type_counts candgen.child_num_cdfs = pickle.load(fh) candgen.max_children = pickle.load(fh) if not hasattr(candgen, 'node_limits'): candgen.node_limits = None if not hasattr(candgen, 'child_type_counts'): candgen.child_type_counts = candgen.form_counts candgen.child_num_cdfs = candgen.child_cdfs if not hasattr(candgen, 'exp_child_num'): candgen.exp_child_num = candgen.exp_from_cdfs(candgen.child_num_cdfs) if not hasattr(candgen, 'compatible_dais'): candgen.compatible_dais = None candgen.compatible_dais_type = None candgen.compatible_dais_limit = 1000 if not hasattr(candgen, 'compatible_slots'): candgen.compatible_slots = False return candgen
def process_document(self, filename): "Read a YAML file and return its contents as a Document object" f = file_stream(filename, encoding=None) data = yaml.load(f) doc = Document(filename, data) f.close() return doc
def read_ttrees(ttree_file): """Read t-trees from a YAML/Pickle file.""" if "pickle" in ttree_file: # if pickled, read just the pickle fh = file_stream(ttree_file, mode="rb", encoding=None) unpickler = pickle.Unpickler(fh) ttrees = unpickler.load() fh.close() else: # if not pickled, read YAML and save a pickle nearby yaml_reader = YAMLReader(scenario=None, args={}) ttrees = yaml_reader.process_document(ttree_file) pickle_file = ttree_file.replace("yaml", "pickle") fh = file_stream(pickle_file, mode="wb", encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(ttrees) fh.close() return ttrees
def save_to_file(self, model_file): """\ Save the model to a pickle file or stream (supports GZip compression). """ log_info('Saving model to file ' + str(model_file)) fh = file_stream(model_file, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self) fh.close() log_info('Model successfully saved.')
def read_das(da_file): """Read dialogue acts from a file, one-per-line.""" das = [] with file_stream(da_file) as fh: for line in fh: da = DialogueAct() da.parse(line) das.append(da) return das
def process_document(self, doc): "Write a YAML document" data = [] for bundle in doc.bundles: data.append(self.serialize_bundle(bundle)) out = file_stream(self.get_output_file_name(doc), 'w', encoding=None) out.write(yaml.safe_dump(data, allow_unicode=True, explicit_start=True)) out.close()
def load_from_file(model_file): """\ Load the model from a pickle file or stream (supports GZip compression). """ log_info('Loading model from file ' + str(model_file)) fh = file_stream(model_file, mode='rb', encoding=None) unpickler = pickle.Unpickler(fh) model = unpickler.load() fh.close() log_info('Model loaded successfully.') return model
def percrank_train(args): opts, files = getopt(args, 'c:d:s:j:w:e:') candgen_model = None train_size = 1.0 parallel = False jobs_number = 0 work_dir = None experiment_id = None for opt, arg in opts: if opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-s': train_size = float(arg) elif opt == '-c': candgen_model = arg elif opt == '-j': parallel = True jobs_number = int(arg) elif opt == '-w': work_dir = arg elif opt == '-e': experiment_id = arg if len(files) != 4: sys.exit(__doc__) fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files log_info('Training perceptron ranker...') rank_config = Config(fname_rank_config) if candgen_model: rank_config['candgen_model'] = candgen_model if rank_config.get('nn'): if rank_config['nn'] == 'emb': ranker_class = EmbNNRanker else: ranker_class = SimpleNNRanker else: ranker_class = PerceptronRanker if not parallel: ranker = ranker_class(rank_config) else: rank_config['jobs_number'] = jobs_number if work_dir is None: work_dir, _ = os.path.split(fname_rank_config) ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class) ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size) ranker.save_to_file(fname_rank_model)
def process_document(self, filename): """\ Read a Tecto-Template file and return its contents as a Document object. """ fh = file_stream(filename, encoding=self.encoding) doc = Document(filename) for line in fh: bundle = doc.create_bundle() zone = bundle.create_zone(self.language, self.selector) ttree = zone.create_ttree() self.parse_line(line, ttree) log_info('Parsed a tree with %d nodes.' % len(ttree.get_descendants())) fh.close() return doc
def save_to_arff(self, filename, encoding='UTF-8'): """ Save the data set to an ARFF file """ # open the file fh = file_stream(filename, 'w', encoding) # print the relation name print >> fh, '@relation ' + (self.relation_name if self.relation_name is not None else '<noname>') # print the list of attributes for attrib in self.attribs: print >> fh, '@attribute ' + attrib.name + ' ' + \ attrib.get_arff_type() # print instances print >> fh, '@data' for inst, weight in zip(self.data, self.inst_weights): print >> fh, self.__get_arff_line(inst, weight)
def load_from_arff(self, filename, encoding='UTF-8'): """ Load an ARFF file/stream, filling the data structures. """ # initialize if not self.is_empty: raise IOError('Cannot store second data set into the same object.') status = 'header' # we first assume to read the header line_num = 1 # line counter instances = [] weights = [] # open the file fh = file_stream(filename, encoding=encoding) # parse the file for line in fh: line = line.strip() # skip comments if line.startswith('%'): continue # relation name elif line.lower().startswith('@relation'): self.relation_name = line.split(None, 1)[1] # attribute definition elif line.lower().startswith('@attribute'): attr_name, attr_type = line.split(None, 2)[1:] self.attribs.append(Attribute(attr_name, attr_type)) # data section start elif line.lower().startswith('@data'): status = 'data' # data lines elif status == 'data' and line != '': inst, weight = self.__parse_line(line, line_num) instances.append(inst) weights.append(weight) line_num += 1 fh.close() # store the resulting matrix self.data = instances self.inst_weights = weights # remember attribute names self.attribs_by_name = { attr.name: idx for idx, attr in enumerate(self.attribs) }
def create_training_job(config, work_dir, train_file, name=None, memory=8, encoding='UTF-8'): """\ Submit a training process on the cluster which will save the model to a pickle. Return the submitted job and the future location of the model pickle. train_file cannot be a stream, it must be an actual file. """ # purge name if name is None: name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file) else: name = re.sub(r'[^A-Za-z0-9_]', '_', name) # create working directory, if not existing if not os.path.isdir(work_dir): os.mkdir(work_dir) train_file = os.path.abspath(train_file) # generate model file name model_file = os.path.abspath( os.path.join(work_dir, name + '-model.pickle.gz')) config_pickle = os.path.abspath( os.path.join(work_dir, name + '-cfg.pickle.gz')) # create the configuration pickle fh = file_stream(config_pickle, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config) fh.close() # create the job job = Job(name=name, work_dir=work_dir) job.code = "fh = file_stream('" + config_pickle + \ "', mode='rb', encoding=None)\n" + \ "cfg = pickle.Unpickler(fh).load()\n" + \ "fh.close()\n" + \ "model = Model(cfg)\n" + \ "model.train('" + train_file + "', encoding='" + \ encoding + "')\n" \ "model.save_to_file('" + model_file + "')\n" job.header += "from alex.components.nlg.tectotpl.tool.ml.model import Model\n" + \ "import pickle\n" + \ "from alex.components.nlg.tectotpl.core.util import file_stream\n" return job, model_file
def run_worker(head_host, head_port, debug_out=None): # setup debugging output, if applicable if debug_out is not None: set_debug_stream(file_stream(debug_out, mode='w')) # start the server (in the background) log_info('Creating worker server...') server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1) server_thread = Thread(target=server.start) server_thread.start() my_host = socket.getfqdn() log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' % (my_host, server.port, head_host, head_port)) # notify main about this server conn = connect(head_host, head_port, config={'allow_pickle': True}) conn.root.register_worker(my_host, server.port) conn.close() log_info('Worker is registered with the head.') # now serve until we're killed (the server thread will continue to run) server_thread.join()
def load_from_arff(self, filename, encoding='UTF-8'): """ Load an ARFF file/stream, filling the data structures. """ # initialize if not self.is_empty: raise IOError('Cannot store second data set into the same object.') status = 'header' # we first assume to read the header line_num = 1 # line counter instances = [] weights = [] # open the file fh = file_stream(filename, encoding=encoding) # parse the file for line in fh: line = line.strip() # skip comments if line.startswith('%'): continue # relation name elif line.lower().startswith('@relation'): self.relation_name = line.split(None, 1)[1] # attribute definition elif line.lower().startswith('@attribute'): attr_name, attr_type = line.split(None, 2)[1:] self.attribs.append(Attribute(attr_name, attr_type)) # data section start elif line.lower().startswith('@data'): status = 'data' # data lines elif status == 'data' and line != '': inst, weight = self.__parse_line(line, line_num) instances.append(inst) weights.append(weight) line_num += 1 fh.close() # store the resulting matrix self.data = instances self.inst_weights = weights # remember attribute names self.attribs_by_name = {attr.name: idx for idx, attr in enumerate(self.attribs)}
def create_training_job(config, work_dir, train_file, name=None, memory=8, encoding='UTF-8'): """\ Submit a training process on the cluster which will save the model to a pickle. Return the submitted job and the future location of the model pickle. train_file cannot be a stream, it must be an actual file. """ # purge name if name is None: name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file) else: name = re.sub(r'[^A-Za-z0-9_]', '_', name) # create working directory, if not existing if not os.path.isdir(work_dir): os.mkdir(work_dir) train_file = os.path.abspath(train_file) # generate model file name model_file = os.path.abspath(os.path.join(work_dir, name + '-model.pickle.gz')) config_pickle = os.path.abspath(os.path.join(work_dir, name + '-cfg.pickle.gz')) # create the configuration pickle fh = file_stream(config_pickle, mode='wb', encoding=None) pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config) fh.close() # create the job job = Job(name=name, work_dir=work_dir) job.code = "fh = file_stream('" + config_pickle + \ "', mode='rb', encoding=None)\n" + \ "cfg = pickle.Unpickler(fh).load()\n" + \ "fh.close()\n" + \ "model = Model(cfg)\n" + \ "model.train('" + train_file + "', encoding='" + \ encoding + "')\n" \ "model.save_to_file('" + model_file + "')\n" job.header += "from alex.components.nlg.tectotpl.tool.ml.model import Model\n" + \ "import pickle\n" + \ "from alex.components.nlg.tectotpl.core.util import file_stream\n" return job, model_file
def load_from_file(model_fname): """Load a pre-trained model from a file.""" log_info("Loading ranker from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: return pickle.load(fh)
def save_to_file(self, fname): log_info('Saving model to ' + fname) with file_stream(fname, mode='wb', encoding=None) as fh: pickle.dump(self, fh, pickle.HIGHEST_PROTOCOL)
def asearch_gen(args): """A*search generation""" opts, files = getopt(args, 'e:d:w:c:s:') eval_file = None fname_ttrees_out = None cfg_file = None eval_selector = '' for opt, arg in opts: if opt == '-e': eval_file = arg elif opt == '-s': eval_selector = arg elif opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-w': fname_ttrees_out = arg elif opt == '-c': cfg_file = arg if len(files) != 3: sys.exit('Invalid arguments.\n' + __doc__) fname_cand_model, fname_rank_model, fname_da_test = files log_info('Initializing...') candgen = RandomCandidateGenerator.load_from_file(fname_cand_model) ranker = PerceptronRanker.load_from_file(fname_rank_model) cfg = Config(cfg_file) if cfg_file else {} cfg.update({'candgen': candgen, 'ranker': ranker}) tgen = ASearchPlanner(cfg) log_info('Generating...') das = read_das(fname_da_test) if eval_file is None: gen_doc = Document() else: eval_doc = read_ttrees(eval_file) if eval_selector == tgen.selector: gen_doc = Document() else: gen_doc = eval_doc # generate and evaluate if eval_file is not None: # generate + analyze open&close lists lists_analyzer = ASearchListsAnalyzer() for num, (da, gold_tree) in enumerate(zip(das, trees_from_doc(eval_doc, tgen.language, eval_selector)), start=1): log_debug("\n\nTREE No. %03d" % num) open_list, close_list = tgen.generate_tree(da, gen_doc, return_lists=True) lists_analyzer.append(gold_tree, open_list, close_list) gen_tree = close_list.peek()[0] if gen_tree != gold_tree: log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n") log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats()) # evaluate the generated trees against golden trees eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector) gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector) log_info('Evaluating...') evaler = Evaluator() for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das): # add some stats about the tree directly into the output file add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore', "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree))) # collect overall stats evaler.append(eval_ttree, gen_ttree, ranker.score(TreeData.from_ttree(eval_ttree), da), ranker.score(TreeData.from_ttree(gen_ttree), da)) # print overall stats log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1()) log_info("DEP precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP)) log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.tree_size_stats()) log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats()) log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" % evaler.common_subtree_stats()) # just generate else: for da in das: tgen.generate_tree(da, gen_doc) # write output if fname_ttrees_out is not None: log_info('Writing output...') write_ttrees(gen_doc, fname_ttrees_out)
def save_to_file(self, model_fname): """Save the model to a file.""" log_info("Saving ranker to %s..." % model_fname) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)