Example #1
0
 def load_from_file(fname):
     log_info('Loading model from ' + fname)
     with file_stream(fname, mode='rb', encoding=None) as fh:
         candgen = pickle.load(fh)
         # various backward compatibility tricks
         if type(candgen) == dict:
             child_type_counts = candgen
             candgen = RandomCandidateGenerator({})
             candgen.child_type_counts = child_type_counts
             candgen.child_num_cdfs = pickle.load(fh)
             candgen.max_children = pickle.load(fh)
         if not hasattr(candgen, 'node_limits'):
             candgen.node_limits = None
         if not hasattr(candgen, 'child_type_counts'):
             candgen.child_type_counts = candgen.form_counts
             candgen.child_num_cdfs = candgen.child_cdfs
         if not hasattr(candgen, 'exp_child_num'):
             candgen.exp_child_num = candgen.exp_from_cdfs(candgen.child_num_cdfs)
         if not hasattr(candgen, 'compatible_dais'):
             candgen.compatible_dais = None
             candgen.compatible_dais_type = None
             candgen.compatible_dais_limit = 1000
         if not hasattr(candgen, 'compatible_slots'):
             candgen.compatible_slots = False
         return candgen
Example #2
0
 def process_document(self, filename):
     "Read a YAML file and return its contents as a Document object"
     f = file_stream(filename, encoding=None)
     data = yaml.load(f)
     doc = Document(filename, data)
     f.close()
     return doc
Example #3
0
 def process_document(self, filename):
     "Read a YAML file and return its contents as a Document object"
     f = file_stream(filename, encoding=None)
     data = yaml.load(f)
     doc = Document(filename, data)
     f.close()
     return doc
Example #4
0
File: futil.py Project: fooyou/tgen
def read_ttrees(ttree_file):
    """Read t-trees from a YAML/Pickle file."""
    if "pickle" in ttree_file:
        # if pickled, read just the pickle
        fh = file_stream(ttree_file, mode="rb", encoding=None)
        unpickler = pickle.Unpickler(fh)
        ttrees = unpickler.load()
        fh.close()
    else:
        # if not pickled, read YAML and save a pickle nearby
        yaml_reader = YAMLReader(scenario=None, args={})
        ttrees = yaml_reader.process_document(ttree_file)
        pickle_file = ttree_file.replace("yaml", "pickle")
        fh = file_stream(pickle_file, mode="wb", encoding=None)
        pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(ttrees)
        fh.close()
    return ttrees
Example #5
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Example #6
0
File: futil.py Project: fooyou/tgen
def read_das(da_file):
    """Read dialogue acts from a file, one-per-line."""
    das = []
    with file_stream(da_file) as fh:
        for line in fh:
            da = DialogueAct()
            da.parse(line)
            das.append(da)
    return das
Example #7
0
 def process_document(self, doc):
     "Write a YAML document"
     data = []
     for bundle in doc.bundles:
         data.append(self.serialize_bundle(bundle))
     out = file_stream(self.get_output_file_name(doc), 'w', encoding=None)
     out.write(yaml.safe_dump(data, allow_unicode=True,
                              explicit_start=True))
     out.close()
Example #8
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Example #9
0
File: yaml.py Project: AoJ/alex
 def process_document(self, doc):
     "Write a YAML document"
     data = []
     for bundle in doc.bundles:
         data.append(self.serialize_bundle(bundle))
     out = file_stream(self.get_output_file_name(doc), 'w', encoding=None)
     out.write(yaml.safe_dump(data, allow_unicode=True,
                              explicit_start=True))
     out.close()
Example #10
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Example #11
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Example #12
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        if rank_config['nn'] == 'emb':
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker
    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class)
    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)
    ranker.save_to_file(fname_rank_model)
Example #13
0
 def process_document(self, filename):
     """\
     Read a Tecto-Template file and return its contents as
     a Document object.
     """
     fh = file_stream(filename, encoding=self.encoding)
     doc = Document(filename)
     for line in fh:
         bundle = doc.create_bundle()
         zone = bundle.create_zone(self.language, self.selector)
         ttree = zone.create_ttree()
         self.parse_line(line, ttree)
         log_info('Parsed a tree with %d nodes.' %
                  len(ttree.get_descendants()))
     fh.close()
     return doc
Example #14
0
 def save_to_arff(self, filename, encoding='UTF-8'):
     """
     Save the data set to an ARFF file
     """
     # open the file
     fh = file_stream(filename, 'w', encoding)
     # print the relation name
     print >> fh, '@relation ' + (self.relation_name if self.relation_name
                                  is not None else '<noname>')
     # print the list of attributes
     for attrib in self.attribs:
         print >> fh, '@attribute ' + attrib.name + ' ' + \
                 attrib.get_arff_type()
     # print instances
     print >> fh, '@data'
     for inst, weight in zip(self.data, self.inst_weights):
         print >> fh, self.__get_arff_line(inst, weight)
Example #15
0
File: dataset.py Project: AoJ/alex
 def save_to_arff(self, filename, encoding='UTF-8'):
     """
     Save the data set to an ARFF file
     """
     # open the file
     fh = file_stream(filename, 'w', encoding)
     # print the relation name
     print >> fh, '@relation ' + (self.relation_name
                                  if self.relation_name is not None
                                  else '<noname>')
     # print the list of attributes
     for attrib in self.attribs:
         print >> fh, '@attribute ' + attrib.name + ' ' + \
                 attrib.get_arff_type()
     # print instances
     print >> fh, '@data'
     for inst, weight in zip(self.data, self.inst_weights):
         print >> fh, self.__get_arff_line(inst, weight)
Example #16
0
 def load_from_arff(self, filename, encoding='UTF-8'):
     """
     Load an ARFF file/stream, filling the data structures.
     """
     # initialize
     if not self.is_empty:
         raise IOError('Cannot store second data set into the same object.')
     status = 'header'  # we first assume to read the header
     line_num = 1  # line counter
     instances = []
     weights = []
     # open the file
     fh = file_stream(filename, encoding=encoding)
     # parse the file
     for line in fh:
         line = line.strip()
         # skip comments
         if line.startswith('%'):
             continue
         # relation name
         elif line.lower().startswith('@relation'):
             self.relation_name = line.split(None, 1)[1]
         # attribute definition
         elif line.lower().startswith('@attribute'):
             attr_name, attr_type = line.split(None, 2)[1:]
             self.attribs.append(Attribute(attr_name, attr_type))
         # data section start
         elif line.lower().startswith('@data'):
             status = 'data'
         # data lines
         elif status == 'data' and line != '':
             inst, weight = self.__parse_line(line, line_num)
             instances.append(inst)
             weights.append(weight)
         line_num += 1
     fh.close()
     # store the resulting matrix
     self.data = instances
     self.inst_weights = weights
     # remember attribute names
     self.attribs_by_name = {
         attr.name: idx
         for idx, attr in enumerate(self.attribs)
     }
Example #17
0
 def create_training_job(config,
                         work_dir,
                         train_file,
                         name=None,
                         memory=8,
                         encoding='UTF-8'):
     """\
     Submit a training process on the cluster which will save the
     model to a pickle. Return the submitted job and the future location of
     the model pickle.
     train_file cannot be a stream, it must be an actual file.
     """
     # purge name
     if name is None:
         name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file)
     else:
         name = re.sub(r'[^A-Za-z0-9_]', '_', name)
     # create working directory, if not existing
     if not os.path.isdir(work_dir):
         os.mkdir(work_dir)
     train_file = os.path.abspath(train_file)
     # generate model file name
     model_file = os.path.abspath(
         os.path.join(work_dir, name + '-model.pickle.gz'))
     config_pickle = os.path.abspath(
         os.path.join(work_dir, name + '-cfg.pickle.gz'))
     # create the configuration pickle
     fh = file_stream(config_pickle, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config)
     fh.close()
     # create the job
     job = Job(name=name, work_dir=work_dir)
     job.code = "fh = file_stream('" + config_pickle + \
             "', mode='rb', encoding=None)\n" + \
             "cfg = pickle.Unpickler(fh).load()\n" + \
             "fh.close()\n" + \
             "model = Model(cfg)\n" + \
             "model.train('" + train_file + "', encoding='" + \
             encoding + "')\n" \
             "model.save_to_file('" + model_file + "')\n"
     job.header += "from alex.components.nlg.tectotpl.tool.ml.model import Model\n" + \
             "import pickle\n" + \
             "from alex.components.nlg.tectotpl.core.util import file_stream\n"
     return job, model_file
Example #18
0
def run_worker(head_host, head_port, debug_out=None):
    # setup debugging output, if applicable
    if debug_out is not None:
        set_debug_stream(file_stream(debug_out, mode='w'))
    # start the server (in the background)
    log_info('Creating worker server...')
    server = ThreadPoolServer(service=RankerTrainingService, nbThreads=1)
    server_thread = Thread(target=server.start)
    server_thread.start()
    my_host = socket.getfqdn()
    log_info('Worker server created at %s:%d. Connecting to head at %s:%d...' %
             (my_host, server.port, head_host, head_port))
    # notify main about this server
    conn = connect(head_host, head_port, config={'allow_pickle': True})
    conn.root.register_worker(my_host, server.port)
    conn.close()
    log_info('Worker is registered with the head.')
    # now serve until we're killed (the server thread will continue to run)
    server_thread.join()
Example #19
0
File: dataset.py Project: AoJ/alex
 def load_from_arff(self, filename, encoding='UTF-8'):
     """
     Load an ARFF file/stream, filling the data structures.
     """
     # initialize
     if not self.is_empty:
         raise IOError('Cannot store second data set into the same object.')
     status = 'header'  # we first assume to read the header
     line_num = 1  # line counter
     instances = []
     weights = []
     # open the file
     fh = file_stream(filename, encoding=encoding)
     # parse the file
     for line in fh:
         line = line.strip()
         # skip comments
         if line.startswith('%'):
             continue
         # relation name
         elif line.lower().startswith('@relation'):
             self.relation_name = line.split(None, 1)[1]
         # attribute definition
         elif line.lower().startswith('@attribute'):
             attr_name, attr_type = line.split(None, 2)[1:]
             self.attribs.append(Attribute(attr_name, attr_type))
         # data section start
         elif line.lower().startswith('@data'):
             status = 'data'
         # data lines
         elif status == 'data' and line != '':
             inst, weight = self.__parse_line(line, line_num)
             instances.append(inst)
             weights.append(weight)
         line_num += 1
     fh.close()
     # store the resulting matrix
     self.data = instances
     self.inst_weights = weights
     # remember attribute names
     self.attribs_by_name = {attr.name: idx
                             for idx, attr in enumerate(self.attribs)}
Example #20
0
 def create_training_job(config, work_dir, train_file,
                         name=None, memory=8, encoding='UTF-8'):
     """\
     Submit a training process on the cluster which will save the
     model to a pickle. Return the submitted job and the future location of
     the model pickle.
     train_file cannot be a stream, it must be an actual file.
     """
     # purge name
     if name is None:
         name = 'TR-' + re.sub(r'[^A-Za-z0-9_]', '_', train_file)
     else:
         name = re.sub(r'[^A-Za-z0-9_]', '_', name)
     # create working directory, if not existing
     if not os.path.isdir(work_dir):
         os.mkdir(work_dir)
     train_file = os.path.abspath(train_file)
     # generate model file name
     model_file = os.path.abspath(os.path.join(work_dir,
                                               name + '-model.pickle.gz'))
     config_pickle = os.path.abspath(os.path.join(work_dir,
                                                  name + '-cfg.pickle.gz'))
     # create the configuration pickle
     fh = file_stream(config_pickle, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(config)
     fh.close()
     # create the job
     job = Job(name=name, work_dir=work_dir)
     job.code = "fh = file_stream('" + config_pickle + \
             "', mode='rb', encoding=None)\n" + \
             "cfg = pickle.Unpickler(fh).load()\n" + \
             "fh.close()\n" + \
             "model = Model(cfg)\n" + \
             "model.train('" + train_file + "', encoding='" + \
             encoding + "')\n" \
             "model.save_to_file('" + model_file + "')\n"
     job.header += "from alex.components.nlg.tectotpl.tool.ml.model import Model\n" + \
             "import pickle\n" + \
             "from alex.components.nlg.tectotpl.core.util import file_stream\n"
     return job, model_file
Example #21
0
File: rank.py Project: fooyou/tgen
 def load_from_file(model_fname):
     """Load a pre-trained model from a file."""
     log_info("Loading ranker from %s..." % model_fname)
     with file_stream(model_fname, 'rb', encoding=None) as fh:
         return pickle.load(fh)
Example #22
0
 def save_to_file(self, fname):
     log_info('Saving model to ' + fname)
     with file_stream(fname, mode='wb', encoding=None) as fh:
         pickle.dump(self, fh, pickle.HIGHEST_PROTOCOL)
Example #23
0
def asearch_gen(args):
    """A*search generation"""

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(das,
                                                  trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            open_list, close_list = tgen.generate_tree(da, gen_doc, return_lists=True)
            lists_analyzer.append(gold_tree, open_list, close_list)
            gen_tree = close_list.peek()[0]
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore',
                            "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree,
                          gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.tree_size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats())
        log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
                 evaler.common_subtree_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Example #24
0
File: rank.py Project: fooyou/tgen
 def save_to_file(self, model_fname):
     """Save the model to a file."""
     log_info("Saving ranker to %s..." % model_fname)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self, fh, protocol=pickle.HIGHEST_PROTOCOL)