Exemple #1
1
    def _init_training(self, das_file, ttree_file, data_portion):
        """Initialize training.

        Store input data, initialize 1-hot feature representations for input and output and
        transform training data accordingly, initialize the classification neural network.
        """
        # read input
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees from ' + ttree_file + '...')
        ttree_doc = read_ttrees(ttree_file)
        trees = trees_from_doc(ttree_doc, self.language, self.selector)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # add empty tree + empty DA to training data
        # (i.e. forbid the network to keep any of its outputs "always-on")
        train_size += 1
        self.train_trees.append(TreeData())
        empty_da = DialogueAct()
        empty_da.parse('inform()')
        self.train_das.append(empty_da)

        self.train_order = range(len(self.train_trees))
        log_info('Using %d training instances.' % train_size)

        # initialize input features/embeddings
        if self.tree_embs:
            self.dict_size = self.tree_embs.init_dict(self.train_trees)
            self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees])
        else:
            self.tree_feats = Features(['node: presence t_lemma formeme'])
            self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True)
            self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees]
            self.X = self.tree_vect.fit_transform(self.X)

        # initialize output features
        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)
        self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das]
        self.y = self.da_vect.fit_transform(self.y)

        # initialize I/O shapes
        self.input_shape = [list(self.X[0].shape)]
        self.num_outputs = len(self.da_vect.get_feature_names())

        # initialize NN classifier
        self._init_neural_network()
Exemple #2
0
 def load_surface_forms(self, surface_forms_fname):
     """Load all proper name surface forms from a file."""
     log_info('Loading surface forms from %s...' % surface_forms_fname)
     with file_stream(surface_forms_fname) as fh:
         data = json.load(fh)
     for slot, values in data.iteritems():
         sf_all = {}
         sf_formeme = {}
         sf_tag = {}
         for value in values.keys():
             for surface_form in values[value]:
                 form, tag = surface_form.split("\t")
                 if slot == 'street':  # add street number placeholders to addresses
                     value += ' _'
                     slot = 'address'
                 # store the value globally + for all possible tag subsets/formemes
                 sf_all[value] = sf_all.get(value, []) + [form]
                 sf_tag[value] = sf_tag.get(value, {})
                 sf_formeme[value] = sf_formeme.get(value, {})
                 for tag_subset in self._get_tag_subsets(tag):
                     sf_tag[value][tag_subset] = sf_tag[value].get(
                         tag_subset, []) + [form]
                 for formeme in self._get_compatible_formemes(tag):
                     sf_formeme[value][formeme] = sf_formeme[value].get(
                         formeme, []) + [form]
         self._sf_all[slot] = sf_all
         self._sf_by_formeme[slot] = sf_formeme
         self._sf_by_tag[slot] = sf_tag
Exemple #3
0
    def save_to_file(self, model_fname):
        """This will actually just move the best generator (which is saved in a temporary file)
        to the final location."""
        log_info('Moving generator to %s...' % model_fname)
        orig_model_fname = self.model_temp_path
        shutil.move(orig_model_fname, model_fname)
        orig_tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess',
                                       orig_model_fname)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if os.path.isfile(orig_tf_session_fname):
            shutil.move(orig_tf_session_fname, tf_session_fname)

        # move the reranking classifier model files as well, if they exist
        orig_clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1',
                                     orig_model_fname)
        orig_clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess',
                                        orig_clfilter_fname)

        if os.path.isfile(orig_clfilter_fname) and os.path.isfile(
                orig_clfilter_tf_fname):
            clfilter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1',
                                    model_fname)
            clfilter_tf_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tfsess',
                                       clfilter_fname)
            shutil.move(orig_clfilter_fname, clfilter_fname)
            shutil.move(orig_clfilter_tf_fname, clfilter_tf_fname)
Exemple #4
0
    def train(self, train_sents, valid_sents=None):
        """Train the RNNLM on the given data (list of lists of tokens).
        @param train_sents: training data (list of lists of tokens, lexicalized)
        @param valid_sents: validation data (list of lists of tokens, lexicalized, may be None \
            if no validation should be performed)
        """
        self._init_training(train_sents, valid_sents)

        top_perp = float('nan')

        for iter_no in xrange(1, self.passes + 1):
            # preparing parameters
            iter_alpha = self.alpha * np.exp(-self.alpha_decay * iter_no)
            self._train_order = range(len(self._train_data))
            if self.randomize:
                rnd.shuffle(self._train_order)
            # training
            self._training_pass(iter_no, iter_alpha)

            # validation
            if (self.validation_freq and iter_no > self.min_passes
                    and iter_no % self.validation_freq == 0):
                perp = self._valid_perplexity()
                log_info("Perplexity: %.3f" % perp)
                # if we have the best model so far, save it as a checkpoint (overwrite previous)
                if math.isnan(top_perp) or perp < top_perp:
                    top_perp = perp
                    self._save_checkpoint()

        self._restore_checkpoint()  # restore the best parameters so far
Exemple #5
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:r:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg
        elif opt == '-r' and arg:
            rnd.seed(arg)

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        from tgen.rank_nn import SimpleNNRanker, EmbNNRanker
        if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']:
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker

    log_info('Using %s for ranking' % ranker_class.__name__)

    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id,
                                ranker_class)

    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)

    # avoid the "maximum recursion depth exceeded" error
    sys.setrecursionlimit(100000)
    ranker.save_to_file(fname_rank_model)
Exemple #6
0
def read_system_training_data(filename):
    insts = []
    for inst in pd.read_csv(filename, index_col=None,
                            encoding='UTF-8').to_dict('records'):
        insts.append({
            'dataset':
            'E2E',
            'mr':
            DA.parse_diligent_da(inst['mr']).to_cambridge_da_string(),
            'delex_mr':
            DA.parse_diligent_da(inst['mr']).get_delexicalized(
                set(['name', 'near'])).to_cambridge_da_string(),
            'system':
            'HUMAN',
            'system_ref':
            None,
            'orig_ref':
            inst['ref'],
            'informativeness':
            None,
            'naturalness':
            None,
            'quality':
            None,
            'is_real':
            0
        })
    log_info(
        "Using %d different training human references to create fake pairs" %
        len(insts))
    return insts
Exemple #7
0
 def _delex_texts(self):
     """Delexicalize texts in the buffers and save them separately in the member variables,
     along with the delexicalization instructions used for the operation."""
     self._delexed_texts = []
     self._absts = []
     for text_idx, (text, da) in enumerate(zip(self._sents, self._das)):
         delex_text = []
         absts = []
         # do the delexicalization, keep track of which slots we used
         for tok_idx, (form, lemma, tag) in enumerate(text):
             slot = da.has_value(lemma)
             if slot and slot in self._abst_slots:
                 delex_text.append(('X-' + slot, 'X-' + slot, tag))
                 absts.append(Abst(slot, lemma, form, tok_idx, tok_idx + 1))
             else:
                 delex_text.append((form, lemma, tag))
         # fix coordinated delexicalized values
         self._delex_fix_coords(delex_text, da, absts)
         covered_slots = set([a.slot for a in absts])
         # check and warn if we left isomething non-delexicalized
         for dai in da:
             if (dai.slot in self._abst_slots
                     and dai.value not in [None, 'none', 'dont_care']
                     and dai.slot not in covered_slots):
                 log_info(
                     "Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n"
                     % (dai.slot, text_idx, unicode(da), " ".join(
                         [form for form, _, _ in text])))
         # save the delexicalized text and the delexicalization instructions
         self._delexed_texts.append(delex_text)
         self._absts.append(absts)
Exemple #8
0
 def save_to_file(self, model_fname):
     log_info("Saving classifier to %s..." % model_fname)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL)
         pickle.dump(self.get_all_settings(),
                     fh,
                     protocol=pickle.HIGHEST_PROTOCOL)
Exemple #9
0
def create_fake_pairs(fake_insts, data_len):
    """Given fake instances (ordered by the level of distortion & in the same order across the
    distortion levels: A-0, B-0..., A-1, B-1..., A-2, B-2... etc.), this creates pairs
    of instances for ranking (e.g. A-0 is better than A-2 etc.)."""
    log_info('Creating fake pairs...')
    # create a new dataframe with the same columns, plus 2nd system reference
    fake_pairs = []
    max_distort = len(fake_insts) / data_len  # should be an integer
    for inst_no in xrange(data_len):
        # add perfect vs. imperfect
        distort_levels = [(0, lev) for lev in range(1, max_distort)]
        # sample 5 pairs of different degrees of distortion
        pairs = list(combinations(range(1, max_distort), 2))
        distort_levels += [
            pairs[i] for i in np.random.choice(len(pairs), 5, replace=False)
        ]
        # choose the instances based on the distortion levels, create the pairs instanecs
        for better, worse in distort_levels:
            new_inst = dict(fake_insts.iloc[inst_no + better * data_len])
            new_inst['system_ref2'] = fake_insts.iloc[inst_no + worse *
                                                      data_len]['system_ref']
            del new_inst['informativeness']
            del new_inst['naturalness']
            del new_inst['quality']
            # add both naturalness and quality, ignore informativeness here
            for quant in ['naturalness', 'quality']:
                fake_pairs.append(dict(new_inst, **{quant: 1}))
    log_info('Created %d fake pairs.' % len(fake_pairs))
    return pd.DataFrame.from_records(fake_pairs)
Exemple #10
0
    def train(self, fnames, train_trees, valid_trees=None):
        """Train the lexicalizer (including its LM, if applicable).
        @param fnames: file names for surface forms (JSON) and training data lexicalization \
            instructions
        @param train_trees: loaded generator training data (TreeData trees/lists of lemma-tag \
            or form-tag pairs)
        """
        log_info('Training lexicalizer...')
        if not fnames:
            return
        valid_abst_fname = None
        if ',' in fnames:
            fnames = fnames.split(',')
            if len(fnames) == 3:
                surface_forms_fname, train_abst_fname, valid_abst_fname = fnames
            else:
                surface_forms_fname, train_abst_fname = fnames
        else:
            surface_forms_fname, train_abst_fname = fnames, None

        self.load_surface_forms(surface_forms_fname)
        if train_abst_fname and not isinstance(self._form_select,
                                               RandomFormSelect):
            log_info(
                'Training lexicalization LM from training trees and %s...' %
                train_abst_fname)
            self._form_select.train(*self._prepare_train_toks(
                train_trees, train_abst_fname, valid_trees, valid_abst_fname))
Exemple #11
0
    def load_from_file(model_fname):
        """Load the generator from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading generator from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Seq2SeqGen(cfg=data['cfg'])
            ret.load_all_settings(data)

        if ret.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname)
            if os.path.isfile(classif_filter_fname):
                ret.classif_filter = RerankingClassifier.load_from_file(classif_filter_fname)
            else:
                log_warn("Classification filter data not found, ignoring.")
                ret.classif_filter = False

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)

        return ret
Exemple #12
0
    def load_from_file(model_fname):
        """Load the generator from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading generator from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Seq2SeqGen(cfg=data['cfg'])
            ret.load_all_settings(data)

        if ret.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$',
                                          r'.tftreecl\1', model_fname)
            if os.path.isfile(classif_filter_fname):
                ret.classif_filter = RerankingClassifier.load_from_file(
                    classif_filter_fname)
            else:
                log_warn("Classification filter data not found, ignoring.")
                ret.classif_filter = False

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)

        return ret
Exemple #13
0
    def save_to_file(self, model_fname):
        """Save the whole ensemble into a file (get all settings and parameters, dump them in a
        pickle)."""
        # TODO support for lexicalizer

        log_info("Saving generator to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL)

            gens_dump = []
            for gen in self.gens:
                setting = gen.get_all_settings()
                parset = gen.get_model_params()
                setting['classif_filter'] = self.classif_filter is not None
                gens_dump.append((setting, parset))

            pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL)

            if self.classif_filter:
                pickle.dump(self.classif_filter.get_all_settings(),
                            fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(self.classif_filter.get_model_params(),
                            fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
Exemple #14
0
    def evaluate_file(self, das_file, ttree_file):
        """Evaluate the reranking classifier on a given pair of DA/tree files (show the
        total Hamming distance and total number of DAIs)

        @param das_file: DA file path
        @param ttree_file: trees/sentences file path
        @return: a tuple (total DAIs, distance)
        """
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        log_info('Reading t-trees/tokens from ' + ttree_file + '...')
        trees = read_trees_or_tokens(ttree_file, self.mode, self.language, self.selector)
        if self.mode in ['tokens', 'tagged_lemmas']:
            trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas')

        tot_len = 0
        tot_dist = 0
        classif_das = []
        for da, tree in zip(das, trees):
            tot_len += len(da)
            dist, classif = self.dist_to_da(da, [tree], return_classif=True)
            tot_dist += dist[0]
            classif_das.append(DA.parse_features(classif[0]))

        return tot_len, tot_dist, classif_das
Exemple #15
0
    def save_to_file(self, model_fname):
        """Save the generator to a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph will be stored with a \
            different extension
        """
        log_info("Saving generator to %s..." % model_fname)
        if self.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$',
                                          r'.tftreecl\1', model_fname)
            self.classif_filter.save_to_file(classif_filter_fname)
        if self.lexicalizer:
            lexicalizer_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.lexic\1',
                                       model_fname)
            self.lexicalizer.save_to_file(lexicalizer_fname)

        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.get_all_settings(),
                        fh,
                        protocol=pickle.HIGHEST_PROTOCOL)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if hasattr(self, 'checkpoint_path') and self.checkpoint_path:
            shutil.copyfile(self.checkpoint_path, tf_session_fname)
        else:
            self.saver.save(self.session, tf_session_fname)
Exemple #16
0
def get_sys_outputs(data):
    """Get instances with individual system outputs (regardless of pairs)."""
    sys_outputs = {}
    mrs = {}
    for inst in data:
        mrs[inst['mr']] = inst['delex_mr']
        sys_outputs[(inst['mr'], inst['system'])] = inst['system_ref']
        sys_outputs[(inst['mr'], inst['system2'])] = inst['system_ref2']

    sys_outs_list = []
    for (mr, sys_name), output in sys_outputs.iteritems():
        sys_outs_list.append({
            'dataset': 'E2E',
            'mr': mr,
            'delex_mr': mrs[mr],
            'system': sys_name,
            'system_ref': None,
            'orig_ref': output,
            'informativeness': None,
            'naturalness': None,
            'quality': None,
            'is_real': 0
        })
    log_info('Using %d different system outputs to create fake pairs.' %
             len(sys_outs_list))
    return sys_outs_list
Exemple #17
0
 def exposed_init_training(self, cfg):
     """Create the Seq2SeqGen object."""
     cfg = pickle.loads(cfg)
     tstart = time.time()
     log_info('Initializing training...')
     self.seq2seq = Seq2SeqGen(cfg)
     log_info('Training initialized. Time taken: %f secs.' % (time.time() - tstart))
Exemple #18
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:r:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg
        elif opt == '-r' and arg:
            rnd.seed(arg)

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        from tgen.rank_nn import SimpleNNRanker, EmbNNRanker
        if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']:
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker

    log_info('Using %s for ranking' % ranker_class.__name__)

    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class)

    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)

    # avoid the "maximum recursion depth exceeded" error
    sys.setrecursionlimit(100000)
    ranker.save_to_file(fname_rank_model)
Exemple #19
0
 def _save_checkpoint(self):
     """Save a checkpoint to a temporary path; set `self.checkpoint_path` to the path
     where it is saved; if called repeatedly, will always overwrite the last checkpoint."""
     if not self.checkpoint_path:
         fh, path = tempfile.mkstemp(".ckpt", "tgen-", self.checkpoint_path)
         self.checkpoint_path = path
     log_info('Saving checkpoint to %s' % self.checkpoint_path)
     self.saver.save(self.session, self.checkpoint_path)
Exemple #20
0
 def _save_checkpoint(self):
     """Save a checkpoint to a temporary path; set `self.checkpoint_path` to the path
     where it is saved; if called repeatedly, will always overwrite the last checkpoint."""
     if not self.checkpoint_path:
         fh, path = tempfile.mkstemp(".ckpt", "tgen-", self.checkpoint_path)
         self.checkpoint_path = path
     log_info('Saving checkpoint to %s' % self.checkpoint_path)
     self.saver.save(self.session, self.checkpoint_path)
Exemple #21
0
    def save_to_file(self, lexicalizer_fname):
        """Save the lexicalizer model to a file (and a second file with the LM, if needed)."""
        log_info("Saving lexicalizer to %s..." % lexicalizer_fname)
        with file_stream(lexicalizer_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL)

        if not isinstance(self._form_select, RandomFormSelect):
            self._form_select.save_model(lexicalizer_fname)
Exemple #22
0
 def _save_checkpoint(self):
     """Save a checkpoint to a temporary path; set `self.checkpoint_path` to the path
     where it is saved; if called repeatedly, will always overwrite the last checkpoint."""
     if not self.checkpoint_path:
         path = tempfile.mkdtemp(suffix="", prefix="tftreecl-")
         self.checkpoint_path = os.path.join(path, "ckpt")
     log_info('Saving checkpoint to %s' % self.checkpoint_path)
     self.saver.save(self.session, self.checkpoint_path)
Exemple #23
0
 def _save_checkpoint(self):
     """Save a checkpoint to a temporary path; set `self.checkpoint_path` to the path
     where it is saved; if called repeatedly, will always overwrite the last checkpoint."""
     if not self.checkpoint_path:
         path = tempfile.mkdtemp(suffix="", prefix="tftreecl-")
         self.checkpoint_path = os.path.join(path, "ckpt")
     log_info('Saving checkpoint to %s' % self.checkpoint_path)
     self.saver.save(self.session, self.checkpoint_path)
Exemple #24
0
def add_fake_data(train_data, real_data, add_from='', create_pairs=''):
    """Adding fake data to the training set (return just the training set
    if there's nothing to add).
    @param train_data: training data (correct CV part if applicable)
    @param real_data: basis on which the fake data should be created
    @param add_from: T = include human refs from training data, \
        S = include system outputs in training data (in addition to real_data)
    @param create_pairs: create training pairs to rank ('' - not at all, \
        'add' - in addition to regular fakes, 'only' - exclusively)
    @return the enhanced (or unchanged) training set
    """
    if 'T' in add_from:
        log_info(
            "Will create fake data from human references in training data.")
        human_data = train_data.copy()
        refs = human_data['orig_ref'].str.split(' <\|> ').apply(pd.Series,
                                                                1).stack()
        refs.index = refs.index.droplevel(-1)
        refs.name = 'orig_ref'
        del human_data['orig_ref']
        human_data = human_data.join(refs).reset_index()
        human_data = human_data.groupby(
            ['mr', 'orig_ref'],  # delete scores
            as_index=False).agg(lambda vals: None)
        real_data = pd.concat((real_data, human_data), sort=True)
        train_data['orig_ref'] = ''

    if 'S' in add_from:
        log_info("Will create fake data from system outputs in training data.")
        # we keep the scores here, but use the outputs as orig references
        sys_outs = train_data.copy()
        del sys_outs['orig_ref']  # delete original human refs first
        sys_outs = sys_outs.rename(columns={'system_ref': 'orig_ref'})
        real_data = pd.concat((real_data, sys_outs), sort=True)

    # there is some fake data to be created and added
    if len(real_data):
        log_info("Creating fake data...")
        fake_data = create_fake_data(
            real_data,
            train_data.columns,
            score_type=('hter' if args.hter_score else 'nlg'))
        log_info("Created %d fake instances." % len(fake_data))
        # now we can add fake pairwise rankings
        if create_pairs:
            fake_pairs = create_fake_pairs(fake_data, len(real_data))
            if create_pairs == 'only':
                return pd.concat([fake_pairs, train_data], sort=True)
            else:
                log_info(
                    'Only keeping fake pairs, forgetting individual instances.'
                )
                return pd.concat([fake_data, fake_pairs, train_data],
                                 sort=True)
        return pd.concat([fake_data, train_data])

    # no fake data to be added -> return just the original
    return train_data
Exemple #25
0
    def train(self,
              das,
              trees,
              data_portion=1.0,
              valid_das=None,
              valid_trees=None):
        """Run training on the given training data.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (defaults to 1.0)
        @param valid_das: validation data DAs
        @param valid_trees: list of lists of corresponding paraphrases (same length as valid_das)
        """

        log_info('Training reranking classifier...')

        # initialize training
        self._init_training(das, trees, data_portion)
        if self.mode in ['tokens', 'tagged_lemmas'
                         ] and valid_trees is not None:
            valid_trees = [
                self._tokens_to_flat_trees(
                    paraphrases, use_tags=self.mode == 'tagged_lemmas')
                for paraphrases in valid_trees
            ]

        # start training
        top_comb_cost = float('nan')

        for iter_no in xrange(1, self.passes + 1):
            self.train_order = range(len(self.train_trees))
            if self.randomize:
                rnd.shuffle(self.train_order)
            pass_cost, pass_diff = self._training_pass(iter_no)

            if self.validation_freq and iter_no > self.min_passes and iter_no % self.validation_freq == 0:

                valid_diff = 0
                if valid_das:
                    valid_diff = np.sum([
                        np.sum(self.dist_to_da(d, t))
                        for d, t in zip(valid_das, valid_trees)
                    ])

                # cost combining validation and training data performance
                # (+ "real" cost with negligible weight)
                comb_cost = 1000 * valid_diff + 100 * pass_diff + pass_cost
                log_info('Combined validation cost: %8.3f' % comb_cost)

                # if we have the best model so far, save it as a checkpoint (overwrite previous)
                if math.isnan(top_comb_cost) or comb_cost < top_comb_cost:
                    top_comb_cost = comb_cost
                    self._save_checkpoint()

        # restore last checkpoint (best performance on devel data)
        self.restore_checkpoint()
Exemple #26
0
def convert_model(model_fname):

    reset_default_graph()

    param_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.params.gz', model_fname)
    log_info('Converting %s to %s...' % (model_fname, param_fname))
    model = Seq2SeqBase.load_from_file(model_fname)
    with file_stream(param_fname, 'wb', encoding=None) as fh:
        pickle.dump(model.get_model_params(), fh, protocol=pickle.HIGHEST_PROTOCOL)
Exemple #27
0
def seq2seq_train(args):

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-s', '--train-size', type=float,
                    help='Portion of the training data to use (default: 1.0)', default=1.0)
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use')
    ap.add_argument('-w', '--work-dir', type=str, help='Main working directory for parallel jobs')
    ap.add_argument('-e', '--experiment-id', type=str,
                    help='Experiment ID for parallel jobs (used as job name prefix)')
    ap.add_argument('-r', '--random-seed', type=str,
                    help='Initial random seed (used as string).')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')
    ap.add_argument('-v', '--valid-data', type=str,
                    help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)')
    ap.add_argument('-l', '--lexic-data', type=str,
                    help='Lexicalization data paths (1-2 comma-separated files: surface forms,' +
                    'training lexic. instructions)')
    ap.add_argument('-t', '--tb-summary-dir', '--tensorboard-summary-dir', '--tensorboard', type=str,
                    help='Directory where Tensorboard summaries are saved during training')

    ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file')
    ap.add_argument('da_train_file', type=str, help='Input training DAs')
    ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences')
    ap.add_argument('seq2seq_model_file', type=str,
                    help='File name where to save the trained Seq2Seq generator model')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))
    if args.random_seed:
        rnd.seed(args.random_seed)

    log_info('Training sequence-to-sequence generator...')

    config = Config(args.seq2seq_config_file)

    if args.tb_summary_dir:  # override Tensorboard setting
        config['tb_summary_dir'] = args.tb_summary_dir
    if args.jobs:  # parallelize when training
        config['jobs_number'] = args.jobs
        if not args.work_dir:
            work_dir, _ = os.path.split(args.seq2seq_config_file)
        generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id)
    else:  # just a single training instance
        generator = Seq2SeqGen(config)

    generator.train(args.da_train_file, args.tree_train_file,
                    data_portion=args.train_size, context_file=args.context_file,
                    validation_files=args.valid_data, lexic_files=args.lexic_data)

    sys.setrecursionlimit(100000)
    generator.save_to_file(args.seq2seq_model_file)
Exemple #28
0
def seq2seq_train(args):

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-s', '--train-size', type=float,
                    help='Portion of the training data to use (default: 1.0)', default=1.0)
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use')
    ap.add_argument('-w', '--work-dir', type=str, help='Main working directory for parallel jobs')
    ap.add_argument('-e', '--experiment-id', type=str,
                    help='Experiment ID for parallel jobs (used as job name prefix)')
    ap.add_argument('-r', '--random-seed', type=str,
                    help='Initial random seed (used as string).')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')
    ap.add_argument('-v', '--valid-data', type=str,
                    help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)')
    ap.add_argument('-l', '--lexic-data', type=str,
                    help='Lexicalization data paths (1-2 comma-separated files: surface forms,' +
                    'training lexic. instructions)')
    ap.add_argument('-t', '--tb-summary-dir', '--tensorboard-summary-dir', '--tensorboard', type=str,
                    help='Directory where Tensorboard summaries are saved during training')

    ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file')
    ap.add_argument('da_train_file', type=str, help='Input training DAs')
    ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences')
    ap.add_argument('seq2seq_model_file', type=str,
                    help='File name where to save the trained Seq2Seq generator model')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))
    if args.random_seed:
        rnd.seed(args.random_seed)

    log_info('Training sequence-to-sequence generator...')

    config = Config(args.seq2seq_config_file)

    if args.tb_summary_dir:  # override Tensorboard setting
        config['tb_summary_dir'] = args.tb_summary_dir
    if args.jobs:  # parallelize when training
        config['jobs_number'] = args.jobs
        if not args.work_dir:
            work_dir, _ = os.path.split(args.seq2seq_config_file)
        generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id)
    else:  # just a single training instance
        generator = Seq2SeqGen(config)

    generator.train(args.da_train_file, args.tree_train_file,
                    data_portion=args.train_size, context_file=args.context_file,
                    validation_files=args.valid_data, lexic_files=args.lexic_data)

    sys.setrecursionlimit(100000)
    generator.save_to_file(args.seq2seq_model_file)
Exemple #29
0
 def expand(self):
     log_info("Expanding...")
     for da_key, (da, orig_pos) in self.orig_da_positions.iteritems():
         if da_key not in self.transl_da_positions:
             print >> sys.stderr, "DA key not found: %s" % da_key
             print >> sys.stderr, "Original positions: %s" % ", ".join(
                 [str(p) for p in orig_pos])
             continue
         _, transl_pos = self.transl_da_positions[da_key]
         self.expand_da(da, orig_pos, transl_pos)
Exemple #30
0
 def write_outputs(self):
     log_info("Writing outputs...")
     write_texts(self.out_texts_file, self.out_texts)
     write_toks(self.out_delex_texts_file,
                self.out_delex_texts,
                capitalize=False,
                detok=False,
                lowercase=True)
     write_das(self.out_das_file, self.out_das)
     write_das(self.out_delex_das_file, self.out_delex_das)
def convert_model(model_fname):

    reset_default_graph()

    param_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.params.gz', model_fname)
    log_info('Converting %s to %s...' % (model_fname, param_fname))
    model = Seq2SeqBase.load_from_file(model_fname)
    with file_stream(param_fname, 'wb', encoding=None) as fh:
        pickle.dump(model.get_model_params(),
                    fh,
                    protocol=pickle.HIGHEST_PROTOCOL)
Exemple #32
0
    def _create_delex_texts(self):
        """Delexicalize texts in the buffers and save them separately in the member variables,
        along with the delexicalization instructions used for the operation."""
        self._delex_texts = []
        self._absts = []
        for text_idx, (text, da) in enumerate(zip(self._texts, self._das)):
            delex_text = []
            absts = []
            # do the delexicalization, keep track of which slots we used
            for tok_idx, (form, lemma, tag) in enumerate(text):
                # abstract away from numbers
                abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower())
                abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma)
                # try to find if the surface form belongs to some slot
                slot, value = self._rev_sf_dict.get(
                    (abst_form, abst_lemma, tag), (None, None))
                # if we found a slot, get back the numbers
                if slot:
                    for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)',
                                                 lemma):
                        value = re.sub(r'_',
                                       num_match.group(1),
                                       value,
                                       count=1)
                # fall back to directly comparing against the DA value
                else:
                    slot = da.has_value(lemma)
                    value = lemma

                # if we found something, delexicalize it (check if the value corresponds to the DA!)
                if (slot and slot in self._abst_slots
                        and da.value_for_slot(slot)
                        not in [None, 'none', 'dont_care']
                        and value in da.value_for_slot(slot)):
                    delex_text.append(('X-' + slot, 'X-' + slot, tag))
                    absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1))
                # otherwise keep the token as it is
                else:
                    delex_text.append((form, lemma, tag))
            # fix coordinated delexicalized values
            self._delex_fix_coords(delex_text, da, absts)
            covered_slots = set([a.slot for a in absts])
            # check and warn if we left isomething non-delexicalized
            for dai in da:
                if (dai.slot in self._abst_slots
                        and dai.value not in [None, 'none', 'dont_care']
                        and dai.slot not in covered_slots):
                    log_info(
                        "Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n"
                        % (dai.slot, text_idx, str(da), " ".join(
                            [form for form, _, _ in text])))
            # save the delexicalized text and the delexicalization instructions
            self._delex_texts.append(delex_text)
            self._absts.append(absts)
Exemple #33
0
    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(SimpleNNRanker, self)._init_training(das_file, ttree_file, data_portion)

        self._init_neural_network()

        self.w_after_iter = []
        self.update_weights_sum()

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')
Exemple #34
0
    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(SimpleNNRanker, self)._init_training(das_file, ttree_file, data_portion)

        self._init_neural_network()

        self.w_after_iter = []
        self.update_weights_sum()

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')
Exemple #35
0
 def _load_contexts(self, das, context_file):
     """Load input context utterances from a .yaml.gz/.pickle.gz/.txt file and add them to the
     given DAs (each returned item is then a tuple of context + DA)."""
     # read contexts, combine them with corresponding DAs for easier handling
     if context_file is None:
         raise ValueError('Expected context utterances file name!')
     log_info('Reading context utterances from %s...' % context_file)
     if context_file.endswith('.txt'):
         contexts = read_tokens(context_file)
     else:
         contexts = tokens_from_doc(read_ttrees(context_file), self.language, self.selector)
     return [(context, da) for context, da in zip(contexts, das)]
Exemple #36
0
    def load_from_file(lexicalizer_fname):
        """Load the lexicalizer model from a file (and a second file with the LM, if needed)."""
        log_info("Loading lexicalizer from %s..." % lexicalizer_fname)
        with file_stream(lexicalizer_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Lexicalizer(cfg=data['cfg'])
            ret.__dict__.update(data)
            ret._form_select = ret._form_select(data['cfg'])

        if not isinstance(ret._form_select, RandomFormSelect):
            ret._form_select.load_model(lexicalizer_fname)
        return ret
Exemple #37
0
 def _load_contexts(self, das, context_file):
     """Load input context utterances from a .yaml.gz/.pickle.gz/.txt file and add them to the
     given DAs (each returned item is then a tuple of context + DA)."""
     # read contexts, combine them with corresponding DAs for easier handling
     if context_file is None:
         raise ValueError('Expected context utterances file name!')
     log_info('Reading context utterances from %s...' % context_file)
     if context_file.endswith('.txt'):
         contexts = read_tokens(context_file)
     else:
         contexts = tokens_from_doc(read_ttrees(context_file), self.language, self.selector)
     return [(context, da) for context, da in zip(contexts, das)]
Exemple #38
0
    def load_from_file(model_fname):
        log_info("Loading classifier from %s..." % model_fname)

        with file_stream(model_fname, 'rb', encoding=None) as fh:
            typeid = pickle.load(fh)
            if typeid != E2EPatternClassifier:
                raise ValueError('Wrong type identifier in file %s' %
                                 model_fname)
            cfg = pickle.load(fh)

        ret = E2EPatternClassifier(cfg)
        ret.__dict__.update(cfg)  # load the trained settings
        return ret
Exemple #39
0
def convert(args):
    """Main conversion function (using command-line arguments as parsed by Argparse)."""
    log_info('Loading...')
    reader = Reader(args.tagger_model, args.abst_slots)
    reader.load_surface_forms(args.surface_forms)
    log_info('Processing input files...')
    insts = reader.process_dataset(args.input_data)
    log_info('Loaded %d data items.' % len(insts))

    # write all data groups
    # outputs: plain delex, plain lex, interleaved delex & lex, CoNLL-U delex & lex, DAs, abstrs
    writer = Writer()

    log_info('Writing %s (size: %d)...' % (args.out_prefix, len(insts)))

    writer.write_absts(args.out_prefix + '-abst.txt', insts)

    writer.write_das(args.out_prefix + '-das_l.txt', insts)
    writer.write_das(args.out_prefix + '-das.txt', insts, delex=True)

    writer.write_text(args.out_prefix + '-text_l.txt', 'plain', insts)
    writer.write_text(args.out_prefix + '-text.txt', 'plain', insts, delex=True)
    writer.write_text(args.out_prefix + '-tls_l.txt', 'interleaved', insts)
    writer.write_text(args.out_prefix + '-tls.txt', 'interleaved', insts, delex=True)
    writer.write_text(args.out_prefix + '-text_l.conll', 'conll', insts)
    writer.write_text(args.out_prefix + '-text.conll', 'conll', insts, delex=True)
Exemple #40
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        if rank_config['nn'] == 'emb':
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker
    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class)
    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)
    ranker.save_to_file(fname_rank_model)
Exemple #41
0
 def _load_trees(self, ttree_file, selector=None):
     """Load input trees/sentences from a .yaml.gz/.pickle.gz (trees) or .txt (sentences) file."""
     log_info('Reading t-trees/sentences from ' + ttree_file + '...')
     if ttree_file.endswith('.txt'):
         if not self.use_tokens:
             raise ValueError("Cannot read trees from a .txt file (%s)!" % ttree_file)
         return read_tokens(ttree_file)
     else:
         ttree_doc = read_ttrees(ttree_file)
         if selector is None:
             selector = self.selector
         if self.use_tokens:
             return tokens_from_doc(ttree_doc, self.language, selector)
         else:
             return trees_from_doc(ttree_doc, self.language, selector)
Exemple #42
0
    def save_to_file(self, model_fname):
        """Save the generator to a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph will be stored with a \
            different extension
        """
        log_info("Saving classifier to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.get_all_settings(), fh, protocol=pickle.HIGHEST_PROTOCOL)
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        if self.checkpoint_path:
            shutil.copyfile(self.checkpoint_path, tf_session_fname)
        else:
            self.saver.save(self.session, tf_session_fname)
Exemple #43
0
    def _delex_texts(self):
        """Delexicalize texts in the buffers and save them separately in the member variables,
        along with the delexicalization instructions used for the operation."""
        self._delexed_texts = []
        self._absts = []
        for text_idx, (text, da) in enumerate(zip(self._sents, self._das)):
            delex_text = []
            absts = []
            # do the delexicalization, keep track of which slots we used
            for tok_idx, (form, lemma, tag) in enumerate(text):
                # abstract away from numbers
                abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower())
                abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma)
                # try to find if the surface form belongs to some slot
                slot, value = self._rev_sf_dict.get((abst_form, abst_lemma, tag), (None, None))
                # if we found a slot, get back the numbers
                if slot:
                    for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma):
                        value = re.sub(r'_', num_match.group(1), value, count=1)
                # fall back to directly comparing against the DA value
                else:
                    slot = da.has_value(lemma)
                    value = lemma

                # if we found something, delexicalize it
                if (slot and slot in self._abst_slots and
                        da.value_for_slot(slot) not in [None, 'none', 'dont_care']):
                    delex_text.append(('X-' + slot, 'X-' + slot, tag))
                    absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1))
                # otherwise keep the token as it is
                else:
                    delex_text.append((form, lemma, tag))
            # fix coordinated delexicalized values
            self._delex_fix_coords(delex_text, da, absts)
            covered_slots = set([a.slot for a in absts])
            # check and warn if we left isomething non-delexicalized
            for dai in da:
                if (dai.slot in self._abst_slots and
                        dai.value not in [None, 'none', 'dont_care'] and
                        dai.slot not in covered_slots):
                    log_info("Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n" %
                             (dai.slot,
                              text_idx,
                              unicode(da),
                              " ".join([form for form, _, _ in text])))
            # save the delexicalized text and the delexicalization instructions
            self._delexed_texts.append(delex_text)
            self._absts.append(absts)
Exemple #44
0
    def train(self, das, trees, data_portion=1.0, valid_das=None, valid_trees=None):
        """Run training on the given training data.

        @param das: name of source file with training DAs, or list of DAs
        @param trees: name of source file with corresponding trees/sentences, or list of trees
        @param data_portion: portion of the training data to be used (defaults to 1.0)
        @param valid_das: validation data DAs
        @param valid_trees: list of lists of corresponding paraphrases (same length as valid_das)
        """

        log_info('Training reranking classifier...')

        # initialize training
        self._init_training(das, trees, data_portion)
        if self.mode in ['tokens', 'tagged_lemmas'] and valid_trees is not None:
            valid_trees = [self._tokens_to_flat_trees(paraphrases,
                                                      use_tags=self.mode == 'tagged_lemmas')
                           for paraphrases in valid_trees]

        # start training
        top_comb_cost = float('nan')

        for iter_no in xrange(1, self.passes + 1):
            self.train_order = range(len(self.train_trees))
            if self.randomize:
                rnd.shuffle(self.train_order)
            pass_cost, pass_diff = self._training_pass(iter_no)

            if self.validation_freq and iter_no > self.min_passes and iter_no % self.validation_freq == 0:

                valid_diff = 0
                if valid_das:
                    valid_diff = np.sum([np.sum(self.dist_to_da(d, t))
                                         for d, t in zip(valid_das, valid_trees)])

                # cost combining validation and training data performance
                # (+ "real" cost with negligible weight)
                comb_cost = 1000 * valid_diff + 100 * pass_diff + pass_cost
                log_info('Combined validation cost: %8.3f' % comb_cost)

                # if we have the best model so far, save it as a checkpoint (overwrite previous)
                if math.isnan(top_comb_cost) or comb_cost < top_comb_cost:
                    top_comb_cost = comb_cost
                    self._save_checkpoint()

        # restore last checkpoint (best performance on devel data)
        self.restore_checkpoint()
Exemple #45
0
    def load_from_file(model_fname):
        """Load the reranker from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading reranker from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = RerankingClassifier(cfg=data['cfg'])
            ret.load_all_settings(data)

        # re-build TF graph and restore the TF session
        tf_session_fname = os.path.abspath(re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname))
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)
        return ret
Exemple #46
0
def candgen_train(args):
    opts, files = getopt(args, 'p:lnc:sd:t:')

    prune_threshold = 1
    parent_lemmas = False
    node_limits = False
    comp_type = None
    comp_limit = None
    comp_slots = False
    tree_classif = False

    for opt, arg in opts:
        if opt == '-p':
            prune_threshold = int(arg)
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-l':
            parent_lemmas = True
        elif opt == '-n':
            node_limits = True
        elif opt == '-c':
            comp_type = arg
            if ':' in comp_type:
                comp_type, comp_limit = comp_type.split(':', 1)
                comp_limit = int(comp_limit)
        elif opt == '-t':
            tree_classif = Config(arg)
        elif opt == '-s':
            comp_slots = True

    if len(files) != 3:
        sys.exit("Invalid arguments.\n" + __doc__)
    fname_da_train, fname_ttrees_train, fname_cand_model = files

    log_info('Training candidate generator...')
    candgen = RandomCandidateGenerator({'prune_threshold': prune_threshold,
                                        'parent_lemmas': parent_lemmas,
                                        'node_limits': node_limits,
                                        'compatible_dais_type': comp_type,
                                        'compatible_dais_limit': comp_limit,
                                        'compatible_slots': comp_slots,
                                        'tree_classif': tree_classif})
    candgen.train(fname_da_train, fname_ttrees_train)
    candgen.save_to_file(fname_cand_model)
Exemple #47
0
    def _init_neural_network(self):
        """Create the neural network for classification, according to the self.nn_shape
        parameter (as set in configuration)."""
        layers = []
        if self.tree_embs:
            layers.append([Embedding('emb', self.dict_size, self.emb_size, 'uniform_005')])

        # feedforward networks
        if self.nn_shape.startswith('ff'):
            if self.tree_embs:
                layers.append([Flatten('flat')])
            num_ff_layers = 2
            if self.nn_shape[-1] in ['0', '1', '3', '4']:
                num_ff_layers = int(self.nn_shape[-1])
            layers += self._ff_layers('ff', num_ff_layers)

        # convolutional networks
        elif 'conv' in self.nn_shape or 'pool' in self.nn_shape:
            assert self.tree_embs  # convolution makes no sense without embeddings
            num_conv = 0
            if 'conv' in self.nn_shape:
                num_conv = 1
            if 'conv2' in self.nn_shape:
                num_conv = 2
            pooling = None
            if 'maxpool' in self.nn_shape:
                pooling = T.max
            elif 'avgpool' in self.nn_shape:
                pooling = T.mean
            layers += self._conv_layers('conv', num_conv, pooling)
            layers.append([Flatten('flat')])
            layers += self._ff_layers('ff', 1)

        # input types: integer 3D for tree embeddings (batch + 2D embeddings),
        #              float 2D (matrix) for binary input (batch + features)
        input_types = (T.itensor3,) if self.tree_embs else (T.fmatrix,)

        # create the network, connect layers
        self.classif = ClassifNN(layers, self.input_shape, input_types, normgrad=False)
        log_info("Network shape:\n\n" + str(self.classif))
Exemple #48
0
def convert(args):
    """Main conversion function (using command-line arguments as parsed by Argparse)."""
    log_info('Loading...')
    analyzer = MorphoAnalyzer(args.tagger_model, args.abst_slots)
    analyzer.load_surface_forms(args.surface_forms)
    log_info('Processing input files...')
    analyzer.process_files(args.input_text_file, args.input_da_file, args.skip_hello)
    log_info('Loaded %d data items.' % analyzer.buf_length())

    # outputs: plain delex, plain lex, interleaved delex & lex, CoNLL-U delex & lex, DAs, abstrs
    # TODO maybe do relexicalization, but not now (no time)

    if args.split:
        # get file name prefixes and compute data sizes for all the parts to be split
        out_names = re.split(r'[, ]+', args.out_prefix)
        data_sizes = [int(part_size) for part_size in args.split.split(':')]
        assert len(out_names) == len(data_sizes)
        # compute sizes for all but the 1st part (+ round them)
        total = float(sum(data_sizes))
        remain = analyzer.buf_length()
        for part_no in xrange(len(data_sizes) - 1, 0, -1):
            part_size = int(round(analyzer.buf_length() * (data_sizes[part_no] / total)))
            data_sizes[part_no] = part_size
            remain -= part_size
        # put whatever remained into the 1st part
        data_sizes[0] = remain
    else:
        # use just one part -- containing all the data
        data_sizes = [analyzer.buf_length()]
        out_names = [args.out_prefix]

    # write all data parts
    offset = 0
    for part_size, part_name in zip(data_sizes, out_names):
        log_info('Writing %s (size: %d)...' % (part_name, part_size))
        subrange = slice(offset, offset + part_size)

        analyzer.write_absts(part_name + '-abst.txt', subrange)

        analyzer.write_das(part_name + '-das_l.txt', subrange)
        analyzer.write_das(part_name + '-das.txt', subrange, delex=True)

        analyzer.write_text(part_name + '-text_l.txt', 'plain', subrange)
        analyzer.write_text(part_name + '-text.txt', 'plain', subrange, delex=True)
        analyzer.write_text(part_name + '-tls_l.txt', 'interleaved', subrange)
        analyzer.write_text(part_name + '-tls.txt', 'interleaved', subrange, delex=True)
        analyzer.write_text(part_name + '-text_l.conll', 'conll', subrange)
        analyzer.write_text(part_name + '-text.conll', 'conll', subrange, delex=True)

        offset += part_size
Exemple #49
0
    def train(self, das_file, ttree_file, data_portion=1.0,
              context_file=None, validation_files=None):
        """
        The main training process – initialize and perform a specified number of
        training passes, validating every couple iterations.

        @param das_file: training data file with DAs
        @param ttree_file: training data file with output t-trees/sentences
        @param data_portion: portion of training data to be actually used, defaults to 1.0
        @param context_file: path to training file with contexts (trees/sentences)
        @param validation_files: paths to validation data (DAs, trees/sentences, possibly contexts)
        """
        # load and prepare data and initialize the neural network
        self._init_training(das_file, ttree_file, data_portion, context_file, validation_files)

        # do the training passes
        for iter_no in xrange(1, self.passes + 1):

            self.train_order = range(len(self.train_enc))
            if self.randomize:
                rnd.shuffle(self.train_order)

            self._training_pass(iter_no)

            # validate every couple iterations
            if iter_no % self.validation_freq == 0 and self.validation_size > 0:

                cur_train_out = self.process_das(self.train_das[:self.batch_size])
                log_info("Current train output:\n" +
                         "\n".join([" ".join(n.t_lemma for n in tree.nodes[1:])
                                    if self.use_tokens
                                    else unicode(tree)
                                    for tree in cur_train_out]))

                cur_valid_out = self.process_das(self.valid_das[:self.batch_size])
                cur_cost = self._compute_valid_cost(cur_valid_out, self.valid_trees)
                log_info("Current validation output:\n" +
                         "\n".join([" ".join(n.t_lemma for n in tree.nodes[1:])
                                    if self.use_tokens
                                    else unicode(tree)
                                    for tree in cur_valid_out]))
                log_info('IT %d validation cost: %5.4f' % (iter_no, cur_cost))

                # if we have the best model so far, save it as a checkpoint (overwrite previous)
                if math.isnan(self.top_k_costs[0]) or cur_cost < self.top_k_costs[0]:
                    self._save_checkpoint()

                if self._should_stop(iter_no, cur_cost):
                    log_info("Stoping criterion met.")
                    break
Exemple #50
0
    def load_from_file(model_fname):
        """Load the whole ensemble from a file (load settings and model parameters, then build the
        ensemble network)."""

        log_info("Loading ensemble generator from %s..." % model_fname)

        with file_stream(model_fname, 'rb', encoding=None) as fh:
            typeid = pickle.load(fh)
            if typeid != Seq2SeqEnsemble:
                raise ValueError('Wrong type identifier in file %s' % model_fname)
            cfg = pickle.load(fh)
            ret = Seq2SeqEnsemble(cfg)
            gens_dump = pickle.load(fh)
            if 'classif_filter' in cfg:
                rerank_settings = pickle.load(fh)
                rerank_params = pickle.load(fh)
            else:
                rerank_settings = None
                rerank_params = None

        ret.build_ensemble(gens_dump, rerank_settings, rerank_params)
        return ret
Exemple #51
0
    def _load_valid_data(self, valid_data_paths):
        """Load validation data from separate files (comma-separated list of files with DAs, trees,
        and optionally contexts is expected)."""
        # parse validation data file specification
        valid_data_paths = valid_data_paths.split(',')
        if len(valid_data_paths) == 3:  # with contexts (this does not determine if they're used)
            valid_das_file, valid_trees_file, valid_context_file = valid_data_paths
        else:
            valid_das_file, valid_trees_file = valid_data_paths

        # load the validation data
        log_info('Reading DAs from ' + valid_das_file + '...')
        self.valid_das = read_das(valid_das_file)
        self.valid_trees = self._load_trees(valid_trees_file, selector=self.ref_selectors)
        if self.use_context:
            self.valid_das = self._load_contexts(self.valid_das, valid_context_file)

        # reorder validation data for multiple references (see also _cut_valid_data)
        valid_size = len(self.valid_trees)
        if self.multiple_refs:
            num_refs, refs_stored = self._check_multiple_ref_type(valid_size)

            # serial: different instances next to each other, then synonymous in the same order
            if refs_stored == 'serial':
                valid_tree_chunks = [chunk for chunk in
                                     chunk_list(self.valid_trees, valid_size / num_refs)]
                self.valid_trees = [[chunk[i] for chunk in valid_tree_chunks]
                                    for i in xrange(valid_size / num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[0:valid_size / num_refs]
            # parallel: synonymous instances next to each other
            elif refs_stored == 'parallel':
                self.valid_trees = [chunk for chunk in chunk_list(self.valid_trees, num_refs)]
                if len(self.valid_das) > len(self.valid_trees):
                    self.valid_das = self.valid_das[::num_refs]

        # no multiple references; make lists of size 1 to simplify working with the data
        else:
            self.valid_trees = [[tree] for tree in self.valid_trees]
Exemple #52
0
    def save_to_file(self, model_fname):
        """Save the whole ensemble into a file (get all settings and parameters, dump them in a
        pickle)."""

        log_info("Saving generator to %s..." % model_fname)
        with file_stream(model_fname, 'wb', encoding=None) as fh:
            pickle.dump(self.__class__, fh, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.cfg, fh, protocol=pickle.HIGHEST_PROTOCOL)

            gens_dump = []
            for gen in self.gens:
                setting = gen.get_all_settings()
                parset = gen.get_model_params()
                setting['classif_filter'] = self.classif_filter is not None
                gens_dump.append((setting, parset))

            pickle.dump(gens_dump, fh, protocol=pickle.HIGHEST_PROTOCOL)

            if self.classif_filter:
                pickle.dump(self.classif_filter.get_all_settings(), fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
                pickle.dump(self.classif_filter.get_model_params(), fh,
                            protocol=pickle.HIGHEST_PROTOCOL)
Exemple #53
0
    def _training_pass(self, iter_no):
        """Perform one pass through the training data (epoch).
        @param iter_no: pass number (for logging)
        """
        it_cost = 0.0
        it_learning_rate = self.alpha * np.exp(-self.alpha_decay * iter_no)
        log_info('IT %d alpha: %8.5f' % (iter_no, it_learning_rate))

        for batch_no in self.train_order:

            # feed data into the TF session:

            # initial state
            initial_state = np.zeros([self.batch_size, self.emb_size])
            feed_dict = {self.initial_state: initial_state,
                         self.learning_rate: it_learning_rate}

            # encoder inputs
            for i in xrange(len(self.train_enc[batch_no])):
                feed_dict[self.enc_inputs[i]] = self.train_enc[batch_no][i]

            # decoder inputs
            for i in xrange(len(self.train_dec[batch_no])):
                feed_dict[self.dec_inputs[i]] = self.train_dec[batch_no][i]

            # the last target output (padding, to have the same number of step as there are decoder
            # inputs) is always 'VOID' for all instances of the batch
            feed_dict[self.targets[-1]] = len(self.train_dec[batch_no][0]) * [self.tree_embs.VOID]

            # run the TF session (one optimizer step == train_func) and get the cost
            # (1st value returned is None, throw it away)
            _, cost = self.session.run([self.train_func, self.cost], feed_dict=feed_dict)

            it_cost += cost

        log_info('IT %d total cost: %8.5f' % (iter_no, cost))
Exemple #54
0
def eval_tokens(das, eval_tokens, gen_tokens):
    """Evaluate generated tokens and print out statistics."""
    postprocess_tokens(eval_tokens, das)
    postprocess_tokens(gen_tokens, das)

    evaluator = BLEUMeasure()
    for pred_sent, gold_sents in zip(gen_tokens, eval_tokens):
        evaluator.append(pred_sent, gold_sents)
    log_info("BLEU score: %.4f" % (evaluator.bleu() * 100))

    evaluator = Evaluator()
    for pred_sent, gold_sents in zip(gen_tokens, eval_tokens):
        for gold_sent in gold_sents:  # effectively an average over all gold paraphrases
            evaluator.append(gold_sent, pred_sent)

    log_info("TOKEN precision: %.4f, Recall: %.4f, F1: %.4f" % evaluator.p_r_f1(EvalTypes.TOKEN))
    log_info("Sentence length stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaluator.size_stats())
    log_info("Common subphrase stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
             evaluator.common_substruct_stats())
Exemple #55
0
def rerank_cl_eval(args):
    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))
    ap.add_argument('-l', '--language', type=str,
                    help='Override classifier language (for t-tree input files)')
    ap.add_argument('-s', '--selector', type=str,
                    help='Override classifier selector (for t-tree input files)')
    ap.add_argument('fname_cl_model', type=str, help='Path to trained reranking classifier model')
    ap.add_argument('fname_test_da', type=str, help='Path to test DA file')
    ap.add_argument('fname_test_sent', type=str, help='Path to test trees file (must be trees!)')
    args = ap.parse_args(args)

    log_info("Loading reranking classifier...")
    rerank_cl = RerankingClassifier.load_from_file(args.fname_cl_model)
    if args.language is not None:
        rerank_cl.language = args.language
    if args.selector is not None:
        rerank_cl.selector = args.selector

    log_info("Evaluating...")
    tot_len, dist = rerank_cl.evaluate_file(args.fname_test_da, args.fname_test_sent)
    log_info("Penalty: %d, Total DAIs %d." % (dist, tot_len))
Exemple #56
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)
Exemple #57
0
def asearch_gen(args):
    """A*search generation"""
    from pytreex.core.document import Document

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(das,
                                                  trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            gen_tree = tgen.generate_tree(da, gen_doc)
            lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list)
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore',
                            "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree,
                          gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats())
        log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
                 evaler.common_substruct_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Exemple #58
0
def sample_gen(args):
    from pytreex.core.document import Document
    opts, files = getopt(args, 'r:n:o:w:')
    num_to_generate = 1
    oracle_eval_file = None
    fname_ttrees_out = None

    for opt, arg in opts:
        if opt == '-n':
            num_to_generate = int(arg)
        elif opt == '-o':
            oracle_eval_file = arg
        elif opt == '-w':
            fname_ttrees_out = arg

    if len(files) != 2:
        sys.exit(__doc__)
    fname_cand_model, fname_da_test = files

    # load model
    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)

    ranker = candgen

    tgen = SamplingPlanner({'candgen': candgen, 'ranker': ranker})
    # generate
    log_info('Generating...')
    gen_doc = Document()
    das = read_das(fname_da_test)
    for da in das:
        for _ in xrange(num_to_generate):  # repeat generation n times
            tgen.generate_tree(da, gen_doc)

    # evaluate if needed
    if oracle_eval_file is not None:
        log_info('Evaluating oracle F1...')
        log_info('Loading gold data from ' + oracle_eval_file)
        gold_trees = ttrees_from_doc(read_ttrees(oracle_eval_file), tgen.language, tgen.selector)
        gen_trees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)
        log_info('Gold data loaded.')
        correct, predicted, gold = 0, 0, 0
        for gold_tree, gen_trees in zip(gold_trees, chunk_list(gen_trees, num_to_generate)):
            # find best of predicted trees (in terms of F1)
            _, tc, tp, tg = max([(f1_from_counts(c, p, g), c, p, g) for c, p, g
                                 in map(lambda gen_tree: corr_pred_gold(gold_tree, gen_tree),
                                        gen_trees)],
                                key=lambda x: x[0])
            correct += tc
            predicted += tp
            gold += tg
        # evaluate oracle F1
        log_info("Oracle Precision: %.6f, Recall: %.6f, F1: %.6f" % p_r_f1_from_counts(correct, predicted, gold))
    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Exemple #59
0
        rerank_cl.selector = args.selector

    log_info("Evaluating...")
    tot_len, dist = rerank_cl.evaluate_file(args.fname_test_da, args.fname_test_sent)
    log_info("Penalty: %d, Total DAIs %d." % (dist, tot_len))


if __name__ == '__main__':

    if len(sys.argv) < 2:
        sys.exit(__doc__)

    action = sys.argv[1]
    args = sys.argv[2:]

    log_info('Running on %s version %s' % (platform.python_implementation(),
                                           platform.python_version()))

    if action == 'candgen_train':
        candgen_train(args)
    elif action == 'percrank_train':
        percrank_train(args)
    elif action == 'sample_gen':
        sample_gen(args)
    elif action == 'asearch_gen':
        asearch_gen(args)
    elif action == 'seq2seq_train':
        seq2seq_train(args)
    elif action == 'seq2seq_gen':
        seq2seq_gen(args)
    elif action == 'treecl_train':
        treecl_train(args)
Exemple #60
0
    def _init_training(self, das_file, ttree_file, data_portion, context_file, validation_files):
        """Load training data, prepare batches, build the NN.

        @param das_file: training DAs (file path)
        @param ttree_file: training t-trees (file path)
        @param data_portion: portion of the data to be actually used for training
        @param context_file: training contexts (file path)
        @param validation_files: validation file paths (or None)
        """
        # read training data
        log_info('Reading DAs from ' + das_file + '...')
        das = read_das(das_file)
        trees = self._load_trees(ttree_file)
        if self.use_context:
            das = self._load_contexts(das, context_file)

        # make training data smaller if necessary
        train_size = int(round(data_portion * len(trees)))
        self.train_trees = trees[:train_size]
        self.train_das = das[:train_size]

        # load separate validation data files...
        if validation_files:
            self._load_valid_data(validation_files)
        # ... or save part of the training data for validation:
        elif self.validation_size > 0:
            self._cut_valid_data()  # will set train_trees, valid_trees, train_das, valid_das
        log_info('Using %d training, %d validation instances.' %
                 (len(self.train_das), len(self.valid_das)))

        # initialize embeddings
        if self.use_context:
            self.da_embs = ContextDAEmbeddingSeq2SeqExtract(cfg=self.cfg)
        else:
            self.da_embs = DAEmbeddingSeq2SeqExtract(cfg=self.cfg)
        if self.use_tokens:
            self.tree_embs = TokenEmbeddingSeq2SeqExtract(cfg=self.cfg)
        else:
            self.tree_embs = TreeEmbeddingSeq2SeqExtract(cfg=self.cfg)

        self.da_dict_size = self.da_embs.init_dict(self.train_das)
        self.tree_dict_size = self.tree_embs.init_dict(self.train_trees)
        self.max_tree_len = self.tree_embs.get_embeddings_shape()[0]
        self.max_da_len = self.da_embs.get_embeddings_shape()[0]

        # prepare training batches
        self.train_enc = [cut_batch_into_steps(b)
                          for b in grouper([self.da_embs.get_embeddings(da)
                                            for da in self.train_das],
                                           self.batch_size, None)]
        self.train_dec = [cut_batch_into_steps(b)
                          for b in grouper([self.tree_embs.get_embeddings(tree)
                                            for tree in self.train_trees],
                                           self.batch_size, None)]

        # train the classifier for filtering n-best lists
        if self.classif_filter:
            self.classif_filter.train(self.train_das, self.train_trees,
                                      valid_das=self.valid_das,
                                      valid_trees=self.valid_trees)
            self.classif_filter.restore_checkpoint()  # restore the best performance on devel data

        # convert validation data to flat trees to enable F1 measuring
        if self.validation_size > 0 and self.use_tokens:
            self.valid_trees = self._valid_data_to_flat_trees(self.valid_trees)

        # initialize top costs
        self.top_k_costs = [float('nan')] * self.top_k
        self.checkpoint_path = None

        # build the NN
        self._init_neural_network()

        # initialize the NN variables
        self.session.run(tf.initialize_all_variables())