Python log_debug Examples, logf.log_debug Python Examples

Example #1

0

Show file

File: rank.py Project: UFAL-DSG/tgen

    def _training_pass(self, pass_no):
        """Run one training pass, update weights (store them for possible averaging),
        and store diagnostic values."""

        pass_start_time = time.time()
        self.reset_diagnostics()
        self.update_weights_sum()

        log_debug('\n***\nTR %05d:' % pass_no)

        rgen_max_iter = self._get_num_iters(pass_no, self.rival_gen_max_iter)
        rgen_max_defic_iter = self._get_num_iters(pass_no, self.rival_gen_max_defic_iter)
        rgen_beam_size = self.rival_gen_beam_size
        rgen_prune_size = self.rival_gen_prune_size
        rgen_strategy = self._get_rival_gen_strategy(pass_no)

        for tree_no in self.train_order:

            log_debug('TREE-NO: %d' % tree_no)
            log_debug('SENT: %s' % self.train_sents[tree_no])

            gold = Inst(da=self.train_das[tree_no],
                        tree=self.train_trees[tree_no],
                        score=self._score(self.train_feats[tree_no]),
                        feats=self.train_feats[tree_no])

            # obtain some 'rival', alternative incorrect candidates
            for strategy in rgen_strategy:

                # generate using current weights
                if strategy == 'gen_cur_weights':
                    gen = self._gen_cur_weights(gold, rgen_max_iter, rgen_max_defic_iter,
                                                rgen_prune_size, rgen_beam_size)

                # generate while trying to update weights
                elif strategy == 'gen_update':
                    gen = self._gen_update(gold, rgen_max_iter, rgen_max_defic_iter,
                                           rgen_prune_size, rgen_beam_size)

                # check against other possible candidates/combinations
                else:
                    gen = self._get_rival_candidates(gold, tree_no, strategy)

                # evaluate the top-scoring generated tree against gold t-tree
                # (disregarding whether it was selected as the best one)
                self.evaluator.append(TreeNode(gold.tree), TreeNode(gen.tree), gold.score, gen.score)

                # update weights if the system doesn't give the highest score to the gold standard tree
                if gold.score < gen.score:
                    self._update_weights(gold, gen)

        # store a copy of the current weights for averaging
        self.store_iter_weights()

        # debug print: current weights and pass accuracy
        log_debug(self._feat_val_str(), '\n***')
        log_debug('PASS ACCURACY: %.3f' % self.evaluator.tree_accuracy())

        # print and return statistics
        self._print_pass_stats(pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)))

Example #2

0

Show file

File: planner.py Project: ProjectsUCSC/E2E-NLG-Personage

    def init_run(self, input_da, max_iter=None, max_defic_iter=None, prune_size=None, beam_size=None):
        """Init the A*-search generation for the given input DA, with the given parameters
        (the parameters override any previously set parameters, such as those loaded from
        configuration upon class creation).

        @param input_da: The input DA for which a tree is to be generated
        @param max_iter: Maximum number of iteration (hard termination)
        @param max_defic_iter: Maximum number of deficit iteration (soft termination)
        @param prune_size: Beam size for open list pruning
        @param beam_size: Beam size for candidate expansion (expand more at a time if > 1)
        """
        log_debug('GEN TREE for DA: %s' % unicode(input_da))

        # initialization
        empty_tree = TreeData()
        et_score = self.ranker.score(empty_tree, input_da)
        et_futpr = self.ranker.get_future_promise(empty_tree)

        self.open_list = CandidateList({empty_tree: (-(et_score + et_futpr), -et_score, -et_futpr)})
        self.close_list = CandidateList()
        self.input_da = input_da
        self.defic_iter = 0
        self.num_iter = 0
        self.candgen.init_run(input_da)

        if max_iter is not None:
            self.max_iter = max_iter
        if max_defic_iter is not None:
            self.max_defic_iter = max_defic_iter
        if prune_size is not None:
            self.prune_size = prune_size
        if beam_size is not None:
            self.beam_size = beam_size

Example #3

0

Show file

File: rank.py Project: UFAL-DSG/tgen

    def _gen_update(self, gold, max_iter, max_defic_iter, prune_size, beam_size):
        """Try generating using the current weights, but update the weights after each
        iteration if the result is not going in the right direction (not a subtree of the
        gold-standard tree).

        @param gold: the gold-standard Inst holding the input DA for generation and the reference tree
        @param max_iter: maximum number of A*-search iterations to run
        @param max_defic_iter: maximum number of deficit A*-search iterations (stopping criterion)
        @param prune_size: beam size for open list pruning
        @param beam_size: beam size for candidate expansion (expand more per iteration if > 1)
        @return: The best generated tree that is different from the gold-standard tree
        @rtype: Inst
        """

        log_debug('GEN-UPDATE')
        self.asearch_planner.init_run(gold.da, max_iter, max_defic_iter, prune_size, beam_size)

        while not self.asearch_planner.check_finalize():
            # run one A*search iteration
            self.asearch_planner.run_iter()

            # stop if there's nothing on the open list
            if not self.asearch_planner.open_list:
                break

            # look if we are on the right track to the gold tree
            cur_top, score = self.asearch_planner.open_list.peek()
            csi, _ = gold.tree.common_subtree_idxs(cur_top)

            # if not, update
            if len(csi) != len(cur_top):

                feats = self._extract_feats(cur_top, gold.da)
                gen = Inst(tree=cur_top, da=gold.da, feats=feats, score=score)

                # for small wrong trees,
                # fake the current open list to only include a subtree of the gold tree
                # TODO fake it better, include more variants
                # update using a subtree of the gold tree
                if len(cur_top) < len(gold.tree):
                    diff = sorted(list(set(range(len(gold.tree))) - set(csi)),
                                  cmp=gold.tree._compare_node_depth)
                    gold_sub = gold.tree.get_subtree(csi + diff[0:len(cur_top) - len(gold.tree)])

                    self.asearch_planner.open_list.clear()
                    self.asearch_planner.open_list.push(gold_sub, score)
                    # TODO speed up by remembering the features in planner
                    feats = self._extract_feats(gold_sub, gold.da)
                    gold_sub = Inst(tree=gold_sub, da=gold.da, feats=feats, score=0)
                    self._update_weights(gold_sub, gen)

                # otherwise, update using the full gold tree
                else:
                    self._update_weights(gold, gen)

        return self.get_best_generated(gold)

Example #4

0

Show file

File: rank.py Project: UFAL-DSG/tgen

    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(PerceptronRanker, self)._init_training(das_file, ttree_file, data_portion)
        # initialize weights
        self.w = np.ones(self.train_feats.shape[1])
        self.update_weights_sum()
        # self.w = np.array([rnd.gauss(0, self.alpha) for _ in xrange(self.train_feats.shape[1])])

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')

Example #5

0

Show file

    def _init_training(self, das_file, ttree_file, data_portion):
        # load data, determine number of features etc. etc.
        super(PerceptronRanker, self)._init_training(das_file, ttree_file,
                                                     data_portion)
        # initialize weights
        self.w = np.ones(self.train_feats.shape[1])
        self.update_weights_sum()
        # self.w = np.array([rnd.gauss(0, self.alpha) for _ in xrange(self.train_feats.shape[1])])

        log_debug('\n***\nINIT:')
        log_debug(self._feat_val_str())
        log_info('Training ...')

Example #6

0

Show file

File: planner.py Project: fooyou/tgen

 def generate_tree(self, da, gen_doc=None, return_lists=False):
     # generate and use only 1-best
     open_list, close_list = self.run(da, self.max_iter, self.max_defic_iter)
     best_tree, best_score = close_list.peek()
     log_debug("RESULT: %12.5f %s" % (best_score, unicode(best_tree)))
     # return or append the result, return open & close list for inspection if needed
     if gen_doc:
         zone = self.get_target_zone(gen_doc)
         zone.ttree = best_tree.create_ttree()
         zone.sentence = unicode(da)
     if return_lists:
         return open_list, close_list
     if gen_doc:
         return
     return best_tree

Example #7

0

Show file

File: planner.py Project: ProjectsUCSC/E2E-NLG-Personage

 def generate_tree(self, da, gen_doc=None):
     """Generate a tree for the given DA.
     @param da: The input DA
     @param gen_doc: Save the generated tree into this PyTreex document, if given
     @return: the generated tree
     """
     # generate and use only 1-best
     self.run(da)
     best_tree, best_score = self.close_list.peek()
     log_debug("RESULT: %12.5f %s" % (best_score, unicode(best_tree)))
     # if requested, append the result
     if gen_doc:
         zone = self.get_target_zone(gen_doc)
         zone.ttree = best_tree.create_ttree()
         zone.sentence = unicode(da)
     # return the result
     return best_tree

Example #8

0

Show file

File: rank.py Project: UFAL-DSG/tgen

    def _gen_cur_weights(self, gold, max_iter, max_defic_iter, prune_size, beam_size):
        """
        Get the best candidate generated using the A*search planner, which uses this ranker with current
        weights to guide the search, and the current DA as the input.

        @param gold: the gold-standard Inst holding the input DA for generation and the reference tree
        @param max_iter: maximum number of A*-search iterations to run
        @param max_defic_iter: maximum number of deficit A*-search iterations (stopping criterion)
        @param prune_size: beam size for open list pruning
        @param beam_size: beam size for candidate expansion (expand more per iteration if > 1)
        @return: The best generated tree that is different from the gold-standard tree
        @rtype: Inst
        """
        log_debug('GEN-CUR-WEIGHTS')
        # TODO make asearch_planner remember features (for last iteration, maybe)
        self.asearch_planner.run(gold.da, max_iter, max_defic_iter, prune_size, beam_size)
        return self.get_best_generated(gold)

Example #9

0

Show file

    def _gen_cur_weights(self, gold, max_iter, max_defic_iter, prune_size,
                         beam_size):
        """
        Get the best candidate generated using the A*search planner, which uses this ranker with current
        weights to guide the search, and the current DA as the input.

        @param gold: the gold-standard Inst holding the input DA for generation and the reference tree
        @param max_iter: maximum number of A*-search iterations to run
        @param max_defic_iter: maximum number of deficit A*-search iterations (stopping criterion)
        @param prune_size: beam size for open list pruning
        @param beam_size: beam size for candidate expansion (expand more per iteration if > 1)
        @return: The best generated tree that is different from the gold-standard tree
        @rtype: Inst
        """
        log_debug('GEN-CUR-WEIGHTS')
        # TODO make asearch_planner remember features (for last iteration, maybe)
        self.asearch_planner.run(gold.da, max_iter, max_defic_iter, prune_size,
                                 beam_size)
        return self.get_best_generated(gold)

Example #10

0

Show file

    def _check_pending_request(self, sc, job_no, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param job_no: current job number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if job_no is not None:
            log_debug('Checking %d' % job_no)

        # checking if the request has finished
        if req.ready:
            if job_no is not None:
                log_debug('Ready %d' % job_no)
                log_info('Retrieved finished request %d' % job_no)
            if req.error:
                log_info(
                    'Error found on request: job #%d, worker %s:%d' %
                    (job_no if job_no is not None else -1, sc.host, sc.port))
            result = req.value

            # remove from list of pending requests
            # TODO return to pool of free requests (but needs to store the results somewhere)
            self.pending_requests.remove((sc, job_no, req))
            if job_no is None:
                self.free_services.append(sc)

        return result

Example #11

0

Show file

File: parallel_seq2seq_train.py Project: pdsujnow/tgen

    def _check_pending_request(self, sc, job_no, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param job_no: current job number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if job_no is not None:
            log_debug('Checking %d' % job_no)

        # checking if the request has finished
        if req.ready:
            if job_no is not None:
                log_debug('Ready %d' % job_no)
                log_info('Retrieved finished request %d' % job_no)
            if req.error:
                log_info('Error found on request: job #%d, worker %s:%d' %
                         (job_no if job_no is not None else -1, sc.host, sc.port))
            result = req.value

            # remove from list of pending requests
            # TODO return to pool of free requests (but needs to store the results somewhere)
            self.pending_requests.remove((sc, job_no, req))
            if job_no is None:
                self.free_services.append(sc)

        return result

Example #12

0

Show file

File: planner.py Project: ProjectsUCSC/E2E-NLG-Personage

    def run_iter(self):
        """Run one iteration of the A*-search generation algorithm. Move the best candidate(s)
        from open list to close list and try to expand them in all ways possible, then put them
        results on the open list. Keep track of the number of iteration and deficit iteration
        so that the termination condition evaluates properly.
        """

        cands = []
        while len(cands) < self.beam_size and self.open_list:
            cand, score = self.open_list.pop()
            self.close_list.push(cand, score[1])  # only use score without future promise
            cands.append(cand)

            if len(cands) == 0:
                log_debug("-- IT %4d: O %5d S %12.5f -- %s" %
                          (self.num_iter, len(self.open_list), -score[1], unicode(cand)))

        successors = [succ
                      for succ in self.candgen.get_all_successors(cand)
                      for cand in cands
                      if succ not in self.close_list]

        if successors:
            # add candidates with score (negative for the min-heap)
            scores = self.ranker.score_all(successors, self.input_da)
            futprs = self.ranker.get_future_promise_all(successors)
            self.open_list.push_all([(succ, (-(score + futpr), -score, -futpr))
                                     for succ, score, futpr in zip(successors, scores, futprs)])
            # pruning (if supposed to do it)
            # TODO do not even add them on the open list when pruning
            if self.prune_size is not None:
                pruned = self.open_list.prune(self.prune_size)
                self.close_list.push_all(pruned)
        self.num_iter += 1

        # check where the score is higher -- on the open or on the close list
        # keep track of 'deficit' iterations (and do not allow more than the threshold)
        # TODO decide how to check this: should we check the combined score against the close list?
        if self.open_list and self.close_list:
            open_best_score, close_best_score = self.open_list.peek()[1][1], self.close_list.peek()[1]
            if open_best_score <= close_best_score:  # scores are negative, less is better
                self.defic_iter = 0
            else:
                self.defic_iter += 1

        if self.num_iter == self.max_iter:
            log_debug('ITERATION LIMIT REACHED')
        elif self.defic_iter == self.max_defic_iter:
            log_debug('DEFICIT ITERATION LIMIT REACHED')

Example #13

0

Show file

File: parallel_percrank_train.py Project: ProjectsUCSC/E2E-NLG-Personage

    def _check_pending_request(self, iter_no, sc, req_portion, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param req_portion: current data portion number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if req_portion is not None:
            log_debug('Checking %d' % req_portion)

        # checking if the request has finished
        if req.ready:
            # loading requests -- do nothing (just logging)
            if req_portion is None:
                if req.error:
                    log_info('Error loading on %s:%d' % (sc.host, sc.port))
                else:
                    log_info('Worker %s:%d finished loading.' %
                             (sc.host, sc.port))
            # data processing request -- retrieve the value
            else:
                log_debug('Ready %d' % req_portion)
                log_info('Retrieved finished request %d / %d' %
                         (iter_no, req_portion))
                if req.error:
                    log_info(
                        'Error found on request: IT %d PORTION %d, WORKER %s:%d'
                        % (iter_no, req_portion, sc.host, sc.port))
                result = pickle.loads(req.value)

            # add the worker to the pool of free services (both loading and data processing requests)
            self.pending_requests.remove((sc, req_portion, req))
            self.free_services.append(sc)

        if req_portion is not None:
            log_debug('Done with %d' % req_portion)
        return result

Example #14

0

Show file

File: parallel_percrank_train.py Project: UFAL-DSG/tgen

    def _check_pending_request(self, iter_no, sc, req_portion, req):
        """Check whether the given request has finished (i.e., job is loaded or job has
        processed the given data portion.

        If the request is finished, the worker that processed it is moved to the pool
        of free services.

        @param iter_no: current iteration number (for logging)
        @param sc: a ServiceConn object that stores the worker connection parameters
        @param req_portion: current data portion number (is None for jobs loading)
        @param req: the request itself

        @return: the value returned by the finished data processing request, or None \
            (for loading requests or unfinished requests)
        """
        result = None
        if req_portion is not None:
            log_debug('Checking %d' % req_portion)

        # checking if the request has finished
        if req.ready:
            # loading requests -- do nothing (just logging)
            if req_portion is None:
                if req.error:
                    log_info('Error loading on %s:%d' % (sc.host, sc.port))
                else:
                    log_info('Worker %s:%d finished loading.' % (sc.host, sc.port))
            # data processing request -- retrieve the value
            else:
                log_debug('Ready %d' % req_portion)
                log_info('Retrieved finished request %d / %d' % (iter_no, req_portion))
                if req.error:
                    log_info('Error found on request: IT %d PORTION %d, WORKER %s:%d' %
                             (iter_no, req_portion, sc.host, sc.port))
                result = pickle.loads(req.value)

            # add the worker to the pool of free services (both loading and data processing requests)
            self.pending_requests.remove((sc, req_portion, req))
            self.free_services.append(sc)

        if req_portion is not None:
            log_debug('Done with %d' % req_portion)
        return result

Example #15

0

Show file

File: rank.py Project: UFAL-DSG/tgen

    def get_best_generated(self, gold):
        """Return the best generated tree that is different from the gold-standard tree
        (to be used for updates, if it scores better). Also, keep track of logging and
        update analyzer lists.

        @param gold: the gold-standard Inst from which the generated tree must differ
        @rtype: Inst
        """
        self.lists_analyzer.append(gold.tree,
                                   self.asearch_planner.open_list,
                                   self.asearch_planner.close_list)

        gen_tree = gold.tree
        while self.asearch_planner.close_list and gen_tree == gold.tree:
            gen_tree, gen_score = self.asearch_planner.close_list.pop()

        # scores are negative on the close list – reverse the sign
        gen = Inst(tree=gen_tree, da=gold.da, score=-gen_score,
                   feats=self._extract_feats(gen_tree, gold.da))
        log_debug('SEL: GOLD' if gold.score >= gen.score else 'SEL: GEN')
        log_debug("GOLD:\t", "%12.5f" % gold.score, "\t", gold.tree)
        log_debug("GEN :\t", "%12.5f" % gen.score, "\t", gen.tree)
        return gen

Example #16

0

Show file

    def get_best_generated(self, gold):
        """Return the best generated tree that is different from the gold-standard tree
        (to be used for updates, if it scores better). Also, keep track of logging and
        update analyzer lists.

        @param gold: the gold-standard Inst from which the generated tree must differ
        @rtype: Inst
        """
        self.lists_analyzer.append(gold.tree, self.asearch_planner.open_list,
                                   self.asearch_planner.close_list)

        gen_tree = gold.tree
        while self.asearch_planner.close_list and gen_tree == gold.tree:
            gen_tree, gen_score = self.asearch_planner.close_list.pop()

        # scores are negative on the close list – reverse the sign
        gen = Inst(tree=gen_tree,
                   da=gold.da,
                   score=-gen_score,
                   feats=self._extract_feats(gen_tree, gold.da))
        log_debug('SEL: GOLD' if gold.score >= gen.score else 'SEL: GEN')
        log_debug("GOLD:\t", "%12.5f" % gold.score, "\t", gold.tree)
        log_debug("GEN :\t", "%12.5f" % gen.score, "\t", gen.tree)
        return gen

Example #17

0

Show file

    def train(self,
              das_file,
              ttree_file,
              data_portion=1.0,
              context_file=None,
              validation_files=None):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.',
                                        1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' %
                             j) if is_debug_stream() else 'None'
            job = Job(
                header='from tgen.parallel_seq2seq_train import run_training',
                code=('run_training("%s", %d, %s)' %
                      (self.host, self.port, debug_logfile)),
                name=self.experiment_id + ("PRT%02d-%s-%d" %
                                           (j, host_short, self.port)),
                work_dir=self.work_dir)
            job.submit(memory=self.job_memory, queue=self.queue_settings)
            self.jobs.append(job)

        # run the training passes
        try:
            cur_assign = 0
            results = [None] * self.jobs_number
            rnd_seeds = [rnd.random() for _ in xrange(self.jobs_number)]

            # assign training and wait for it to finish
            while cur_assign < self.jobs_number or self.pending_requests:
                log_debug('Starting loop over services.')

                # check if some of the pending computations have finished
                for sc, job_no, req in list(self.pending_requests):
                    res = self._check_pending_request(sc, job_no, req)
                    if res is not None:
                        results[job_no] = res, sc

                # check for free services and assign new computation
                while cur_assign < self.jobs_number and self.free_services:
                    log_debug('Assigning request %d' % cur_assign)
                    sc = self.free_services.popleft()
                    log_info('Assigning request %d to %s:%d' %
                             (cur_assign, sc.host, sc.port))
                    if validation_files is not None:
                        validation_files = ','.join([
                            os.path.relpath(f, self.work_dir)
                            for f in validation_files.split(',')
                        ])
                    train_func = async (sc.conn.root.train)
                    req = train_func(
                        rnd_seeds[cur_assign],
                        os.path.relpath(das_file, self.work_dir),
                        os.path.relpath(ttree_file, self.work_dir),
                        data_portion,
                        os.path.relpath(context_file, self.work_dir)
                        if context_file else None, validation_files)
                    self.pending_requests.add((sc, cur_assign, req))
                    cur_assign += 1
                    log_debug('Assigned %d' % cur_assign)

                # sleep for a while
                log_debug('Sleeping.')
                time.sleep(self.poll_interval)

            log_info("Results:\n" + "\n".join("%.5f %s:%d" %
                                              (cost, sc.host, sc.port)
                                              for cost, sc in results))

            self.model_temp_path = os.path.join(self.work_dir,
                                                self.TEMPFILE_NAME)
            results.sort(key=lambda res: res[0])
            # average the computed models
            if self.average_models:
                log_info('Creating ensemble models...')
                # use only top k if required
                results_for_ensemble = (results[:self.average_models_top_k]
                                        if self.average_models_top_k > 0 else
                                        results)
                ensemble_model = self.build_ensemble_model(
                    results_for_ensemble)
                log_info('Saving the ensemble model temporarily to %s...' %
                         self.model_temp_path)
                ensemble_model.save_to_file(self.model_temp_path)
            # select the best result on devel data + save it
            else:
                best_cost, best_sc = results[0]
                log_info('Best cost: %f (computed at %s:%d).' %
                         (best_cost, best_sc.host, best_sc.port))
                log_info('Saving best generator temporarily to %s...' %
                         self.model_temp_path)
                # use relative path (working directory of worker jobs is different)
                best_sc.conn.root.save_model(
                    os.path.relpath(self.model_temp_path, self.work_dir))

        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()

Example #18

0

Show file

    def _get_rival_candidates(self, gold, tree_no, strategy):
        """Generate some rival candidates for a DA and the correct (gold) tree,
        given a strategy; using other DAs for the correct tree, other trees for the correct
        DA, or random trees.

        NB: This has not been shown to be usable in practice; use _gen_cur_weights() instead.

        TODO: checking for trees identical to the gold one slows down the process

        @param tree_no: the index of the current training data item (tree, DA)
        @rtype: tuple of two lists: one of TreeData's, one of arrays
        @return: an array of rival trees and an array of the corresponding features
        """
        train_trees = self.train_trees

        rival_das, rival_trees, rival_feats = [], [], []

        if strategy != 'other_da':
            rival_das = [gold.da] * self.rival_number

        # use current DA but change trees when computing features
        if strategy == 'other_inst':
            # use alternative indexes, avoid the correct one
            rival_idxs = map(
                lambda idx: len(train_trees) - 1 if idx == tree_no else idx,
                rnd.sample(xrange(len(train_trees) - 1), self.rival_number))
            other_inst_trees = [
                train_trees[rival_idx] for rival_idx in rival_idxs
            ]
            rival_trees.extend(other_inst_trees)
            rival_feats.extend([
                self._extract_feats(tree, gold.da) for tree in other_inst_trees
            ])

        # use the current gold tree but change DAs when computing features
        if strategy == 'other_da':
            rival_idxs = map(
                lambda idx: len(train_trees) - 1 if idx == tree_no else idx,
                rnd.sample(xrange(len(train_trees) - 1), self.rival_number))
            other_inst_das = [
                self.train_das[rival_idx] for rival_idx in rival_idxs
            ]
            rival_das.extend(other_inst_das)
            rival_trees.extend([self.train_trees[tree_no]] * self.rival_number)
            rival_feats.extend([
                self._extract_feats(self.train_trees[tree_no], da)
                for da in other_inst_das
            ])


#         # candidates generated using the random planner (use the current DA)
#         if strategy == 'random':
#             random_trees = []
#             while len(random_trees) < self.rival_number:
#                 tree = self.sampling_planner.generate_tree(da)
#                 if (tree != train_trees[tree_no]):  # don't generate trees identical to the gold one
#                     random_trees.append(tree)
#             rival_trees.extend(random_trees)
#             rival_feats.extend([self._extract_feats(tree, da) for tree in random_trees])

# score them along with the right one
        rival_scores = [self._score(r) for r in rival_feats]
        top_rival_idx = rival_scores.index(max(rival_scores))
        gen = Inst(tree=rival_trees[top_rival_idx],
                   da=rival_das[top_rival_idx],
                   score=rival_scores[top_rival_idx],
                   feats=rival_feats[top_rival_idx])

        # debug print: candidate trees
        log_debug('#RIVALS: %02d' % len(rival_feats))
        log_debug('SEL: GOLD' if gold.score >= gen.score else (
            'SEL: RIVAL#%d' % top_rival_idx))
        log_debug('ALL CAND TREES:')
        for ttree, score in zip([gold.tree] + rival_trees,
                                [gold.score] + rival_scores):
            log_debug("%12.5f" % score, "\t", ttree)

        return gen

Example #19

0

Show file

File: parallel_percrank_train.py Project: UFAL-DSG/tgen

    def train(self, das_file, ttree_file, data_portion=1.0):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        self.loc_ranker._init_training(das_file, ttree_file, data_portion)
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.', 1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None'
            job = Job(header='from tgen.parallel_percrank_train import run_worker',
                      code=('run_worker("%s", %d, %s)' %
                            (self.host, self.port, debug_logfile)),
                      name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)),
                      work_dir=self.work_dir)
            job.submit(self.job_memory)
            self.jobs.append(job)
        # run the training passes
        try:
            for iter_no in xrange(1, self.loc_ranker.passes + 1):

                log_info('Pass %d...' % iter_no)
                log_debug('\n***\nTR%05d:' % iter_no)

                iter_start_time = time.time()
                cur_portion = 0
                results = [None] * self.data_portions
                w_dump = pickle.dumps(self.loc_ranker.get_weights(), protocol=pickle.HIGHEST_PROTOCOL)
                rnd_seeds = [rnd.random() for _ in xrange(self.data_portions)]
                # wait for free services / assign computation
                while cur_portion < self.data_portions or self.pending_requests:
                    log_debug('Starting loop over services.')

                    # check if some of the pending computations have finished
                    for sc, req_portion, req in list(self.pending_requests):
                        res = self._check_pending_request(iter_no, sc, req_portion, req)
                        if res:
                            results[req_portion] = res

                    # check for free services and assign new computation
                    while cur_portion < self.data_portions and self.free_services:
                        log_debug('Assigning request %d' % cur_portion)
                        sc = self.free_services.popleft()
                        log_info('Assigning request %d / %d to %s:%d' %
                                 (iter_no, cur_portion, sc.host, sc.port))
                        train_func = async(sc.conn.root.training_pass)
                        req = train_func(w_dump, iter_no, rnd_seeds[cur_portion],
                                         * self._get_portion_bounds(cur_portion))
                        self.pending_requests.add((sc, cur_portion, req))
                        cur_portion += 1
                        log_debug('Assigned %d' % cur_portion)
                    # sleep for a while
                    log_debug('Sleeping.')
                    time.sleep(self.poll_interval)

                # delete the temporary ranker dump when the 1st iteration is complete
                if self.ranker_dump_path:
                    log_info('Removing temporary ranker dump at %s.' % self.ranker_dump_path)
                    os.remove(self.ranker_dump_path)
                    self.ranker_dump_path = None

                # gather/average the diagnostic statistics
                self.loc_ranker.set_diagnostics_average([d for _, d in results])

                # take an average of weights; set it as new w
                self.loc_ranker.set_weights_average([w for w, _ in results])
                self.loc_ranker.store_iter_weights()  # store a copy of w for averaged perceptron

                # print statistics
                log_debug(self.loc_ranker._feat_val_str(), '\n***')
                self.loc_ranker._print_pass_stats(iter_no, datetime.timedelta(seconds=(time.time() - iter_start_time)))

            # after all passes: average weights if set to do so
            if self.loc_ranker.averaging is True:
                self.loc_ranker.set_weights_iter_average()
        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()

Example #20

0

Show file

    def _gen_update(self, gold, max_iter, max_defic_iter, prune_size,
                    beam_size):
        """Try generating using the current weights, but update the weights after each
        iteration if the result is not going in the right direction (not a subtree of the
        gold-standard tree).

        @param gold: the gold-standard Inst holding the input DA for generation and the reference tree
        @param max_iter: maximum number of A*-search iterations to run
        @param max_defic_iter: maximum number of deficit A*-search iterations (stopping criterion)
        @param prune_size: beam size for open list pruning
        @param beam_size: beam size for candidate expansion (expand more per iteration if > 1)
        @return: The best generated tree that is different from the gold-standard tree
        @rtype: Inst
        """

        log_debug('GEN-UPDATE')
        self.asearch_planner.init_run(gold.da, max_iter, max_defic_iter,
                                      prune_size, beam_size)

        while not self.asearch_planner.check_finalize():
            # run one A*search iteration
            self.asearch_planner.run_iter()

            # stop if there's nothing on the open list
            if not self.asearch_planner.open_list:
                break

            # look if we are on the right track to the gold tree
            cur_top, score = self.asearch_planner.open_list.peek()
            csi, _ = gold.tree.common_subtree_idxs(cur_top)

            # if not, update
            if len(csi) != len(cur_top):

                feats = self._extract_feats(cur_top, gold.da)
                gen = Inst(tree=cur_top, da=gold.da, feats=feats, score=score)

                # for small wrong trees,
                # fake the current open list to only include a subtree of the gold tree
                # TODO fake it better, include more variants
                # update using a subtree of the gold tree
                if len(cur_top) < len(gold.tree):
                    diff = sorted(list(set(range(len(gold.tree))) - set(csi)),
                                  cmp=gold.tree._compare_node_depth)
                    gold_sub = gold.tree.get_subtree(csi +
                                                     diff[0:len(cur_top) -
                                                          len(gold.tree)])

                    self.asearch_planner.open_list.clear()
                    self.asearch_planner.open_list.push(gold_sub, score)
                    # TODO speed up by remembering the features in planner
                    feats = self._extract_feats(gold_sub, gold.da)
                    gold_sub = Inst(tree=gold_sub,
                                    da=gold.da,
                                    feats=feats,
                                    score=0)
                    self._update_weights(gold_sub, gen)

                # otherwise, update using the full gold tree
                else:
                    self._update_weights(gold, gen)

        return self.get_best_generated(gold)

Example #21

0

Show file

File: parallel_seq2seq_train.py Project: pdsujnow/tgen

    def train(self, das_file, ttree_file, data_portion=1.0, context_file=None, validation_files=None):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.', 1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' % j) if is_debug_stream() else 'None'
            job = Job(header='from tgen.parallel_seq2seq_train import run_training',
                      code=('run_training("%s", %d, %s)' %
                            (self.host, self.port, debug_logfile)),
                      name=self.experiment_id + ("PRT%02d-%s-%d" % (j, host_short, self.port)),
                      work_dir=self.work_dir)
            job.submit(memory=self.job_memory, queue=self.queue_settings)
            self.jobs.append(job)

        # run the training passes
        try:
            cur_assign = 0
            results = [None] * self.jobs_number
            rnd_seeds = [rnd.random() for _ in xrange(self.jobs_number)]

            # assign training and wait for it to finish
            while cur_assign < self.jobs_number or self.pending_requests:
                log_debug('Starting loop over services.')

                # check if some of the pending computations have finished
                for sc, job_no, req in list(self.pending_requests):
                    res = self._check_pending_request(sc, job_no, req)
                    if res is not None:
                        results[job_no] = res, sc

                # check for free services and assign new computation
                while cur_assign < self.jobs_number and self.free_services:
                    log_debug('Assigning request %d' % cur_assign)
                    sc = self.free_services.popleft()
                    log_info('Assigning request %d to %s:%d' % (cur_assign, sc.host, sc.port))
                    if validation_files is not None:
                        validation_files = ','.join([os.path.relpath(f, self.work_dir)
                                                     for f in validation_files.split(',')])
                    train_func = async(sc.conn.root.train)
                    req = train_func(rnd_seeds[cur_assign],
                                     os.path.relpath(das_file, self.work_dir),
                                     os.path.relpath(ttree_file, self.work_dir),
                                     data_portion,
                                     os.path.relpath(context_file, self.work_dir)
                                     if context_file else None,
                                     validation_files)
                    self.pending_requests.add((sc, cur_assign, req))
                    cur_assign += 1
                    log_debug('Assigned %d' % cur_assign)

                # sleep for a while
                log_debug('Sleeping.')
                time.sleep(self.poll_interval)

            log_info("Results:\n" + "\n".join("%.5f %s:%d" % (cost, sc.host, sc.port)
                                              for cost, sc in results))

            self.model_temp_path = os.path.join(self.work_dir, self.TEMPFILE_NAME)
            results.sort(key=lambda res: res[0])
            # average the computed models
            if self.average_models:
                log_info('Creating ensemble models...')
                # use only top k if required
                results_for_ensemble = (results[:self.average_models_top_k]
                                        if self.average_models_top_k > 0
                                        else results)
                ensemble_model = self.build_ensemble_model(results_for_ensemble)
                log_info('Saving the ensemble model temporarily to %s...' % self.model_temp_path)
                ensemble_model.save_to_file(self.model_temp_path)
            # select the best result on devel data + save it
            else:
                best_cost, best_sc = results[0]
                log_info('Best cost: %f (computed at %s:%d).' % (best_cost, best_sc.host, best_sc.port))
                log_info('Saving best generator temporarily to %s...' % self.model_temp_path)
                # use relative path (working directory of worker jobs is different)
                best_sc.conn.root.save_model(os.path.relpath(self.model_temp_path, self.work_dir))

        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()

Example #22

0

Show file

File: rank.py Project: UFAL-DSG/tgen

    def _get_rival_candidates(self, gold, tree_no, strategy):
        """Generate some rival candidates for a DA and the correct (gold) tree,
        given a strategy; using other DAs for the correct tree, other trees for the correct
        DA, or random trees.

        NB: This has not been shown to be usable in practice; use _gen_cur_weights() instead.

        TODO: checking for trees identical to the gold one slows down the process

        @param tree_no: the index of the current training data item (tree, DA)
        @rtype: tuple of two lists: one of TreeData's, one of arrays
        @return: an array of rival trees and an array of the corresponding features
        """
        train_trees = self.train_trees

        rival_das, rival_trees, rival_feats = [], [], []

        if strategy != 'other_da':
            rival_das = [gold.da] * self.rival_number

        # use current DA but change trees when computing features
        if strategy == 'other_inst':
            # use alternative indexes, avoid the correct one
            rival_idxs = map(lambda idx: len(train_trees) - 1 if idx == tree_no else idx,
                             rnd.sample(xrange(len(train_trees) - 1), self.rival_number))
            other_inst_trees = [train_trees[rival_idx] for rival_idx in rival_idxs]
            rival_trees.extend(other_inst_trees)
            rival_feats.extend([self._extract_feats(tree, gold.da) for tree in other_inst_trees])

        # use the current gold tree but change DAs when computing features
        if strategy == 'other_da':
            rival_idxs = map(lambda idx: len(train_trees) - 1 if idx == tree_no else idx,
                             rnd.sample(xrange(len(train_trees) - 1), self.rival_number))
            other_inst_das = [self.train_das[rival_idx] for rival_idx in rival_idxs]
            rival_das.extend(other_inst_das)
            rival_trees.extend([self.train_trees[tree_no]] * self.rival_number)
            rival_feats.extend([self._extract_feats(self.train_trees[tree_no], da)
                                for da in other_inst_das])

#         # candidates generated using the random planner (use the current DA)
#         if strategy == 'random':
#             random_trees = []
#             while len(random_trees) < self.rival_number:
#                 tree = self.sampling_planner.generate_tree(da)
#                 if (tree != train_trees[tree_no]):  # don't generate trees identical to the gold one
#                     random_trees.append(tree)
#             rival_trees.extend(random_trees)
#             rival_feats.extend([self._extract_feats(tree, da) for tree in random_trees])

        # score them along with the right one
        rival_scores = [self._score(r) for r in rival_feats]
        top_rival_idx = rival_scores.index(max(rival_scores))
        gen = Inst(tree=rival_trees[top_rival_idx],
                   da=rival_das[top_rival_idx],
                   score=rival_scores[top_rival_idx],
                   feats=rival_feats[top_rival_idx])

        # debug print: candidate trees
        log_debug('#RIVALS: %02d' % len(rival_feats))
        log_debug('SEL: GOLD' if gold.score >= gen.score else ('SEL: RIVAL#%d' % top_rival_idx))
        log_debug('ALL CAND TREES:')
        for ttree, score in zip([gold.tree] + rival_trees, [gold.score] + rival_scores):
            log_debug("%12.5f" % score, "\t", ttree)

        return gen

Example #23

0

Show file

File: parallel_percrank_train.py Project: ProjectsUCSC/E2E-NLG-Personage

    def train(self, das_file, ttree_file, data_portion=1.0):
        """Run parallel perceptron training, start and manage workers."""
        # initialize the ranker instance
        log_info('Initializing...')
        self.loc_ranker._init_training(das_file, ttree_file, data_portion)
        # run server to process registering clients
        self._init_server()
        # spawn training jobs
        log_info('Spawning jobs...')
        host_short, _ = self.host.split('.',
                                        1)  # short host name for job names
        for j in xrange(self.jobs_number):
            # set up debugging logfile only if we have it on the head
            debug_logfile = ('"PRT%02d.debug-out.txt.gz"' %
                             j) if is_debug_stream() else 'None'
            job = Job(
                header='from tgen.parallel_percrank_train import run_worker',
                code=('run_worker("%s", %d, %s)' %
                      (self.host, self.port, debug_logfile)),
                name=self.experiment_id + ("PRT%02d-%s-%d" %
                                           (j, host_short, self.port)),
                work_dir=self.work_dir)
            job.submit(self.job_memory)
            self.jobs.append(job)
        # run the training passes
        try:
            for iter_no in xrange(1, self.loc_ranker.passes + 1):

                log_info('Pass %d...' % iter_no)
                log_debug('\n***\nTR%05d:' % iter_no)

                iter_start_time = time.time()
                cur_portion = 0
                results = [None] * self.data_portions
                w_dump = pickle.dumps(self.loc_ranker.get_weights(),
                                      protocol=pickle.HIGHEST_PROTOCOL)
                rnd_seeds = [rnd.random() for _ in xrange(self.data_portions)]
                # wait for free services / assign computation
                while cur_portion < self.data_portions or self.pending_requests:
                    log_debug('Starting loop over services.')

                    # check if some of the pending computations have finished
                    for sc, req_portion, req in list(self.pending_requests):
                        res = self._check_pending_request(
                            iter_no, sc, req_portion, req)
                        if res:
                            results[req_portion] = res

                    # check for free services and assign new computation
                    while cur_portion < self.data_portions and self.free_services:
                        log_debug('Assigning request %d' % cur_portion)
                        sc = self.free_services.popleft()
                        log_info('Assigning request %d / %d to %s:%d' %
                                 (iter_no, cur_portion, sc.host, sc.port))
                        train_func = async (sc.conn.root.training_pass)
                        req = train_func(
                            w_dump, iter_no, rnd_seeds[cur_portion],
                            *self._get_portion_bounds(cur_portion))
                        self.pending_requests.add((sc, cur_portion, req))
                        cur_portion += 1
                        log_debug('Assigned %d' % cur_portion)
                    # sleep for a while
                    log_debug('Sleeping.')
                    time.sleep(self.poll_interval)

                # delete the temporary ranker dump when the 1st iteration is complete
                if self.ranker_dump_path:
                    log_info('Removing temporary ranker dump at %s.' %
                             self.ranker_dump_path)
                    os.remove(self.ranker_dump_path)
                    self.ranker_dump_path = None

                # gather/average the diagnostic statistics
                self.loc_ranker.set_diagnostics_average(
                    [d for _, d in results])

                # take an average of weights; set it as new w
                self.loc_ranker.set_weights_average([w for w, _ in results])
                self.loc_ranker.store_iter_weights(
                )  # store a copy of w for averaged perceptron

                # print statistics
                log_debug(self.loc_ranker._feat_val_str(), '\n***')
                self.loc_ranker._print_pass_stats(
                    iter_no,
                    datetime.timedelta(seconds=(time.time() -
                                                iter_start_time)))

            # after all passes: average weights if set to do so
            if self.loc_ranker.averaging is True:
                self.loc_ranker.set_weights_iter_average()
        # kill all jobs
        finally:
            for job in self.jobs:
                job.delete()

Example #24

0

Show file

    def _training_pass(self, pass_no):
        """Run one training pass, update weights (store them for possible averaging),
        and store diagnostic values."""

        pass_start_time = time.time()
        self.reset_diagnostics()
        self.update_weights_sum()

        log_debug('\n***\nTR %05d:' % pass_no)

        rgen_max_iter = self._get_num_iters(pass_no, self.rival_gen_max_iter)
        rgen_max_defic_iter = self._get_num_iters(
            pass_no, self.rival_gen_max_defic_iter)
        rgen_beam_size = self.rival_gen_beam_size
        rgen_prune_size = self.rival_gen_prune_size
        rgen_strategy = self._get_rival_gen_strategy(pass_no)

        for tree_no in self.train_order:

            log_debug('TREE-NO: %d' % tree_no)
            log_debug('SENT: %s' % self.train_sents[tree_no])

            gold = Inst(da=self.train_das[tree_no],
                        tree=self.train_trees[tree_no],
                        score=self._score(self.train_feats[tree_no]),
                        feats=self.train_feats[tree_no])

            # obtain some 'rival', alternative incorrect candidates
            for strategy in rgen_strategy:

                # generate using current weights
                if strategy == 'gen_cur_weights':
                    gen = self._gen_cur_weights(gold, rgen_max_iter,
                                                rgen_max_defic_iter,
                                                rgen_prune_size,
                                                rgen_beam_size)

                # generate while trying to update weights
                elif strategy == 'gen_update':
                    gen = self._gen_update(gold, rgen_max_iter,
                                           rgen_max_defic_iter,
                                           rgen_prune_size, rgen_beam_size)

                # check against other possible candidates/combinations
                else:
                    gen = self._get_rival_candidates(gold, tree_no, strategy)

                # evaluate the top-scoring generated tree against gold t-tree
                # (disregarding whether it was selected as the best one)
                self.evaluator.append(TreeNode(gold.tree), TreeNode(gen.tree),
                                      gold.score, gen.score)

                # update weights if the system doesn't give the highest score to the gold standard tree
                if gold.score < gen.score:
                    self._update_weights(gold, gen)

        # store a copy of the current weights for averaging
        self.store_iter_weights()

        # debug print: current weights and pass accuracy
        log_debug(self._feat_val_str(), '\n***')
        log_debug('PASS ACCURACY: %.3f' % self.evaluator.tree_accuracy())

        # print and return statistics
        self._print_pass_stats(
            pass_no,
            datetime.timedelta(seconds=(time.time() - pass_start_time)))

Example #25

0

Show file

File: planner.py Project: fooyou/tgen

    def run(self, da, max_iter=None, max_defic_iter=None, beam_size=None):
        """Run the A*-search generation and after it finishes, return the open
        and close lists.

        @param da: the input dialogue act
        @param max_iter: maximum number of iterations for generation
        @param gold_ttree: a gold t-tree to check if it matches the current candidate
        @rtype: tuple
        @return: the resulting open and close lists
        """
        # initialization
        empty_tree = TreeData()

        et_score = self.ranker.score(empty_tree, da)
        et_futpr = self.ranker.get_future_promise(empty_tree)
        open_list = CandidateList({empty_tree: (-(et_score + et_futpr), -et_score, -et_futpr)})
        close_list = CandidateList()

        num_iter = 0
        defic_iter = 0
        cdfs = self.candgen.get_merged_child_type_cdfs(da)
        node_limits = self.candgen.get_merged_limits(da)
        if not max_iter:
            max_iter = self.max_iter

        log_debug('GEN TREE for DA: %s' % unicode(da))

        # main search loop
        while open_list and num_iter < max_iter and (max_defic_iter is None
                                                     or defic_iter <= max_defic_iter):
            # log_debug("   OPEN : %s" % str(open_list))
            # log_debug("   CLOSE: %s" % str(close_list))
            cand, score = open_list.pop()
            close_list.push(cand, score[1])  # only use score without future promise
            log_debug("-- IT %4d: O %5d S %12.5f -- %s" %
                      (num_iter, len(open_list), -score[1], unicode(cand)))
            successors = [succ for succ in self.candgen.get_all_successors(cand, cdfs, node_limits)
                          if succ not in close_list]

            if successors:
                # add candidates with score (negative for the min-heap)
                scores = self.ranker.score_all(successors, da)
                futprs = self.ranker.get_future_promise_all(successors)
                open_list.push_all([(succ, (-(score + futpr), -score, -futpr))
                                   for succ, score, futpr in zip(successors, scores, futprs)])
                # pruning (if supposed to do it)
                # TODO do not even add them on the open list when pruning
                if beam_size is not None:
                    pruned = open_list.prune(beam_size)
                    close_list.push_all(pruned)
            num_iter += 1
            # check where the score is higher -- on the open or on the close list
            # keep track of 'deficit' iterations (and do not allow more than the threshold)
            # TODO decide how to check this: should we check the combined score against the close list?
            if open_list and close_list:
                open_best_score, close_best_score = open_list.peek()[1][1], close_list.peek()[1]
                if open_best_score <= close_best_score:  # scores are negative, less is better
                    defic_iter = 0
                else:
                    defic_iter += 1

            if num_iter == max_iter:
                log_debug('ITERATION LIMIT REACHED')
            elif defic_iter == max_defic_iter:
                log_debug('DEFICIT ITERATION LIMIT REACHED')

        # now push everything from open list to close list, getting rid of future cost
        while open_list:
            cand, score = open_list.pop()
            close_list.push(cand, score[1])

        return open_list, close_list

Example #26

0

Show file

File: rank.py Project: fooyou/tgen

    def _training_pass(self, pass_no):
        """Run one training pass, update weights (store them for possible averaging),
        and store diagnostic values."""

        pass_start_time = time.time()
        self.reset_diagnostics()
        self.update_weights_sum()

        log_debug('\n***\nTR %05d:' % pass_no)

        rgen_max_iter = self._get_num_iters(pass_no, self.rival_gen_max_iter)
        rgen_max_defic_iter = self._get_num_iters(pass_no, self.rival_gen_max_defic_iter)
        rgen_beam_size = self.rival_gen_beam_size

        for tree_no in self.train_order:
            # obtain some 'rival', alternative incorrect candidates
            gold_da, gold_tree, gold_feats = self.train_das[tree_no], self.train_trees[tree_no], self.train_feats[tree_no]

            for strategy in self.rival_gen_strategy:
                rival_das, rival_trees, rival_feats = self._get_rival_candidates(tree_no, strategy, rgen_max_iter,
                                                                                 rgen_max_defic_iter, rgen_beam_size)
                cands = [gold_feats] + rival_feats

                # score them along with the right one
                scores = [self._score(cand) for cand in cands]
                top_cand_idx = scores.index(max(scores))
                top_rival_idx = scores[1:].index(max(scores[1:]))
                top_rival_tree = rival_trees[top_rival_idx]
                top_rival_da = rival_das[top_rival_idx]

                # find the top-scoring generated tree, evaluate against gold t-tree
                # (disregarding whether it was selected as the best one)
                self.evaluator.append(TreeNode(gold_tree), TreeNode(top_rival_tree), scores[0], max(scores[1:]))

                # debug print: candidate trees
                log_debug('TTREE-NO: %04d, SEL_CAND: %04d, LEN: %02d' % (tree_no, top_cand_idx, len(cands)))
                log_debug('SENT: %s' % self.train_sents[tree_no])
                log_debug('ALL CAND TREES:')
                for ttree, score in zip([gold_tree] + rival_trees, scores):
                    log_debug("%12.5f" % score, "\t", ttree)

                # update weights if the system doesn't give the highest score to the right one
                if top_cand_idx != 0:
                    self._update_weights(gold_da, top_rival_da, gold_tree, top_rival_tree,
                                         gold_feats, cands[top_cand_idx])

        # store a copy of the current weights for averaging
        self.store_iter_weights()

        # debug print: current weights and pass accuracy
        log_debug(self._feat_val_str(), '\n***')
        log_debug('PASS ACCURACY: %.3f' % self.evaluator.tree_accuracy())

        # print and return statistics
        self._print_pass_stats(pass_no, datetime.timedelta(seconds=(time.time() - pass_start_time)))