Esempio n. 1
0
    def test_IdentityNeedTerminalTransduce(self):
        rule0 = XTRule(self.S, tree_or_string('(NP ?x0|JJ ?x1|NN)'),
                       tree_or_string('(NP ?x1|NN ?x0|JJ)'), {
                           (0, ): self.S,
                           (1, ): self.S
                       }, 1.0)
        rule1 = XTRule(self.S, tree_or_string('(JJ ?x0|)'),
                       tree_or_string('(JJ ?x0|)'), {(0, ): 't'}, 1.0)
        rule2 = XTRule(self.S, tree_or_string('(NN ?x0|)'),
                       tree_or_string('(NN ?x0|)'), {(0, ): 't'}, 0.8)
        rule3 = XTRule('t', tree_or_string('beautiful'),
                       tree_or_string('bonita'), {}, 1.0)
        rule4 = XTRule('t', tree_or_string('house'), tree_or_string('casa'),
                       {}, 1.0)
        rules = [rule0, rule1, rule2, rule3, rule4]
        rule_backoffs = [Identity()]
        rule_index = RuleIndexT2T(rules, rule_backoffs)
        tree1 = tree_or_string(u'(NP (JJ beautiful) (NN home))')
        tree2 = None
        transducer = xT(self.S, rules, rule_backoffs)
        wrtg = transducer.Transduce(tree1)
        derivation = wrtg.ObtainBestDerivation()
        src_projection, _ = SourceProjectionFromDerivationStrict(derivation)
        expected_src_projection = tree_or_string(
            u'(NP (NN home) (JJ beautiful))')
        self.assertEqual(expected_src_projection, src_projection)

        trg_projection, _ = TargetProjectionFromDerivation(derivation)
        expected_trg_projection = tree_or_string(u'(NP (NN home) (JJ bonita))')
        self.assertEqual(expected_trg_projection, trg_projection)
Esempio n. 2
0
 def setUp(self):
     self.S = 'q0'
     rule0 = XTRule(self.S, tree_or_string('(A ?x0|)'),
                    tree_or_string('(O ?x0|)'), {(0, ): self.S}, 1.0)
     rule1 = XTRule(self.S, tree_or_string('(A ?x0|)'),
                    tree_or_string('(P ?x0|)'), {(0, ): self.S}, 1.0)
     rule2 = XTRule(self.S, tree_or_string('a'), tree_or_string('o'), {},
                    1.0)
     self.rules = [rule0, rule1, rule2]
     self.transducer = xT(self.S, self.rules)
     self.model = PerceptronModel
Esempio n. 3
0
def ObtainWRTGAugmented(weighted_tree_pair):
  """
  Given a transducer and a weighted source/target tree, it returns a tuple
  that contains the wRTG and the weighted pair. If the transducer fails at
  explaining the source/target tree with the rules it has, then it returns
  a tuple (None, None). The weights of the RTG are not estimated here.

  global variables used here (bad practice, but need for parallelization):
    * transducer
    * feat_inst
    * model_class
    * GetScoreOfDerivation
    * CombineScoresOfDerivations
  """
  global transducer
  intree_str, outtree_str, pair_weight = weighted_tree_pair
  wrtg = ObtainWRTG((intree_str, None, pair_weight), print_result=False)[0]
  if not wrtg or not wrtg.P:
    output = (None, None)
    result_str = 'X'
  else:
    productions = AddAllPredicatesForEntitiesFromProds(wrtg.P, transducer.linker)
    rules = list(set(p.rhs.rule for p in productions))
    rules_augmented = list(set(transducer.rules[:] + rules))
    transducer_aug = xT(
      transducer.start_state, rules_augmented, transducer.rule_index.rule_backoffs)
    transducer_back = transducer
    transducer = transducer_aug
    wrtg = ObtainWRTG((intree_str, outtree_str, pair_weight), print_result=False)[0]
    transducer = transducer_back
    if not wrtg or not wrtg.P:
      output = (None, None)
      result_str = 'x'
    else:
      wrtg.ScoreDerivation = GetScoreOfDerivation
      wrtg.CombineDerivationScores = CombineScoresOfDerivations
      if feat_inst:
        feat_inst.SetContext({'src_tree' : intree_str})
      model_class.populate_wrtg_feats(wrtg, feat_inst)
      output = (wrtg, weighted_tree_pair)
      result_str = 'o'
  sys.stdout.flush()
  result_str = result_str if outtree_str is not None else result_str.upper()
  print(result_str, end='', file=sys.stderr)
  return output
    def run(self):
        # Build feature instantiator.
        descriptions_filename = self.__feat_inst
        feat_inst = FeatureInstantiator(
            descriptions_filename,
            feat_names_filename=self.__feat_names_filename)

        # Corpus is a list of [tree, string]s.
        corpus_filename = self.__input[0]
        corpus = LoadCorpus(corpus_filename)
        # The algorithm for training tree transducers expects a triplet:
        #   source tree, target_tree, weight (of the pair).
        corpus = [(src_tree, trg_tree, 0.5) for (src_tree, trg_tree) in corpus]

        # Build transducer with back-off cost functions.
        rule_backoffs = []
        if self.__lsdictent:
            from linguistics.similarity_dict_entities import DictEntities
            dict_filename, feature_weight = self.__lsdictent
            rule_backoffs.append(
                DictEntities(dict_filename, float(feature_weight)))
        if self.__lsdictbent:
            from linguistics.similarity_dict_entities import DictBridgeEntities
            dict_filename, feature_weight = self.__lsdictbent
            rule_backoffs.append(
                DictBridgeEntities(dict_filename, float(feature_weight)))
        if self.__lsdictpred:
            from linguistics.similarity_dict_entities import DictPredicates
            dict_filename, feature_weight = self.__lsdictpred
            dict_predicates = DictPredicates(dict_filename,
                                             float(feature_weight))
            rule_backoffs.append(dict_predicates)
        if self.__lssempred:
            from linguistics.similarity_dict_predicates import SemprePredicates
            dict_filename, feature_weight = self.__lssempred
            sempre_predicates = SemprePredicates(dict_filename,
                                                 float(feature_weight))
            rule_backoffs.append(sempre_predicates)

        if self.__lsent or self.__lsbent or self.__lspred or self.__filter_prods:
            from qald.grounding import Linker
            linker = Linker()
        else:
            linker = None

        rules_filename = self.__input[1]
        rules = loadrules(rules_filename,
                          fmt=self.__fmtPrint,
                          num_occur=self.__numOccur)
        rules = list(set(rules))
        initial_state = GetInitialState(rules_filename, self.__fmtPrint)
        transducer = xT(initial_state, rules, rule_backoffs)
        transducer.linker = linker

        cvt_inserter = None
        if self.__insert_cvts:
            from decoder.decode_qa_utils import CVTInserter
            cvt_inserter = CVTInserter(cache_filename='.cvt_cache')

        if self.__filter_prods:
            from lm.lm_qald_cohesion import ProductionFilter
            prod_filter = ProductionFilter(linker, self.__cores)
        else:
            prod_filter = None

        if self.__model == 'perceptron_avg':
            from training.train_perceptron_avg import AveragedPerceptronModel
            model = AveragedPerceptronModel(prod_filter,
                                            cvt_inserter=cvt_inserter)
        elif self.__model == 'perceptron':
            from training.train_perceptron import PerceptronModel
            model = PerceptronModel(prod_filter, cvt_inserter=cvt_inserter)
        model.max_iterations = self.__kMaxIterations
        model.learning_rate = self.__learning_rate

        query_manager = None
        if self.__task == 'qa':
            from qald.sparql_utils import QueryManager
            query_manager = QueryManager()
            from decoder.decode_qa_utils import QueryLambdaDCSC

            def AreAnswersEqual(src_tree, trg_tree):
                src_results = QueryLambdaDCSC(src_tree)
                trg_results = QueryLambdaDCSC(trg_tree)
                return src_results == trg_results and src_results is not None

            model.trg_equals_gold = AreAnswersEqual
            from decoder.decode_qa_utils import GetBestValidDerivations
            model.GetBestValidDerivations = GetBestValidDerivations
        else:

            def AreTreesEqual(src_tree, trg_tree):
                assert IsString(src_tree) and IsString(trg_tree)
                return src_tree == trg_tree

            model.trg_equals_gold = AreTreesEqual
        model.query_manager = query_manager
        model.augment_wrtgs = self.__augment_wrtgs

        try:
            model.train(transducer,
                        corpus,
                        feat_inst=feat_inst,
                        ncores=self.__cores)
        except KeyboardInterrupt:
            sys.exit(1)
        finally:
            model.save(self.__output)
            if feat_inst:
                feat_inst.Close()
            for rule_backoff in rule_backoffs:
                rule_backoff.Close()
            if cvt_inserter:
                cvt_inserter.close()
            if linker:
                linker.close()
            if query_manager:
                query_manager.close()
Esempio n. 5
0
    def test_PerceptronExampleArticle(self):
        rule0 = XTRule(self.S, tree_or_string('(A ?x0| ?x1|)'),
                       tree_or_string('(A (R ?x1| ?x0|) (S X))'), {
                           (0, 0): self.S,
                           (0, 1): self.S
                       }, 1.0)
        rule1 = XTRule(self.S, tree_or_string('(B ?x0| ?x1|)'),
                       tree_or_string('U'), {}, 1.0)
        rule2 = XTRule(self.S, tree_or_string('(C ?x0| ?x1|)'),
                       tree_or_string('(T ?x0| ?x1|)'), {
                           (0, ): self.S,
                           (1, ): self.S
                       }, 1.0)
        rule3 = XTRule(self.S, tree_or_string('(C ?x0| ?x1|)'),
                       tree_or_string('(T ?x1| ?x0|)'), {
                           (0, ): self.S,
                           (1, ): self.S
                       }, 1.0)
        rule4 = XTRule(self.S, tree_or_string('F'), tree_or_string('V'), {},
                       1.0)
        rule5 = XTRule(self.S, tree_or_string('F'), tree_or_string('W'), {},
                       1.0)
        rule6 = XTRule(self.S, tree_or_string('G'), tree_or_string('V'), {},
                       1.0)
        rule7 = XTRule(self.S, tree_or_string('G'), tree_or_string('W'), {},
                       1.0)
        rule8 = XTRule(
            self.S,
            tree_or_string('G'),  # This rule does not apply.
            tree_or_string('Z'),
            {},
            1.0)
        rules = [rule0, rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8]
        rules[0].features = [(0, 1.0), (10, 1.0)]
        rules[1].features = [(1, 1.0), (11, 1.0)]
        rules[2].features = [(2, 1.0), (12, 1.0)]
        rules[3].features = [(3, 1.0), (13, 1.0)]
        rules[4].features = [(4, 1.0), (14, 1.0)]
        rules[5].features = [(5, 1.0), (15, 1.0)]
        rules[6].features = [(6, 1.0), (16, 1.0)]
        rules[7].features = [(7, 1.0), (17, 1.0)]
        rules[8].features = [(8, 1.0), (18, 1.0)]
        feat_weights = {
            0: 1.0,
            1: 1.0,
            2: 1.0,
            3: 1.0,
            4: 1.0,
            5: 1.0,
            6: 1.0,
            7: 1.0,
            8: 2.0,
            9: 1.0,
            10: 1.0,
            11: 1.0,
            12: 2.0,
            13: 1.0,
            14: 1.0,
            15: 1.0,
            16: 1.0,
            17: 1.0,
            18: 2.0
        }
        transducer = xT(self.S, rules)
        input1 = '(A (B D E) (C F G))'
        output1 = '(A (R (T V W) U) (S X))'
        pair_weight = 1.0
        corpus = [(input1, output1, pair_weight)]

        perceptron_model = self.model()
        perceptron_model.max_iterations = 1
        perceptron_model.learing_rate = .1
        perceptron_model.feat_weights = copy.deepcopy(feat_weights)
        perceptron_model.train(transducer, corpus)

        self.assertEqual(perceptron_model.feat_weights[18], 1.9,
                         'feat_weights: {0}'.format(feat_weights))

        # Check that, at first, for non-estimated feature weights, the transducer
        # produces a grammar that does not obtain the desired target tree. However,
        # when running the structured perceptron to estimate the feature weights,
        # the transducer produces a grammar that obtains the desired target tree.
        transducer = xT(self.S, rules)
        wrtg = transducer.Transduce(tree_or_string(input1))
        perceptron_model.weight_wrtg(wrtg)
        trg_tree = wrtg.GenerateTrees().next()[0]
        self.assertNotEqual(repr(trg_tree), output1)

        transducer = xT(self.S, rules)
        perceptron_model = self.model()
        perceptron_model.max_iterations = 10
        perceptron_model.learing_rate = .1
        perceptron_model.feat_weights = copy.deepcopy(feat_weights)
        perceptron_model.train(transducer, corpus)

        wrtg = transducer.Transduce(tree_or_string(input1))
        perceptron_model.weight_wrtg(wrtg)
        trg_tree = wrtg.GenerateTrees().next()[0]
        self.assertEqual(repr(trg_tree), output1)
Esempio n. 6
0
    def run(self):
        global feat_inst
        global lm
        global rule_backoffs
        global timeout
        global nbest
        global cvt_inserter
        global linker
        global query_manager

        query_manager = QueryManager()

        timeout = self.__timeout
        nbest = self.__nbest

        # Build transducer with rule back-offs.
        rules_filename = self.__input[1]
        if self.__lsdictent:
            from linguistics.similarity_dict_entities import DictEntities
            dict_filename, feature_weight = self.__lsdictent
            rule_backoffs.append(
                DictEntities(dict_filename, float(feature_weight)))
        if self.__lsdictbent:
            from linguistics.similarity_dict_entities import DictBridgeEntities
            dict_filename, feature_weight = self.__lsdictbent
            rule_backoffs.append(
                DictBridgeEntities(dict_filename, float(feature_weight)))
        if self.__lsdictpred:
            from linguistics.similarity_dict_entities import DictPredicates
            dict_filename, feature_weight = self.__lsdictpred
            dict_predicates = DictPredicates(dict_filename,
                                             float(feature_weight))
            rule_backoffs.append(dict_predicates)

        if self.__lssempred:
            from linguistics.similarity_dict_predicates import SemprePredicates
            dict_filename, feature_weight = self.__lssempred
            sempre_predicates = SemprePredicates(dict_filename,
                                                 float(feature_weight))
            rule_backoffs.append(sempre_predicates)

        if self.__lsent or self.__lsbent or self.__lspred or self.__filter_prods:
            from qald.grounding import Linker
            linker = Linker()

        rules = loadrules(rules_filename, fmt=self.__fmtPrint)
        initial_state = GetInitialState(rules_filename, self.__fmtPrint)
        transducer = xT(initial_state, list(set(rules)), rule_backoffs)
        transducer.linker = linker

        # TODO: What about passing the feature instantiator as a parameter
        # to the model?
        # Build model and set its parameters.
        model, params_filename = self.__model
        if model == 'perceptron':
            from training.train_perceptron import PerceptronModel
            model = PerceptronModel()
            model_cls = PerceptronModel
        elif model == 'perceptron_avg':
            from training.train_perceptron_avg import AveragedPerceptronModel
            model = AveragedPerceptronModel()
            model_cls = AveragedPerceptronModel
        model.load(params_filename)

        # Load previously produced feature descriptions filename, so that
        # the same feature IDs are assigned to the same feature names.
        feat_inst = FeatureInstantiator(description_filename=self.__feat_inst,
                                        feat_names_filename=self.__feat_names)

        # Load a type-checking structured language model (if requested).
        if self.__lm:
            from lm.lm_qald import GetLMScoreOfDerivations, TypeCheckLM
            lm = TypeCheckLM(cache_filename='.lm_cache')
            lm_scoring_func = \
              lambda score, state: GetLMScoreOfDerivations(lm, score, state)
        else:
            lm_scoring_func = lambda score, state: score

        if self.__insert_cvts:
            from decoder.decode_qa_utils import CVTInserter
            cvt_inserter = CVTInserter(cache_filename='.cvt_cache')

        # Read input trees and make corpus of triplets (intree, outtree, weight).
        sentences_filename = self.__input[0]
        with codecs.open(sentences_filename, 'r', 'utf-8') as finput:
            intrees_str = [intree_str for intree_str in finput]
        corpus_src = [(s, None, 1.0) for s in intrees_str]
        # Obtain their wRTGs.
        if self.__augment_wrtgs:
            wrtgs, weighted_tree_pairs = ObtainWRTGsAugmented(
                corpus_src,
                transducer,
                feat_inst,
                model_cls,
                ncores=self.__cores)
        else:
            wrtgs, weighted_tree_pairs = ObtainWRTGs(corpus_src,
                                                     transducer,
                                                     feat_inst,
                                                     model_cls,
                                                     ncores=self.__cores)
        feat_inst.sync_id2desc()
        # Remove productions whose predicates do not link to any entity in the grammar.
        # from pudb import set_trace; set_trace()
        if self.__filter_prods:
            from lm.lm_qald_cohesion import ProductionFilter
            prod_filter = ProductionFilter(linker, self.__cores)
            wrtgs = prod_filter.filter_prods_from_wrtgs(wrtgs, corpus_src)

        # Weight rules of wRTGs according to the statistical model.
        for wrtg in wrtgs:
            if wrtg is not None:
                model.weight_wrtg(wrtg)

        if self.__debug:
            for wrtg in wrtgs:
                if wrtg is not None:
                    wrtg.feat_inst = feat_inst

        outputs = ProduceResultsFromTrees(wrtgs,
                                          intrees_str,
                                          ncores=self.__cores)
        print(dumps(outputs, indent=2))
        return