def test_IdentityNeedTerminalTransduce(self): rule0 = XTRule(self.S, tree_or_string('(NP ?x0|JJ ?x1|NN)'), tree_or_string('(NP ?x1|NN ?x0|JJ)'), { (0, ): self.S, (1, ): self.S }, 1.0) rule1 = XTRule(self.S, tree_or_string('(JJ ?x0|)'), tree_or_string('(JJ ?x0|)'), {(0, ): 't'}, 1.0) rule2 = XTRule(self.S, tree_or_string('(NN ?x0|)'), tree_or_string('(NN ?x0|)'), {(0, ): 't'}, 0.8) rule3 = XTRule('t', tree_or_string('beautiful'), tree_or_string('bonita'), {}, 1.0) rule4 = XTRule('t', tree_or_string('house'), tree_or_string('casa'), {}, 1.0) rules = [rule0, rule1, rule2, rule3, rule4] rule_backoffs = [Identity()] rule_index = RuleIndexT2T(rules, rule_backoffs) tree1 = tree_or_string(u'(NP (JJ beautiful) (NN home))') tree2 = None transducer = xT(self.S, rules, rule_backoffs) wrtg = transducer.Transduce(tree1) derivation = wrtg.ObtainBestDerivation() src_projection, _ = SourceProjectionFromDerivationStrict(derivation) expected_src_projection = tree_or_string( u'(NP (NN home) (JJ beautiful))') self.assertEqual(expected_src_projection, src_projection) trg_projection, _ = TargetProjectionFromDerivation(derivation) expected_trg_projection = tree_or_string(u'(NP (NN home) (JJ bonita))') self.assertEqual(expected_trg_projection, trg_projection)
def setUp(self): self.S = 'q0' rule0 = XTRule(self.S, tree_or_string('(A ?x0|)'), tree_or_string('(O ?x0|)'), {(0, ): self.S}, 1.0) rule1 = XTRule(self.S, tree_or_string('(A ?x0|)'), tree_or_string('(P ?x0|)'), {(0, ): self.S}, 1.0) rule2 = XTRule(self.S, tree_or_string('a'), tree_or_string('o'), {}, 1.0) self.rules = [rule0, rule1, rule2] self.transducer = xT(self.S, self.rules) self.model = PerceptronModel
def ObtainWRTGAugmented(weighted_tree_pair): """ Given a transducer and a weighted source/target tree, it returns a tuple that contains the wRTG and the weighted pair. If the transducer fails at explaining the source/target tree with the rules it has, then it returns a tuple (None, None). The weights of the RTG are not estimated here. global variables used here (bad practice, but need for parallelization): * transducer * feat_inst * model_class * GetScoreOfDerivation * CombineScoresOfDerivations """ global transducer intree_str, outtree_str, pair_weight = weighted_tree_pair wrtg = ObtainWRTG((intree_str, None, pair_weight), print_result=False)[0] if not wrtg or not wrtg.P: output = (None, None) result_str = 'X' else: productions = AddAllPredicatesForEntitiesFromProds(wrtg.P, transducer.linker) rules = list(set(p.rhs.rule for p in productions)) rules_augmented = list(set(transducer.rules[:] + rules)) transducer_aug = xT( transducer.start_state, rules_augmented, transducer.rule_index.rule_backoffs) transducer_back = transducer transducer = transducer_aug wrtg = ObtainWRTG((intree_str, outtree_str, pair_weight), print_result=False)[0] transducer = transducer_back if not wrtg or not wrtg.P: output = (None, None) result_str = 'x' else: wrtg.ScoreDerivation = GetScoreOfDerivation wrtg.CombineDerivationScores = CombineScoresOfDerivations if feat_inst: feat_inst.SetContext({'src_tree' : intree_str}) model_class.populate_wrtg_feats(wrtg, feat_inst) output = (wrtg, weighted_tree_pair) result_str = 'o' sys.stdout.flush() result_str = result_str if outtree_str is not None else result_str.upper() print(result_str, end='', file=sys.stderr) return output
def run(self): # Build feature instantiator. descriptions_filename = self.__feat_inst feat_inst = FeatureInstantiator( descriptions_filename, feat_names_filename=self.__feat_names_filename) # Corpus is a list of [tree, string]s. corpus_filename = self.__input[0] corpus = LoadCorpus(corpus_filename) # The algorithm for training tree transducers expects a triplet: # source tree, target_tree, weight (of the pair). corpus = [(src_tree, trg_tree, 0.5) for (src_tree, trg_tree) in corpus] # Build transducer with back-off cost functions. rule_backoffs = [] if self.__lsdictent: from linguistics.similarity_dict_entities import DictEntities dict_filename, feature_weight = self.__lsdictent rule_backoffs.append( DictEntities(dict_filename, float(feature_weight))) if self.__lsdictbent: from linguistics.similarity_dict_entities import DictBridgeEntities dict_filename, feature_weight = self.__lsdictbent rule_backoffs.append( DictBridgeEntities(dict_filename, float(feature_weight))) if self.__lsdictpred: from linguistics.similarity_dict_entities import DictPredicates dict_filename, feature_weight = self.__lsdictpred dict_predicates = DictPredicates(dict_filename, float(feature_weight)) rule_backoffs.append(dict_predicates) if self.__lssempred: from linguistics.similarity_dict_predicates import SemprePredicates dict_filename, feature_weight = self.__lssempred sempre_predicates = SemprePredicates(dict_filename, float(feature_weight)) rule_backoffs.append(sempre_predicates) if self.__lsent or self.__lsbent or self.__lspred or self.__filter_prods: from qald.grounding import Linker linker = Linker() else: linker = None rules_filename = self.__input[1] rules = loadrules(rules_filename, fmt=self.__fmtPrint, num_occur=self.__numOccur) rules = list(set(rules)) initial_state = GetInitialState(rules_filename, self.__fmtPrint) transducer = xT(initial_state, rules, rule_backoffs) transducer.linker = linker cvt_inserter = None if self.__insert_cvts: from decoder.decode_qa_utils import CVTInserter cvt_inserter = CVTInserter(cache_filename='.cvt_cache') if self.__filter_prods: from lm.lm_qald_cohesion import ProductionFilter prod_filter = ProductionFilter(linker, self.__cores) else: prod_filter = None if self.__model == 'perceptron_avg': from training.train_perceptron_avg import AveragedPerceptronModel model = AveragedPerceptronModel(prod_filter, cvt_inserter=cvt_inserter) elif self.__model == 'perceptron': from training.train_perceptron import PerceptronModel model = PerceptronModel(prod_filter, cvt_inserter=cvt_inserter) model.max_iterations = self.__kMaxIterations model.learning_rate = self.__learning_rate query_manager = None if self.__task == 'qa': from qald.sparql_utils import QueryManager query_manager = QueryManager() from decoder.decode_qa_utils import QueryLambdaDCSC def AreAnswersEqual(src_tree, trg_tree): src_results = QueryLambdaDCSC(src_tree) trg_results = QueryLambdaDCSC(trg_tree) return src_results == trg_results and src_results is not None model.trg_equals_gold = AreAnswersEqual from decoder.decode_qa_utils import GetBestValidDerivations model.GetBestValidDerivations = GetBestValidDerivations else: def AreTreesEqual(src_tree, trg_tree): assert IsString(src_tree) and IsString(trg_tree) return src_tree == trg_tree model.trg_equals_gold = AreTreesEqual model.query_manager = query_manager model.augment_wrtgs = self.__augment_wrtgs try: model.train(transducer, corpus, feat_inst=feat_inst, ncores=self.__cores) except KeyboardInterrupt: sys.exit(1) finally: model.save(self.__output) if feat_inst: feat_inst.Close() for rule_backoff in rule_backoffs: rule_backoff.Close() if cvt_inserter: cvt_inserter.close() if linker: linker.close() if query_manager: query_manager.close()
def test_PerceptronExampleArticle(self): rule0 = XTRule(self.S, tree_or_string('(A ?x0| ?x1|)'), tree_or_string('(A (R ?x1| ?x0|) (S X))'), { (0, 0): self.S, (0, 1): self.S }, 1.0) rule1 = XTRule(self.S, tree_or_string('(B ?x0| ?x1|)'), tree_or_string('U'), {}, 1.0) rule2 = XTRule(self.S, tree_or_string('(C ?x0| ?x1|)'), tree_or_string('(T ?x0| ?x1|)'), { (0, ): self.S, (1, ): self.S }, 1.0) rule3 = XTRule(self.S, tree_or_string('(C ?x0| ?x1|)'), tree_or_string('(T ?x1| ?x0|)'), { (0, ): self.S, (1, ): self.S }, 1.0) rule4 = XTRule(self.S, tree_or_string('F'), tree_or_string('V'), {}, 1.0) rule5 = XTRule(self.S, tree_or_string('F'), tree_or_string('W'), {}, 1.0) rule6 = XTRule(self.S, tree_or_string('G'), tree_or_string('V'), {}, 1.0) rule7 = XTRule(self.S, tree_or_string('G'), tree_or_string('W'), {}, 1.0) rule8 = XTRule( self.S, tree_or_string('G'), # This rule does not apply. tree_or_string('Z'), {}, 1.0) rules = [rule0, rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8] rules[0].features = [(0, 1.0), (10, 1.0)] rules[1].features = [(1, 1.0), (11, 1.0)] rules[2].features = [(2, 1.0), (12, 1.0)] rules[3].features = [(3, 1.0), (13, 1.0)] rules[4].features = [(4, 1.0), (14, 1.0)] rules[5].features = [(5, 1.0), (15, 1.0)] rules[6].features = [(6, 1.0), (16, 1.0)] rules[7].features = [(7, 1.0), (17, 1.0)] rules[8].features = [(8, 1.0), (18, 1.0)] feat_weights = { 0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 2.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 2.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 2.0 } transducer = xT(self.S, rules) input1 = '(A (B D E) (C F G))' output1 = '(A (R (T V W) U) (S X))' pair_weight = 1.0 corpus = [(input1, output1, pair_weight)] perceptron_model = self.model() perceptron_model.max_iterations = 1 perceptron_model.learing_rate = .1 perceptron_model.feat_weights = copy.deepcopy(feat_weights) perceptron_model.train(transducer, corpus) self.assertEqual(perceptron_model.feat_weights[18], 1.9, 'feat_weights: {0}'.format(feat_weights)) # Check that, at first, for non-estimated feature weights, the transducer # produces a grammar that does not obtain the desired target tree. However, # when running the structured perceptron to estimate the feature weights, # the transducer produces a grammar that obtains the desired target tree. transducer = xT(self.S, rules) wrtg = transducer.Transduce(tree_or_string(input1)) perceptron_model.weight_wrtg(wrtg) trg_tree = wrtg.GenerateTrees().next()[0] self.assertNotEqual(repr(trg_tree), output1) transducer = xT(self.S, rules) perceptron_model = self.model() perceptron_model.max_iterations = 10 perceptron_model.learing_rate = .1 perceptron_model.feat_weights = copy.deepcopy(feat_weights) perceptron_model.train(transducer, corpus) wrtg = transducer.Transduce(tree_or_string(input1)) perceptron_model.weight_wrtg(wrtg) trg_tree = wrtg.GenerateTrees().next()[0] self.assertEqual(repr(trg_tree), output1)
def run(self): global feat_inst global lm global rule_backoffs global timeout global nbest global cvt_inserter global linker global query_manager query_manager = QueryManager() timeout = self.__timeout nbest = self.__nbest # Build transducer with rule back-offs. rules_filename = self.__input[1] if self.__lsdictent: from linguistics.similarity_dict_entities import DictEntities dict_filename, feature_weight = self.__lsdictent rule_backoffs.append( DictEntities(dict_filename, float(feature_weight))) if self.__lsdictbent: from linguistics.similarity_dict_entities import DictBridgeEntities dict_filename, feature_weight = self.__lsdictbent rule_backoffs.append( DictBridgeEntities(dict_filename, float(feature_weight))) if self.__lsdictpred: from linguistics.similarity_dict_entities import DictPredicates dict_filename, feature_weight = self.__lsdictpred dict_predicates = DictPredicates(dict_filename, float(feature_weight)) rule_backoffs.append(dict_predicates) if self.__lssempred: from linguistics.similarity_dict_predicates import SemprePredicates dict_filename, feature_weight = self.__lssempred sempre_predicates = SemprePredicates(dict_filename, float(feature_weight)) rule_backoffs.append(sempre_predicates) if self.__lsent or self.__lsbent or self.__lspred or self.__filter_prods: from qald.grounding import Linker linker = Linker() rules = loadrules(rules_filename, fmt=self.__fmtPrint) initial_state = GetInitialState(rules_filename, self.__fmtPrint) transducer = xT(initial_state, list(set(rules)), rule_backoffs) transducer.linker = linker # TODO: What about passing the feature instantiator as a parameter # to the model? # Build model and set its parameters. model, params_filename = self.__model if model == 'perceptron': from training.train_perceptron import PerceptronModel model = PerceptronModel() model_cls = PerceptronModel elif model == 'perceptron_avg': from training.train_perceptron_avg import AveragedPerceptronModel model = AveragedPerceptronModel() model_cls = AveragedPerceptronModel model.load(params_filename) # Load previously produced feature descriptions filename, so that # the same feature IDs are assigned to the same feature names. feat_inst = FeatureInstantiator(description_filename=self.__feat_inst, feat_names_filename=self.__feat_names) # Load a type-checking structured language model (if requested). if self.__lm: from lm.lm_qald import GetLMScoreOfDerivations, TypeCheckLM lm = TypeCheckLM(cache_filename='.lm_cache') lm_scoring_func = \ lambda score, state: GetLMScoreOfDerivations(lm, score, state) else: lm_scoring_func = lambda score, state: score if self.__insert_cvts: from decoder.decode_qa_utils import CVTInserter cvt_inserter = CVTInserter(cache_filename='.cvt_cache') # Read input trees and make corpus of triplets (intree, outtree, weight). sentences_filename = self.__input[0] with codecs.open(sentences_filename, 'r', 'utf-8') as finput: intrees_str = [intree_str for intree_str in finput] corpus_src = [(s, None, 1.0) for s in intrees_str] # Obtain their wRTGs. if self.__augment_wrtgs: wrtgs, weighted_tree_pairs = ObtainWRTGsAugmented( corpus_src, transducer, feat_inst, model_cls, ncores=self.__cores) else: wrtgs, weighted_tree_pairs = ObtainWRTGs(corpus_src, transducer, feat_inst, model_cls, ncores=self.__cores) feat_inst.sync_id2desc() # Remove productions whose predicates do not link to any entity in the grammar. # from pudb import set_trace; set_trace() if self.__filter_prods: from lm.lm_qald_cohesion import ProductionFilter prod_filter = ProductionFilter(linker, self.__cores) wrtgs = prod_filter.filter_prods_from_wrtgs(wrtgs, corpus_src) # Weight rules of wRTGs according to the statistical model. for wrtg in wrtgs: if wrtg is not None: model.weight_wrtg(wrtg) if self.__debug: for wrtg in wrtgs: if wrtg is not None: wrtg.feat_inst = feat_inst outputs = ProduceResultsFromTrees(wrtgs, intrees_str, ncores=self.__cores) print(dumps(outputs, indent=2)) return