def no_test_tonto(self): taggroups=self.taggroups bilphrasesSet=ruleLearningLib.AlignmentTemplateSet(taggroups) originalATList=list() numAt=0 print >> sys.stderr, "Reading ALignment Templates/ Bilingual Phrases...." for line in sys.stdin: numAt+=1 line=line.decode('utf-8').strip() at = ruleLearningLib.AlignmentTemplate() piecesOfline=line.split(u'|') textat=u'|'.join(piecesOfline[1:5]) freq=piecesOfline[0].strip() sllemmastext=piecesOfline[5].strip() tllemmastext=piecesOfline[6].strip() sllemmas=sllemmastext.split(u'\t') tllemmas=tllemmastext.split(u'\t') at.parse(textat) at.freq=int(freq) tl_lemmas_from_dictionary_text=piecesOfline[7].strip() tl_lemmas_from_dictionary_list=tl_lemmas_from_dictionary_text.split(u'\t') bilphrase=copy.deepcopy(at) bilphrase.tl_lemmas_from_dictionary=tl_lemmas_from_dictionary_list bilphrase.lexicalise_all(sllemmas,tllemmas) bilphrase.id=numAt bilphrasesSet.add(bilphrase) originalATList.append((at,sllemmas,tllemmas,tl_lemmas_from_dictionary_list)) #print bilphrase.tl_lemmas_from_dictionary print >> sys.stderr, " ....."+str(len(originalATList))+" items." solution=generaliseATs.generalise_by_linear_program(bilphrasesSet,originalATList,taggroups) for at in solution: print at
import argparse import ruleLearningLib import sys,gzip,math ENCODING='utf-8' if __name__ == "__main__": parser = argparse.ArgumentParser(description='Filter TT1 alignment templates and keep only those with the lexical categories from a TT2.0 solution') parser.add_argument('--allowed_boxes') args = parser.parse_args(sys.argv[1:]) allowedseqs=set() for line in open(args.allowed_boxes): line=line.decode(ENCODING).strip() cats=line.split(u"__") allowedseqs.add(tuple(cats)) for line in sys.stdin: line=line.decode(ENCODING).strip() at = ruleLearningLib.AlignmentTemplate() at.parse(line) slseq=tuple([ l.get_pos() for l in at.parsed_sl_lexforms ]) if slseq in allowedseqs: print line.encode(ENCODING)
DEBUG = False parser = argparse.ArgumentParser( description='Chooses alignment templates.') parser.add_argument('--alignment_template', required=True) parser.add_argument('--tag_groups_file_name', required=True) parser.add_argument('--emptyrestrictionsmatcheverything', action='store_true') parser.add_argument('--tt1_beam', action='store_true') args = parser.parse_args(sys.argv[1:]) ruleLearningLib.AT_LexicalTagsProcessor.initialize( args.tag_groups_file_name, None) #parse AT myAT = ruleLearningLib.AlignmentTemplate() myAT.parse(args.alignment_template.decode('utf-8')) if DEBUG: print >> sys.stderr, "AT: " + myAT.to_string() for line in sys.stdin: line = line.strip().decode('utf-8') bilphrase = ruleLearningLib.AlignmentTemplate() if not args.tt1_beam: bilphrase.parse(u'|'.join(line.split(u'|')[1:])) else: bilphrase.parse(u'|'.join(line.split(u'|')[1:]), True) if not args.tt1_beam: bilphrase.add_explicit_restrictions() if DEBUG: print >> sys.stderr, "Checking: " + bilphrase.to_string()
def setUp(self): self.at1 = ruleLearningLib.AlignmentTemplate() self.at2 = ruleLearningLib.AlignmentTemplate() self.at3 = ruleLearningLib.AlignmentTemplate() self.at4 = ruleLearningLib.AlignmentTemplate() self.at5 = ruleLearningLib.AlignmentTemplate() self.at6 = ruleLearningLib.AlignmentTemplate() self.at7 = ruleLearningLib.AlignmentTemplate() self.at8 = ruleLearningLib.AlignmentTemplate() self.at9 = ruleLearningLib.AlignmentTemplate() self.at10 = ruleLearningLib.AlignmentTemplate() self.at11 = ruleLearningLib.AlignmentTemplate() self.at12 = ruleLearningLib.AlignmentTemplate() self.at13 = ruleLearningLib.AlignmentTemplate() self.at14 = ruleLearningLib.AlignmentTemplate() self.at15 = ruleLearningLib.AlignmentTemplate() self.at16 = ruleLearningLib.AlignmentTemplate() self.at17 = ruleLearningLib.AlignmentTemplate() self.at18 = ruleLearningLib.AlignmentTemplate() self.bil18 = ruleLearningLib.AlignmentTemplate() self.at19= ruleLearningLib.AlignmentTemplate() self.bil19= ruleLearningLib.AlignmentTemplate() self.at20= ruleLearningLib.AlignmentTemplate() self.bil20= ruleLearningLib.AlignmentTemplate() self.at1.parse(u"<det><ind><m><sg> <n><m><sg> | <det><ind><f><pl> <n><f><pl> | 0:0 1:1 | <det> <n><f><pl>") self.at2.parse(u"<det><def><m><sg> dinero<n><m><sg> | <det><def><m><pl> diner<n><m><pl> | 0:0 1:1 | <det><def> <n><m><pl>") self.at3.parse(u"<det><def><m><sg> <n><m><sg> | <det><def><f><pl> <n><f><pl> | 0:0 1:1 | <det><def> <n><f><pl>") self.at4.parse(u"<det><ind><*gender><*numberat> <n><*gender><*numberat> | <det><ind><*gender><*numberat> <n><*gender><*numberat> | 0:0 1:1 | <det> <n>") self.at5.parse(u"<det><def><*gender><*numberat> <n><*gender><*numberat> | <det><def><*gender><*numberat> <n><*gender><*numberat> | 0:0 1:1 | <det><def> <n>") self.at6.parse(u"<vbser><ifi><p3><sg> <vblex><pp><m><sg> | anar<vaux><p3><sg> <vbser><inf> <vblex><pp><m><sg> | 0:1 1:2 | <vbser> <vblex>") self.at7.parse(u"<det><def><m><*numberat> <n><m><*numberat> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>") self.at8.parse(u"<det><def><m><*numberat> <n><f><*numberat> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>") self.at9.parse(u"<det><*determinertype><*gender><*numberat> <n><*gender><*numberat> | <det><*determinertype><*gender><*numberat> <n><*gender><*numberat> | 0:0 1:1 | <det> <n>") self.at10.parse(u"<det><def><f><sg> <n><f><sg> | <det><def><f><pl> <n><f><pl> | 0:0 1:1 | <det><def> <n><f><pl>") self.at13.parse(u"el<det><def><m><*numberat> <n><f><*numberat> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>") self.at14.parse(u"el<det><def><*gender><*numberat> <n><f><sg> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>") self.at15.parse(u"el<det><def><*gender><*numberat> <n><m><sg> | <det><def><m><*numberat> <n><m><*numberat> | 0:0 1:1 | <det><def> <n>") self.at16.parse(u"<det><*determinertype><*gender><*numberat> <n><*gender><*numberat> | <det><)000determinertype><)001gender><)001numberat> <n><)001gender><)001numberat> | 0:0 1:1 | <det> <n>") self.at17.parse(u"<det><ind><m><sg> <n><m><sg> | <det><ind><f><pl> <n><f><pl> | 0:0 1:1 | <det> <n><f><pl>") self.at11.parse(u"<vbser><ifi><p3><sg> <vblex><pp><m><sg> | anar<vaux><p3><sg> <vbser><inf> <vblex><pp><m><sg> | 0:1 1:2 | <vbser> <vblex>") self.at12.parse(u"suyo<det><pos><mf><sp> <n><empty_tag_ntype><f><sg> <pr> | el<det><def><f><sp> <n><empty_tag_ntype><f><sg> <pr> | 0:0 1:1 2:2 | __CLOSEWORD__ <n> <pr>") self.at18.parse(u"<det><def><f><*numberat> <n><empty_tag_ntype><f><*numberat> | <det><def><f><)000numberat> <n><empty_tag_ntype><f><)001numberat> | 0:0 1:1 | <det><def><f><pl> <n>") self.bil18.parse(u"el<det><def><f><pl> casa<n><empty_tag_ntype><f><pl> | el<det><def><f><pl> casa<n><empty_tag_ntype><f><pl> | 0:0 1:1 | <det><def> <n>") self.bil18.tl_lemmas_from_dictionary=[u"el",u"casa"] self.at19.parse(u"el<det><def><*gender><*numberat> <n><empty_tag_ntype><*gender><*numberat> | <n><empty_tag_ntype><)001gender><)000numberat> | 0:0 1:0 | <det> <n>") self.bil19.parse(u"el<det><def><f><sg> casa<n><empty_tag_ntype><f><sg> | casa<n><empty_tag_ntype><f><sg> | 0:0 1:0 | <det> <n>") self.bil19.tl_lemmas_from_dictionary=[u"el",u"casa"] self.at20.parse(u"<n><empty_tag_ntype><f><*numberat> <adj><empty_tag_adjtype><f><*numberat> | <n><empty_tag_ntype><f><)000numberat> <adj><empty_tag_adjtype><f><)001numberat> | 0:0 1:1 | <n><empty_tag_ntype><f> <adj><empty_tag_adjtype><f>") self.bil20.parse(u"companyia<n><empty_tag_ntype><f><pl> elèctric<adj><empty_tag_adjtype><f><pl> | compañía<n><empty_tag_ntype><f><pl> eléctrico<adj><empty_tag_adjtype><f><pl> | 0:0 1:1 | <n> <adj>") self.bil20.tl_lemmas_from_dictionary=[u"compañía",u"eléctrico"] self.atlist=[self.at1,self.at2,self.at3,self.at4,self.at5] myfile=open("taggroups",'r') self.taggroups=ruleLearningLib.read_tag_groups(myfile) myfile.close() ruleLearningLib.AT_LexicalTagsProcessor.initialize("taggroups","tagsequences")
def test_explicit_restrictions(self): myat=self.at1.fast_clone() myat.add_explicit_restrictions() expectedAT=ruleLearningLib.AlignmentTemplate() expectedAT.parse(u"<det><ind><m><sg> <n><m><sg> | <det><ind><f><pl> <n><f><pl> | 0:0 1:1 | <det><ind><m><sg> <n><f><pl>") self.assertEqual(myat, expectedAT)