at.parse(line) altRuleList.add(at) gfile.close() ruleLists=[ruleList,altRuleList] boxesCoverage=False boxesDic=dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts=line.split("\t") boxesDic[parts[1].strip()]=int(parts[0]) boxesCoverage=True numLine=0 for line in sys.stdin: numLine+=1 line=line.rstrip('\n').decode('utf-8') parts=line.split('|') if len(parts) > 5 and not args.tt1_beam: #wrong sentence print "" else: parallelSentence=ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True,forRBPE=args.rbpe) if not args.tt1_beam: parallelSentence.add_explicit_empty_tags() finalHypotheses=parallelSentence.compute_coverages_and_bleu(ruleLists,int(args.beam_size),boxesCoverage,boxesDic,args.allow_incompatible_rules,args.tt1_beam) print u"|||".join([unicode(h) for h in finalHypotheses]).encode('utf-8') print >> sys.stderr, "finished line "+str(numLine)
line=line.strip().decode('utf-8') at=AlignmentTemplate() at.parse(line) ruleList.add(at) gfile.close() #load sentences sentences=list() if args.sentences: if args.sentences.lower().endswith('.gz'): gfile=gzip.open(args.sentences) else: gfile=open(args.sentences) for line in gfile: line=line.strip().decode('utf-8') parallelSentence=ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True,forRBPE=args.rbpe) if not args.tt1_beam: parallelSentence.add_explicit_empty_tags() sentences.append(parallelSentence) gfile.close() boxesDic=dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts=line.split("\t") boxesDic[parts[1].strip()]=int(parts[0]) #read best rule application for each sentence bestHypothesisForEachSentence=list() emptyIndexes=set()
''' Created on 12/02/2014 @author: vitaka ''' #stdinput: sentences to be translated from beamSearchLib import RuleList, ParallelSentence, RuleApplicationHypothesis from ruleLearningLib import AlignmentTemplate import ruleLearningLib import argparse import gzip import sys if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--tag_groups_file_name', required=True) parser.add_argument('--tag_sequences_file_name', required=True) args = parser.parse_args(sys.argv[1:]) ruleLearningLib.AT_LexicalTagsProcessor.initialize( args.tag_groups_file_name, args.tag_sequences_file_name) for line in sys.stdin: line = line.rstrip('\n').decode('utf-8') print "Parsing ..." print line.encode('utf-8') parallelSentence = ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True) parallelSentence.add_explicit_empty_tags() print "Everything OK"
''' Created on 12/02/2014 @author: vitaka ''' #stdinput: sentences to be translated from beamSearchLib import RuleList, ParallelSentence, RuleApplicationHypothesis from ruleLearningLib import AlignmentTemplate import ruleLearningLib import argparse import gzip import sys if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--tag_groups_file_name',required=True) parser.add_argument('--tag_sequences_file_name',required=True) args = parser.parse_args(sys.argv[1:]) ruleLearningLib.AT_LexicalTagsProcessor.initialize(args.tag_groups_file_name,args.tag_sequences_file_name) for line in sys.stdin: line=line.rstrip('\n').decode('utf-8') print "Parsing ..." print line.encode('utf-8') parallelSentence=ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True) parallelSentence.add_explicit_empty_tags() print "Everything OK"
at.parse(line) ruleList.add(at) gfile.close() print >> sys.stderr, "... done" print >> sys.stderr, "Loading sentences ..." #load sentences sentences = list() if args.sentences: if args.sentences.lower().endswith(".gz"): gfile = gzip.open(args.sentences) else: gfile = open(args.sentences) for line in gfile: line = line.strip().decode('utf-8') parallelSentence = ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True, forRBPE=args.rbpe) parallelSentence.add_explicit_empty_tags() sentences.append(parallelSentence) gfile.close() print >> sys.stderr, "... done" boxesInvDic = dict() boxesDic = dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts = line.split("\t") boxesInvDic[int(parts[0])] = parts[1].strip() boxesDic[parts[1].strip()] = int(parts[0])
bslSet=ruleLearningLib.BilingualSequenceLexSet() parallelSentences=list() MYMETHOD=True if args.advisors_method: MYMETHOD=False if args.extremes_variant != "antiphrases": numLine=0 for line in gzip.open(args.sentences): numLine+=1 print >> sys.stderr, "Line "+str(numLine) line=line.rstrip('\n').decode('utf-8') #parts=line.split(u" | ") parallelSentence=ParallelSentence() #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False) parallelSentence.parse(line, parseTlLemmasFromDic=True) #ugly fix for errors in previous steps: if length of restrictions ot tl lemmas form dictionary does not #match length of sl lexical forms, add synthetic stuff differenceRestrictions=len(parallelSentence.parsed_restrictions) - len(parallelSentence.parsed_sl_lexforms) if differenceRestrictions > 0: #we have more restrictions than SL lexforms: remove additional restrictions for num in range(differenceRestrictions): parallelSentence.parsed_restrictions.pop() elif differenceRestrictions < 0: for num in range(-differenceRestrictions): newrestriction=ruleLearningLib.AT_Restriction() newrestriction.parse(u"__EMPTYRESTRICTION__") parallelSentence.parsed_restrictions.append(newrestriction)
boxesCoverage = False boxesDic = dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts = line.split("\t") boxesDic[parts[1].strip()] = int(parts[0]) boxesCoverage = True numLine = 0 for line in sys.stdin: numLine += 1 line = line.rstrip('\n').decode('utf-8') parts = line.split('|') if len(parts) > 5 and not args.tt1_beam: #wrong sentence print "" else: parallelSentence = ParallelSentence() parallelSentence.parse(line, parseTlLemmasFromDic=True, forRBPE=args.rbpe) if not args.tt1_beam: parallelSentence.add_explicit_empty_tags() finalHypotheses = parallelSentence.compute_coverages_and_bleu( ruleLists, int(args.beam_size), boxesCoverage, boxesDic, args.allow_incompatible_rules, args.tt1_beam) print u"|||".join([unicode(h) for h in finalHypotheses]).encode('utf-8') print >> sys.stderr, "finished line " + str(numLine)
''' Created on 04/12/2013 @author: vitaka ''' from beamSearchLib import ParallelSentence import sys print "digraph aligments{" prevSubgraph=None for line in sys.stdin: line=line.rstrip('\n').decode('utf-8') #parts=line.split(u" | ") parallelSentence=ParallelSentence() #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False) parallelSentence.parse(line, parseTlLemmasFromDic=False) parallelSentence.extract_antiphrases() for antiphrase in parallelSentence.antiphrases: print >> sys.stderr, antiphrase content,subgraphName = parallelSentence.draw_dot() print content if prevSubgraph!=None: print "DUMMY_"+prevSubgraph+" -> DUMMY_"+subgraphName+" [style=invis];" prevSubgraph=subgraphName print "}"
parser = argparse.ArgumentParser(description='Extracts bilingual phrases.') parser.add_argument('--sentences') parser.add_argument('--output_sl') parser.add_argument('--output_tl') parser.add_argument('--output_dic_header') args = parser.parse_args(sys.argv[1:]) outsl=open(args.output_sl,'w') outtl=open(args.output_tl,'w') tagset=set() for line in gzip.open(args.sentences): line=line.rstrip('\n').decode(ENCODING) #parts=line.split(u" | ") parallelSentence=ParallelSentence() #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False) parallelSentence.parse(line, parseTlLemmasFromDic=True) retratos_s,retratos_t = parallelSentence.to_retratos_format() for lf in parallelSentence.parsed_sl_lexforms+parallelSentence.parsed_tl_lexforms: if not lf.is_unknown(): tagset.add(lf.get_pos()) for tag in lf.get_tags(): subtags=tag.split(u"|") for st in subtags: tagset.add(st) outsl.write(retratos_s.encode(ENCODING)+"\n") outtl.write(retratos_t.encode(ENCODING)+"\n")
len(allowedCountsLextags.freqmap.keys())) + " seqs of lexical tags" numSentence = 0 countsToRemoveCache = dict() parallelSentences = list() #process sentences for line in gzip.open(args.sentences): numSentence += 1 line = line.rstrip('\n').decode('utf-8') #parts=line.split(u" | ") parallelSentence = ParallelSentence() #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False) parallelSentence.parse(line, parseTlLemmasFromDic=False) parallelSentence.extract_antiphrases() parallelSentences.append(parallelSentence) for numSentence, parallelSentence in enumerate(parallelSentences): numSentence += 1 print >> sys.stderr, "Parallel sentence " + str(numSentence) print >> sys.stderr, parallelSentence for antiphrase in parallelSentence.antiphrases: print >> sys.stderr, "\t antiphrase: " + str(antiphrase) bilphrasesLeft = parallelSentence.extract_bilphrases_containing_antiphrase( antiphrase, ParallelSentence.SIDE_LEFT) probsLeft = list()
''' Created on 04/12/2013 @author: vitaka ''' from beamSearchLib import ParallelSentence import sys print "digraph aligments{" prevSubgraph = None for line in sys.stdin: line = line.rstrip('\n').decode('utf-8') #parts=line.split(u" | ") parallelSentence = ParallelSentence() #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False) parallelSentence.parse(line, parseTlLemmasFromDic=False) parallelSentence.extract_antiphrases() for antiphrase in parallelSentence.antiphrases: print >> sys.stderr, antiphrase content, subgraphName = parallelSentence.draw_dot() print content if prevSubgraph != None: print "DUMMY_" + prevSubgraph + " -> DUMMY_" + subgraphName + " [style=invis];" prevSubgraph = subgraphName print "}"