boxesDic=dict() if args.final_boxes_index: for line in open(args.final_boxes_index): parts=line.split("\t") boxesDic[parts[1].strip()]=int(parts[0]) #read best rule application for each sentence bestHypothesisForEachSentence=list() emptyIndexes=set() numLine=0 for line in sys.stdin: line=line.decode('utf-8').strip() if len(line) > 0: parts=line.split(u"|||") try: bestHypothesisForEachSentence.append(RuleApplicationHypothesis.create_and_parse(parts[0],parseTranslation=True)) except ruleLearningLib.AT_ParsingError: bestHypothesisForEachSentence.append(None) emptyIndexes.add(numLine) print >> sys.stderr, "ERROR parsing line: "+line.encode('utf-8') else: bestHypothesisForEachSentence.append(None) emptyIndexes.add(numLine) numLine+=1 if len(bestHypothesisForEachSentence) != len(sentences): print >> sys.stderr, "ERROR: different length of sentences and best hyportheses" exit() for numSentence,bestHyp in enumerate(bestHypothesisForEachSentence): sentence=sentences[numSentence]
''' from beamSearchLib import RuleApplicationHypothesis,RuleList,ParallelSentence import argparse import ruleLearningLib import sys,gzip from ruleLearningLib import debug,AlignmentTemplate if __name__ == "__main__": parser = argparse.ArgumentParser(description='select alternative sets of Ats which maximise 1-BLEU score') parser.add_argument('--tag_groups_file_name',required=True) parser.add_argument('--tag_sequences_file_name',required=True) parser.add_argument('--debug', action='store_true') args = parser.parse_args(sys.argv[1:]) if args.debug: DEBUG=True ruleLearningLib.DEBUG=True ruleLearningLib.AT_LexicalTagsProcessor.initialize(args.tag_groups_file_name,args.tag_sequences_file_name) l_best_hypothesis=list() for line in sys.stdin: line=line.decode('utf-8').strip() parts=line.split(u"|||") l_best_hypothesis.append(RuleApplicationHypothesis.create_and_parse(parts[0])) resultTuples=RuleApplicationHypothesis.select_boxes_from_alternative_at_sets(l_best_hypothesis) for boxid,altatset in resultTuples: print str(boxid)+"\t"+str(altatset)
ll_hypothesis = list() if args.input: inputfile = gzip.open(args.input) else: inputfile = sys.stdin print >> sys.stderr, "Loading scores ..." for line in inputfile: line = line.decode('utf-8').strip() parts = line.split(u"|||") if nfirst != None: parts = parts[:nfirst] ll_hypothesis.append([ RuleApplicationHypothesis.create_and_parse(part) for part in parts if len(part) > 0 ]) if args.input: inputfile.close() print >> sys.stderr, "... done" print >> sys.stderr, "Maximising score of " + str( len(ll_hypothesis)) + " sentences" if args.only_hyps_with_maximum_local or args.super_heuristic or args.select_boxes_minimum: #remove all non-maximum hypotheses for numSentence, l_hypothesis in enumerate(ll_hypothesis): firstNotMaximumIndex = len(l_hypothesis) if firstNotMaximumIndex > 0:
from ruleLearningLib import debug, AlignmentTemplate if __name__ == "__main__": parser = argparse.ArgumentParser( description='select alternative sets of Ats which maximise 1-BLEU score' ) parser.add_argument('--tag_groups_file_name', required=True) parser.add_argument('--tag_sequences_file_name', required=True) parser.add_argument('--debug', action='store_true') args = parser.parse_args(sys.argv[1:]) if args.debug: DEBUG = True ruleLearningLib.DEBUG = True ruleLearningLib.AT_LexicalTagsProcessor.initialize( args.tag_groups_file_name, args.tag_sequences_file_name) l_best_hypothesis = list() for line in sys.stdin: line = line.decode('utf-8').strip() parts = line.split(u"|||") l_best_hypothesis.append( RuleApplicationHypothesis.create_and_parse(parts[0])) resultTuples = RuleApplicationHypothesis.select_boxes_from_alternative_at_sets( l_best_hypothesis) for boxid, altatset in resultTuples: print str(boxid) + "\t" + str(altatset)
if args.final_boxes_index: for line in open(args.final_boxes_index): parts = line.split("\t") boxesDic[parts[1].strip()] = int(parts[0]) #read best rule application for each sentence bestHypothesisForEachSentence = list() emptyIndexes = set() numLine = 0 for line in sys.stdin: line = line.decode('utf-8').strip() if len(line) > 0: parts = line.split(u"|||") try: bestHypothesisForEachSentence.append( RuleApplicationHypothesis.create_and_parse( parts[0], parseTranslation=True)) except ruleLearningLib.AT_ParsingError: bestHypothesisForEachSentence.append(None) emptyIndexes.add(numLine) print >> sys.stderr, "ERROR parsing line: " + line.encode( 'utf-8') else: bestHypothesisForEachSentence.append(None) emptyIndexes.add(numLine) numLine += 1 if len(bestHypothesisForEachSentence) != len(sentences): print >> sys.stderr, "ERROR: different length of sentences and best hyportheses" exit(1) for numSentence, bestHyp in enumerate(bestHypothesisForEachSentence):