at.parse(line)
            altRuleList.add(at)
        gfile.close()
        ruleLists=[ruleList,altRuleList]
    
    boxesCoverage=False
    boxesDic=dict()
    if args.final_boxes_index:
        for line in open(args.final_boxes_index):
            parts=line.split("\t")
            boxesDic[parts[1].strip()]=int(parts[0])
        boxesCoverage=True
    
    numLine=0
    for line in sys.stdin:
        numLine+=1
        line=line.rstrip('\n').decode('utf-8')
        parts=line.split('|')
        if len(parts) > 5 and not args.tt1_beam:
            #wrong sentence
            print ""
        else:
            parallelSentence=ParallelSentence()
            parallelSentence.parse(line, parseTlLemmasFromDic=True,forRBPE=args.rbpe)
            if not args.tt1_beam:
                parallelSentence.add_explicit_empty_tags()
            finalHypotheses=parallelSentence.compute_coverages_and_bleu(ruleLists,int(args.beam_size),boxesCoverage,boxesDic,args.allow_incompatible_rules,args.tt1_beam)
            print u"|||".join([unicode(h) for h in finalHypotheses]).encode('utf-8')
            print >> sys.stderr, "finished line "+str(numLine)
            
         line=line.strip().decode('utf-8')
         at=AlignmentTemplate()
         at.parse(line)
         ruleList.add(at)
     gfile.close()
 
 #load sentences
 sentences=list()
 if args.sentences:
     if args.sentences.lower().endswith('.gz'):
         gfile=gzip.open(args.sentences)
     else:
         gfile=open(args.sentences)
     for line in gfile:
         line=line.strip().decode('utf-8')
         parallelSentence=ParallelSentence()
         parallelSentence.parse(line, parseTlLemmasFromDic=True,forRBPE=args.rbpe)
         if not args.tt1_beam:
             parallelSentence.add_explicit_empty_tags()
         sentences.append(parallelSentence)
     gfile.close()
 
 boxesDic=dict()
 if args.final_boxes_index:
     for line in open(args.final_boxes_index):
         parts=line.split("\t")
         boxesDic[parts[1].strip()]=int(parts[0])
 
 #read best rule application for each sentence
 bestHypothesisForEachSentence=list()
 emptyIndexes=set()
예제 #3
0
'''
Created on 12/02/2014

@author: vitaka
'''
#stdinput: sentences to be translated
from beamSearchLib import RuleList, ParallelSentence, RuleApplicationHypothesis
from ruleLearningLib import AlignmentTemplate
import ruleLearningLib
import argparse
import gzip
import sys

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--tag_groups_file_name', required=True)
    parser.add_argument('--tag_sequences_file_name', required=True)
    args = parser.parse_args(sys.argv[1:])

    ruleLearningLib.AT_LexicalTagsProcessor.initialize(
        args.tag_groups_file_name, args.tag_sequences_file_name)
    for line in sys.stdin:
        line = line.rstrip('\n').decode('utf-8')
        print "Parsing ..."
        print line.encode('utf-8')
        parallelSentence = ParallelSentence()
        parallelSentence.parse(line, parseTlLemmasFromDic=True)
        parallelSentence.add_explicit_empty_tags()
    print "Everything OK"
'''
Created on 12/02/2014

@author: vitaka
'''
#stdinput: sentences to be translated
from beamSearchLib import RuleList, ParallelSentence, RuleApplicationHypothesis
from ruleLearningLib import AlignmentTemplate
import ruleLearningLib
import argparse
import gzip
import sys

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--tag_groups_file_name',required=True)
    parser.add_argument('--tag_sequences_file_name',required=True)
    args = parser.parse_args(sys.argv[1:])
    
    ruleLearningLib.AT_LexicalTagsProcessor.initialize(args.tag_groups_file_name,args.tag_sequences_file_name)
    for line in sys.stdin:
        line=line.rstrip('\n').decode('utf-8')
        print "Parsing ..."
        print line.encode('utf-8')
        parallelSentence=ParallelSentence()
        parallelSentence.parse(line, parseTlLemmasFromDic=True)
        parallelSentence.add_explicit_empty_tags()
    print "Everything OK"
예제 #5
0
            at.parse(line)
            ruleList.add(at)
        gfile.close()
    print >> sys.stderr, "... done"

    print >> sys.stderr, "Loading sentences ..."
    #load sentences
    sentences = list()
    if args.sentences:
        if args.sentences.lower().endswith(".gz"):
            gfile = gzip.open(args.sentences)
        else:
            gfile = open(args.sentences)
        for line in gfile:
            line = line.strip().decode('utf-8')
            parallelSentence = ParallelSentence()
            parallelSentence.parse(line,
                                   parseTlLemmasFromDic=True,
                                   forRBPE=args.rbpe)
            parallelSentence.add_explicit_empty_tags()
            sentences.append(parallelSentence)
        gfile.close()
    print >> sys.stderr, "... done"

    boxesInvDic = dict()
    boxesDic = dict()
    if args.final_boxes_index:
        for line in open(args.final_boxes_index):
            parts = line.split("\t")
            boxesInvDic[int(parts[0])] = parts[1].strip()
            boxesDic[parts[1].strip()] = int(parts[0])
예제 #6
0
 bslSet=ruleLearningLib.BilingualSequenceLexSet()
 
 parallelSentences=list()
 
 MYMETHOD=True
 if args.advisors_method:
     MYMETHOD=False
 
 if args.extremes_variant != "antiphrases":
      numLine=0
      for line in gzip.open(args.sentences):
         numLine+=1
         print >> sys.stderr, "Line "+str(numLine)
         line=line.rstrip('\n').decode('utf-8')
         #parts=line.split(u" | ")
         parallelSentence=ParallelSentence()
         #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False)
         parallelSentence.parse(line, parseTlLemmasFromDic=True)
         
         #ugly fix for errors in previous steps: if length of restrictions ot tl lemmas form dictionary does not
         #match length of sl lexical forms, add synthetic stuff
         differenceRestrictions=len(parallelSentence.parsed_restrictions) - len(parallelSentence.parsed_sl_lexforms)
         if differenceRestrictions > 0:
             #we have more restrictions than SL lexforms: remove additional restrictions
             for num in range(differenceRestrictions):
                 parallelSentence.parsed_restrictions.pop()
         elif differenceRestrictions < 0:
             for num in range(-differenceRestrictions):
                 newrestriction=ruleLearningLib.AT_Restriction()
                 newrestriction.parse(u"__EMPTYRESTRICTION__")
                 parallelSentence.parsed_restrictions.append(newrestriction)
예제 #7
0
    boxesCoverage = False
    boxesDic = dict()
    if args.final_boxes_index:
        for line in open(args.final_boxes_index):
            parts = line.split("\t")
            boxesDic[parts[1].strip()] = int(parts[0])
        boxesCoverage = True

    numLine = 0
    for line in sys.stdin:
        numLine += 1
        line = line.rstrip('\n').decode('utf-8')
        parts = line.split('|')
        if len(parts) > 5 and not args.tt1_beam:
            #wrong sentence
            print ""
        else:
            parallelSentence = ParallelSentence()
            parallelSentence.parse(line,
                                   parseTlLemmasFromDic=True,
                                   forRBPE=args.rbpe)
            if not args.tt1_beam:
                parallelSentence.add_explicit_empty_tags()
            finalHypotheses = parallelSentence.compute_coverages_and_bleu(
                ruleLists, int(args.beam_size), boxesCoverage, boxesDic,
                args.allow_incompatible_rules, args.tt1_beam)
            print u"|||".join([unicode(h)
                               for h in finalHypotheses]).encode('utf-8')
            print >> sys.stderr, "finished line " + str(numLine)
'''
Created on 04/12/2013

@author: vitaka
'''
from beamSearchLib import ParallelSentence
import sys


print "digraph aligments{"
prevSubgraph=None
for line in sys.stdin:
    line=line.rstrip('\n').decode('utf-8')
    #parts=line.split(u" | ")
    parallelSentence=ParallelSentence()
    #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False)
    parallelSentence.parse(line, parseTlLemmasFromDic=False)
    parallelSentence.extract_antiphrases()
    
    for antiphrase in parallelSentence.antiphrases:
        print >> sys.stderr, antiphrase
    
    content,subgraphName = parallelSentence.draw_dot()
    print content
    if prevSubgraph!=None:
        print "DUMMY_"+prevSubgraph+" -> DUMMY_"+subgraphName+" [style=invis];"
    prevSubgraph=subgraphName
print "}"
예제 #9
0
 parser = argparse.ArgumentParser(description='Extracts bilingual phrases.')
 parser.add_argument('--sentences')
 parser.add_argument('--output_sl')
 parser.add_argument('--output_tl')
 parser.add_argument('--output_dic_header')
 args = parser.parse_args(sys.argv[1:])
 
 outsl=open(args.output_sl,'w')
 outtl=open(args.output_tl,'w')
 
 tagset=set()
 
 for line in gzip.open(args.sentences):
     line=line.rstrip('\n').decode(ENCODING)
     #parts=line.split(u" | ")
     parallelSentence=ParallelSentence()
     #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False)
     parallelSentence.parse(line, parseTlLemmasFromDic=True)
     retratos_s,retratos_t = parallelSentence.to_retratos_format()
     
     for lf in parallelSentence.parsed_sl_lexforms+parallelSentence.parsed_tl_lexforms:
         if not lf.is_unknown():
             tagset.add(lf.get_pos())
             for tag in lf.get_tags():
                 subtags=tag.split(u"|")
                 for st in subtags:
                     tagset.add(st)
     
     outsl.write(retratos_s.encode(ENCODING)+"\n")
     outtl.write(retratos_t.encode(ENCODING)+"\n")
 
            len(allowedCountsLextags.freqmap.keys())) + " seqs of lexical tags"

    numSentence = 0

    countsToRemoveCache = dict()

    parallelSentences = list()

    #process sentences
    for line in gzip.open(args.sentences):

        numSentence += 1

        line = line.rstrip('\n').decode('utf-8')
        #parts=line.split(u" | ")
        parallelSentence = ParallelSentence()
        #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False)
        parallelSentence.parse(line, parseTlLemmasFromDic=False)
        parallelSentence.extract_antiphrases()
        parallelSentences.append(parallelSentence)

    for numSentence, parallelSentence in enumerate(parallelSentences):
        numSentence += 1
        print >> sys.stderr, "Parallel sentence " + str(numSentence)
        print >> sys.stderr, parallelSentence

        for antiphrase in parallelSentence.antiphrases:
            print >> sys.stderr, "\t antiphrase: " + str(antiphrase)
            bilphrasesLeft = parallelSentence.extract_bilphrases_containing_antiphrase(
                antiphrase, ParallelSentence.SIDE_LEFT)
            probsLeft = list()
예제 #11
0
'''
Created on 04/12/2013

@author: vitaka
'''
from beamSearchLib import ParallelSentence
import sys

print "digraph aligments{"
prevSubgraph = None
for line in sys.stdin:
    line = line.rstrip('\n').decode('utf-8')
    #parts=line.split(u" | ")
    parallelSentence = ParallelSentence()
    #parallelSentence.parse(u" | ".join(parts[1:]), parseTlLemmasFromDic=False)
    parallelSentence.parse(line, parseTlLemmasFromDic=False)
    parallelSentence.extract_antiphrases()

    for antiphrase in parallelSentence.antiphrases:
        print >> sys.stderr, antiphrase

    content, subgraphName = parallelSentence.draw_dot()
    print content
    if prevSubgraph != None:
        print "DUMMY_" + prevSubgraph + " -> DUMMY_" + subgraphName + " [style=invis];"
    prevSubgraph = subgraphName
print "}"