コード例 #1
0
ファイル: setview.py プロジェクト: proycon/colibrita
def main():
    try:
        filename = sys.argv[1]
    except:
        print("Please specify a file to view",file=sys.stderr)
        sys.exit(2)

    reader = Reader(filename)
    for sentencepair in reader:
        print("----------- Sentence #" + str(sentencepair.id) + " -----------")
        if sentencepair.input:
            print("Input: ", end="")
            print(sentencepair.inputstr(True,"blue"))
        if sentencepair.ref:
            print("Reference: ", end="")
            print(sentencepair.refstr(True,"green"))
        if sentencepair.output:
            print("Output: ", end="")
            print(sentencepair.refstr(True,"yellow"))
        if sentencepair.source:
            print("Source: ", end="")
            print(sentencepair.source)
        if sentencepair.category:
            print("Category: ", end="")
            print(sentencepair.category)
        fragment = None
        for x in sentencepair.ref:
            if isinstance(x, Fragment):
                fragment = x
        if fragment and fragment.alternatives:
            print("Alternatives: ", end="")
            print("; ".join([str(x) for x in fragment.alternatives]))
        print()
    reader.close()
コード例 #2
0
ファイル: setdist.py プロジェクト: proycon/colibrita
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
    except:
        print("Syntax: inputset outputset",file=sys.stderr)
        sys.exit(2)

    reader = Reader(inputset)
    writer = Writer(outputset)
    for sentencepair in reader:
        sentencepair.ref = None
        sentencepair.source = None
        sentencepair.category = None
        writer.write(sentencepair)
    writer.close()
    reader.close()
コード例 #3
0
ファイル: setmerge.py プロジェクト: proycon/colibrita
def main():
    try:
        outputset = sys.argv[1]
        inputsets = sys.argv[2:]
    except:
        print("Syntax: outputset inputset inputset2...", file=sys.stderr)
        sys.exit(2)

    id = 0
    writer = Writer(outputset)
    for inputset in inputsets:
        reader = Reader(inputset)
        for sentencepair in reader:
            id += 1
            sentencepair.id = id
            writer.write(sentencepair)
        reader.close()
    writer.close()
コード例 #4
0
ファイル: setshuffle.py プロジェクト: proycon/colibrita
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
    except:
        print("Syntax: inputset outputset",file=sys.stderr)
        sys.exit(2)

    reader = Reader(inputset)
    sentencepairs = []
    for sentencepair in reader:
        sentencepairs.append(sentencepair)
    reader.close()
    writer = Writer(outputset)
    random.shuffle(sentencepairs)
    for i, sentencepair in enumerate(sentencepairs):
        sentencepair.id = i+1
        writer.write(sentencepair)
    writer.close()
コード例 #5
0
ファイル: common.py プロジェクト: proycon/colibrita
def generate(testoutput, ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, size =0, joinedprobabilitythreshold = 0.01, divergencefrombestthreshold=0.8,DEBUG = False):


    if size > 0:
        print("Extracting instances, writing to " + testoutput + '.tmp',file=sys.stderr)
        writer = Writer(testoutput+'.tmp')
    else:
        print("Extracting instances, writing to " + testoutput,file=sys.stderr)
        writer = Writer(testoutput)



    prevsentence = -1
    id = 0
    for sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence, sentence in extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG):
        id += 1
        if sentence != prevsentence:
            print(datetime.datetime.now().strftime('%H:%M:%S'), "Input sentence #" + str(sentence) + " , Output sentence #" + str(id), file=sys.stderr)
            prevsentence = sentence
        valid, sentencepair = makesentencepair(id, sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence)
        if valid:
            writer.write(sentencepair)

    writer.close()

    if size > 0:
        print("Sampling " + str(size),file=sys.stderr)
        selected_ids = set(random.sample( range(1,id+1), size ))
        writer = Writer(testoutput)
        reader = Reader(testoutput+'.tmp')
        newid = 0
        for sentencepair in reader:
            if int(sentencepair.id) in selected_ids:
                newid += 1
                sentencepair.id = newid
                writer.write(sentencepair)
        reader.close()
        writer.close()
コード例 #6
0
ファイル: settok.py プロジェクト: proycon/colibrita
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
        l1 = sys.argv[3]
        l2 = sys.argv[4]
    except:
        print("Syntax: inputset outputset l1 l2", file=sys.stderr)
        sys.exit(2)

    writer = Writer(outputset)
    reader = Reader(inputset)
    for sentencepair in reader:
        if sentencepair.ref:
            for left, fragment, right in sentencepair.fragments(sentencepair.ref):
                print("Tokenising reference: L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l2), id=fragment.id)
                for alt in alts:
                    fragment.alternatives.append(Alternative(tok(alt.value, l2)))
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    ref = left + (fragment,) + right
                elif left:
                    ref = left + (fragment,)
                elif right:
                    ref = (fragment,) + right
                sentencepair.ref = ref

        if sentencepair.output:
            for left, fragment, right in sentencepair.fragments(sentencepair.output):
                print("Tokenising output:    L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l2))
                for alt in alts:
                    fragment.alternatives.append(Alternative(tok(alt)))
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    out = left + (fragment,) + right
                elif left:
                    out = left + (fragment,)
                elif right:
                    out = (fragment,) + right
                sentencepair.output = out

        if sentencepair.input:
            for left, fragment, right in sentencepair.fragments(sentencepair.input):
                print("Tokenising input:     L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l1), id=fragment.id)
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    inp = left + (fragment,) + right
                elif left:
                    inp = left + (fragment,)
                elif right:
                    inp = (fragment,) + right
                sentencepair.input = inp

        writer.write(sentencepair)
    reader.close()
    writer.close()
コード例 #7
0
ファイル: moses.py プロジェクト: proycon/colibrita
def main():
    parser = argparse.ArgumentParser(description="Colibrita - Translation Assistance using Moses", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-f','--dataset', type=str,help="Dataset file", action='store',default="",required=True)
    parser.add_argument('--debug','-d', help="Debug", action='store_true', default=False)
    parser.add_argument('-o','--output',type=str,help="Output prefix", required = True)
    parser.add_argument('-T','--ttable', type=str,help="Phrase translation table (file) to use when testing with --lm and without classifier training", action='store',required=True)
    parser.add_argument('--lm',type=str, help="Language model (file in ARPA format, as produced by for instance SRILM)", action='store',required=True)
    parser.add_argument('--lmweight',type=float, help="Language model weight for Moses ", action='store',default=0.5)
    parser.add_argument('--lmorder',type=float, help="Language model order", action='store',default=3)
    parser.add_argument('--dweight',type=float, help="Distortion weight for Moses", action='store',default=0.6)
    parser.add_argument('--tmweights',type=str, help="Translation model weights for Moses (comma separated)", action='store',default="0.20,0.20,0.20,0.20,0.20")
    parser.add_argument('--lmweightrr',type=float, help="Language model weight in reranking", action='store',default=1)
    parser.add_argument('--tweightrr',type=float, help="Translation model weight in reranking", action='store',default=1)
    parser.add_argument('-n','--n',type=int,help="Number of output hypotheses per sentence", default=25)
    parser.add_argument('-a','--a',type=int,help="Add alternative translations, up to the specified numer", default=0)

    args = parser.parse_args()

    #if os.path.exists(args.output):
    #    print("Output already " + args.output + " already exists, doing nothing..",file=sys.stderr)
    #    sys.exit(2)
    #else:
    #    os.mkdir(args.output)

    if not os.path.exists(args.ttable):
        print("Translation table " + args.ttable + " does not exist", file=sys.stderr)
        sys.exit(2)

    if not os.path.exists(args.lm):
        print("Language model " + args.lm + " does not exist", file=sys.stderr)
        sys.exit(2)


    data = Reader(args.dataset)

    f = open(args.output + '.moses.ini','w',encoding='utf-8')
    f.write("[input-factors]\n0\n\n")
    f.write("[mapping]\n0 T 0\n\n")
    f.write("[ttable-file]\n0 0 0 5 " + args.ttable + "\n\n")
    f.write("[lmodel-file]\n0 0 " + str(args.lmorder) + " " + args.lm + "\n\n")
    f.write("[ttable-limit]\n20\n\n")
    f.write("[weight-d]\n" + str(args.dweight) + "\n\n")
    f.write("[weight-l]\n" + str(args.lmweight) + "\n\n")
    f.write("[weight-t]\n" + "\n".join(args.tmweights.split(',')) + "\n\n")
    f.write("[weight-w]\n-1\n")
    f.write("[distortion-limit]\n6\n")
    f.close()


    if not os.path.exists(args.output + ".nbestlist"):
        cmd = 'moses -f ' + args.output + '.moses.ini -n-best-list ' + args.output + '.nbestlist ' + str(args.n)
        print("Calling moses: " + cmd,file=sys.stderr)
        p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stdin=subprocess.PIPE,stderr=subprocess.PIPE)
        for sentencepair in data:
            for left, sourcefragment, right in sentencepair.inputfragments():
                p.stdin.write( (str(sourcefragment) + "\n").encode('utf-8'))
        p.communicate()
        p.stdin.close()

        data.reset()
    else:
        print("Moses output already exists, not overwriting. Delete " + args.output + ".nbestlist if you want a fresh run.",file=sys.stderr)


    print("Loading Language model", file=sys.stderr)
    lm = ARPALanguageModel(args.lm)

    print("Processing moses output...",file=sys.stderr)

    previndex = -1
    sentenceoutput = []
    hypotheses = []
    with open(args.output+'.nbestlist','r',encoding='utf-8') as f:
        for line in f:
            fields = [ x.strip() for x in  line.strip().split("|||") ]
            print(fields,file=sys.stderr)
            index = int(fields[0])
            if index != previndex:
                if hypotheses:
                    sentenceoutput.append( hypotheses )
                hypotheses = []
            previndex = index
            solution = fields[1]
            rawscores = fields[2].split(' ')
            print(rawscores,file=sys.stderr)
            tscore = float(rawscores[9])
            hypotheses.append( (solution, tscore) )
        sentenceoutput.append( hypotheses ) #don't forget last one

    writer = Writer(args.output + '.output.xml')
    for i, sentencepair in enumerate(data):
        sentencepair.output = copy(sentencepair.input)
        hypotheses = sentenceoutput[i]
        for left, inputfragment, right in sentencepair.inputfragments():
            candidatesentences = []
            bestlmscore = -999999999
            besttscore = -999999999
            for hypothesis, tscore in hypotheses:
                #compute new lm score
                outputfragment = Fragment(tuple(hypothesis.split(' ')), inputfragment.id)
                candidatesentence = sentencepair.replacefragment(inputfragment, outputfragment, sentencepair.output)
                lminput = " ".join(sentencepair._str(candidatesentence)).split(" ") #joining and splitting deliberately to ensure each word is one item
                lmscore = lm.score(lminput)
                assert lmscore <= 0
                if lmscore > bestlmscore:
                    bestlmscore = lmscore
                if tscore > besttscore:
                    besttscore = tscore

                candidatesentences.append( ( candidatesentence, hypothesis, tscore, lmscore ) )

            #compute scores
            solutions = []
            for candidatesentence, targetpattern, tscore, lmscore in candidatesentences:
                tscore = args.tweightrr * (tscore-besttscore)
                lmscore = args.lmweightrr * (lmscore-bestlmscore)
                score = tscore + lmscore
                print(targetpattern + " --- tscore=" + str(tscore) + ", lmscore=" + str(lmscore),file=sys.stderr)
                solutions.append( (score, targetpattern) )

            solutions = sorted(solutions, key=lambda x: -1 * x[0])

            translation = tuple(solutions[0][1].split())
            outputfragment = Fragment(translation, inputfragment.id)
            print("\t" + str(inputfragment) + " -> " + str(outputfragment), file=sys.stderr)

            if args.a:
                for score, solution in solutions[1:1+args.a]:
                    outputfragment.alternatives.append( Alternative( tuple(solution.split()), confidence=score) )

            sentencepair.output = sentencepair.replacefragment(inputfragment, outputfragment, sentencepair.output)

            writer.write(sentencepair)

            break #only support one iteration for now, one fragment per sentence
    writer.close()


    print("All done.", file=sys.stderr)