Beispiel #1
0
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 5:
        print("Syntax: inputset outputset offset maxwords",file=sys.stderr)
        sys.exit(2)
    try:
        inputset, outputset, offset,maxwords = sys.argv[1:]
        offset = int(offset)
        maxwords = int(maxwords)
    except:
        maxwords = 99
        try:
            inputset, outputset, offset = sys.argv[1:]
            offset = int(offset)
        except:
            inputset, outputset = sys.argv[1:]
            offset = 1

    buffer = []
    BUFFERSIZE = 10
    tmpfile=False
    inputs = set()
    if os.path.exists(outputset):
        writer = Writer(outputset + '.tmp')
        reader = Reader(outputset)
        for sentencepair in reader:
            inputs.add( hash(sentencepair.input) )
            writer.write(sentencepair)
        tmpfile=True
    else:
        writer = Writer(outputset)

    num = 0
    reader = Reader(inputset)
    quit = False
    for sentencepair in reader:
        if len(sentencepair.input) <= maxwords:
            num += 1
            if not hash(sentencepair.input) in inputs:
                if num >= offset:
                    buffer.append(sentencepair)
                    if len(buffer) == BUFFERSIZE:
                        buffer, quit = processbuffer(buffer, reader,writer, inputs,num-BUFFERSIZE)
                        if quit: break

    if buffer and not quit: processbuffer(buffer, reader,writer, inputs,num)


    writer.close()
    if tmpfile:
        os.rename(outputset+'.tmp',outputset)
Beispiel #2
0
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
    except:
        print("Syntax: inputset outputset",file=sys.stderr)
        sys.exit(2)

    reader = Reader(inputset)
    writer = Writer(outputset)
    for sentencepair in reader:
        sentencepair.ref = None
        sentencepair.source = None
        sentencepair.category = None
        writer.write(sentencepair)
    writer.close()
    reader.close()
Beispiel #3
0
def main():
    try:
        outputset = sys.argv[1]
        inputsets = sys.argv[2:]
    except:
        print("Syntax: outputset inputset inputset2...", file=sys.stderr)
        sys.exit(2)

    id = 0
    writer = Writer(outputset)
    for inputset in inputsets:
        reader = Reader(inputset)
        for sentencepair in reader:
            id += 1
            sentencepair.id = id
            writer.write(sentencepair)
        reader.close()
    writer.close()
Beispiel #4
0
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
    except:
        print("Syntax: inputset outputset",file=sys.stderr)
        sys.exit(2)

    reader = Reader(inputset)
    sentencepairs = []
    for sentencepair in reader:
        sentencepairs.append(sentencepair)
    reader.close()
    writer = Writer(outputset)
    random.shuffle(sentencepairs)
    for i, sentencepair in enumerate(sentencepairs):
        sentencepair.id = i+1
        writer.write(sentencepair)
    writer.close()
Beispiel #5
0
def generate(testoutput, ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, size =0, joinedprobabilitythreshold = 0.01, divergencefrombestthreshold=0.8,DEBUG = False):


    if size > 0:
        print("Extracting instances, writing to " + testoutput + '.tmp',file=sys.stderr)
        writer = Writer(testoutput+'.tmp')
    else:
        print("Extracting instances, writing to " + testoutput,file=sys.stderr)
        writer = Writer(testoutput)



    prevsentence = -1
    id = 0
    for sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence, sentence in extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG):
        id += 1
        if sentence != prevsentence:
            print(datetime.datetime.now().strftime('%H:%M:%S'), "Input sentence #" + str(sentence) + " , Output sentence #" + str(id), file=sys.stderr)
            prevsentence = sentence
        valid, sentencepair = makesentencepair(id, sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence)
        if valid:
            writer.write(sentencepair)

    writer.close()

    if size > 0:
        print("Sampling " + str(size),file=sys.stderr)
        selected_ids = set(random.sample( range(1,id+1), size ))
        writer = Writer(testoutput)
        reader = Reader(testoutput+'.tmp')
        newid = 0
        for sentencepair in reader:
            if int(sentencepair.id) in selected_ids:
                newid += 1
                sentencepair.id = newid
                writer.write(sentencepair)
        reader.close()
        writer.close()
Beispiel #6
0
def main():
    global sources, categories
    if len(sys.argv) < 4:
        print("Syntax: set L1 L2",file=sys.stderr)
        sys.exit(2)
    setfile= sys.argv[1]
    l1= sys.argv[2]
    l2= sys.argv[3]

    sentencepairs = []
    if os.path.exists(setfile):
        print("Loading existing file: ", setfile)
        reader = Reader(setfile)
        for sentencepair in reader:
            sentencepairs.append(sentencepair)
            if sentencepair.source:
                sources[sentencepair.source] += 1
            if sentencepair.category:
                categories[sentencepair.category] += 1
        print(str(len(sentencepairs)) + " sentences loaded")
    else:
        print("New file: ", setfile,file=sys.stderr)

    print("Type h for help")

    cursor = None

    quit = False
    while not quit:
        cmd = input("> ")
        if cmd.lower() == 'q':
            writer = Writer(setfile,l1,l2)
            for sentencepair in sentencepairs:
                writer.write(sentencepair)
            writer.close()
            quit = True
        elif cmd.lower() == 'h':
            print("q\tSave and quit",file=sys.stderr)
            print("n\tNew sentence pair",file=sys.stderr)
            #print("d\tDelete sentence pair",file=sys.stderr)
            print("a\tAdd alternative",file=sys.stderr)
            print(">\tNext sentence pair",file=sys.stderr)
            print("<\tPrevious sentence pair",file=sys.stderr)
            print("12\tGo to sentence pair #12", file=sys.stderr)
            print("w\tWrite changes to disk", file=sys.stderr)
        elif cmd.lower() == "<":
            if cursor is None:
                cursor = len(sentencepairs) - 1
            else:
                cursor = cursor - 1
                if cursor < 0:
                    cursor = len(sentencepairs) - 1
            showsentencepair(sentencepairs, cursor)
        elif cmd.lower() == ">":
            if cursor is None:
                cursor = 0
            else:
                cursor = cursor + 1
                if cursor >= len(sentencepairs):
                    cursor = 0
            showsentencepair(sentencepairs, cursor)
        elif cmd.lower().isdigit():
            cursor = int(cmd.lower()) - 1
            if cursor < 0:
                cursor = 0
            if cursor >= len(sentencepairs):
                cursor = len(sentencepairs) - 1
        elif cmd.lower() == 'n':
            cursor = newsentencepair(sentencepairs)
        elif cmd.lower() == 'w':
            writer = Writer(setfile,l1,l2)
            for sentencepair in sentencepairs:
                writer.write(sentencepair)
            writer.close()
        elif cmd.lower() == 'p':
            if cursor is None:
                cursor = 0
            showsentencepair(sentencepairs, cursor)
        elif cmd.lower() == 'a':
            if cursor is None:
                cursor = 0
            addalternative(sentencepairs[cursor])
        else:
            print("No such command, type h for help", file=sys.stderr)
Beispiel #7
0
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
        l1 = sys.argv[3]
        l2 = sys.argv[4]
    except:
        print("Syntax: inputset outputset l1 l2", file=sys.stderr)
        sys.exit(2)

    writer = Writer(outputset)
    reader = Reader(inputset)
    for sentencepair in reader:
        if sentencepair.ref:
            for left, fragment, right in sentencepair.fragments(sentencepair.ref):
                print("Tokenising reference: L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l2), id=fragment.id)
                for alt in alts:
                    fragment.alternatives.append(Alternative(tok(alt.value, l2)))
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    ref = left + (fragment,) + right
                elif left:
                    ref = left + (fragment,)
                elif right:
                    ref = (fragment,) + right
                sentencepair.ref = ref

        if sentencepair.output:
            for left, fragment, right in sentencepair.fragments(sentencepair.output):
                print("Tokenising output:    L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l2))
                for alt in alts:
                    fragment.alternatives.append(Alternative(tok(alt)))
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    out = left + (fragment,) + right
                elif left:
                    out = left + (fragment,)
                elif right:
                    out = (fragment,) + right
                sentencepair.output = out

        if sentencepair.input:
            for left, fragment, right in sentencepair.fragments(sentencepair.input):
                print("Tokenising input:     L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l1), id=fragment.id)
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    inp = left + (fragment,) + right
                elif left:
                    inp = left + (fragment,)
                elif right:
                    inp = (fragment,) + right
                sentencepair.input = inp

        writer.write(sentencepair)
    reader.close()
    writer.close()
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(description="Colibrita - Translation Assistance using Moses", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-f','--dataset', type=str,help="Dataset file", action='store',default="",required=True)
    parser.add_argument('--debug','-d', help="Debug", action='store_true', default=False)
    parser.add_argument('-o','--output',type=str,help="Output prefix", required = True)
    parser.add_argument('-T','--ttable', type=str,help="Phrase translation table (file) to use when testing with --lm and without classifier training", action='store',required=True)
    parser.add_argument('--lm',type=str, help="Language model (file in ARPA format, as produced by for instance SRILM)", action='store',required=True)
    parser.add_argument('--lmweight',type=float, help="Language model weight for Moses ", action='store',default=0.5)
    parser.add_argument('--lmorder',type=float, help="Language model order", action='store',default=3)
    parser.add_argument('--dweight',type=float, help="Distortion weight for Moses", action='store',default=0.6)
    parser.add_argument('--tmweights',type=str, help="Translation model weights for Moses (comma separated)", action='store',default="0.20,0.20,0.20,0.20,0.20")
    parser.add_argument('--lmweightrr',type=float, help="Language model weight in reranking", action='store',default=1)
    parser.add_argument('--tweightrr',type=float, help="Translation model weight in reranking", action='store',default=1)
    parser.add_argument('-n','--n',type=int,help="Number of output hypotheses per sentence", default=25)
    parser.add_argument('-a','--a',type=int,help="Add alternative translations, up to the specified numer", default=0)

    args = parser.parse_args()

    #if os.path.exists(args.output):
    #    print("Output already " + args.output + " already exists, doing nothing..",file=sys.stderr)
    #    sys.exit(2)
    #else:
    #    os.mkdir(args.output)

    if not os.path.exists(args.ttable):
        print("Translation table " + args.ttable + " does not exist", file=sys.stderr)
        sys.exit(2)

    if not os.path.exists(args.lm):
        print("Language model " + args.lm + " does not exist", file=sys.stderr)
        sys.exit(2)


    data = Reader(args.dataset)

    f = open(args.output + '.moses.ini','w',encoding='utf-8')
    f.write("[input-factors]\n0\n\n")
    f.write("[mapping]\n0 T 0\n\n")
    f.write("[ttable-file]\n0 0 0 5 " + args.ttable + "\n\n")
    f.write("[lmodel-file]\n0 0 " + str(args.lmorder) + " " + args.lm + "\n\n")
    f.write("[ttable-limit]\n20\n\n")
    f.write("[weight-d]\n" + str(args.dweight) + "\n\n")
    f.write("[weight-l]\n" + str(args.lmweight) + "\n\n")
    f.write("[weight-t]\n" + "\n".join(args.tmweights.split(',')) + "\n\n")
    f.write("[weight-w]\n-1\n")
    f.write("[distortion-limit]\n6\n")
    f.close()


    if not os.path.exists(args.output + ".nbestlist"):
        cmd = 'moses -f ' + args.output + '.moses.ini -n-best-list ' + args.output + '.nbestlist ' + str(args.n)
        print("Calling moses: " + cmd,file=sys.stderr)
        p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stdin=subprocess.PIPE,stderr=subprocess.PIPE)
        for sentencepair in data:
            for left, sourcefragment, right in sentencepair.inputfragments():
                p.stdin.write( (str(sourcefragment) + "\n").encode('utf-8'))
        p.communicate()
        p.stdin.close()

        data.reset()
    else:
        print("Moses output already exists, not overwriting. Delete " + args.output + ".nbestlist if you want a fresh run.",file=sys.stderr)


    print("Loading Language model", file=sys.stderr)
    lm = ARPALanguageModel(args.lm)

    print("Processing moses output...",file=sys.stderr)

    previndex = -1
    sentenceoutput = []
    hypotheses = []
    with open(args.output+'.nbestlist','r',encoding='utf-8') as f:
        for line in f:
            fields = [ x.strip() for x in  line.strip().split("|||") ]
            print(fields,file=sys.stderr)
            index = int(fields[0])
            if index != previndex:
                if hypotheses:
                    sentenceoutput.append( hypotheses )
                hypotheses = []
            previndex = index
            solution = fields[1]
            rawscores = fields[2].split(' ')
            print(rawscores,file=sys.stderr)
            tscore = float(rawscores[9])
            hypotheses.append( (solution, tscore) )
        sentenceoutput.append( hypotheses ) #don't forget last one

    writer = Writer(args.output + '.output.xml')
    for i, sentencepair in enumerate(data):
        sentencepair.output = copy(sentencepair.input)
        hypotheses = sentenceoutput[i]
        for left, inputfragment, right in sentencepair.inputfragments():
            candidatesentences = []
            bestlmscore = -999999999
            besttscore = -999999999
            for hypothesis, tscore in hypotheses:
                #compute new lm score
                outputfragment = Fragment(tuple(hypothesis.split(' ')), inputfragment.id)
                candidatesentence = sentencepair.replacefragment(inputfragment, outputfragment, sentencepair.output)
                lminput = " ".join(sentencepair._str(candidatesentence)).split(" ") #joining and splitting deliberately to ensure each word is one item
                lmscore = lm.score(lminput)
                assert lmscore <= 0
                if lmscore > bestlmscore:
                    bestlmscore = lmscore
                if tscore > besttscore:
                    besttscore = tscore

                candidatesentences.append( ( candidatesentence, hypothesis, tscore, lmscore ) )

            #compute scores
            solutions = []
            for candidatesentence, targetpattern, tscore, lmscore in candidatesentences:
                tscore = args.tweightrr * (tscore-besttscore)
                lmscore = args.lmweightrr * (lmscore-bestlmscore)
                score = tscore + lmscore
                print(targetpattern + " --- tscore=" + str(tscore) + ", lmscore=" + str(lmscore),file=sys.stderr)
                solutions.append( (score, targetpattern) )

            solutions = sorted(solutions, key=lambda x: -1 * x[0])

            translation = tuple(solutions[0][1].split())
            outputfragment = Fragment(translation, inputfragment.id)
            print("\t" + str(inputfragment) + " -> " + str(outputfragment), file=sys.stderr)

            if args.a:
                for score, solution in solutions[1:1+args.a]:
                    outputfragment.alternatives.append( Alternative( tuple(solution.split()), confidence=score) )

            sentencepair.output = sentencepair.replacefragment(inputfragment, outputfragment, sentencepair.output)

            writer.write(sentencepair)

            break #only support one iteration for now, one fragment per sentence
    writer.close()


    print("All done.", file=sys.stderr)