Exemple #1
0
def main():
    try:
        filename = sys.argv[1]
    except:
        print("Please specify a file to view",file=sys.stderr)
        sys.exit(2)

    reader = Reader(filename)
    for sentencepair in reader:
        print("----------- Sentence #" + str(sentencepair.id) + " -----------")
        if sentencepair.input:
            print("Input: ", end="")
            print(sentencepair.inputstr(True,"blue"))
        if sentencepair.ref:
            print("Reference: ", end="")
            print(sentencepair.refstr(True,"green"))
        if sentencepair.output:
            print("Output: ", end="")
            print(sentencepair.refstr(True,"yellow"))
        if sentencepair.source:
            print("Source: ", end="")
            print(sentencepair.source)
        if sentencepair.category:
            print("Category: ", end="")
            print(sentencepair.category)
        fragment = None
        for x in sentencepair.ref:
            if isinstance(x, Fragment):
                fragment = x
        if fragment and fragment.alternatives:
            print("Alternatives: ", end="")
            print("; ".join([str(x) for x in fragment.alternatives]))
        print()
    reader.close()
Exemple #2
0
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
    except:
        print("Syntax: inputset outputset",file=sys.stderr)
        sys.exit(2)

    reader = Reader(inputset)
    writer = Writer(outputset)
    for sentencepair in reader:
        sentencepair.ref = None
        sentencepair.source = None
        sentencepair.category = None
        writer.write(sentencepair)
    writer.close()
    reader.close()
Exemple #3
0
def main():
    try:
        outputset = sys.argv[1]
        inputsets = sys.argv[2:]
    except:
        print("Syntax: outputset inputset inputset2...", file=sys.stderr)
        sys.exit(2)

    id = 0
    writer = Writer(outputset)
    for inputset in inputsets:
        reader = Reader(inputset)
        for sentencepair in reader:
            id += 1
            sentencepair.id = id
            writer.write(sentencepair)
        reader.close()
    writer.close()
Exemple #4
0
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
    except:
        print("Syntax: inputset outputset",file=sys.stderr)
        sys.exit(2)

    reader = Reader(inputset)
    sentencepairs = []
    for sentencepair in reader:
        sentencepairs.append(sentencepair)
    reader.close()
    writer = Writer(outputset)
    random.shuffle(sentencepairs)
    for i, sentencepair in enumerate(sentencepairs):
        sentencepair.id = i+1
        writer.write(sentencepair)
    writer.close()
Exemple #5
0
def generate(testoutput, ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, size =0, joinedprobabilitythreshold = 0.01, divergencefrombestthreshold=0.8,DEBUG = False):


    if size > 0:
        print("Extracting instances, writing to " + testoutput + '.tmp',file=sys.stderr)
        writer = Writer(testoutput+'.tmp')
    else:
        print("Extracting instances, writing to " + testoutput,file=sys.stderr)
        writer = Writer(testoutput)



    prevsentence = -1
    id = 0
    for sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence, sentence in extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG):
        id += 1
        if sentence != prevsentence:
            print(datetime.datetime.now().strftime('%H:%M:%S'), "Input sentence #" + str(sentence) + " , Output sentence #" + str(id), file=sys.stderr)
            prevsentence = sentence
        valid, sentencepair = makesentencepair(id, sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence)
        if valid:
            writer.write(sentencepair)

    writer.close()

    if size > 0:
        print("Sampling " + str(size),file=sys.stderr)
        selected_ids = set(random.sample( range(1,id+1), size ))
        writer = Writer(testoutput)
        reader = Reader(testoutput+'.tmp')
        newid = 0
        for sentencepair in reader:
            if int(sentencepair.id) in selected_ids:
                newid += 1
                sentencepair.id = newid
                writer.write(sentencepair)
        reader.close()
        writer.close()
Exemple #6
0
def main():
    try:
        inputset = sys.argv[1]
        outputset = sys.argv[2]
        l1 = sys.argv[3]
        l2 = sys.argv[4]
    except:
        print("Syntax: inputset outputset l1 l2", file=sys.stderr)
        sys.exit(2)

    writer = Writer(outputset)
    reader = Reader(inputset)
    for sentencepair in reader:
        if sentencepair.ref:
            for left, fragment, right in sentencepair.fragments(sentencepair.ref):
                print("Tokenising reference: L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l2), id=fragment.id)
                for alt in alts:
                    fragment.alternatives.append(Alternative(tok(alt.value, l2)))
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    ref = left + (fragment,) + right
                elif left:
                    ref = left + (fragment,)
                elif right:
                    ref = (fragment,) + right
                sentencepair.ref = ref

        if sentencepair.output:
            for left, fragment, right in sentencepair.fragments(sentencepair.output):
                print("Tokenising output:    L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l2))
                for alt in alts:
                    fragment.alternatives.append(Alternative(tok(alt)))
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    out = left + (fragment,) + right
                elif left:
                    out = left + (fragment,)
                elif right:
                    out = (fragment,) + right
                sentencepair.output = out

        if sentencepair.input:
            for left, fragment, right in sentencepair.fragments(sentencepair.input):
                print("Tokenising input:     L=", left, file=sys.stderr)
                print("                      F=", fragment.value, file=sys.stderr)
                print("                      R=", right, file=sys.stderr)
                if left.strip():
                    left = tok(left, l2)
                else:
                    left = ""
                alts = fragment.alternatives
                fragment = Fragment(tok(fragment.value, l1), id=fragment.id)
                if right.strip():
                    right = tok(right, l2)
                else:
                    right = ""
                if left and right:
                    inp = left + (fragment,) + right
                elif left:
                    inp = left + (fragment,)
                elif right:
                    inp = (fragment,) + right
                sentencepair.input = inp

        writer.write(sentencepair)
    reader.close()
    writer.close()