def main(): try: filename = sys.argv[1] except: print("Please specify a file to view",file=sys.stderr) sys.exit(2) reader = Reader(filename) for sentencepair in reader: print("----------- Sentence #" + str(sentencepair.id) + " -----------") if sentencepair.input: print("Input: ", end="") print(sentencepair.inputstr(True,"blue")) if sentencepair.ref: print("Reference: ", end="") print(sentencepair.refstr(True,"green")) if sentencepair.output: print("Output: ", end="") print(sentencepair.refstr(True,"yellow")) if sentencepair.source: print("Source: ", end="") print(sentencepair.source) if sentencepair.category: print("Category: ", end="") print(sentencepair.category) fragment = None for x in sentencepair.ref: if isinstance(x, Fragment): fragment = x if fragment and fragment.alternatives: print("Alternatives: ", end="") print("; ".join([str(x) for x in fragment.alternatives])) print() reader.close()
def main(): try: inputset = sys.argv[1] outputset = sys.argv[2] except: print("Syntax: inputset outputset",file=sys.stderr) sys.exit(2) reader = Reader(inputset) writer = Writer(outputset) for sentencepair in reader: sentencepair.ref = None sentencepair.source = None sentencepair.category = None writer.write(sentencepair) writer.close() reader.close()
def main(): try: outputset = sys.argv[1] inputsets = sys.argv[2:] except: print("Syntax: outputset inputset inputset2...", file=sys.stderr) sys.exit(2) id = 0 writer = Writer(outputset) for inputset in inputsets: reader = Reader(inputset) for sentencepair in reader: id += 1 sentencepair.id = id writer.write(sentencepair) reader.close() writer.close()
def main(): try: inputset = sys.argv[1] outputset = sys.argv[2] except: print("Syntax: inputset outputset",file=sys.stderr) sys.exit(2) reader = Reader(inputset) sentencepairs = [] for sentencepair in reader: sentencepairs.append(sentencepair) reader.close() writer = Writer(outputset) random.shuffle(sentencepairs) for i, sentencepair in enumerate(sentencepairs): sentencepair.id = i+1 writer.write(sentencepair) writer.close()
def generate(testoutput, ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, size =0, joinedprobabilitythreshold = 0.01, divergencefrombestthreshold=0.8,DEBUG = False): if size > 0: print("Extracting instances, writing to " + testoutput + '.tmp',file=sys.stderr) writer = Writer(testoutput+'.tmp') else: print("Extracting instances, writing to " + testoutput,file=sys.stderr) writer = Writer(testoutput) prevsentence = -1 id = 0 for sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence, sentence in extractpairs(ttablefile, gizamodelfile_s2t, gizamodelfile_t2s, patternmodelfile_source, patternmodelfile_target, classfile_source, classfile_target, joinedprobabilitythreshold, divergencefrombestthreshold, DEBUG): id += 1 if sentence != prevsentence: print(datetime.datetime.now().strftime('%H:%M:%S'), "Input sentence #" + str(sentence) + " , Output sentence #" + str(id), file=sys.stderr) prevsentence = sentence valid, sentencepair = makesentencepair(id, sourcepattern, targetpattern, sourceoffset, targetoffset, sourcesentence, targetsentence) if valid: writer.write(sentencepair) writer.close() if size > 0: print("Sampling " + str(size),file=sys.stderr) selected_ids = set(random.sample( range(1,id+1), size )) writer = Writer(testoutput) reader = Reader(testoutput+'.tmp') newid = 0 for sentencepair in reader: if int(sentencepair.id) in selected_ids: newid += 1 sentencepair.id = newid writer.write(sentencepair) reader.close() writer.close()
def main(): try: inputset = sys.argv[1] outputset = sys.argv[2] l1 = sys.argv[3] l2 = sys.argv[4] except: print("Syntax: inputset outputset l1 l2", file=sys.stderr) sys.exit(2) writer = Writer(outputset) reader = Reader(inputset) for sentencepair in reader: if sentencepair.ref: for left, fragment, right in sentencepair.fragments(sentencepair.ref): print("Tokenising reference: L=", left, file=sys.stderr) print(" F=", fragment.value, file=sys.stderr) print(" R=", right, file=sys.stderr) if left.strip(): left = tok(left, l2) else: left = "" alts = fragment.alternatives fragment = Fragment(tok(fragment.value, l2), id=fragment.id) for alt in alts: fragment.alternatives.append(Alternative(tok(alt.value, l2))) if right.strip(): right = tok(right, l2) else: right = "" if left and right: ref = left + (fragment,) + right elif left: ref = left + (fragment,) elif right: ref = (fragment,) + right sentencepair.ref = ref if sentencepair.output: for left, fragment, right in sentencepair.fragments(sentencepair.output): print("Tokenising output: L=", left, file=sys.stderr) print(" F=", fragment.value, file=sys.stderr) print(" R=", right, file=sys.stderr) if left.strip(): left = tok(left, l2) else: left = "" alts = fragment.alternatives fragment = Fragment(tok(fragment.value, l2)) for alt in alts: fragment.alternatives.append(Alternative(tok(alt))) if right.strip(): right = tok(right, l2) else: right = "" if left and right: out = left + (fragment,) + right elif left: out = left + (fragment,) elif right: out = (fragment,) + right sentencepair.output = out if sentencepair.input: for left, fragment, right in sentencepair.fragments(sentencepair.input): print("Tokenising input: L=", left, file=sys.stderr) print(" F=", fragment.value, file=sys.stderr) print(" R=", right, file=sys.stderr) if left.strip(): left = tok(left, l2) else: left = "" alts = fragment.alternatives fragment = Fragment(tok(fragment.value, l1), id=fragment.id) if right.strip(): right = tok(right, l2) else: right = "" if left and right: inp = left + (fragment,) + right elif left: inp = left + (fragment,) elif right: inp = (fragment,) + right sentencepair.input = inp writer.write(sentencepair) reader.close() writer.close()