def calculate(): if log.level >= 1: sys.stderr.write("(4) Calculating probabilities\n") count = 1 dropped = 0 for r in gram.iterkeys(): ewordsnorm = r.e.handle() scores = r.scores weight = scores[0] if weight <= opts.cutoff: dropped += 1 continue try: newscores = [ costfromprob(float(weight) / xsum[r.lhs]), # p(e,f|x) #-math.log10(float(weight)/esum[(r.lhs, ewordsnorm)]), # p(f|e,x) costfromprob(float(weight) / esum[ewordsnorm]), # p(f|e) #-math.log10(float(weight)/fsum[(r.lhs, r.f)]), # p(e|f,x) costfromprob(float(weight) / fsum[r.f]), # p(e|f) #-math.log10(float(fsum[r.f])/allsum), # p(f) #-math.log10(float(esum[ewordsnorm])/allsum), # p(e) ] # the rest of the fields we find the weighted average of, using the first field as weight # fields 2 and 3 are interpreted as probabilities, the rest as costs. this is ugly if len(scores) >= 3: newscores.extend([ costfromprob(scores[1] / weight), # lexical weight costfromprob(scores[2] / weight), # lexical weight ]) # anything else newscores.extend([score / weight for score in scores[3:]]) r.scores = newscores output_file.write("%s\n" % r.to_line()) except (OverflowError, ZeroDivisionError, KeyError): sys.stderr.write( "warning: division by zero or log of zero: %s, xsum=%s fsum=%s esum=%s allsum=%s\n" % (r.to_line(), xsum[r.lhs], fsum[r.f], esum[ewordsnorm], allsum)) if log.level >= 1 and count % interval == 0: sys.stderr.write("time: %f, rules out: %d, dropped: %d\n" % (monitor.cpu(), count, dropped)) count += 1 # obsolete """for (x,s) in xsum.iteritems(): if x != PHRASE: sys.stderr.write("output PHRASE -> %s\n" % sym.tostring(x)) # or should it be relative to PHRASE? x = sym.setindex(x, 1) try: r = rule.Rule(PHRASE, rule.Phrase([x]), rule.Phrase([x]), scores=[-math.log10(float(s)/allsum), 0.0, 0.0, 0.0, 0.0]) except OverflowError: sys.stderr.write("warning: overflow error: x=%s, xsum=%s, allsum=%s\n" % (x, s, allsum)) output_file.write("%s\n" % r.to_line())""" log.write("%d dropped total\n" % dropped)
def tabulate(): if log.level >= 1: sys.stderr.write("(3) Tabulating filtered phrases\n") count = 1 inputfiles = [] for input in inputs: if os.path.isdir(input): inputfiles.extend( os.path.join(input, name) for name in os.listdir(input)) else: inputfiles.append(input) inputfiles = [file(inputfile) for inputfile in inputfiles] global fsum, esum, allsum, xsum, gram fsum = {} # c(lhs, french) esum = {} # c(lhs, english) allsum = 0.0 # c(*) xsum = {} # c(lhs) gram = {} # read in all rules with matching english sides at the same time. # this way, we can sum only those english sides that ever appeared # with a french side that passes the filter. for rules in read_rule_blocks(inputfiles): flag = False blocksum = 0. for r in rules: scores = r.scores weight = scores[0] allsum += weight blocksum += weight xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight if ffilter is None or ffilter.match( r.f ): # there used to be a shortcut here -- if fsum.has_key(r.f) #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight fsum[r.f] = fsum.get(r.f, 0.0) + weight if r in gram: gram[r] += r else: gram[r] = r flag = True if log.level >= 1 and count % interval == 0: sys.stderr.write( "time: %f, memory: %s, rules in: %d, rules counted: %d\n" % (monitor.cpu(), monitor.memory(), count, len(gram))) count += 1 if flag: ewordsnorm = rules[0].e.handle() if ewordsnorm in esum: sys.stderr.write("warning: files not sorted properly\n") esum[ewordsnorm] = blocksum
def calculate(): if log.level >= 1: sys.stderr.write("(4) Calculating probabilities\n") count = 1 dropped = 0 for r in gram.iterkeys(): ewordsnorm = r.e.handle() scores = r.scores weight = scores[0] if weight <= opts.cutoff: dropped += 1 continue try: newscores = [ costfromprob(float(weight)/xsum[r.lhs]), # p(e,f|x) #-math.log10(float(weight)/esum[(r.lhs, ewordsnorm)]), # p(f|e,x) costfromprob(float(weight)/esum[ewordsnorm]), # p(f|e) #-math.log10(float(weight)/fsum[(r.lhs, r.f)]), # p(e|f,x) costfromprob(float(weight)/fsum[r.f]), # p(e|f) #-math.log10(float(fsum[r.f])/allsum), # p(f) #-math.log10(float(esum[ewordsnorm])/allsum), # p(e) ] # the rest of the fields we find the weighted average of, using the first field as weight # fields 2 and 3 are interpreted as probabilities, the rest as costs. this is ugly if len(scores) >= 3: newscores.extend([ costfromprob(scores[1]/weight), # lexical weight costfromprob(scores[2]/weight), # lexical weight ]) # anything else newscores.extend([score/weight for score in scores[3:]]) r.scores = newscores output_file.write("%s\n" % r.to_line()) except (OverflowError, ZeroDivisionError, KeyError): sys.stderr.write("warning: division by zero or log of zero: %s, xsum=%s fsum=%s esum=%s allsum=%s\n" % (r.to_line(), xsum[r.lhs], fsum[r.f], esum[ewordsnorm], allsum)) if log.level >= 1 and count%interval == 0: sys.stderr.write("time: %f, rules out: %d, dropped: %d\n" % (monitor.cpu(), count, dropped)) count += 1 # obsolete """for (x,s) in xsum.iteritems(): if x != PHRASE: sys.stderr.write("output PHRASE -> %s\n" % sym.tostring(x)) # or should it be relative to PHRASE? x = sym.setindex(x, 1) try: r = rule.Rule(PHRASE, rule.Phrase([x]), rule.Phrase([x]), scores=[-math.log10(float(s)/allsum), 0.0, 0.0, 0.0, 0.0]) except OverflowError: sys.stderr.write("warning: overflow error: x=%s, xsum=%s, allsum=%s\n" % (x, s, allsum)) output_file.write("%s\n" % r.to_line())""" log.write("%d dropped total\n" % dropped)
def tabulate(): if log.level >= 1: sys.stderr.write("(3) Tabulating filtered phrases\n") count = 1 inputfiles = [] for input in inputs: if os.path.isdir(input): inputfiles.extend(os.path.join(input, name) for name in os.listdir(input)) else: inputfiles.append(input) inputfiles = [file(inputfile) for inputfile in inputfiles] global fsum, esum, allsum, xsum, gram fsum = {} # c(lhs, french) esum = {} # c(lhs, english) allsum = 0.0 # c(*) xsum = {} # c(lhs) gram = {} # read in all rules with matching english sides at the same time. # this way, we can sum only those english sides that ever appeared # with a french side that passes the filter. for rules in read_rule_blocks(inputfiles): flag = False blocksum = 0. for r in rules: scores = r.scores weight = scores[0] allsum += weight blocksum += weight xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight if ffilter is None or ffilter.match(r.f): # there used to be a shortcut here -- if fsum.has_key(r.f) #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight fsum[r.f] = fsum.get(r.f, 0.0) + weight if r in gram: gram[r] += r else: gram[r] = r flag = True if log.level >= 1 and count%interval == 0: sys.stderr.write("time: %f, memory: %s, rules in: %d, rules counted: %d\n" % (monitor.cpu(), monitor.memory(), count, len(gram))) count += 1 if flag: ewordsnorm = rules[0].e.handle() if ewordsnorm in esum: sys.stderr.write("warning: files not sorted properly\n") esum[ewordsnorm] = blocksum
def split_and_print(self, latticeIter): for lat in latticeIter: #print "DEBUG: " + str(lat) self.linesExamined += 1 if self.linesExamined % 100 == 0: sys.stderr.write("%d sentences, %d skipped, %s per second\n" % (self.linesExamined, self.linesSkipped, float(self.linesExamined) / monitor.cpu())) edges = [] initial_start = 0 next_fake_id = -1 # when splitting words, we add fake id (which are negative) -- to be fixed later for fed in sorted(lat.lines, lambda x, y: cmp(x.span, y.span)): # only supports simple lattices right now # TODO: change to support blocks? # add basic edge to edges list (ids will change later) edges.append(fed) sent_initial = fed.span[0] == initial_start if fed.span == (0, 1) and fed.label == '<foreign-sentence>': # foreign sentence is handled specially initial_start = 1 continue fw = fed.label remaining_fw = fw remaining_fw_start_pos, remaining_fw_end_pos = fed.span if opts.add_identity_arcs and make_identity(fw): edges.append( lattice.Edge(span=(remaining_fw_start_pos, remaining_fw_end_pos), label=fw, properties={ 'identity': '10^-1', 'target': 'NNP("' + fw + '")' })) if self.morphTable: used = set([]) # to avoid double use of affixes for m in self.morphTable.iter(): fanalysis = m.analyze_arabic(remaining_fw, sent_initial, used) if fanalysis: if fanalysis.prefixes and len( fanalysis.prefixes) > 0: sent_initial = False # only keep the sentence initial flag if there are no prefixes...? for pr in fanalysis.prefixes: edges.append( lattice.Edge( span=(remaining_fw_start_pos, next_fake_id), label=pr, properties={'s_morph': '10^-1'})) remaining_fw_start_pos = next_fake_id next_fake_id -= 1 # fake id's "increase" in negative space if fanalysis.suffixes and len( fanalysis.suffixes) > 0: fanalysis.suffixes.reverse( ) # we want to walk backward over this list for sf in fanalysis.suffixes: edges.append( lattice.Edge( span=(next_fake_id, remaining_fw_end_pos), label=sf, properties={'s_morph': '10^-1'})) remaining_fw_end_pos = next_fake_id next_fake_id -= 1 # fake id's "increase" in negative space # place the baseword itself there if fanalysis.baseword: edges.append( lattice.Edge( span=(remaining_fw_start_pos, remaining_fw_end_pos), label=fanalysis.baseword, properties={'s_morph': '10^-1'})) remaining_fw = fanalysis.baseword else: sys.stderr.write( "Analysis of word '%s' missing baseword using morph transform %s" % (remaining_fw, str(m))) self.f_outfile.write( str( lattice.Lattice(lines=fix_edges(edges), properties=lat.properties)) + ";\n") #except Error, inst: # print "Error encountered splitting affixes:", inst # self.linesSkipped += 1 # if self.f_outfile: # self.f_outfile.write(????) # continue sys.stderr.write("%d sentences, %d skipped, %s per second\n" % (self.linesExamined, self.linesSkipped, float(self.linesExamined) / monitor.cpu()))
def write(s): s = "t=%s wt=%s %s" % (dec(monitor.cpu()), dec(time.time() - tstart), s) for l in s.splitlines(True): file.write(prefix + l) file.flush()
def split_and_print(self, latticeIter): for lat in latticeIter: #print "DEBUG: " + str(lat) self.linesExamined += 1 if self.linesExamined % 100 == 0: sys.stderr.write("%d sentences, %d skipped, %s per second\n" % (self.linesExamined, self.linesSkipped, float(self.linesExamined)/monitor.cpu())) edges = [] initial_start = 0 next_fake_id = -1 # when splitting words, we add fake id (which are negative) -- to be fixed later for fed in sorted(lat.lines, lambda x, y: cmp(x.span, y.span)): # only supports simple lattices right now # TODO: change to support blocks? # add basic edge to edges list (ids will change later) edges.append(fed) sent_initial = fed.span[0] == initial_start if fed.span == (0,1) and fed.label == '<foreign-sentence>': # foreign sentence is handled specially initial_start = 1 continue fw = fed.label remaining_fw = fw remaining_fw_start_pos, remaining_fw_end_pos = fed.span if opts.add_identity_arcs and make_identity(fw): edges.append(lattice.Edge(span=(remaining_fw_start_pos, remaining_fw_end_pos), label=fw, properties={'identity':'10^-1','target':'NNP("' + fw + '")'})) if self.morphTable: used = set([]) # to avoid double use of affixes for m in self.morphTable.iter(): fanalysis = m.analyze_arabic(remaining_fw, sent_initial, used) if fanalysis: if fanalysis.prefixes and len(fanalysis.prefixes) > 0: sent_initial = False # only keep the sentence initial flag if there are no prefixes...? for pr in fanalysis.prefixes: edges.append(lattice.Edge(span=(remaining_fw_start_pos, next_fake_id), label=pr, properties={'s_morph':'10^-1'})) remaining_fw_start_pos = next_fake_id next_fake_id -= 1 # fake id's "increase" in negative space if fanalysis.suffixes and len(fanalysis.suffixes) > 0: fanalysis.suffixes.reverse() # we want to walk backward over this list for sf in fanalysis.suffixes: edges.append(lattice.Edge(span=(next_fake_id, remaining_fw_end_pos), label=sf, properties={'s_morph':'10^-1'})) remaining_fw_end_pos = next_fake_id next_fake_id -= 1 # fake id's "increase" in negative space # place the baseword itself there if fanalysis.baseword: edges.append(lattice.Edge(span=(remaining_fw_start_pos, remaining_fw_end_pos), label=fanalysis.baseword, properties={'s_morph':'10^-1'})) remaining_fw = fanalysis.baseword else: sys.stderr.write("Analysis of word '%s' missing baseword using morph transform %s" % (remaining_fw, str(m))) self.f_outfile.write(str(lattice.Lattice(lines=fix_edges(edges),properties=lat.properties)) + ";\n") #except Error, inst: # print "Error encountered splitting affixes:", inst # self.linesSkipped += 1 # if self.f_outfile: # self.f_outfile.write(????) # continue sys.stderr.write("%d sentences, %d skipped, %s per second\n" % (self.linesExamined, self.linesSkipped, float(self.linesExamined)/monitor.cpu()))
from forest import get_weights weights = get_weights(opts.weights) # should have a special "Identity" vector (11...1) trim_weights = 1 # svector.Vector("gt_prob=1") slim_features = False redundant_rules = False output_foreign_start = False foreign_sentence_tag = "<foreign-sentence>" max_edges_per_node = 1000000 start_sent_id = 1 # command-line: cat <isi-forest> | ./convert_forest_to_my.py <rules> <f_sent> <bylines> <refs>+ import monitor sys.stderr.write("t=%s start\n" % monitor.cpu()) ## if len(sys.argv) < 2: ## print >> sys.stderr, "WARNING: no rule files supplied -- output forest" \ ## + "will contain ruleids only" ## rules = None ## else: forestfile = sys.stdin srcfile = open(opts.foreign) bylinefile = open(opts.byline) reffiles = [open(f) for f in args] ## the remaining of the input are assumed to be refs ## print >> logs, "rules file %s" % rulefile ## print >> logs, "source file %s" % srcfile
def cpu(): return monitor.cpu()
dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v) for (f, v) in ded.dcost.iteritems()) edgestrs.append( ' {"head": %s, "tails": [%s], "features": %s}\n' % (ni, ",".join(tailstrs), dcoststr)) result.append(' "edges": [\n%s\n ]\n' % ",\n".join(edgestrs)) result.append('}') return "".join(result) if __name__ == "__main__": import monitor import getopt weights = None opts, args = getopt.getopt(sys.argv[1:], "w:") for opt, optarg in opts: if opt == "-w": weights = svector.Vector(open(optarg).read()) sys.stderr.write("t=%s start\n" % monitor.cpu()) for li, line in enumerate(sys.stdin): f = forest_from_text(line) if weights: f.reweight(weights) print forest_to_xml(f, mode="english", weights=weights) sys.stderr.write("t=%s read line %s\n" % (monitor.cpu(), li)) sys.stderr.flush()
def write(s): s = "t=%s wt=%s %s" % (dec(monitor.cpu()), dec(time.time()-tstart), s) for l in s.splitlines(True): file.write(prefix + l) file.flush()
def main(argv=None): '''Call this from the command-line to create a pre-computed binary data array for later use''' if argv is None: argv = sys.argv parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+ "\n\nNote: -d,-s,-a, and -p are mutually exclusive") parser.add_option("-d", "--data-array", action="store_true", default=False, dest="da", help="Compile file into data array (default)") parser.add_option("-s", "--suffix-array", action="store_true", default=False, dest="sa", help="Compile file into suffix array") parser.add_option("-a", "--alignment", action="store_true", default=False, dest="a", help="Compile file into alignment") parser.add_option("-l", "--lexical", action="store_true", default=False, dest="l", help="Compile file into lex file") parser.add_option("-x", "--compute_lexical", action="store", nargs=2, dest="lex_args", help="Compute lex file from data", metavar="<f file> <e file>") parser.add_option("-p", "--parse", action="store_true", default=False, dest="p", help="Compile file into parse") parser.add_option("-b", "--binary-infile", action="store_true", default=False, dest="bin", help="Input file is binary (default: text)") parser.add_option("-t", "--text-outfile", action="store_true", default=False, dest="text", help="Output file is text (default: binary)") parser.add_option("-e", "--enhanced-outfile", action="store_true", default=False, dest="enhanced", help="Output file is enhanced text (default: binary)") parser.add_option("-r", action="store", nargs=7, dest="precomp_args", help="Precompute collocations (Hiero only)", metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>") (options, args) = parser.parse_args() filetype_opts = [options.da, options.sa, options.a, options.p] if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: parser.print_help() sys.exit(1) (infilename, outfilename) = args if options.bin: bin = " binary" else: bin = "" start_time = monitor.cpu() if options.precomp_args: if options.bin: obj = precomputation.Precomputation(infilename, from_binary=True) else: keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) precomp_opts = {} sys.stderr.write("Precomputing statistics for list %s\n" % infilename) for pair in options.precomp_args: (key, val) = pair.split("=") if key in keys: keys.remove(key) if key != "sa": val = int(val) precomp_opts[key] = val else: sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) return 1 sa = csuf.SuffixArray(precomp_opts["sa"], True) obj = precomputation.Precomputation(infilename, sa, precompute_rank=precomp_opts["rank1"], precompute_secondary_rank=precomp_opts["rank2"], max_length=precomp_opts["max-len"], max_nonterminals=precomp_opts["max-nt"], train_max_initial_size=precomp_opts["max-size"], train_min_gap_size=precomp_opts["min-gap"]) elif options.sa: sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) obj = csuf.SuffixArray(infilename, options.bin) elif options.a: sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) obj = calignment.Alignment(infilename, options.bin) elif options.p: sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) obj = parse.ParseArray(infilename, options.bin) elif options.l: sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) obj = clex.CLex(infilename, options.bin) elif options.lex_args: ffile = options.lex_args[0] efile = options.lex_args[1] sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) fsarray = csuf.SuffixArray(ffile, True) earray = cdat.DataArray(efile, True) aarray = calignment.Alignment(infilename, True) obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) else: sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) obj = cdat.DataArray(infilename, options.bin) sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) start_time = monitor.cpu() if options.text: sys.stderr.write("Writing text file %s...\n" % outfilename) obj.write_text(outfilename) elif options.enhanced: sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) obj.write_enhanced(outfilename) else: sys.stderr.write("Writing binary file %s...\n" % outfilename) obj.write_binary(outfilename) sys.stderr.write("Finished.\n") sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) mem_use = float(monitor.memory()) metric = "B" if mem_use / 1000 > 1: mem_use /= 1000 metric = "KB" if mem_use / 1000 > 1: mem_use /= 1000 metric = "MB" if mem_use / 1000 > 1: mem_use /= 1000 metric = "GB" sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric))
if log.level >= 1: log.write("Reading configuration from %s\n" % opts.config) execfile(opts.config) if len(args) >= 1 and args[0] != "-": input_file = file(args[0], "r") else: input_file = sys.stdin if len(args) >= 2 and args[1] != "-": output_file = file(args[1], "w") else: output_file = sys.stdout gc.collect() if log.level >= 1: log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) sents = sgml.read_raw(input_file) for sent in sents: mark = sent.getmark() if mark is not None: (tag, attrs) = mark if tag == "seg": sent.unmark() dattrs = sgml.attrs_to_dict(attrs) sent.meta = attrs extract_grammar(sent)
# if only one chunk, don't validate match. for sentnum in matchsents: if n_chunks == 1 or self.match1(pattern, self.substrs[sentnum], self.n[sentnum]): yield sentnum if __name__ == "__main__": import optparse optparser = optparse.OptionParser() optparser.add_option('-l', '--maxlen', dest='maxlen', type="int", default=None, help='maximum initial base phrase size') (opts,args) = optparser.parse_args() maxlen = opts.maxlen filterfilename = args[0] log.write("t=%s building filter from %s\n" % (monitor.cpu(), filterfilename)) ffilter = Filter([line.split() for line in file(filterfilename)], maxlen=maxlen) log.write("t=%s begin filtering\n" % monitor.cpu()) progress = 0 for line in sys.stdin: #rule, _ = simplerule.Rule.from_str_hiero(line) #frhs = rule.frhs frhs = [simplerule.Nonterminal.from_str(f) for f in line.split(" ||| ", 3)[1].split()] for si in ffilter.match(frhs): print("%s\t%s" % (si,line.rstrip())) progress += 1