Beispiel #1
0
def calculate():
    if log.level >= 1:
        sys.stderr.write("(4) Calculating probabilities\n")

    count = 1
    dropped = 0
    for r in gram.iterkeys():
        ewordsnorm = r.e.handle()
        scores = r.scores
        weight = scores[0]
        if weight <= opts.cutoff:
            dropped += 1
            continue
        try:
            newscores = [
                costfromprob(float(weight) / xsum[r.lhs]),  # p(e,f|x)
                #-math.log10(float(weight)/esum[(r.lhs, ewordsnorm)]), # p(f|e,x)
                costfromprob(float(weight) / esum[ewordsnorm]),  # p(f|e)
                #-math.log10(float(weight)/fsum[(r.lhs, r.f)]),        # p(e|f,x)
                costfromprob(float(weight) / fsum[r.f]),  # p(e|f)
                #-math.log10(float(fsum[r.f])/allsum),          # p(f)
                #-math.log10(float(esum[ewordsnorm])/allsum),    # p(e)
            ]
            # the rest of the fields we find the weighted average of, using the first field as weight
            # fields 2 and 3 are interpreted as probabilities, the rest as costs. this is ugly
            if len(scores) >= 3:
                newscores.extend([
                    costfromprob(scores[1] / weight),  # lexical weight
                    costfromprob(scores[2] / weight),  # lexical weight
                ])
                # anything else
                newscores.extend([score / weight for score in scores[3:]])
            r.scores = newscores
            output_file.write("%s\n" % r.to_line())
        except (OverflowError, ZeroDivisionError, KeyError):
            sys.stderr.write(
                "warning: division by zero or log of zero: %s, xsum=%s fsum=%s esum=%s allsum=%s\n"
                % (r.to_line(), xsum[r.lhs], fsum[r.f], esum[ewordsnorm],
                   allsum))
        if log.level >= 1 and count % interval == 0:
            sys.stderr.write("time: %f, rules out: %d, dropped: %d\n" %
                             (monitor.cpu(), count, dropped))
        count += 1

    # obsolete
    """for (x,s) in xsum.iteritems():
        if x != PHRASE:
            sys.stderr.write("output PHRASE -> %s\n" % sym.tostring(x))
            # or should it be relative to PHRASE?
            x = sym.setindex(x, 1)
            try:
                r = rule.Rule(PHRASE, rule.Phrase([x]), rule.Phrase([x]), scores=[-math.log10(float(s)/allsum), 0.0, 0.0, 0.0, 0.0])
            except OverflowError:
                sys.stderr.write("warning: overflow error: x=%s, xsum=%s, allsum=%s\n" % (x, s, allsum))
            output_file.write("%s\n" % r.to_line())"""
    log.write("%d dropped total\n" % dropped)
Beispiel #2
0
def tabulate():
    if log.level >= 1:
        sys.stderr.write("(3) Tabulating filtered phrases\n")
    count = 1

    inputfiles = []
    for input in inputs:
        if os.path.isdir(input):
            inputfiles.extend(
                os.path.join(input, name) for name in os.listdir(input))
        else:
            inputfiles.append(input)
    inputfiles = [file(inputfile) for inputfile in inputfiles]

    global fsum, esum, allsum, xsum, gram
    fsum = {}  # c(lhs, french)
    esum = {}  # c(lhs, english)
    allsum = 0.0  # c(*)
    xsum = {}  # c(lhs)
    gram = {}

    # read in all rules with matching english sides at the same time.
    # this way, we can sum only those english sides that ever appeared
    # with a french side that passes the filter.

    for rules in read_rule_blocks(inputfiles):
        flag = False
        blocksum = 0.
        for r in rules:
            scores = r.scores
            weight = scores[0]
            allsum += weight
            blocksum += weight
            xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight
            if ffilter is None or ffilter.match(
                    r.f
            ):  # there used to be a shortcut here -- if fsum.has_key(r.f)
                #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight
                fsum[r.f] = fsum.get(r.f, 0.0) + weight
                if r in gram:
                    gram[r] += r
                else:
                    gram[r] = r
                flag = True
            if log.level >= 1 and count % interval == 0:
                sys.stderr.write(
                    "time: %f, memory: %s, rules in: %d, rules counted: %d\n" %
                    (monitor.cpu(), monitor.memory(), count, len(gram)))

            count += 1
        if flag:
            ewordsnorm = rules[0].e.handle()
            if ewordsnorm in esum:
                sys.stderr.write("warning: files not sorted properly\n")
            esum[ewordsnorm] = blocksum
Beispiel #3
0
def calculate():
    if log.level >= 1:
        sys.stderr.write("(4) Calculating probabilities\n")

    count = 1
    dropped = 0
    for r in gram.iterkeys():
        ewordsnorm = r.e.handle()
        scores = r.scores
        weight = scores[0]
        if weight <= opts.cutoff:
            dropped += 1
            continue
        try:
            newscores = [
                costfromprob(float(weight)/xsum[r.lhs]),             # p(e,f|x)
                #-math.log10(float(weight)/esum[(r.lhs, ewordsnorm)]), # p(f|e,x)
                costfromprob(float(weight)/esum[ewordsnorm]), # p(f|e)
                #-math.log10(float(weight)/fsum[(r.lhs, r.f)]),        # p(e|f,x)
                costfromprob(float(weight)/fsum[r.f]),        # p(e|f)
                #-math.log10(float(fsum[r.f])/allsum),          # p(f)
                #-math.log10(float(esum[ewordsnorm])/allsum),    # p(e)
                ]
            # the rest of the fields we find the weighted average of, using the first field as weight
            # fields 2 and 3 are interpreted as probabilities, the rest as costs. this is ugly
            if len(scores) >= 3:
                newscores.extend([
                    costfromprob(scores[1]/weight),             # lexical weight
                    costfromprob(scores[2]/weight),             # lexical weight
                ])
                # anything else
                newscores.extend([score/weight for score in scores[3:]])
            r.scores = newscores
            output_file.write("%s\n" % r.to_line())
        except (OverflowError, ZeroDivisionError, KeyError):
            sys.stderr.write("warning: division by zero or log of zero: %s, xsum=%s fsum=%s esum=%s allsum=%s\n" % (r.to_line(), xsum[r.lhs], fsum[r.f], esum[ewordsnorm], allsum))
        if log.level >= 1 and count%interval == 0:
            sys.stderr.write("time: %f, rules out: %d, dropped: %d\n" % (monitor.cpu(), count, dropped))
        count += 1

    # obsolete
    """for (x,s) in xsum.iteritems():
        if x != PHRASE:
            sys.stderr.write("output PHRASE -> %s\n" % sym.tostring(x))
            # or should it be relative to PHRASE?
            x = sym.setindex(x, 1)
            try:
                r = rule.Rule(PHRASE, rule.Phrase([x]), rule.Phrase([x]), scores=[-math.log10(float(s)/allsum), 0.0, 0.0, 0.0, 0.0])
            except OverflowError:
                sys.stderr.write("warning: overflow error: x=%s, xsum=%s, allsum=%s\n" % (x, s, allsum))
            output_file.write("%s\n" % r.to_line())"""
    log.write("%d dropped total\n" % dropped)
Beispiel #4
0
def tabulate():
    if log.level >= 1:
        sys.stderr.write("(3) Tabulating filtered phrases\n")
    count = 1

    inputfiles = []
    for input in inputs:
        if os.path.isdir(input):
            inputfiles.extend(os.path.join(input, name) for name in os.listdir(input))
        else:
            inputfiles.append(input)
    inputfiles = [file(inputfile) for inputfile in inputfiles]

    global fsum, esum, allsum, xsum, gram
    fsum = {} # c(lhs, french)
    esum = {} # c(lhs, english)
    allsum = 0.0 # c(*)
    xsum = {} # c(lhs)
    gram = {}

    # read in all rules with matching english sides at the same time.
    # this way, we can sum only those english sides that ever appeared
    # with a french side that passes the filter.

    for rules in read_rule_blocks(inputfiles):
        flag = False
        blocksum = 0.
        for r in rules:
            scores = r.scores
            weight = scores[0]
            allsum += weight
            blocksum += weight
            xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight
            if ffilter is None or ffilter.match(r.f): # there used to be a shortcut here -- if fsum.has_key(r.f)
                #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight
                fsum[r.f] = fsum.get(r.f, 0.0) + weight
                if r in gram:
                    gram[r] += r
                else:
                    gram[r] = r
                flag = True
            if log.level >= 1 and count%interval == 0:
                sys.stderr.write("time: %f, memory: %s, rules in: %d, rules counted: %d\n" % (monitor.cpu(), monitor.memory(), count, len(gram)))

            count += 1
        if flag:
            ewordsnorm = rules[0].e.handle()
            if ewordsnorm in esum:
                sys.stderr.write("warning: files not sorted properly\n")
            esum[ewordsnorm] = blocksum
Beispiel #5
0
    def split_and_print(self, latticeIter):
        for lat in latticeIter:
            #print "DEBUG: " + str(lat)
            self.linesExamined += 1
            if self.linesExamined % 100 == 0:
                sys.stderr.write("%d sentences, %d skipped, %s per second\n" %
                                 (self.linesExamined, self.linesSkipped,
                                  float(self.linesExamined) / monitor.cpu()))
            edges = []
            initial_start = 0
            next_fake_id = -1  # when splitting words, we add fake id (which are negative) -- to be fixed later
            for fed in sorted(lat.lines, lambda x, y: cmp(x.span, y.span)):
                # only supports simple lattices right now
                # TODO:  change to support blocks?

                # add basic edge to edges list (ids will change later)
                edges.append(fed)

                sent_initial = fed.span[0] == initial_start
                if fed.span == (0, 1) and fed.label == '<foreign-sentence>':
                    # foreign sentence is handled specially
                    initial_start = 1
                    continue

                fw = fed.label
                remaining_fw = fw

                remaining_fw_start_pos, remaining_fw_end_pos = fed.span

                if opts.add_identity_arcs and make_identity(fw):
                    edges.append(
                        lattice.Edge(span=(remaining_fw_start_pos,
                                           remaining_fw_end_pos),
                                     label=fw,
                                     properties={
                                         'identity': '10^-1',
                                         'target': 'NNP("' + fw + '")'
                                     }))

                if self.morphTable:
                    used = set([])  # to avoid double use of affixes
                    for m in self.morphTable.iter():
                        fanalysis = m.analyze_arabic(remaining_fw,
                                                     sent_initial, used)
                        if fanalysis:
                            if fanalysis.prefixes and len(
                                    fanalysis.prefixes) > 0:
                                sent_initial = False  # only keep the sentence initial flag if there are no prefixes...?
                                for pr in fanalysis.prefixes:
                                    edges.append(
                                        lattice.Edge(
                                            span=(remaining_fw_start_pos,
                                                  next_fake_id),
                                            label=pr,
                                            properties={'s_morph': '10^-1'}))
                                    remaining_fw_start_pos = next_fake_id
                                    next_fake_id -= 1  # fake id's "increase" in negative space

                            if fanalysis.suffixes and len(
                                    fanalysis.suffixes) > 0:
                                fanalysis.suffixes.reverse(
                                )  # we want to walk backward over this list
                                for sf in fanalysis.suffixes:
                                    edges.append(
                                        lattice.Edge(
                                            span=(next_fake_id,
                                                  remaining_fw_end_pos),
                                            label=sf,
                                            properties={'s_morph': '10^-1'}))
                                    remaining_fw_end_pos = next_fake_id
                                    next_fake_id -= 1  # fake id's "increase" in negative space

                            # place the baseword itself there
                            if fanalysis.baseword:
                                edges.append(
                                    lattice.Edge(
                                        span=(remaining_fw_start_pos,
                                              remaining_fw_end_pos),
                                        label=fanalysis.baseword,
                                        properties={'s_morph': '10^-1'}))
                                remaining_fw = fanalysis.baseword
                            else:
                                sys.stderr.write(
                                    "Analysis of word '%s' missing baseword using morph transform %s"
                                    % (remaining_fw, str(m)))

            self.f_outfile.write(
                str(
                    lattice.Lattice(lines=fix_edges(edges),
                                    properties=lat.properties)) + ";\n")

            #except Error, inst:
            #    print "Error encountered splitting affixes:", inst
            #    self.linesSkipped += 1
            #    if self.f_outfile:
            #        self.f_outfile.write(????)
            #    continue

        sys.stderr.write("%d sentences, %d skipped, %s per second\n" %
                         (self.linesExamined, self.linesSkipped,
                          float(self.linesExamined) / monitor.cpu()))
Beispiel #6
0
def write(s):
    s = "t=%s wt=%s %s" % (dec(monitor.cpu()), dec(time.time() - tstart), s)
    for l in s.splitlines(True):
        file.write(prefix + l)
    file.flush()
    def split_and_print(self, latticeIter):
        for lat in latticeIter:
            #print "DEBUG: " + str(lat)
            self.linesExamined += 1
            if self.linesExamined % 100 == 0:
                sys.stderr.write("%d sentences, %d skipped, %s per second\n" % (self.linesExamined, self.linesSkipped, float(self.linesExamined)/monitor.cpu()))
            edges = []
            initial_start = 0
            next_fake_id = -1  # when splitting words, we add fake id (which are negative) -- to be fixed later
            for fed in sorted(lat.lines, lambda x, y: cmp(x.span, y.span)):
                # only supports simple lattices right now
                # TODO:  change to support blocks?

                # add basic edge to edges list (ids will change later)
                edges.append(fed)

                sent_initial = fed.span[0] == initial_start
                if fed.span == (0,1) and fed.label == '<foreign-sentence>':
                    # foreign sentence is handled specially
                    initial_start = 1
                    continue
                    
                fw = fed.label
                remaining_fw = fw

                remaining_fw_start_pos, remaining_fw_end_pos = fed.span

                if opts.add_identity_arcs and make_identity(fw):
                    edges.append(lattice.Edge(span=(remaining_fw_start_pos, remaining_fw_end_pos), label=fw, properties={'identity':'10^-1','target':'NNP("' + fw + '")'}))

                if self.morphTable:
                    used = set([]) # to avoid double use of affixes
                    for m in self.morphTable.iter():
                        fanalysis = m.analyze_arabic(remaining_fw, sent_initial, used)
                        if fanalysis:
                            if fanalysis.prefixes and len(fanalysis.prefixes) > 0:
                                sent_initial = False  # only keep the sentence initial flag if there are no prefixes...?
                                for pr in fanalysis.prefixes:
                                    edges.append(lattice.Edge(span=(remaining_fw_start_pos, next_fake_id), label=pr, properties={'s_morph':'10^-1'}))
                                    remaining_fw_start_pos = next_fake_id
                                    next_fake_id -= 1  # fake id's "increase" in negative space
                                
                            if fanalysis.suffixes and len(fanalysis.suffixes) > 0:
                                fanalysis.suffixes.reverse()  # we want to walk backward over this list
                                for sf in fanalysis.suffixes:
                                    edges.append(lattice.Edge(span=(next_fake_id, remaining_fw_end_pos), label=sf, properties={'s_morph':'10^-1'}))
                                    remaining_fw_end_pos = next_fake_id
                                    next_fake_id -= 1  # fake id's "increase" in negative space

                            # place the baseword itself there
                            if fanalysis.baseword:                                
                                edges.append(lattice.Edge(span=(remaining_fw_start_pos, remaining_fw_end_pos), label=fanalysis.baseword, properties={'s_morph':'10^-1'}))
                                remaining_fw = fanalysis.baseword
                            else:
                                sys.stderr.write("Analysis of word '%s' missing baseword using morph transform %s" % (remaining_fw, str(m)))

            self.f_outfile.write(str(lattice.Lattice(lines=fix_edges(edges),properties=lat.properties)) + ";\n")
            
            #except Error, inst:
            #    print "Error encountered splitting affixes:", inst
            #    self.linesSkipped += 1
            #    if self.f_outfile:
            #        self.f_outfile.write(????)
            #    continue

        sys.stderr.write("%d sentences, %d skipped, %s per second\n" % (self.linesExamined, self.linesSkipped, float(self.linesExamined)/monitor.cpu()))
Beispiel #8
0
    from forest import get_weights
    weights = get_weights(opts.weights)

    # should have a special "Identity" vector (11...1)
    trim_weights = 1 # svector.Vector("gt_prob=1")
    slim_features = False
    redundant_rules = False
    output_foreign_start = False
    foreign_sentence_tag = "<foreign-sentence>"
    max_edges_per_node = 1000000
    start_sent_id = 1

    # command-line: cat <isi-forest> | ./convert_forest_to_my.py <rules> <f_sent> <bylines> <refs>+
    import monitor
    sys.stderr.write("t=%s start\n" % monitor.cpu())
##    if len(sys.argv) < 2:
##        print >> sys.stderr, "WARNING: no rule files supplied -- output forest" \
##              + "will contain ruleids only"
##        rules = None
##    else:

    
    forestfile = sys.stdin
    srcfile = open(opts.foreign)
    bylinefile = open(opts.byline)
    reffiles = [open(f) for f in args] ## the remaining of the input are assumed to be refs


##    print >> logs, "rules file %s" % rulefile
##    print >> logs, "source file %s" % srcfile
Beispiel #9
0
def cpu():
	return monitor.cpu()
Beispiel #10
0
            dcoststr = "{%s}" % ",".join("%s:%s" % (quotejson(f), v)
                                         for (f, v) in ded.dcost.iteritems())
            edgestrs.append(
                '    {"head": %s, "tails": [%s], "features": %s}\n' %
                (ni, ",".join(tailstrs), dcoststr))

    result.append('  "edges": [\n%s\n  ]\n' % ",\n".join(edgestrs))

    result.append('}')
    return "".join(result)


if __name__ == "__main__":
    import monitor
    import getopt

    weights = None
    opts, args = getopt.getopt(sys.argv[1:], "w:")
    for opt, optarg in opts:
        if opt == "-w":
            weights = svector.Vector(open(optarg).read())

    sys.stderr.write("t=%s start\n" % monitor.cpu())
    for li, line in enumerate(sys.stdin):
        f = forest_from_text(line)
        if weights:
            f.reweight(weights)
        print forest_to_xml(f, mode="english", weights=weights)
        sys.stderr.write("t=%s read line %s\n" % (monitor.cpu(), li))
        sys.stderr.flush()
Beispiel #11
0
def write(s):
    s = "t=%s wt=%s %s" % (dec(monitor.cpu()), dec(time.time()-tstart), s)
    for l in s.splitlines(True):
        file.write(prefix + l)
    file.flush()
Beispiel #12
0
def main(argv=None):
	'''Call this from the command-line to create a 
	pre-computed binary data array for later use'''
	if argv is None:
		argv = sys.argv

	parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+
								"\n\nNote: -d,-s,-a, and -p are mutually exclusive")
	parser.add_option("-d", "--data-array", 
					action="store_true", default=False,
					dest="da", help="Compile file into data array (default)")
	parser.add_option("-s", "--suffix-array", 
					action="store_true", default=False,
					dest="sa", help="Compile file into suffix array")
	parser.add_option("-a", "--alignment", 
					action="store_true", default=False,
					dest="a", help="Compile file into alignment")
	parser.add_option("-l", "--lexical", 
					action="store_true", default=False,
					dest="l", help="Compile file into lex file")
	parser.add_option("-x", "--compute_lexical", action="store", nargs=2,
					dest="lex_args", help="Compute lex file from data",
					metavar="<f file> <e file>")
	parser.add_option("-p", "--parse", 
					action="store_true", default=False,
					dest="p", help="Compile file into parse")
	parser.add_option("-b", "--binary-infile", 
					action="store_true", default=False,
					dest="bin", help="Input file is binary (default: text)")
	parser.add_option("-t", "--text-outfile", 
					action="store_true", default=False,
					dest="text", help="Output file is text (default: binary)")
	parser.add_option("-e", "--enhanced-outfile", 
					action="store_true", default=False,
					dest="enhanced", help="Output file is enhanced text (default: binary)")
	parser.add_option("-r", action="store", nargs=7,
					dest="precomp_args", help="Precompute collocations (Hiero only)", 
					metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>")
	(options, args) = parser.parse_args()

	filetype_opts =  [options.da, options.sa, options.a, options.p]

	if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2:
		parser.print_help()
		sys.exit(1)

	(infilename, outfilename) = args
	if options.bin:
		bin = " binary"
	else:
		bin = ""

	start_time = monitor.cpu()
	if options.precomp_args:
		if options.bin:
			obj = precomputation.Precomputation(infilename, from_binary=True)
		else:
			keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"])
			precomp_opts = {} 
			sys.stderr.write("Precomputing statistics for list %s\n" % infilename)
			for pair in options.precomp_args:
				(key, val) = pair.split("=")
				if key in keys:
					keys.remove(key)
					if key != "sa":
						val = int(val)
					precomp_opts[key] = val
				else:
					sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key)
					return 1
			sa = csuf.SuffixArray(precomp_opts["sa"], True)
			obj = precomputation.Precomputation(infilename, sa, 
				precompute_rank=precomp_opts["rank1"], 
				precompute_secondary_rank=precomp_opts["rank2"], 
				max_length=precomp_opts["max-len"], 
				max_nonterminals=precomp_opts["max-nt"], 
				train_max_initial_size=precomp_opts["max-size"], 
				train_min_gap_size=precomp_opts["min-gap"])
	elif options.sa:
		sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin))
		obj = csuf.SuffixArray(infilename, options.bin)
	elif options.a:
		sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin))
		obj = calignment.Alignment(infilename, options.bin)
	elif options.p:
		sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin))
		obj = parse.ParseArray(infilename, options.bin)
	elif options.l:
		sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin))
		obj = clex.CLex(infilename, options.bin)
	elif options.lex_args:
		ffile = options.lex_args[0]
		efile = options.lex_args[1]
		sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile))
		fsarray = csuf.SuffixArray(ffile, True)
		earray = cdat.DataArray(efile, True)
		aarray = calignment.Alignment(infilename, True)
		obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray)
	else:
		sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin))
		obj = cdat.DataArray(infilename, options.bin)

	sys.stderr.write("  Total time for read: %f\n" % (monitor.cpu() - start_time))
	start_time = monitor.cpu()
	if options.text:
		sys.stderr.write("Writing text file %s...\n" % outfilename)
		obj.write_text(outfilename)
	elif options.enhanced:
		sys.stderr.write("Writing enhanced text file %s...\n" % outfilename)
		obj.write_enhanced(outfilename)
	else:
		sys.stderr.write("Writing binary file %s...\n" % outfilename)
		obj.write_binary(outfilename)
	sys.stderr.write("Finished.\n")
	sys.stderr.write("  Total time for write: %f\n" % (monitor.cpu() - start_time))

	mem_use = float(monitor.memory())
	metric = "B"
	if mem_use / 1000 > 1:
		mem_use /= 1000
		metric = "KB"
	if mem_use / 1000 > 1:
		mem_use /= 1000
		metric = "MB"
	if mem_use / 1000 > 1:
		mem_use /= 1000
		metric = "GB"
	sys.stderr.write("  Memory usage: %.1f%s\n" % (mem_use, metric))
Beispiel #13
0
        if log.level >= 1:
            log.write("Reading configuration from %s\n" % opts.config)
        execfile(opts.config)

    if len(args) >= 1 and args[0] != "-":
        input_file = file(args[0], "r")
    else:
        input_file = sys.stdin

    if len(args) >= 2 and args[1] != "-":
        output_file = file(args[1], "w")
    else:
        output_file = sys.stdout

    gc.collect()
    if log.level >= 1:
        log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
        log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))

    sents = sgml.read_raw(input_file)
    for sent in sents:
        mark = sent.getmark()
        if mark is not None:
            (tag, attrs) = mark
            if tag == "seg":
                sent.unmark()
                dattrs = sgml.attrs_to_dict(attrs)
                sent.meta = attrs
        extract_grammar(sent)

Beispiel #14
0
        # if only one chunk, don't validate match.
        for sentnum in matchsents:
            if n_chunks == 1 or self.match1(pattern, self.substrs[sentnum], self.n[sentnum]):
                yield sentnum

if __name__ == "__main__":
    import optparse
    optparser = optparse.OptionParser()
    optparser.add_option('-l', '--maxlen', dest='maxlen', type="int", default=None, help='maximum initial base phrase size')

    (opts,args) = optparser.parse_args()

    maxlen = opts.maxlen

    filterfilename = args[0]
    log.write("t=%s building filter from %s\n" % (monitor.cpu(), filterfilename))

    ffilter = Filter([line.split() for line in file(filterfilename)], maxlen=maxlen)

    log.write("t=%s begin filtering\n" % monitor.cpu())
    progress = 0
    for line in sys.stdin:
        #rule, _ = simplerule.Rule.from_str_hiero(line)
        #frhs = rule.frhs
        frhs = [simplerule.Nonterminal.from_str(f) for f in line.split(" ||| ", 3)[1].split()]
        for si in ffilter.match(frhs):
            print("%s\t%s" % (si,line.rstrip()))
        progress += 1
            
    
Beispiel #15
0
def cpu():
    return monitor.cpu()