def input(self, fwords, meta): self.ewords = [{} for f in fwords] for (tag, attrs, i, j) in meta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('eword'): if j-i != 1: log.write("warning: eword attribute given for multi-word French expression") ewords = [sym.fromstring(e.strip()) for e in attrs['eword'].split('|')] if 'cost' in attrs: costs = [float(x) for x in attrs['cost'].split('|')] elif 'prob' in attrs: costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')] else: costs = [-math.log10(1.0/len(ewords)) for e in ewords] self.ewords[i] = dict(zip(ewords,costs))
def input(self, input): self.rules = collections.defaultdict(list) for tag, attrs, i, j in input.fmeta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('english'): ephrases = attrs['english'].split('|') if attrs.has_key('cost'): costs = [float(x) for x in attrs['cost'].split('|')] elif attrs.has_key('prob'): costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')] else: costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform if len(costs) != len(ephrases): sys.stderr.write("wrong number of probabilities/costs") raise ValueError if attrs.has_key('features'): features = attrs['features'].split('|') if len(features) != len(ephrases): sys.stderr.write("wrong number of feature names") raise ValueError elif attrs.has_key('feature'): features = [attrs['feature'] for ephrase in ephrases] else: features = ['sgml' for ephrase in ephrases] if attrs.has_key('label'): tags = attrs['label'].split('|') else: tags = [tag.upper()] # bug: if new nonterminals are introduced at this point, # they will not participate in the topological sort for (ephrase,cost,feature) in zip(ephrases,costs,features): for tag in tags: r = rule.Rule(sym.fromtag(tag), rule.Phrase(input.fwords[i:j]), rule.Phrase([sym.fromstring(e) for e in ephrase.split()]), scores=svector.Vector('%s' % feature, cost)) self.rules[i,j].append((r,))
def input(self, fwords, meta): self.ewords = [{} for f in fwords] for (tag, attrs, i, j) in meta: attrs = sgml.attrs_to_dict(attrs) if attrs.has_key('eword'): if j - i != 1: log.write( "warning: eword attribute given for multi-word French expression" ) ewords = [ sym.fromstring(e.strip()) for e in attrs['eword'].split('|') ] if 'cost' in attrs: costs = [float(x) for x in attrs['cost'].split('|')] elif 'prob' in attrs: costs = [ -math.log10(float(x)) for x in attrs['prob'].split('|') ] else: costs = [-math.log10(1.0 / len(ewords)) for e in ewords] self.ewords[i] = dict(zip(ewords, costs))
if log.level >= 1: log.write("Reading configuration from %s\n" % opts.config) execfile(opts.config) if len(args) >= 1 and args[0] != "-": input_file = file(args[0], "r") else: input_file = sys.stdin if len(args) >= 2 and args[1] != "-": output_file = file(args[1], "w") else: output_file = sys.stdout gc.collect() if log.level >= 1: log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) sents = sgml.read_raw(input_file) for sent in sents: mark = sent.getmark() if mark is not None: (tag, attrs) = mark if tag == "seg": sent.unmark() dattrs = sgml.attrs_to_dict(attrs) sent.meta = attrs extract_grammar(sent)