if fst_count % 10 == 0: str_fst.minimize() fst_count += 1 stderr.write("\n") ustr_fsts.intersect(ustr_fst) all_ustr_fsts.append(ustr_fsts) str_fsts.intersect(str_fst) all_str_fsts.append(str_fsts) ustr_model = libhfst.regex(base) str_model = libhfst.regex(base) for i, fst in enumerate(all_ustr_fsts): print "USTR: %u of %u" % (i, len(all_ustr_fsts)) ustr_model.intersect(fst) ustr_model.minimize() for i, fst in enumerate(all_str_fsts): print "STR: %u of %u" % (i, len(all_str_fsts)) str_model.intersect(fst) str_model.minimize() out = libhfst.create_hfst_output_stream(argv[1] + ".ustr", libhfst.TROPICAL_OPENFST_TYPE, 1) out.write(ustr_model) out = libhfst.create_hfst_output_stream(argv[1] + ".str", libhfst.TROPICAL_OPENFST_TYPE, 1) out.write(str_model)
rule_str = ' '.join([center, left_context, center_context]) return regex(rule_str), rule_str if __name__ == '__main__': is_structured = 0 unstructured_model = {} structured_rules = regex('?*') structured_model = regex('?*') oustr = open(argv[1] + '.ustr', 'wb') ostr = create_hfst_output_stream(argv[1] + '.str', TROPICAL_OPENFST_TYPE, 1) seen_struct_feats = set() for i, line in enumerate(map(lambda x: x.strip(), stdin)): if line == '': continue if line == STRUCTID: stderr.write("Structured features.\n") is_structured = 1 elif line == UNSTRUCTID: stderr.write("Unstructured features.\n") is_structured = 0 else: if is_structured: fields = line.split(' ')
for i in iocounts: odistr = sorted([(iocounts[i][o], o) for o in iocounts[i]]) odistr.reverse() tot = 0 for m, o in odistr: outputs[i].append(o) tot += m if m >= TH: break outputs['#'].append('#') outputs['_#_'].append('_#_') out = libhfst.create_hfst_output_stream("", libhfst.TROPICAL_OPENFST_TYPE, 1) ustr_model = libhfst.HfstInputStream(argv[2]).read() str_model = libhfst.HfstInputStream(argv[3]).read() for i, line in enumerate(imap(lambda x: x.strip(), stdin)): stderr.write("LINE: %u\r" % i) expr = '' if line == '': continue chars = ('_#_ _#_ # ' + line.replace('0','"0"') + ' # _#_ _#_').split(' ') for char in chars: expr += ('%s [%s] £ ' % (escape(char), '|'.join([escape(c) for c in outputs[char]])))