def create_wfsa(options): # open output file or write to stdout output = open(options.output, "w") if options.output else sys.stdout # read initial transitions if given it = options.initial_transitions initial_transitions = Automaton.read_transitions(it) if it else {} # create uniform automaton with given number of states per letter # and the possibility of predefine some transitions if options.emitfile: numbers_per_letters = read_dict(open(options.emitfile)) automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions=initial_transitions) automaton.dump(output) if not options.smooth: automaton.smooth() return if options.numstate: input_ = sys.stdin corpus = read_corpus(input_, options.separator) alphabet = get_alphabet(corpus) numbers_per_letters = dict([(letter, options.numstate) for letter in alphabet]) if options.num_epsilons: numbers_per_letters["EPSILON"] = options.num_epsilons automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions) if options.smooth: automaton.smooth() automaton.dump(output) return if options.init_from_corpus: if len(initial_transitions) > 0: raise Exception( "Using initial transitions (-I option) when " + "creating automaton from corpus is not implemented" ) input_ = open(options.init_from_corpus) corpus = read_corpus(input_, options.separator) corpus = normalize_corpus(corpus) automaton = Automaton.create_from_corpus(corpus) if options.smooth: automaton.smooth() automaton.dump(output) return # fallback logging.error("Options are not complete, something is missing to create " + "an Automaton") sys.exit(-1)
def run_uniform_exp(self, quantizer, distance, emissions, state_bits, entropy): exp_name = "{0}-{1}-{2}-{3}-{4}".format( quantizer.levels, abs(quantizer.neg_cutoff), 'm', emissions, distance[0]) logging.info("Running {0}".format(exp_name)) learnt_wfsa_filename = "{0}/{1}".format(self.workdir, "learnt_{0}.wfsa".format(exp_name)) corpus = (self.morpheme_corpus if emissions == "m" else self.unigram_corpus) # read Automaton or learn it and dump it finally if os.path.exists(learnt_wfsa_filename): # read already learnt automaton learnt_wfsa = Automaton.create_from_dump(open(learnt_wfsa_filename)) learnt_wfsa.quantizer = quantizer learnt_wfsa.round_and_normalize() else: # create and learn new automaton alphabet = get_alphabet(corpus) numbers_per_letters = dict([(letter, 1) for letter in alphabet]) #print numbers_per_letters wfsa = Automaton.create_uniform_automaton(numbers_per_letters) wfsa.finalize() wfsa.quantizer = quantizer wfsa.round_and_normalize() cp = lambda *x: checkpoint_dump(wfsa, "{0}/cp_{1}".format(self.workdir, exp_name), *x) logging.info('learning starts here') learnt_wfsa = learn_wfsa(wfsa, corpus, distance, cp) # dump with open(learnt_wfsa_filename, "w") as of: learnt_wfsa.dump(of) # encode automaton encoder = Encoder(entropy) bits_a, bits_e, bits_t, err, hq, tc = encode_wfsa( learnt_wfsa, corpus, encoder) return [exp_name, bits_a, bits_e, bits_t, err, hq, tc]