def main(): wfsa = Automaton.create_from_dump(open(sys.argv[1])) if len(sys.argv) > 2: remaining = float(sys.argv[2]) lang = wfsa.language(remaining) else: lang = wfsa.language() for w in lang: di = wfsa.state_indices["$"] prob = math.exp(lang[w][di]) print "{0} {1}".format("".join(w), prob)
def main(): # read automaton wfsa = Automaton.create_from_dump(open(sys.argv[1])) # read corpus corpus = read_corpus(open(sys.argv[2]), separator=sys.argv[3], skip=[sys.argv[4]]) normalize_corpus(corpus) # call distance_from_corpus distances = {} dist = wfsa.distance_from_corpus(corpus, Automaton.kullback, distances=distances) # print out result for k, v in distances.iteritems(): print k, v
def main(): automaton = Automaton.create_from_dump(open(sys.argv[1])) corpus = read_corpus(open(sys.argv[2])) normalize_corpus(corpus) entropy = float(sys.argv[3]) string_bits = "u" if len(sys.argv) > 4: string_bits = sys.argv[4] q = LogLinQuantizer(10, -20) automaton.quantizer = q encoder = Encoder(entropy, string_bits) print encoder.encode(automaton, corpus)
def run_3state_exp(self, quantizer, distance, harant, emissions, state_bits): aut_name = "{0}-{1}-{2}-{3}-{4}".format( quantizer.levels, abs(quantizer.neg_cutoff), "_".join(("@".join(h) if type(h) == tuple else h) for h in harant), emissions, distance[0]) exp_name = "{0}-{1}".format(aut_name, state_bits) logging.info("Running {0}".format(exp_name)) learnt_wfsa_filename = "{0}/{1}".format(self.workdir, "learnt_{0}.wfsa".format(aut_name)) corpus = (self.morpheme_corpus if emissions == "m" else self.unigram_corpus) # read Automaton or learn it and dump it finally if os.path.exists(learnt_wfsa_filename): # read already learnt automaton learnt_wfsa = Automaton.create_from_dump(open(learnt_wfsa_filename)) learnt_wfsa.quantizer = quantizer learnt_wfsa.round_and_normalize() else: # create and learn new automaton wfsa = create_new_three_state_fsa(self.morpheme_corpus, harant, emissions) wfsa.finalize() wfsa.quantizer = quantizer wfsa.round_and_normalize() cp = lambda *x: checkpoint_dump(wfsa, "{0}/cp_{1}".format(self.workdir, aut_name), *x) learnt_wfsa = learn_wfsa(wfsa, corpus, distance, cp) # dump with open(learnt_wfsa_filename, "w") as of: learnt_wfsa.dump(of) # encode automaton encoder = (self.morpheme_encoder if emissions=="m" else self.unigram_encoder) encoder.state_bits = state_bits bits_a, bits_e, bits_t, err, hq, tc = encode_wfsa( learnt_wfsa, corpus, encoder) return [exp_name, bits_a, bits_e, bits_t, err, hq, tc]
def run_uniform_exp(self, quantizer, distance, emissions, state_bits, entropy): exp_name = "{0}-{1}-{2}-{3}-{4}".format( quantizer.levels, abs(quantizer.neg_cutoff), 'm', emissions, distance[0]) logging.info("Running {0}".format(exp_name)) learnt_wfsa_filename = "{0}/{1}".format(self.workdir, "learnt_{0}.wfsa".format(exp_name)) corpus = (self.morpheme_corpus if emissions == "m" else self.unigram_corpus) # read Automaton or learn it and dump it finally if os.path.exists(learnt_wfsa_filename): # read already learnt automaton learnt_wfsa = Automaton.create_from_dump(open(learnt_wfsa_filename)) learnt_wfsa.quantizer = quantizer learnt_wfsa.round_and_normalize() else: # create and learn new automaton alphabet = get_alphabet(corpus) numbers_per_letters = dict([(letter, 1) for letter in alphabet]) #print numbers_per_letters wfsa = Automaton.create_uniform_automaton(numbers_per_letters) wfsa.finalize() wfsa.quantizer = quantizer wfsa.round_and_normalize() cp = lambda *x: checkpoint_dump(wfsa, "{0}/cp_{1}".format(self.workdir, exp_name), *x) logging.info('learning starts here') learnt_wfsa = learn_wfsa(wfsa, corpus, distance, cp) # dump with open(learnt_wfsa_filename, "w") as of: learnt_wfsa.dump(of) # encode automaton encoder = Encoder(entropy) bits_a, bits_e, bits_t, err, hq, tc = encode_wfsa( learnt_wfsa, corpus, encoder) return [exp_name, bits_a, bits_e, bits_t, err, hq, tc]
def main(options): if not options.automaton_file: raise Exception("Automaton \"option\" (-a) is mandatory") automaton = Automaton.create_from_dump(open(options.automaton_file)) if options.quantizer: automaton.quantizer = AbstractQuantizer.read(open(options.quantizer)) automaton.round_and_normalize() input_ = sys.stdin if options.corpus: input_ = open(options.corpus) corpus = read_corpus(input_, options.separator) corpus = normalize_corpus(corpus) learner = Learner.create_from_options(automaton, corpus, options) learner.main() output = sys.stdout if options.output: output = open(options.output, "w") learner.automaton.dump(output)
def main(): automaton = Automaton.create_from_dump(open(sys.argv[1])) corpus = read_corpus(open(sys.argv[2]), "#") dc = DistanceCache(automaton, corpus) dc.build_paths()