Esempio n. 1
0
def create_wfsa(options):
    # open output file or write to stdout
    output = open(options.output, "w") if options.output else sys.stdout

    # read initial transitions if given
    it = options.initial_transitions
    initial_transitions = Automaton.read_transitions(it) if it else {}

    # create uniform automaton with given number of states per letter
    # and the possibility of predefine some transitions
    if options.emitfile:
        numbers_per_letters = read_dict(open(options.emitfile))
        automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions=initial_transitions)
        automaton.dump(output)
        if not options.smooth:
            automaton.smooth()
        return

    if options.numstate:
        input_ = sys.stdin
        corpus = read_corpus(input_, options.separator)
        alphabet = get_alphabet(corpus)
        numbers_per_letters = dict([(letter, options.numstate) for letter in alphabet])
        if options.num_epsilons:
            numbers_per_letters["EPSILON"] = options.num_epsilons

        automaton = Automaton.create_uniform_automaton(numbers_per_letters, initial_transitions)
        if options.smooth:
            automaton.smooth()
        automaton.dump(output)
        return

    if options.init_from_corpus:
        if len(initial_transitions) > 0:
            raise Exception(
                "Using initial transitions (-I option) when " + "creating automaton from corpus is not implemented"
            )
        input_ = open(options.init_from_corpus)
        corpus = read_corpus(input_, options.separator)
        corpus = normalize_corpus(corpus)
        automaton = Automaton.create_from_corpus(corpus)
        if options.smooth:
            automaton.smooth()
        automaton.dump(output)
        return

    # fallback
    logging.error("Options are not complete, something is missing to create " + "an Automaton")
    sys.exit(-1)
Esempio n. 2
0
    def run_uniform_exp(self, quantizer, distance, emissions, state_bits, entropy):
        exp_name = "{0}-{1}-{2}-{3}-{4}".format(
            quantizer.levels,
            abs(quantizer.neg_cutoff),
            'm',
            emissions,
            distance[0])

        logging.info("Running {0}".format(exp_name))
        learnt_wfsa_filename = "{0}/{1}".format(self.workdir,
            "learnt_{0}.wfsa".format(exp_name))

        corpus = (self.morpheme_corpus if emissions == "m" else
                  self.unigram_corpus)

        # read Automaton or learn it and dump it finally
        if os.path.exists(learnt_wfsa_filename):
            # read already learnt automaton
            learnt_wfsa = Automaton.create_from_dump(open(learnt_wfsa_filename))
            learnt_wfsa.quantizer = quantizer
            learnt_wfsa.round_and_normalize()
        else:
            # create and learn new automaton
            alphabet = get_alphabet(corpus)
            numbers_per_letters = dict([(letter, 1)
                                        for letter in alphabet])
            #print numbers_per_letters
            wfsa = Automaton.create_uniform_automaton(numbers_per_letters)
            wfsa.finalize()
            wfsa.quantizer = quantizer
            wfsa.round_and_normalize()
            cp = lambda *x: checkpoint_dump(wfsa, 
                "{0}/cp_{1}".format(self.workdir, exp_name), *x)
            logging.info('learning starts here')
            learnt_wfsa = learn_wfsa(wfsa, corpus, distance, cp)

            # dump
            with open(learnt_wfsa_filename, "w") as of:
                learnt_wfsa.dump(of)

        # encode automaton
        encoder = Encoder(entropy)
        bits_a, bits_e, bits_t, err, hq, tc = encode_wfsa(
            learnt_wfsa, corpus, encoder)
        return [exp_name, bits_a, bits_e, bits_t, err, hq, tc]