Ejemplo n.º 1
0
    def length_kld(self, mypcfg):
        """
		Compute KLD between the distribution given by the weights and the true distribution.
		"""
        ui = inside.UnaryInside(mypcfg)
        table = ui.compute_inside_smart(len(self.weights))

        kld = 0.0
        total = sum(self.weights)
        for length, w in enumerate(self.weights):
            if w > 0:
                p = w / total
                q = table[length, ui.start]
                kld += p * math.log(p / q)
        return kld
Ejemplo n.º 2
0
    def sample(self, ignore_errors=False, viterbi=False):
        """
		return a PCFG that is well behaved.
		This may not generate strings of all lengths.
		"""
        cfg = self.cfgfactory.sample_trim()
        ##
        unary = "_unary_"
        # make the
        unary_pcfg = pcfg.PCFG()
        unary_pcfg.start = cfg.start
        unary_pcfg.terminals.add(unary)
        unary_pcfg.nonterminals = set(cfg.nonterminals)
        productions = set()
        lexical_index = defaultdict(list)
        sorted_prods = sorted(list(cfg.productions))
        for prod in sorted_prods:
            if len(prod) == 3:
                productions.add(prod)
            else:
                productions.add((prod[0], unary))
                lexical_index[prod[0]].append(prod[1])
        unary_pcfg.productions = list(productions)
        unary_pcfg.parameters = {prod: 1.0 for prod in productions}
        unary_pcfg.normalise()

        self.current = unary_pcfg

        # compute best possible LP.
        logging.info("Training with LENGTH_EM_MAX_LENGTH %d ",
                     LENGTH_EM_MAX_LENGTH)
        #return unary_pcfg
        targetlp = self.length_distribution.ml_lp(LENGTH_EM_MAX_LENGTH)
        valid_target = self.length_distribution.ml_lp_general(
            LENGTH_EM_MAX_LENGTH)
        logging.info("Target LP = %f, %f", targetlp, valid_target)
        ui = inside.UnaryInside(unary_pcfg)

        for i in range(LENGTH_EM_ITERATIONS):

            logging.info("Starting EM iteration %d, target = %f", i, targetlp)
            lp, kld = ui.train_once_smart(self.length_distribution.weights,
                                          LENGTH_EM_MAX_LENGTH)
            delta = abs(lp - targetlp)
            logging.info("KLD from target %f", kld)
            if kld < TERMINATION_KLD:
                logging.info("Converged enough.  %f < %f ", kld,
                             TERMINATION_KLD)
                break
        else:
            logging.warning(
                "Reached maximum number of iterations without reaching convergence threshold. Final KLD is %f.",
                kld)

        unary_pcfg.parameters = ui.get_params()
        unary_pcfg.trim_zeros()
        final_pcfg = pcfg.PCFG()
        #print("nonterminals", unary_pcfg.nonterminals)
        ## Nondeterminism hee
        sorted_nonterminals = list(cfg.nonterminals)
        sorted_nonterminals.sort()
        for nt in sorted_nonterminals:

            if (nt, unary) in unary_pcfg.parameters:
                totalp = unary_pcfg.parameters[(nt, unary)]
                k = len(lexical_index[nt])
                probs = self.lexical_distribution.sample(k)
                #print(nt,probs[0],k,totalp,lexical_index[nt][0])
                assert (len(probs) == k)
                for a, p in zip(lexical_index[nt], probs):
                    prod = (nt, a)

                    final_pcfg.productions.append(prod)
                    final_pcfg.parameters[prod] = p * totalp
        for prod in unary_pcfg.productions:
            if len(prod) == 3:
                final_pcfg.productions.append(prod)
                final_pcfg.parameters[prod] = unary_pcfg.parameters[prod]
        final_pcfg.start = unary_pcfg.start
        final_pcfg.nonterminals = unary_pcfg.nonterminals
        final_pcfg.terminals = cfg.terminals
        final_pcfg.normalise()
        return final_pcfg
Ejemplo n.º 3
0
    def sample_smart(self):
        """
		First constriuct unary one sampling from dirichlet.
		Then train.
		Then resample.
		"""
        logging.info("Sampling cfg")
        self.cfgfactory.number_terminals = 1
        mycfg = self.cfgfactory.sample_full()
        logging.info("Sampled unary cfg")
        ## Number of productions.
        nb = (self.nonterminals - 1)**2
        parameters = {}
        terminals = list(mycfg.terminals)
        unary_terminal = terminals[0]
        snt = list(mycfg.nonterminals)
        snt.sort()
        prods = list(mycfg.productions)
        prods.sort()
        for a in snt:

            unary_prob = numpy.random.random()
            logging.info("Unary probs %s %f" % (a, unary_prob))
            #print(lprobs.shape)
            #for i,prod in enumerate([ prod for prod in mycfg.productions if len(prod) == 2 and prod[0] == a]):
            parameters[(a, unary_terminal)] = unary_prob

            probs = self.binary_distribution.sample(nb)
            #print(probs[0])
            for i, prod in enumerate(
                [prod for prod in prods if len(prod) == 3 and prod[0] == a]):
                parameters[prod] = (1 - unary_prob) * probs[i]

        unary_pcfg = pcfg.PCFG(cfg=mycfg)
        unary_pcfg.parameters = parameters
        ## Ok we may get an error if we don't trim zeros (with very small concentration parameters)
        unary_pcfg.trim_zeros()
        unary_pcfg.set_log_parameters()
        logging.info("1,1,1 param %e" %
                     unary_pcfg.parameters[("NT1", "NT1", "NT1")])
        ## Now we have a unary grammar we train it.
        logging.info("Training with LENGTH_EM_MAX_LENGTH %d ",
                     LENGTH_EM_MAX_LENGTH)
        #return unary_pcfg
        targetlp = self.length_distribution.ml_lp(LENGTH_EM_MAX_LENGTH)
        valid_target = self.length_distribution.ml_lp_general(
            LENGTH_EM_MAX_LENGTH)
        logging.info("Target LP = %f, %f", targetlp, valid_target)
        ui = inside.UnaryInside(unary_pcfg)
        logging.info("Entropy = %f", unary_pcfg.derivational_entropy())
        for i in range(LENGTH_EM_ITERATIONS):
            logging.info("Starting EM iteration %d, target = %f ", i, targetlp)
            lp, kld = ui.train_once_smart(self.length_distribution.weights,
                                          LENGTH_EM_MAX_LENGTH)
            delta = abs(lp - targetlp)
            logging.info("KLD from target %f", kld)
            if kld < TERMINATION_KLD:
                logging.info("Converged enough.  %f < %f ", kld,
                             TERMINATION_KLD)
                break
        else:
            logging.warning(
                "Reached maximum number of iterations without reaching convergence threshold. Final KLD is %f.",
                kld)

        unary_pcfg.parameters = ui.get_params()
        unary_pcfg.trim_zeros()
        #logging.info("1,1,1 param %e" % unary_pcfg.parameters[("NT1","NT1","NT1")])
        ## Now we have a unary grammar, with the right parameters.
        logging.info("Entropy = %f", unary_pcfg.derivational_entropy())
        final_pcfg = pcfg.PCFG()
        final_pcfg.nonterminals = unary_pcfg.nonterminals
        final_pcfg.start = unary_pcfg.start
        final_pcfg.terminals = list(utility.generate_lexicon(self.terminals))
        final_pcfg.terminals.sort()
        logging.info("Terminals from %s to %s" %
                     (final_pcfg.terminals[0], final_pcfg.terminals[-1]))
        for prod, alpha in unary_pcfg.parameters.items():
            if len(prod) == 3:
                if alpha > 0:
                    final_pcfg.productions.append(prod)
                    final_pcfg.parameters[prod] = alpha
            else:
                ## sample conditional probs of binary.
                lprobs = self.lexical_distribution.sample(self.terminals)

                for i, a in enumerate(final_pcfg.terminals):
                    newprod = (prod[0], a)
                    newalpha = alpha * lprobs[i]
                    #					print(newalpha)
                    if newalpha > 0:
                        final_pcfg.productions.append(newprod)
                        final_pcfg.parameters[newprod] = newalpha
        final_pcfg.set_log_parameters()
        return final_pcfg
Ejemplo n.º 4
0
    def sample_uniform(self):
        """
		Sample uniformly. 
		Then train.
		Then resample.
		"""
        logging.info("Sampling uniform cfg")
        self.cfgfactory.number_terminals = 1
        lp = 1 - (0.5**self.terminals)
        logging.info("Lexical pron for sampling is %e" % lp)
        mycfg = self.cfgfactory.sample_uniform(lp=lp, bp=0.5)
        logging.info("Sampled unary cfg")
        ## Number of productions.

        parameters = {}
        terminals = list(mycfg.terminals)
        unary_terminal = terminals[0]
        snt = list(mycfg.nonterminals)
        snt.sort()
        prods = list(mycfg.productions)
        prods.sort()
        nbs = Counter()
        for p in prods:
            if len(p) == 3:
                nbs[p[0]] += 1
        for a in snt:

            unary_prob = numpy.random.random()
            logging.info("Unary probs %s %f" % (a, unary_prob))
            #print(lprobs.shape)
            #for i,prod in enumerate([ prod for prod in mycfg.productions if len(prod) == 2 and prod[0] == a]):
            parameters[(a, unary_terminal)] = unary_prob

            probs = self.binary_distribution.sample(nbs[a])
            #print(probs[0])
            for i, prod in enumerate(
                [prod for prod in prods if len(prod) == 3 and prod[0] == a]):
                parameters[prod] = (1 - unary_prob) * probs[i]

        unary_pcfg = pcfg.PCFG(cfg=mycfg)
        unary_pcfg.parameters = parameters
        ## Ok we may get an error if we don't trim zeros (with very small concentration parameters)
        unary_pcfg.trim_zeros()
        unary_pcfg.set_log_parameters()

        ## Now we have a unary grammar we train it.
        logging.info("Training with LENGTH_EM_MAX_LENGTH %d ",
                     LENGTH_EM_MAX_LENGTH)
        #return unary_pcfg
        targetlp = self.length_distribution.ml_lp(LENGTH_EM_MAX_LENGTH)
        valid_target = self.length_distribution.ml_lp_general(
            LENGTH_EM_MAX_LENGTH)
        logging.info("Target LP = %f, %f", targetlp, valid_target)
        ui = inside.UnaryInside(unary_pcfg)
        logging.info("Entropy = %f", unary_pcfg.derivational_entropy())
        for i in range(LENGTH_EM_ITERATIONS):
            logging.info("Starting EM iteration %d, target = %f ", i, targetlp)
            lp, kld = ui.train_once_smart(self.length_distribution.weights,
                                          LENGTH_EM_MAX_LENGTH)
            delta = abs(lp - targetlp)
            logging.info("KLD from target %f", kld)
            if kld < TERMINATION_KLD:
                logging.info("Converged enough.  %f < %f ", kld,
                             TERMINATION_KLD)
                break
        else:
            logging.warning(
                "Reached maximum number of iterations without reaching convergence threshold. Final KLD is %f.",
                kld)

        unary_pcfg.parameters = ui.get_params()
        unary_pcfg.trim_zeros()
        #logging.info("1,1,1 param %e" % unary_pcfg.parameters[("NT1","NT1","NT1")])
        ## Now we have a unary grammar, with the right parameters.
        logging.info("Entropy = %f", unary_pcfg.derivational_entropy())
        final_pcfg = pcfg.PCFG()
        final_pcfg.nonterminals = unary_pcfg.nonterminals
        final_pcfg.start = unary_pcfg.start
        final_pcfg.terminals = list(utility.generate_lexicon(self.terminals))
        final_pcfg.terminals.sort()
        logging.info("Terminals from %s to %s" %
                     (final_pcfg.terminals[0], final_pcfg.terminals[-1]))

        ## Now sample the lexical prods
        lbs = Counter()
        for prod, alpha in unary_pcfg.parameters.items():
            if len(prod) == 3:
                final_pcfg.productions.append(prod)
                final_pcfg.parameters[prod] = alpha
            else:
                ## It's a unary rule
                ## so we sample one uniformly from the terminals and then flip a coin for the rest.
                initial = numpy.random.choice(self.terminals)
                lps = []
                for i, a in enumerate(final_pcfg.terminals):
                    if numpy.random.random() < 0.5 or i == initial:
                        newprod = (prod[0], a)
                        final_pcfg.productions.append(newprod)
                        lps.append(newprod)
                lprobs = self.lexical_distribution.sample(len(lps))
                for i, newprod in enumerate(lps):
                    final_pcfg.parameters[newprod] = alpha * lprobs[i]
        final_pcfg.set_log_parameters()
        return final_pcfg
                    default="length.pdf")
parser.add_argument("--logscale",
                    help="Y axis in log scale",
                    action="store_true")
## Other options: control output format, what probs are calculated.

args = parser.parse_args()
l = args.maxlength + 1
x = np.arange(1, l)
ys = []
for f in args.inputfilenames:
    mypcfg = pcfg.load_pcfg_from_file(f)
    print(f)
    upcfg = mypcfg.make_unary()

    insider = inside.UnaryInside(upcfg)
    table = insider.compute_inside_smart(l)
    start = insider.start
    y = np.zeros(args.maxlength)

    for length in range(1, l):

        p = table[length, start]
        print(length, p)
        y[length - 1] = p
    ys.append(y)

alpha = 1.0 / math.sqrt(len(ys))
for y in ys:
    plt.plot(x, y, 'b', alpha=alpha)