Ejemplo n.º 1
0
def compress_grammar(grammar):
    comp_dict = get_compress_dict(grammar)
    grammar._productions = list(filter(lambda p: p.lhs() not in comp_dict, grammar._productions))
    for ind, prod in enumerate(grammar._productions):
        rhs = [r if r not in comp_dict else comp_dict[r] for r in prod.rhs()]
        new_prod = ProbabilisticProduction(prod.lhs(), rhs)
        new_prod.prob = prod.prob
        grammar._productions[ind] = new_prod
    return grammar
Ejemplo n.º 2
0
def create_duplications(grammar, dup_prob):
    dup_prods = []
    for ind, prod in enumerate(grammar._productions):
        rhs = prod.rhs()
        if len(rhs) == 1 and type(rhs[0]) is str:
            if prod.prob() != 1.0:
                raise BaseException("Can't handle this currently")
            new_prod = ProbabilisticProduction(prod.lhs(), rhs, prob=1-dup_prob)
            grammar._productions[ind] = new_prod
            dup_prod = ProbabilisticProduction(prod.lhs(), [rhs[0], prod.lhs()], prob=dup_prob)
            dup_prods.append(dup_prod)
    for dup_prod in dup_prods:
        grammar._productions.append(dup_prod)
    return grammar
Ejemplo n.º 3
0
    def test_productions(self):
        t = Tree.fromstring("""
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)

        # Bugfix from official test (, start='S')
        model = UPCFG([t], start='S')

        prods = model.productions()

        prods2 = [
            ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0),
            ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0),
            ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0),
            ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))
Ejemplo n.º 4
0
def _learning_by_biclustering(G, C, T):
    print("learning...")
    global biclusters
    global ignore_mc_ec
    
    ## find the valid bicluster Bc in T that leads to the maximal posterior gain (Eq.2)
    BC = None
    
    ## 1er essai
    attempts = 3
    while BC is None and attempts > 0:
        attempts -= 1
        BC = _get_best_bicluster(T, C)
    
    if BC is None:
        ignore_mc_ec = True
    
        ## 2e essai
        attempts = 2
        while BC is None and attempts > 0:
            attempts -= 1
            BC = _get_best_bicluster(T, C)
    
        if BC is None:
            return False, G, C, T, None
        ignore_mc_ec = False
        
    ## create an AND symbol N and two OR symbols A, B
    N = Nonterminal("_AND_"+str(_get_and_symb_index()))
    A = Nonterminal("_OR_"+str(_get_or_symb_index()))
    B = Nonterminal("_OR_"+str(_get_or_symb_index()))
    bc = BC.as_matrix()
    s = np.sum(bc)
    row_prob = np.sum(bc, 1)/s
    col_prob = np.sum(bc, 0)/s
    ## création des règles
    rules = []
    rules += [ProbabilisticProduction(A, [_format_nt(BC.index[i])], prob=row_prob[i])
              for i in range(BC.shape[0])]
    rules += [ProbabilisticProduction(B, [_format_nt(BC.columns[j])], prob=col_prob[j])
              for j in range(BC.shape[1])]
    rules += [ProbabilisticProduction(N, [A, B], prob=1.)]
    ## mises à jour
    G_updated = PCFG(G.start(), G.productions() + rules) # ajout des règles dans G
    C_reduced = _reduce_corpus(C, BC, N) # réduction du corpus
    T_updated = _create_t(C_reduced) # mise à jour de T
    biclusters[(N.symbol(),A.symbol(),B.symbol())] = BC # sauvegarde de BC pour le groupe appris
    return True, G_updated, C_reduced, T_updated, N
Ejemplo n.º 5
0
def pcfg_generate(grammar):

    def non_terminal_into_terminal(non_terminal):
        nt_productions = grammar.productions(Nonterminal(str(non_terminal)))
        my_dict = dict()
        for pr in nt_productions: my_dict[pr.rhs()] = pr.prob()
        nt_productions_probDist = DictionaryProbDist(my_dict)
        genereted = nt_productions_probDist.generate()
        return list(genereted)

    def nts_into_ts(genereted_nts):
        for index in range(len(genereted_nts)):
            old_nt = genereted_nts[index]
            try:
                t = non_terminal_into_terminal(genereted_nts[index])
            except Exception as e:
                continue
            productions_corpus.append(ProbabilisticProduction(Nonterminal(old_nt), tuple(t), **{'prob': 0}))
            genereted_nts[index] = nts_into_ts(Tree(old_nt, t))
        return genereted_nts

    productions = grammar.productions()
    dic = dict()
    for pr in productions: dic[pr.rhs()] = pr.prob()
    productions_probDist = DictionaryProbDist(dic)
    genereted = productions_probDist.generate()
    productions_corpus.append(ProbabilisticProduction(Nonterminal('S'), genereted, **{'prob': 0}))
    genereted = Tree('S', [genereted[0], genereted[1]])
    return nts_into_ts(genereted)
Ejemplo n.º 6
0
    def parse(self, tokens):
        tagged = nltk.pos_tag(tokens)
        missing = False
        for tok, pos in tagged:
            if not self._grammar._lexical_index.get(tok):
                missing = True
                self._grammar._productions.append(
                    ProbabilisticProduction(Nonterminal(pos), [tok],
                                            prob=0.000001))


# WeightedProduction(Nonterminal(pos), [tok], prob=0.000001))
        if missing:
            self._grammar._calculate_indexes()

        # returns a generator, so call 'next' to get the ProbabilisticTree
        tree = super(PCFGViterbiParser, self).parse(tokens)
        if issubclass(tree.__class__, nltk.tree.Tree):
            print 'returning a tree'
            return tree
        elif isinstance(tree, types.GeneratorType):
            try:
                return next(tree)
            except (StopIteration):
                tweet = ' '.join(tokens)
                print u'Couldn\'t parse {}'.format(tweet)
                return None
        else:
            error("Type of tree is: {}".format(type(tree)))
Ejemplo n.º 7
0
def main(args):

    sentence = args.sentence.lower()
    args.sentence = sentence
    tokens = sentence.split()
    grammar = loadGrammar(args)
    nonterm = getnonterm(grammar)
    terminalProductionRules = getTerminalProbability(args, grammar, nonterm)
    HSrules = grammar.productions(Nonterminal('HS'))
    for rule in HSrules:
        grammar.productions().remove(rule)

    ESrules = grammar.productions(Nonterminal('ES'))
    for rule in ESrules:
        grammar.productions().remove(rule)

    grammar.productions().extend(terminalProductionRules)

    for token in tokens:
        grammar.productions().append(
            ProbabilisticProduction(Nonterminal(token.upper()),
                                    [unicode(token)],
                                    prob=1))

    #print "Grammars"
    grammarlist = str(grammar).split('\n')[1:]

    #print "Transfered"
    strgrammar = ''
    for p in grammar.productions():
        rhs = p.rhs()
        rhsstr = ''
        for r in rhs:
            if is_terminal(r):
                rhsstr += '\'' + str(r) + '\' '
            else:
                rhsstr += str(r) + ' '
        strgrammar += str(p.lhs()) + ' -> ' + rhsstr + ' [' + '{0:.8f}'.format(
            p.prob()) + ']\n'
    #print strgrammar

    grammar = PCFG.fromstring(strgrammar.split('\n'))
    #'''
    #grammar = loadGrammar(args)

    #tokens = args.sentence.lower().split()
    #nonterm = getnonterm(grammar)

    CYK(tokens, nonterm, grammar)
    #with open(args.grammar_file, 'r') as f:
    #        content = f.read()

    #trees = corpus2trees(content)
    #productions = trees2productions(trees)
    #listnonterm = []
    #grammar = nltk.grammar.induce_pcfg(nltk.grammar.Nonterminal('SS'), productions)
    #print grammar

    #'''
    '''
Ejemplo n.º 8
0
def pcfg_bcl(C, alpha=ALPHA, gd_thr=LPG_DIFF_THRESHOLD, mc_thr=MC_THRESHOLD):
    print("\ninitializing...")
    global ALPHA
    global LPG_DIFF_THRESHOLD
    global MC_THRESHOLD
    global and_symb_count
    global or_symb_count
    global ignore_mc_ec
    ALPHA = alpha
    LPG_DIFF_THRESHOLD = gd_thr
    MC_THRESHOLD = mc_thr
    and_symb_count = 0
    or_symb_count = 0
    ignore_mc_ec = False
    
    ## create an empty grammar G
    S = Nonterminal("_START_")
    R = [ProbabilisticProduction(S, [""], prob=1.)]
    G = PCFG(S, R)
    
    T = _create_t(C) # create a table T
    
    ## repeat until no further rule to be learned
    i = 0
    while not _finished(T):
        i += 1
        print("\niter. n° %d" % (i,))
        found, G, C, T, N = _learning_by_biclustering(G, C, T)
        if not found:
            print("NO MORE RULES CAN BE LEARNED")
            break
        G, C, T = _attaching(N, G, C, T)
    G = _postprocessing(G, C)
    print("\n", G) # DEBUG
    return G
Ejemplo n.º 9
0
def get_productions(productions):

    probabilities = dict()
    productions_to_return = list(set(productions))

    for prod in productions:
        if str(prod) in probabilities:
            probabilities[str(prod)] += 1
        else:
            probabilities[str(prod)] = 1

    amount_of_interior_nodes = len([prod.lhs() for prod in productions if prod.lhs() != Nonterminal('S')])

    lhs_of_prods = set([prod.lhs() for prod in productions])

    print('this is the amount of interior nodes: {}'.format(amount_of_interior_nodes))

    for lhs in lhs_of_prods:
        number_of_occurrences = 0
        for prob in probabilities:
            if prob.startswith(str(lhs) + " "):
                number_of_occurrences += probabilities[prob]
        for prob in probabilities:
            if prob.startswith(str(lhs) + " "):
                probabilities[prob] = probabilities[prob] / number_of_occurrences

    for index in range(len(productions_to_return)):
        prod = productions_to_return[index]
        productions_to_return[index] = ProbabilisticProduction(prod.lhs(), prod.rhs(),
                                                               **{'prob': probabilities[str(prod)]})

    return productions_to_return
Ejemplo n.º 10
0
def get_productions(productions):

    probabilities = dict()
    productions_to_return = list(set(productions))

    for prod in productions:
        if str(prod) in probabilities:
            probabilities[str(prod)] += 1
        else:
            probabilities[str(prod)] = 1

    lhs_of_prods = set([prod.lhs() for prod in productions])

    for lhs in lhs_of_prods:
        number_of_occurrences = 0
        for prob in probabilities:
            if prob.startswith(str(lhs) + " "):
                number_of_occurrences += probabilities[prob]
        for prob in probabilities:
            if prob.startswith(str(lhs) + " "):
                probabilities[
                    prob] = probabilities[prob] / number_of_occurrences

    for index in range(len(productions_to_return)):
        prod = productions_to_return[index]
        productions_to_return[index] = ProbabilisticProduction(
            prod.lhs(), prod.rhs(), **{'prob': probabilities[str(prod)]})
    dist = FreqDist(productions_to_return)
    #dist.plot(len(probabilities))

    return productions_to_return, dist
Ejemplo n.º 11
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        start -- start symbol.
        horzMarkov -- None for default. A number n >= 0 for horizontal markov.
        """
        self.start = start

        count_Y_Z = defaultdict(lambda: defaultdict(int))
        count_X = defaultdict(int)
        for t in parsed_sents:
            # it's a copy of tree. We don't want to modify the original tree.
            # mutable structures
            unle_trees = unlexicalize(t.copy(deep=True))
            # chomsky normal form with horizontal markov.
            unle_trees.chomsky_normal_form(horzMarkov=horzMarkov)
            # collapse subtrees with a single child.
            unle_trees.collapse_unary(collapsePOS=True)
            for prod in unle_trees.productions():
                count_Y_Z[prod.lhs()][prod.rhs()] += 1
                count_X[prod.lhs()] += 1

        # create a list of productions.
        productions = []
        for X, c_X in count_X.items():
            for (Y_Z, c_Y_Z) in count_Y_Z[X].items():
                q = c_Y_Z / float(c_X)
                productions.append(ProbabilisticProduction(X, Y_Z, prob=q))

        self.production = productions

        grammar = PCFG(Nonterminal(start), productions)
        self.parser = CKYParser(grammar)
Ejemplo n.º 12
0
 def nts_into_ts(genereted_nts):
     for index in range(len(genereted_nts)):
         old_nt = genereted_nts[index]
         try:
             t = non_terminal_into_terminal(genereted_nts[index])
         except Exception as e:
             continue
         productions_corpus.append(ProbabilisticProduction(Nonterminal(old_nt), tuple(t), **{'prob': 0}))
         genereted_nts[index] = nts_into_ts(Tree(old_nt, t))
     return genereted_nts
Ejemplo n.º 13
0
    def test_horz_markov_0(self):
        t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))")

        model = UPCFG([t], horzMarkov=0)

        prods = model.productions()

        prods2 = [
            # the right-binarized productions:
            ProbabilisticProduction(N('NP'), [N('Det'), N('NP|<>')], prob=1.0),
            ProbabilisticProduction(N('NP|<>'),
                                    [N('Noun'), N('Adj')],
                                    prob=1.0),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))
Ejemplo n.º 14
0
 def parse_batch(self, tagged):
     missing = False
     tokens = []
     for tok, pos in tagged:
         tokens.append(tok)
         if not self._grammar._lexical_index.get(tok):
             missing = True
             self._grammar._productions.append(ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001))
     if missing:
         self._grammar._calculate_indexes()
     return super(PCFGViterbiParser, self).parse(tokens)
Ejemplo n.º 15
0
    def test_horz_markov_None(self):
        t = Tree.fromstring("(NP (Det el) (Noun gato) (Adj negro))")

        # Bugfix from official test (, start='NP')
        model = UPCFG([t], start='NP')  # horzMarkov=None by default

        prods = model.productions()

        prods2 = [
            # the right-binarized productions:
            ProbabilisticProduction(N('NP'),
                                    [N('Det'), N('NP|<Noun-Adj>')],
                                    prob=1.0),
            ProbabilisticProduction(N('NP|<Noun-Adj>'),
                                    [N('Noun'), N('Adj')],
                                    prob=1.0),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))
Ejemplo n.º 16
0
    def __init__(self, parsed_sents, start='sentence', horzMarkov=None):
        """
        parsed_sents -- list of training trees.
        """
        # { A -> B : count(A -> B) }
        productions_counts = defaultdict(int)
        # { A : count(A) }
        lhs_count = defaultdict(int)  # left_hand_side_count

        self.start = start  # Para la gramatica del parser CKY
        self.prods = []  # Lista de producciones

        # Hacemos una copia de t porque al hacer el unlexicalize, este me
        # modifica el arbol
        # Original: unlexicalize_tree = [unlexicalize(t) for t in parsed_sents]
        unlex_sents = [unlexicalize(t.copy(deep=True)) for t in parsed_sents]

        for t in unlex_sents:
            t.chomsky_normal_form(horzMarkov=horzMarkov)
            t.collapse_unary(collapsePOS=True, collapseRoot=True)
            for prod in t.productions():
                # type(prod): <class 'nltk.grammar.Production'>
                # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
                # type(prod.rhs): <class 'tuple'>
                #   Cada elemento de prod.rhs() es del tipo:
                #       <class 'nltk.grammar.Nonterminal'>
                productions_counts[prod] += 1
                lhs_count[prod.lhs()] += 1

        for prod, count_prod in productions_counts.items():
            # type(production): <class 'nltk.grammar.Production'>
                # production : A -> B
                # type(count_prod): int
            # count_prod : count(A -> B)
            count_lhs = lhs_count.get(prod.lhs(), 0)

            # type(prod.lhs): <class 'nltk.grammar.Nonterminal'>
            # type(prod.rhs): <class 'tuple'>
            q_ML = float(count_prod) / count_lhs
            self.prods += [ProbabilisticProduction(prod.lhs(),
                                                   prod.rhs(),
                                                   prob=q_ML)]
            # Cada elemento de self.prods es del tipo:
            #     <class 'nltk.grammar.ProbabilisticProduction'>

        # type(PCFG(...)) = <class 'nltk.grammar.PCFG'>
        # PCFG(start, productions)
        #       type(start): Nonterminal
        #       type(productions): list(Production)
        grammar = PCFG(Nonterminal(start), self.prods)
        self.my_parser = CKYParser(grammar)
Ejemplo n.º 17
0
 def read_productions(self, productions_filename):
     productions = []
     with io.open(productions_filename, 'r', encoding='utf8') as f:
         for line in f:
             line = line.strip()
             components = line.split(u'+')
             lhs = Nonterminal(components[0])
             rhs = tuple([
                 Nonterminal(nt.strip()) for nt in components[1].split(u' ')
             ])
             prob = float(components[2])
             pp = ProbabilisticProduction(lhs, rhs, prob=prob)
             productions.append(pp)
     self.grammar = PCFG(Nonterminal('S'), productions)
Ejemplo n.º 18
0
 def parse(self, tokens, tagger = None):
     # tokens = self._preprocess(list(tokens))
     if (tagger == None):
         tagged = nltk.pos_tag(tokens)
     else:
         tagged = tagger.tag(tokens)
     # print tagged
     missing = False
     for tok, pos in tagged:
         if not self._grammar._lexical_index.get(tok):
             missing = True
             self._grammar._productions.append(ProbabilisticProduction(Nonterminal(pos), [tok], prob=0.000001))
     if missing:
         self._grammar._calculate_indexes()
     return super(PCFGViterbiParser, self).parse(tokens)
    def parse(self, tokens):
        tokens = self._preprocess(list(tokens))
        tagged = nltk.pos_tag(tokens)

        missing = False
        for tok, pos in tagged:
            if not self._grammar._lexical_index.get(tok):
                missing = True
                self._grammar._productions.append(
                    ProbabilisticProduction(Nonterminal(pos), [tok],
                                            prob=0.000001))
        if missing:
            self._grammar._calculate_indexes()

        print 'HI'
        testlist = super(PCFGViterbiParser, self).parse(tokens)
        for test in testlist:
            test.draw()
        return super(PCFGViterbiParser, self).parse(tokens)
Ejemplo n.º 20
0
    def parse(self, tokens):
        #tokens = self._preprocess(list(tokens))
        tagged = nltk.pos_tag(tokens)
        # tagged = tokens
        # print(tagged)
        # tokens = [i[0] for i in tagged]
        # print("TOOOOKKENNSS-------------")
        # print(tokens)

        missing = False
        for tok, pos in tagged:
            if not self._grammar._lexical_index.get(tok):
                missing = True
                self._grammar._productions.append(
                    ProbabilisticProduction(Nonterminal(pos), [tok],
                                            prob=0.000001))
        if missing:
            self._grammar._calculate_indexes()

        print(self._grammar)
        return super(PCFGViterbiParser, self).parse(tokens)
Ejemplo n.º 21
0
def _postprocessing(G, C):
    print("\npostprocessing...")
    ## suppression de la règle _START_ -> ...
    rules = []
    for prod in G.productions():
        if G.start().symbol() not in prod.lhs().symbol():
            rules.append(prod)
    if len(rules) == 0:
        return G
    ## create an OR symbol S
    S = Nonterminal("_START_")
    sss = {} # single symbol sentences
    ## for each sentence s in C do
    ##   if s is fully reduced to a single symbol x then
    ##   add S -> x to G, or if the rule already exists, increase its weight by 1
    for sentence in sent_tokenize(C):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        t = word_tokenize(sentence)
        if len(t) == 1:
            sss[t[0]] = 1 if not t[0] in sss else sss[t[0]] + 1
    weight_sum = sum([sss[k] for k in sss])
    rules += [ProbabilisticProduction(S, [_format_nt(k)], prob=sss[k]/weight_sum) for k in sss]
    return PCFG(S, rules)
Ejemplo n.º 22
0
 def renormalize(self, height=10**4, tol=10**(-17), min_height=100):
     """Return renormalized grammar. 
     
     Raise ValueError if for at least one nonterminal, its coverage
     equals zero.
     Input:
         height - maximal height of parse trees of which the
             coverage is calculated of.
         tol - tolerance as a stopping condition. If change
             is smaller than the input tolerance, then it stops.
         min_height - overrides tolerance stopping condition and
             calculates coverage of all heights <= min_height. It
             also determines for how many previous steps the change
             is measured, i.e. for levels (height-1 - min_height/2).
         verbosity - if set to > 0, it prints stopping probability
             change, height and input tolerance.
     """
     coverages_dict = self.list_coverages(height, tol, min_height)
     if min(coverages_dict[A] for A in coverages_dict) < tol:  # input tol
         print([A for A in coverages_dict if coverages_dict[A] < tol])
         raise ValueError("Not all coverages are positive, so"
                         + " renormalization cannot be performed since zero"
                         + " division.")
     def chi(prod, coverages_dict):
         """Renormalizes production probability p^~ as in Chi paper(22)."""
         subprobabs = prod.prob()
         for symbol in prod.rhs():
             if not isinstance(symbol, Nonterminal):
                 continue  # or subprobabs = 1
             else:
                 subprobabs *= coverages_dict[symbol]
         return subprobabs/coverages_dict[prod.lhs()]
     prods = [ProbabilisticProduction(prod.lhs(), prod.rhs(),
                                     prob=chi(prod, coverages_dict))
             for prod in self.grammar.productions()]
     return PCFG(self.grammar.start(), prods)
Ejemplo n.º 23
0
def tree_to_production(tree):
    return ProbabilisticProduction(get_tag(tree), [get_tag(child) for child in tree], **{'prob': 0})
Ejemplo n.º 24
0
def _read_production(line, nonterm_parser, probabilistic=False):
    """
    Parse a grammar rule, given as a string, and return
    a list of productions.
    """
    pos = 0

    # Parse the left-hand side.
    lhs, pos = nonterm_parser(line, pos)

    # Skip over the arrow.
    m = _ARROW_RE.match(line, pos)
    if not m: raise ValueError('Expected an arrow')
    pos = m.end()

    # Parse the right hand side.
    probabilities = [0.0]
    rhsides = [[]]
    optionals = [[]]  # keep track of optional productions
    while pos < len(line):
        # Probability.
        m = _PROBABILITY_RE.match(line, pos)
        if probabilistic and m:
            pos = m.end()
            probabilities[-1] = float(m.group(1)[1:-1])
            if probabilities[-1] > 1.0:
                raise ValueError('Production probability %f, '
                                 'should not be greater than 1.0' %
                                 (probabilities[-1], ))

        # Vertical bar -- start new rhside.
        elif line[pos] == '|':
            m = _DISJUNCTION_RE.match(line, pos)
            probabilities.append(0.0)
            rhsides.append([])
            optionals.append([])
            pos = m.end()

    # String -- add terminal.
        elif line[pos] in "\'\"":
            m = _TERMINAL_RE.match(line, pos)
            if not m: raise ValueError('Unterminated string')
            rhsides[-1].append(m.group(1)[1:-1])
            optionals[-1].append(False)
            pos = m.end()

        # Opening bracket -- start optional production.
        elif line[pos] == '[':
            m = _OPTIONAL_RE.match(line, pos)  # just get rid of spaces

            pos = m.end()
            # should refactor out the following
            if line[pos] in "\'\"":
                m = _TERMINAL_RE.match(line, pos)
                if not m: raise ValueError('Unterminated string')
                rhsides[-1].append(m.group(1)[1:-1])
                pos = m.end()
            else:
                nonterm, pos = nonterm_parser(line, pos)  # Eats the spaces
                rhsides[-1].append(nonterm)
            # end of refactor
            optionals[-1].append(True)
            if line[pos] != ']':
                raise ValueError('Unterminated optional bracket')
            m = _OPTIONAL_END_RE.match(line, pos)
            pos = m.end()

        # Anything else -- nonterminal.
        else:
            nonterm, pos = nonterm_parser(line, pos)  # Eats the spaces
            rhsides[-1].append(nonterm)
            optionals[-1].append(False)

    # Expand productions with optional elements
    rhsides_temp = []

    for (optionality, rhs) in zip(
            optionals, rhsides
    ):  # in case there were more than one separated by | (disjunction)
        if True in optionality:
            if probabilistic:
                raise ValueError(
                    'Optional terms not allowed in probalistic grammar')
            optterms = [i for (i, isopt) in enumerate(optionality) if isopt]
            opttermlists = powerset(
                optterms)  # all possible combinations of optionals
            for optlist in opttermlists:
                rhstemp = rhs[:]
                for i in sorted(optlist, reverse=True):
                    del rhstemp[i]
                rhsides_temp.append(rhstemp)
            pass
        else:
            rhsides_temp.append(rhs)
    # probablities won't work with optionality!
    if probabilistic:
        return [
            ProbabilisticProduction(lhs, rhs, prob=probability)
            for (rhs, probability) in zip(rhsides, probabilities)
        ]
    else:
        return [Production(lhs, rhs) for rhs in rhsides_temp]
Ejemplo n.º 25
0
def getTerminalProbability(args, pcfg_grammar, list_nonterm):
    #args.save_dir = args.english_save_dir
    #(modelen, charsen, vocaben) = getModel(args, 'en')
    #args.save_dir = args.hindi_save_dir
    #(modelhi, charhi, vocabhi) = getModel(args, 'hi')

    p = []
    args.nonterm = 'HS'
    args.save_dir = args.hindi_save_dir
    args.num_sentence = 1000
    args.length = len(args.sentence)
    segmentshi = []
    #print "PCFG grammar", pcfg_grammar
    (lengthlist, listterminal) = getLength(args, pcfg_grammar, list_nonterm)
    #print "lengthlist", lengthlist
    for length in lengthlist:
        segmentshi.extend(createSegment(length, args.sentence.lower()))
    probdicthi = getModel(args, 'hi', segmentshi)
    listProb = probdicthi.values()
    segmentshi = list(set(segmentshi))

    args.nonterm = 'ES'
    args.save_dir = args.english_save_dir
    (lengthlist, listterminal) = getLength(args, pcfg_grammar, list_nonterm)
    segmentsen = []
    for length in lengthlist:
        segmentsen.extend(createSegment(length, args.sentence.lower()))
    probdicten = getModel(args, 'en', segmentsen)
    listProb.extend(probdicten.values())
    segmentsen = list(set(segmentsen))

    listProb = sorted(listProb)
    denom = (len(listProb) * (len(listProb) + 1)) / 2
    prob1 = 0
    for segment in segmentshi:
        probnew = (listProb.index(probdicthi[segment]) + 1.0) / (denom + 1)
        probnew = float("{0:.8f}".format(round(probnew, 8)))
        prob1 += probnew
        #print segment, probnew
        p.append(
            ProbabilisticProduction(
                Nonterminal('HS'),
                [Nonterminal(token.upper()) for token in segment.split()],
                prob=probnew))
    p.append(
        ProbabilisticProduction(Nonterminal('HS'), ['Dummy'],
                                prob=(1.0 - prob1)))

    #print 'HS', prob1, 1.0-prob1
    prob1 = 0
    for segment in segmentsen:
        probnew = (listProb.index(probdicten[segment]) + 1.0) / (denom + 1)
        probnew = float("{0:.8f}".format(round(probnew, 8)))
        prob1 += probnew
        p.append(
            ProbabilisticProduction(
                Nonterminal('ES'),
                [Nonterminal(token.upper()) for token in segment.split()],
                prob=probnew))

    #print 'ES',prob1, 1.0-prob1
    p.append(
        ProbabilisticProduction(Nonterminal('ES'), ['Dummy'],
                                prob=(1.0 - prob1)))

    return p
Ejemplo n.º 26
0
def _attaching(N, G, C, T):
    print("attaching...")
    C_derived = _apply_grammar(G, C)
    ORs = [] # liste des OR (NonTerminal)
    for prod in G.productions():
        nt = prod.lhs()
        if "OR" in nt.symbol() and nt not in ORs:
            ORs.append(nt)
    ## for each OR symbol O in G do
    for O in ORs:
        ## if O leads to a valid expanded bicluster
        ## as well as a posterior gain (Eq.3) larger than a threshold then
        
        #
        #   AND-OR group
        
        group = None
        pos = None # gauche ou droite (impair-False ou pair-True)
        ## récupération du groupe AND-OR de O
        for g in biclusters:
            if O.symbol() in g[1] or O.symbol() in g[2]:
                group = g
                break
        ## récupération de la position de O dand le groupe
        num = int(O.symbol()[4:]) # numéro du OR, ex: "_OR_2" -> 2
        pos = True if num % 2 == 0 else False
        
        #
        #   BC_tilde et BC_tilde_prime
        
        ## création de BC_t (BC_tilde)
        BC_t = biclusters[group].copy()
        ## remplissage de BC_t
        for pair in _get_bicluster_pairs(BC_t):
            BC_t.at[pair] = _count_occ(" ".join(pair), C_derived)
        ## création de BC_t_1 (BC_tilde_prime) (proposed new rule OR -> AND)
        BC_t_1 = BC_t.copy()
        ## . remplissage de BC_t_1
        if pos == False:
            ## new row (OR à gauche)
            new_row = [_count_occ(" ".join((N.symbol(),x)), C) for x in BC_t.columns]
            BC_t_1.loc[N.symbol(),:] = new_row
            BC_t_1 = BC_t_1.astype(int)
        else:
            ## new column (OR à droite)
            new_col = [_count_occ(" ".join((x,N.symbol())), C) for x in BC_t.index]
            BC_t_1.loc[:,N.symbol()] = new_col
            BC_t_1 = BC_t_1.astype(int)
        
        #
        #   EC_tilde et EC_tilde_prime

        ## création et remplissage de EC_t
        EC_t = _create_ec(BC_t, C_derived, _create_t(C_derived))
        ## création de EC_t_1
        EC_t_1 = EC_t.copy()
        ## . ajout des nouvelles lignes de EC_t_1
        if pos == False:
            ## OR à gauche
            new_row_indices = [(N.symbol(),col) for col in BC_t_1.columns]
        else:
            ## OR à droite
            new_row_indices = [(row,N.symbol()) for row in BC_t_1.index]
        ## . remplissage des nouvelles lignes de EC_t_1
        for i in new_row_indices:
            i_str = _tuple_to_ec_index(i, True)
            EC_t_1.loc[i_str,:] = [-1]*EC_t_1.shape[1]
            for j in EC_t_1.columns:
                e, c = " ".join(i), list(_ec_index_to_tuple(j, False)) # expression, contexte
                c = tuple(["" if _represents_int(x) else x for x in c])
                EC_t_1.loc[i_str,j] = _count_occ(" ".join([c[0],e,c[1]]).strip(), C)
        EC_t_1 = EC_t_1.astype(int)
        bc_t_1 = BC_t_1.as_matrix()
        ec_t_1 = EC_t_1.as_matrix()
        bc_t = BC_t.as_matrix()
        ec_t = EC_t.as_matrix()
        
        #
        #   LOG POSTERIOR GAIN DIFFERENCE (Eq.3)
        
        ## BC et EC valid (MC) ?
        if not _is_mc(bc_t_1) and _is_mc(ec_t_1) and _is_mc(bc_t) and _is_mc(ec_t):
            continue
        
        lpg_diff = _log_posterior_gain(bc_t_1, ec_t_1)
        lpg_diff -= _log_posterior_gain(bc_t, ec_t)
        
        if lpg_diff > LPG_DIFF_THRESHOLD:
            print("new rule: %s -> %s" % (O.symbol(),N.symbol()))
            bc = BC_t_1.as_matrix()
            s = np.sum(bc)
            row_prob = np.sum(bc, 1)/s
            col_prob = np.sum(bc, 0)/s
            ## règles
            rules = []
            for prod in G.productions():
                if O.symbol() not in prod.lhs().symbol():
                    rules.append(prod)
            ## ajout des nouvelles règles
            if pos == False:
                ## OR à gauche
                probs = row_prob
                rhs_symbols = [x for x in BC_t.index]+[N]
                for i in range(BC_t_1.shape[0]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[i]], prob=probs[i]))
            else:
                ## OR à droite
                probs = col_prob
                rhs_symbols = [x for x in BC_t.columns]+[N]
                for j in range(BC_t_1.shape[1]):
                    rules.append(ProbabilisticProduction(O, [rhs_symbols[j]], prob=probs[j]))
                
            ## mises à jour
            biclusters[group] = BC_t_1.copy() # mise à jour du groupe AND-OR
            G = PCFG(G.start(), rules) # mise à jour de G
            C = _reduce_corpus(C, biclusters[group], N, True) # réduction de C
            T = _create_t(C) # mise à jour de T
            
    return G, C, T
Ejemplo n.º 27
0
        probabilities[str(prod)] = 1

lhs_of_prods = set([prod.lhs() for prod in original_production_corpus])

for lhs in lhs_of_prods:
    number_of_occurrences = 0
    for prob in probabilities:
        if prob.startswith(str(lhs) + " "):
            number_of_occurrences += probabilities[prob]
    for prob in probabilities:
        if prob.startswith(str(lhs) + " "):
            probabilities[prob] = probabilities[prob] / number_of_occurrences

for index in range(len(productions_corpus)):
    prod = productions_corpus[index]
    productions_corpus[index] = ProbabilisticProduction(prod.lhs(), prod.rhs(), **{'prob': probabilities[str(prod)]})

productions_toy_pcfg2 = toy_pcfg2.productions()

lhs_of_prods = set([str(prod.lhs()) for prod in original_production_corpus] + [str(prod.lhs()) for prod in
                                                                               productions_toy_pcfg2])


def compute_kl_divergence(mle_dist1, mle_dist2):
    ans = 0
    for p in mle_dist1.freqdist():
        for q in mle_dist2.freqdist():
            if p.rhs() == q.rhs():
                ans += p.prob() * math.log(p.prob() / q.prob())
    return ans
Ejemplo n.º 28
0
def baseline(depth=5, n=500):
    ## symboles non terminaux
    S = Nonterminal("S")
    NP = Nonterminal("NP")
    VP = Nonterminal("VP")
    PP = Nonterminal("PP")
    Det = Nonterminal("Det")
    Vt = Nonterminal("Vt")
    Vc = Nonterminal("Vc")
    Vi = Nonterminal("Vi")
    N = Nonterminal("N")
    P = Nonterminal("P")
    ## règles de production probabilistes
    R = [
        ProbabilisticProduction(S, [NP, VP], prob=1.),
        ProbabilisticProduction(NP, [Det, N], prob=1.),
        ProbabilisticProduction(VP, [Vt, NP], prob=1 / 3),
        ProbabilisticProduction(VP, [Vc, PP], prob=1 / 3),
        ProbabilisticProduction(VP, [Vi], prob=1 / 3),
        ProbabilisticProduction(PP, [P, NP], prob=1.),
        ProbabilisticProduction(Det, ["a"], prob=.5),
        ProbabilisticProduction(Det, ["the"], prob=.5),
        ProbabilisticProduction(Vt, ["touches"], prob=.5),
        ProbabilisticProduction(Vt, ["covers"], prob=.5),
        ProbabilisticProduction(Vi, ["rolls"], prob=.5),
        ProbabilisticProduction(Vi, ["bounces"], prob=.5),
        ProbabilisticProduction(Vc, ["is"], prob=1.),
        ProbabilisticProduction(N, ["circle"], prob=1 / 3),
        ProbabilisticProduction(N, ["square"], prob=1 / 3),
        ProbabilisticProduction(N, ["triangle"], prob=1 / 3),
        ProbabilisticProduction(P, ["above"], prob=.5),
        ProbabilisticProduction(P, ["below"], prob=.5)
    ]
    G = PCFG(S, R)  # grammaire
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C
Ejemplo n.º 29
0
def langley_1(depth=5, n=500):
    ## symboles non terminaux
    S = Nonterminal("S")
    NP = Nonterminal("NP")
    VP = Nonterminal("VP")
    AP = Nonterminal("AP")
    Adj = Nonterminal("Adj")
    Det = Nonterminal("Det")
    Vt = Nonterminal("Vt")
    Vi = Nonterminal("Vi")
    N = Nonterminal("N")
    ## règles de production probabilistes
    R = [
        ProbabilisticProduction(S, [NP, VP], prob=1.),
        ProbabilisticProduction(VP, [Vi], prob=.5),
        ProbabilisticProduction(VP, [Vt, NP], prob=.5),
        ProbabilisticProduction(NP, [Det, N], prob=.5),
        ProbabilisticProduction(NP, [Det, AP, N], prob=.5),
        ProbabilisticProduction(AP, [Adj], prob=.5),
        ProbabilisticProduction(AP, [Adj, AP], prob=.5),
        ProbabilisticProduction(Det, ["the"], prob=1.),
        ProbabilisticProduction(Vt, ["saw"], prob=.5),
        ProbabilisticProduction(Vt, ["heard"], prob=.5),
        ProbabilisticProduction(Vi, ["ate"], prob=.5),
        ProbabilisticProduction(Vi, ["slept"], prob=.5),
        ProbabilisticProduction(N, ["cat"], prob=.5),
        ProbabilisticProduction(N, ["dog"], prob=.5),
        ProbabilisticProduction(Adj, ["big"], prob=.5),
        ProbabilisticProduction(Adj, ["old"], prob=.5)
    ]
    G = PCFG(S, R)  # grammaire
    C = ""  # corpus
    ## toutes les phrases possibles
    print("\n")
    for n, sent in enumerate(generate.generate(G, depth=depth, n=n), 1):
        s = ' '.join(sent)
        C += s + '. '
        print('%3d. %s%s' % (n, s, '.'))
    return G, C