Python OOV Examples

Programming Language: Python

Namespace/Package Name: oov

Class/Type: OOV

Examples at hotexamples.com: 11

Python OOV - 11 examples found. These are the top rated real world Python examples of oov.OOV extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OOV(6)

closest_in_corpus(2)

closest_to_tokens(1)

closest_word(1)

process(1)

Example #1

Show file

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens)

        self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)}

        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

        # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ)
        self.grammar_dicts = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand tag of the grammar rule
            idx_root_tag = self.tag_to_id[root_tag]
            self.grammar_dicts[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items():  # split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.tag_to_id[split[0]]
                idx_right_tag = self.tag_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_dicts[idx_root_tag] = dico

Example #2

Show file

File: cyk.py Project: ganshofp/MVA_NLP_Projects

    def __init__(self, corpus):

        # PCFG and OOV class
        self.pcfg = PCFG(corpus)
        self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags,
                       self.pcfg.tokens)

        # Initialize CYP probability matrix
        self.proba_matrix = None
        self.cyk_matrix = None

Example #3

Show file

File: pcfg.py Project: Louis-Verret/parser-for-french

 def __init__(self, fname):
     self.count1 = {}
     self.count2 = {}
     self.count3 = {}
     self.rules = {}
     self.lexicon = {}
     self.nt = []
     self.oov = OOV('polyglot-fr.pkl')
     with open(fname, 'r') as f:
         self.training_corpus = f.readlines()

Example #4

Show file

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols,
                       self.PCFG.freq_tokens)

        #note : if the id of a symbol is above self.PCFG.nb_tags,
        #it's an artificial symbol introduced with Chomsky normalization
        self.symbol_to_id = {
            symbol: i
            for (i, symbol) in enumerate(self.PCFG.list_all_symbols)
        }

        #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids:
        #we store rules with an additional hierarchical level for speed up
        #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ)
        #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries
        self.grammar_ids = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand symbol of the grammar rule
            idx_root_tag = self.symbol_to_id[root_tag]
            self.grammar_ids[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items(
            ):  #split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.symbol_to_id[split[0]]
                idx_right_tag = self.symbol_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_ids[idx_root_tag] = dico

        #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ?
        #this is what stores self.lexicon_inverted
        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

Example #5

Show file

 def __init__(self):
     self.oov = OOV()
     self.train = []
     self.test = []
     self.poses = set()
     # ex// Un: 56
     self.tokens = defaultdict(int)
     # ex// (A, B): 41
     self.count_grammar = defaultdict(int)
     # ex// (A, a): 11
     self.count_lexicon = defaultdict(int)
     # ex// Un: [N, NP]
     self.token_to_pos = defaultdict(set)
     # ex// (B, C) : [A1, A2, A3]
     self.right_to_pos = defaultdict(set)
     # ex// A: 22
     self.preterminals_pos = defaultdict(int)
     # ex// (N, Un): 0.23
     self.prob_pos_to_token = defaultdict(int)
     # ex// (A, (B, C)): 56
     self.count_left_to_right = defaultdict(int)
     # ex// (A, (B, C)): 0.11
     self.prob_left_to_right = defaultdict(int)

Example #6

Show file

File: cyk.py Project: ganshofp/MVA_NLP_Projects

class CYK:
    def __init__(self, corpus):

        # PCFG and OOV class
        self.pcfg = PCFG(corpus)
        self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags,
                       self.pcfg.tokens)

        # Initialize CYP probability matrix
        self.proba_matrix = None
        self.cyk_matrix = None

    # Apply the CYK algorithm
    def CYK_algorithm(self, sentence):

        # Initialize
        n = len(sentence)
        r = self.pcfg.nb_all_tags
        P = np.zeros((n, n, r))
        cyk_matrix = np.zeros((n, n, r, 3))

        # First level P[0,:,:]
        for idx_word, word in enumerate(sentence):

            # Get closest word in the lexicon
            word = self.oov.closest_word(word)

            if word is None:
                for idx_tag, tag in enumerate(self.pcfg.list_all_tags):
                    if tag in self.pcfg.terminal_tags:
                        P[0, idx_word, idx_tag] = self.pcfg.terminal_tags[tag]

            else:
                for idx_tag, tag in enumerate(self.pcfg.list_all_tags):
                    if tag in self.pcfg.inv_lexicon[word]:
                        P[0, idx_word,
                          idx_tag] = self.pcfg.inv_lexicon[word][tag]

        # Other levels
        for l in range(1, n):

            for s in range(n - l):

                for tag in self.pcfg.grammar:
                    idx_tag = self.pcfg.dic_all_tags[tag]

                    for p in range(l):

                        for rule in self.pcfg.grammar[tag]:
                            left_tag = rule.split(' ')[0]
                            right_tag = rule.split(' ')[1]
                            b = self.pcfg.dic_all_tags[left_tag]
                            c = self.pcfg.dic_all_tags[right_tag]

                            prob_splitting = self.pcfg.grammar[tag][rule] * P[
                                p, s, b] * P[l - p - 1, s + p + 1, c]

                            if prob_splitting > P[l, s, idx_tag]:
                                P[l, s, idx_tag] = prob_splitting
                                cyk_matrix[l, s, idx_tag] = [p, b, c]

        self.proba_matrix = P
        self.cyk_matrix = cyk_matrix.astype(int)

    # Remove new tags and de-telescope tags
    def clean_tags(self, tree):
        # remove new tags of type
        nodes = deepcopy(tree.nodes)
        for node in nodes:
            children = list(tree.successors(node))

            if len(children) == 0:
                pass

            elif len(children) == 1 and len(list(tree.successors(
                    children[0]))) == 0:
                pass

            else:
                parent = list(tree.predecessors(node))
                if len(parent) == 0:
                    pass
                else:
                    tag = tree.nodes[node]["name"]

                    if (self.pcfg.dic_all_tags[tag] >=
                            self.pcfg.nb_tags) and ("|" in tag):

                        for child in tree.successors(node):
                            tree.add_edge(parent[0], child)
                        tree.remove_node(node)

        # Decomposing A&B -> w into A -> B -> w
        max_node = np.max(tree.nodes())
        nodes = deepcopy(tree.nodes)
        for node in nodes:

            children = list(tree.successors(node))

            if len(children) == 0 or len(list(tree.predecessors(node))) == 0:
                pass

            elif len(children) == 1 and len(list(tree.successors(
                    children[0]))) == 0:
                tag = tree.nodes[node]["name"]

                if (self.pcfg.dic_all_tags[tag] >= self.pcfg.nb_tags) and (
                        "&" in tag):  # new tag from unit rule
                    word = children[0]

                    idx_cut = None

                    for (idx, c) in enumerate(tag):
                        if c == "&":
                            idx_cut = idx

                    tree.nodes[node]["name"] = tag[:idx_cut]

                    idx_pre_terminal_node = max_node + 1
                    tree.add_node(idx_pre_terminal_node,
                                  name=tag[idx_cut + 1:])
                    max_id_node += 1
                    tree.remove_edge(node, word)
                    tree.add_edge(node, idx_pre_terminal_node)
                    tree.add_edge(idx_pre_terminal_node, word)

    # Parse part of a sentence
    def parse_substring(self, s, l, idx_tag, sentence):

        if l == 0:
            return sentence[s]

        else:
            cut = self.cyk_matrix[l, s, idx_tag, 0]
            idx_left_tag = self.cyk_matrix[l, s, idx_tag, 1]
            idx_right_tag = self.cyk_matrix[l, s, idx_tag, 2]

            left_tag = self.pcfg.list_all_tags[idx_left_tag]
            right_tag = self.pcfg.list_all_tags[idx_right_tag]

            return [[
                left_tag,
                self.parse_substring(s, cut, idx_left_tag, sentence)
            ],
                    [
                        right_tag,
                        self.parse_substring(s + cut + 1, l - cut - 1,
                                             idx_right_tag, sentence)
                    ]]

    # Returns the parsed sentence
    def parse(self, sentence):

        sentence = sentence.split(' ')
        length_sentence = len(sentence)

        if length_sentence > 1:
            self.CYK_algorithm(sentence)
            idx_root_tag = self.pcfg.dic_all_tags['SENT']
            if self.proba_matrix[length_sentence -
                                 1][0][idx_root_tag] == 0:  # no valid parsing
                return None
            parsing_list = self.parse_substring(0, length_sentence - 1,
                                                idx_root_tag, sentence)

        else:
            word = sentence[0]
            word_lexicon = self.oov.closest_word(word)

            if word_lexicon is None:
                tag = max(self.pcfg.terminal_tags,
                          key=self.pcfg.terminal_tags.get)

            else:
                tag = max(self.pcfg.inv_lexicon[word_lexicon],
                          key=self.pcfg.inv_lexicon[word_lexicon].get)

            parsing_list = '(' + tag + word + ')'

        # converting the parsing stored as a string into a tree
        tree = tagged_sent_to_tree(
            "( (SENT " + list_to_parsed_sentence(parsing_list) + "))",
            remove_after_hyphen=False)

        self.clean_tags(tree)

        return tree_to_sentence(tree)

Example #7

Show file

File: pcfg.py Project: Louis-Verret/parser-for-french

class PCFG(object):
    def __init__(self, fname):
        self.count1 = {}
        self.count2 = {}
        self.count3 = {}
        self.rules = {}
        self.lexicon = {}
        self.nt = []
        self.oov = OOV('polyglot-fr.pkl')
        with open(fname, 'r') as f:
            self.training_corpus = f.readlines()

    def parse_corpus(self):
        '''
        Parse the sentences of a corpus to compute a probabilistic grammar in
        CNF
        '''
        for i, sentence in enumerate(self.training_corpus):
            root = self._parse_tree(sentence)
            self._update_rules(root)
        self._compute_probabilities()
        self._convert_to_cnf()

    def _parse_tree(self, sentence):
        '''
        Parse the tree in string format and convert it into a data structure
        format
        '''
        list_words = (re.split('(\(|\))', sentence))
        level = -1
        root = Node('ROOT')
        curr_node = root
        for i in range(1, len(list_words)-2, 2): # Ignore first and last parenth
            word = list_words[i] + list_words[i+1]
            if i == 1:
                continue
            if word[0] == '(':
                split = word[1:].split(' ', 1)
                non_terminal = split[0].split('-', 1)[0] # Ignore hyphen
                if split[-1] == '':
                    new_node = Node(non_terminal)
                else:
                    terminal = split[1]
                    new_node = Node(non_terminal, anchor=terminal)
                curr_node.add_child(new_node)
                new_node.add_parent(curr_node)
                curr_node = new_node
            else:
                curr_node = curr_node.parents[-1]
        return root

    def _update_rules(self, root):
        '''
        Perform a BFS from the derived tree to count the rules
        '''
        queue = deque()
        queue.append(root)
        marked = [root]
        while queue:
            node = queue.pop()
            rule = node.data + ' ->'
            for child in node.children:
                if child not in marked:
                    rule += ' ' + child.data
                    queue.append(child)
                    marked.append(child)
            if node.is_terminal:
                rule += ' \'' + node.anchor + '\''
                # Keep track of the count of ... -> anchor
                if node.anchor in self.count3:
                    self.count3[node.anchor] += 1
                else:
                    self.count3[node.anchor] = 1
            # Keep track of the count of alpha -> beta
            if rule in self.count1:
                self.count1[rule] += 1
            else:
                self.count1[rule] = 1
            # Keep track of the count of alpha -> ...
            if node.data in self.count2:
                self.count2[node.data] += 1
            else:
                self.count2[node.data] = 1

    def _compute_probabilities(self):
        '''
        Compute the probability for each rule parsed from the count computed
        from the derived tree of the corpus
        '''
        for rule in self.count1:
            split = rule.split('->')
            rhs = split[-1].strip()
            lhs = split[0].strip()
            if rhs[0] == "\'" and rhs[-1] == "\'": # if anchor
                rhs = rhs[1:-1] # remove the ''
                tuple = (lhs, self.count1[rule]/self.count2[lhs])
                if rhs in self.lexicon:
                    self.lexicon[rhs].append(tuple)
                else:
                    self.lexicon[rhs] = [tuple]
                if lhs not in self.nt:
                    self.nt.append(lhs)
            else:
                self.rules[rule] = self.count1[rule]/self.count2[lhs]

    def _add_to_dict(self, key, value, dictionary):
        '''
        Add a pair (key, value) to dictionary
        '''
        if key in dictionary:
            if value not in dictionary[key]:
                dictionary[key].append(value)
        else:
            dictionary[key] = [value]

    def _add_to_nt_list(self, value):
        '''
        Add a value to a list
        '''
        if value not in self.nt:
            self.nt.append(value)

    def _convert_to_cnf(self):
        '''
        Transform N-nary rules when N > 2 to binary rules
        '''
        binary_rules = {}
        unary_rules = {}
        for rule in self.rules:
            proba = self.rules[rule]
            split = rule.split('->')
            lhs = split[0].strip()
            rhs = split[-1].strip()
            symbols = rhs.split(' ')
            if len(symbols) > 2: # if more than two symbols in rule
                new_symbol = lhs + '|' + "+".join(symbols[1:])
                new_rhs = symbols[0] + ' ' + new_symbol
                self._add_to_dict(lhs, (new_rhs, proba), binary_rules)
                self._add_to_nt_list(lhs)
                for i in range(1, len(symbols)-2):
                    new_lhs = new_symbol
                    new_symbol = lhs + '|' + "+".join(symbols[i+1:])
                    new_rhs = symbols[i] + ' ' + new_symbol

                    self._add_to_dict(new_lhs, (new_rhs, 1), binary_rules)
                    self._add_to_nt_list(new_lhs)

                new_rhs = symbols[-2] + ' ' + symbols[-1]
                self._add_to_dict(new_symbol, (new_rhs, 1), binary_rules)
                self._add_to_nt_list(new_symbol)

            elif len(symbols) == 2:
                self._add_to_dict(lhs, (rhs, self.rules[rule]), binary_rules)
            else:
                self._add_to_dict(lhs, (rhs, self.rules[rule]), unary_rules)
            self._add_to_nt_list(lhs)

        self.unary_rules = unary_rules
        self.binary_rules = binary_rules
        self.oov.vocabulary = list(self.lexicon.keys())

    def cky(self, original_sequence, substitute_sequence):
        '''
        Implement the Probabilistic CKY algorithm
        '''
        n = len(original_sequence)
        best = [[{} for i in range(n+1)] for j in range(n+1)]
        back = [[{} for i in range(n+1)] for j in range(n+1)]

        # Init
        for i in range(n+1):
            for j in range(n+1):
                for X in self.nt:
                    best[i][j][X] = 0

        # Handle terminal lexicon
        for i in range(1, n+1):
            substitute_word = substitute_sequence[i-1]
            original_word = original_sequence[i-1]
            for X, p in self.lexicon[substitute_word]:
                if p > best[i-1][i][X]:
                    best[i-1][i][X] = p
                    back[i-1][i][X] = original_word

            # Handle unary rules
            self._handle_unary(back, best, i-1, i)

        for l in range(2, n+1):
            for i in range(n-l+1):
                j = i + l
                for k in range(i+1, j):
                    # Handle binary rules
                    for X in self.binary_rules:
                        for rhs, p in self.binary_rules[X]:
                            Y, Z = rhs.split(' ')
                            p_prime = p * best[i][k][Y] * best[k][j][Z]
                            if p_prime > best[i][j][X]:
                                best[i][j][X] = p_prime
                                back[i][j][X] = (k, Y, Z)

                # Handle unary rules
                self._handle_unary(back, best, i, j)

        return back, best

    def _handle_unary(self, back, best, i, j):
        '''
        Auxiliary function that treats unary rules in the probabilistic CKY
        algorithm
        '''
        again = True
        while again:
            again = False
            for X in self.unary_rules:
                for rhs, p in self.unary_rules[X]:
                    Y = rhs.split(' ')[0]
                    p_prime = p * best[i][j][Y]
                    if p_prime > best[i][j][X]:
                        best[i][j][X] = p_prime
                        back[i][j][X] = Y
                        again = True


    def build_tree(self, i, j, non_terminal, back):
        '''
        Generate the tree from the backpointers computed from P-CKY algorithm
        '''
        node = back[i][j][non_terminal]
        tree = Node(non_terminal)
        if type(node) is tuple: # If binary
            k, left_non_terminal, right_non_terminal = node
            left_node = self.build_tree(i, k, left_non_terminal, back)
            right_node = self.build_tree(k, j, right_non_terminal, back)
            tree.add_child(left_node)
            tree.add_child(right_node)
            left_node.add_parent(tree)
            right_node.add_parent(tree)
            return tree
        else: # If unary
            if node in self.nt: # If not anchor
                left_node = self.build_tree(i, j, node, back)
                tree.add_child(left_node)
                left_node.add_parent(tree)
                return tree
            else: # If anchor
                return Node(non_terminal, node)

    def generate_tree(self, sequence):
        '''
        Generate a parsed tree corresponding to the most likely derivation
        for the sequence given as input
        '''
        original_sequence = sequence.split()
        substitute_sequence = self.oov.process(original_sequence)
        back, best = self.cky(original_sequence, substitute_sequence)
        if "ROOT" not in back[0][len(original_sequence)]: # Not parsable
            return None
        tree = self.build_tree(0, len(original_sequence), 'ROOT', back)
        tree.un_cnf()
        return tree

    def compute_accuracy(self, gt_tree, predicted_tree):
        '''
        Compute the percentage of tokens for which the parser choses
        the correct part-of-speech
        '''
        map_token_pos_gt = {}
        gt_tree.extract_leaves(map_token_pos_gt)
        map_token_pos_pred = {}
        predicted_tree.extract_leaves(map_token_pos_pred)
        acc = 0
        for token in map_token_pos_gt.keys():
            gt_pos = map_token_pos_gt[token]
            pred_pos = map_token_pos_pred[token]
            if gt_pos == pred_pos:
                acc += 1
        return acc/len(map_token_pos_gt.keys())

    def evaluate(self, input_file):
        '''
        Compute the average accuracy after comparing the parsed trees after cky
        and the actual ones of the last 10% of the corpus
        '''
        mean_accuracy = 0
        with open(input_file, 'r') as f:
            testing_sentences = f.readlines()
        for idx, test_sentence in enumerate(tqdm(testing_sentences)):
            gt_tree = self._parse_tree(test_sentence)
            raw_tokens = gt_tree.compute_raw_tokens()
            predicted_tree = self.generate_tree(raw_tokens)
            if predicted_tree is not None:
                mean_accuracy += self.compute_accuracy(gt_tree, predicted_tree)
            else: # sentence not parsable
                print("Sentence number %d: \"%s\" not parsable" \
                       %(idx, raw_tokens))
                mean_accuracy += 0
        mean_accuracy = mean_accuracy / len(testing_sentences)
        print("Final average accuracy:", mean_accuracy)

    def predict(self, input_file, output_file):
        '''
        Predict parse trees from input file with raw tokens and write them in
        an output file
        '''
        with open(input_file, 'r') as f:
            testing_sentences = f.readlines()
        pred_file = open(output_file, "w")
        for idx, test_sentence in enumerate(tqdm(testing_sentences)):
            predicted_tree = self.generate_tree(test_sentence)
            if predicted_tree is not None:
                pred_file.write(predicted_tree.to_string() + "\n")
            else: # sentence not parsable
                print("Sentence number %d: \"%s\" not parsable" \
                       %(idx, raw_tokens))
                pred_file.write("( )\n")
        pred_file.close()

Example #8

Show file

class PCFG:
    def __init__(self):
        self.oov = OOV()
        self.train = []
        self.test = []
        self.poses = set()
        # ex// Un: 56
        self.tokens = defaultdict(int)
        # ex// (A, B): 41
        self.count_grammar = defaultdict(int)
        # ex// (A, a): 11
        self.count_lexicon = defaultdict(int)
        # ex// Un: [N, NP]
        self.token_to_pos = defaultdict(set)
        # ex// (B, C) : [A1, A2, A3]
        self.right_to_pos = defaultdict(set)
        # ex// A: 22
        self.preterminals_pos = defaultdict(int)
        # ex// (N, Un): 0.23
        self.prob_pos_to_token = defaultdict(int)
        # ex// (A, (B, C)): 56
        self.count_left_to_right = defaultdict(int)
        # ex// (A, (B, C)): 0.11
        self.prob_left_to_right = defaultdict(int)

    def from_path(self, path):
        """load and create dataset from a treebank in path"""
        dataset = open(join(dirname(realpath(__file__)), path),
                       'r').read().splitlines()
        np.random.shuffle(dataset)
        sep1, sep2 = int(len(dataset) * 0.8), int(len(dataset) * 0.9)
        self.train, self.test = dataset[:sep2], dataset[sep2:]

    def count_occurences(self):
        """count occurences for the different grammar rules, and compute probabilities"""
        for line in self.train:
            new_tree = Tree()
            new_tree.fit(line)
            for pos, _dict in new_tree.count_rules.items():
                self.poses.add(pos)
                for left, count in _dict.items():
                    self.count_grammar[(pos, left)] += count
                    self.right_to_pos[left].add(pos)
            for pos, _dict in new_tree.count_lexicon.items():
                for token, count in _dict.items():
                    self.count_lexicon[(pos, token[0])] += count
                    self.tokens[token[0]] += 1
                    self.token_to_pos[token[0]].add(pos)
        # compute proba for A --> token
        for (pos, token), count in self.count_lexicon.items():
            self.preterminals_pos[pos] += count
        for (pos, token), count in self.count_lexicon.items():
            self.prob_pos_to_token[(
                pos, token)] = count / self.preterminals_pos[pos]
        # compute proba for A --> BC
        for (pos, _), count in self.count_grammar.items():
            self.count_left_to_right[pos] += count
        for (pos, right_side), count in self.count_grammar.items():
            self.prob_left_to_right[(
                pos, right_side)] = count / self.count_left_to_right[pos]

    def fit(self):
        """Compute grammar probabilities"""
        self.count_occurences()
        self.proba_grammar = {
            **self.prob_pos_to_token,
            **self.prob_left_to_right
        }
        self.non_terminal = set([x[0] for x in self.proba_grammar.keys()])
        self.pos_2_ind = {pos: i for i, pos in enumerate(self.non_terminal)}
        self.ind_2_pos = {v: k for k, v in self.pos_2_ind.items()}

    def pcky(self, tokens):
        """Probabilistic CYK algorithm"""
        since = time()

        # normalize input: OOV module
        words = self.normalize_tokens(tokens)

        N = len(words)
        V = len(self.non_terminal)

        table = np.zeros((N + 1, N + 1, V))
        back = np.zeros((N + 1, N + 1, V), dtype=tuple)
        for j in range(1, N + 1):
            for A in self.token_to_pos[words[j - 1]]:
                table[j - 1, j,
                      self.pos_2_ind[A]] = self.proba_grammar[(A,
                                                               words[j - 1])]

            for i in range(j - 2, -1, -1):
                for k in range(i + 1, j):
                    ind_B = np.nonzero(table[i, k, :] > 0)[0]
                    B_list = [self.ind_2_pos[x] for x in ind_B]
                    ind_C = np.nonzero(table[k, j, :] > 0)[0]
                    C_list = [self.ind_2_pos[x] for x in ind_C]
                    prod = product(B_list, C_list)
                    for BC in prod:
                        for A in self.right_to_pos[BC]:
                            indA, indB, indC = self.pos_2_ind[
                                A], self.pos_2_ind[BC[0]], self.pos_2_ind[
                                    BC[1]]
                            value = (self.proba_grammar[(A, BC)]) * (
                                table[i, k, indB]) * (table[k, j, indC])
                            if (table[i, j, indA]) < value:
                                table[i, j, indA] = value
                                back[i, j, indA] = (k, *BC)

        print("Took {}s".format(int(time() - since)))
        if not back[0, N, self.pos_2_ind["SENT"]]:
            return None
        tree = self.build_tree(tokens, back, 0, N, "SENT")
        return " ".join(self.debinarize(tree.split()))

    def build_tree(self, words, back, i, j, pos):
        """Transform the output of CYK to the form of a parsed sentence with parentheses"""
        n = j - i
        if n == 1:
            return " ( " + pos + " " + words[i] + " ) "
        else:
            k, B, C = back[i, j, self.pos_2_ind[pos]]
            return "( " + pos + " " + self.build_tree(
                words, back, i, k, B) + " " + self.build_tree(
                    words, back, k, j, C) + ") "

    def debinarize(self, s):
        """Reverse Chomsky binarisation"""
        for i, x in enumerate(s):
            if "$" in x and s[i - 1] == "(":
                c = 1
                for j, y in enumerate(s[i + 1:]):
                    if y == '(':
                        c += 1
                    elif y == ")":
                        c -= 1
                    if c == 0:
                        return self.debinarize(s[:i - 1] + s[i + 1:i + 1 + j] +
                                               s[i + 1 + j + 1:])
        return s

    def predict(self, line):
        """predict the parser of line from training dataset"""
        new = self.line_to_tokens(line)
        return self.pcky(new)

    def line_to_tokens(self, line):
        """transform a line from dataset to a list of tokens"""
        tokenized = line.replace("(", " ( ").replace(")", " ) ").split()[1:-1]
        remove = False
        new = []
        for i, x in enumerate(tokenized):
            if tokenized[i] == "(" and tokenized[i + 1] != "(":
                remove = True
            elif tokenized[i] == "(":
                new.append(x)
            else:
                if not remove:
                    new.append(x)
                else:
                    remove = False
        new = list(filter(lambda x: x not in [')', '('], new))
        return new

    def prepare_line_for_prediction(self, line):
        """Tokenize a line from dataset"""
        tokenized = line.replace("(", " ( ").replace(")", " ) ").split()[1:-1]
        new = []
        for i, x in enumerate(tokenized):
            if "-" in x and tokenized[i - 1] == "(":
                new.append(x.split("-")[0])
            else:
                new.append(x)
        return " ".join(new)

    def normalize_word(self, word):
        """OOV module, 1st: compute levenshtein_distance, if not, return closest word using cosinus similarity"""
        if word in self.tokens.keys():
            return word
        lv_distances = defaultdict(list)
        for token in self.tokens.keys():
            distance = levenshtein_distance(word, token)
            for i in range(1, 3):
                if distance == i:
                    lv_distances[i].append(token)
                    break
        for i in range(1, 3):
            if lv_distances[i]:
                return lv_distances[i][0]

        return self.oov.closest_to_tokens(word, self.tokens.keys())

    def normalize_tokens(self, tokens):
        """apply self.normalize_word to a list of tokens"""
        return [self.normalize_word(token) for token in tokens]

Example #9

Show file

class CYK_Parser():

    #My parser based on probabilist CYK algorithm

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols,
                       self.PCFG.freq_tokens)

        #note : if the id of a symbol is above self.PCFG.nb_tags,
        #it's an artificial symbol introduced with Chomsky normalization
        self.symbol_to_id = {
            symbol: i
            for (i, symbol) in enumerate(self.PCFG.list_all_symbols)
        }

        #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids:
        #we store rules with an additional hierarchical level for speed up
        #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ)
        #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries
        self.grammar_ids = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand symbol of the grammar rule
            idx_root_tag = self.symbol_to_id[root_tag]
            self.grammar_ids[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items(
            ):  #split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.symbol_to_id[split[0]]
                idx_right_tag = self.symbol_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_ids[idx_root_tag] = dico

        #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ?
        #this is what stores self.lexicon_inverted
        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

    def compute_CYK_tables(self, sentence, viz_oov=False):
        # compute CYK tables :
        # - looking for the probabilities of the most likely trees parsing the substrings of sentence, for increasing length of substring (from 1 to length of the sentence)
        # - storing each time the position of the cut and the rule (right hand term) enabling to reach the most likely parsing tree of a given root tag

        nb_words = len(sentence)

        max_proba_derivation = np.zeros(
            (nb_words, nb_words, self.PCFG.nb_all_symbols))
        # max_proba_derivation[s,l,a] is the maximum probability of
        # a parsing where symbol a derives substring x_s...x_(s+l)

        split_reaching_max = np.zeros(
            (nb_words, nb_words, self.PCFG.nb_all_symbols, 3))
        # split_reaching_max[s,l,a,0] stores index cut
        # split_reaching_max[s,l,a,1] stores symbol b
        # split_reaching_max[s,l,a,2] stores symbol c
        # such that

        # (i) b derives x_s...x_(s+cut), c derives x_(s+cut)...x_(s+l)
        # and a rewrites bc (a->bc in the grammar)

        # (ii) the splitting <cut,b,c> defined by (i) is the one enabling
        # to reach the maximum probability for a to derives  x_s...x_(s+l)
        # (ie enabling to reach max_proba_derivation[s,l,a])

        # probabilities of tags for unary strings (words)
        for (position_word, word) in enumerate(sentence):

            token_to_tag = word

            if not (word in self.OOV.words_lexicon):
                if viz_oov: print(word + " is an OOV")
                token_to_tag = self.OOV.closest_in_corpus(word,
                                                          viz_closest=viz_oov)
                if viz_oov:
                    if token_to_tag is None:
                        print("No closest token found")
                        print("")
                    else:
                        print("Closest token found : " + token_to_tag)
                        print("")

            if token_to_tag is None:
                for (tag, counts) in self.PCFG.freq_terminal_tags.items():
                    if tag in self.symbol_to_id:  # avoid the case where tag appearing in lexicon but not in grammar rules
                        id_tag = self.symbol_to_id[tag]
                        max_proba_derivation[position_word, 0, id_tag] = counts
            else:
                for (tag,
                     proba) in self.lexicon_inverted[token_to_tag].items():
                    if tag in self.symbol_to_id:  #avoid the case where tag appearing in lexicon but not in grammar rules
                        id_tag = self.symbol_to_id[tag]
                        max_proba_derivation[position_word, 0, id_tag] = proba

        for l in range(1, nb_words):
            # we will consider symbols deriving strings of length l+1...

            for s in range(nb_words - l):
                # ... and starting at index s of the sentence

                for idx_root_tag in self.grammar_ids:
                    # ... root_tag is the symbol deriving the considered string (rule left-hand term)

                    for cut in range(0, l):
                        # ... such symbol can rewrite as two symbols AB
                        # with A deriving substring until index cut included, and B deriving substring from index cut+1

                        for idx_left_tag in self.grammar_ids[
                                idx_root_tag]:  #left symbol A

                            proba_left_derivation = max_proba_derivation[
                                s, cut, idx_left_tag]

                            if proba_left_derivation > max_proba_derivation[
                                    s, l, idx_root_tag]:

                                for (idx_right_tag,
                                     proba_split) in self.grammar_ids[
                                         idx_root_tag][idx_left_tag].items(
                                         ):  #right symbol B

                                    proba_right_derivation = max_proba_derivation[
                                        s + cut + 1, l - cut - 1,
                                        idx_right_tag]

                                    proba_decomposition = proba_split * proba_left_derivation * proba_right_derivation

                                    if proba_decomposition > max_proba_derivation[
                                            s, l, idx_root_tag]:
                                        # therefore, we found a new decomposition <cut,split[0],split[1]>
                                        # reaching a highest probability for root_tag to derive substring x_s...x_(s+l)

                                        max_proba_derivation[
                                            s, l,
                                            idx_root_tag] = proba_decomposition
                                        split_reaching_max[s, l, idx_root_tag,
                                                           0] = cut
                                        split_reaching_max[s, l, idx_root_tag,
                                                           1] = idx_left_tag
                                        split_reaching_max[s, l, idx_root_tag,
                                                           2] = idx_right_tag

        self.max_proba_derivation = max_proba_derivation
        self.split_reaching_max = split_reaching_max.astype(int)

    def parse_substring(self, s, l, idx_root_tag, sentence):
        # parse substring beginning at index s of sentence, of length l+1, and tagged as idx_root_tag

        if l == 0:
            return sentence[s]

        else:  # split enabling to reach max_proba_derivation[s,l,idx_root_tag]
            cut = self.split_reaching_max[s, l, idx_root_tag, 0]
            idx_left_tag = self.split_reaching_max[s, l, idx_root_tag, 1]
            idx_right_tag = self.split_reaching_max[s, l, idx_root_tag, 2]

            left_tag = self.PCFG.list_all_symbols[idx_left_tag]
            right_tag = self.PCFG.list_all_symbols[idx_right_tag]

            return [[
                left_tag,
                self.parse_substring(s, cut, idx_left_tag, sentence)
            ],
                    [
                        right_tag,
                        self.parse_substring(s + cut + 1, l - cut - 1,
                                             idx_right_tag, sentence)
                    ]]

    def remove_artificial_symbols(self, T):
        #removing artificial symbols from T the tree structure encoding the parsing of the sentence

        #debinarize : remove artificial symbols of type X|X1X2X3.. (coming from BIN rule)
        #attaching children of an artificial symbol to its own father
        nodes = deepcopy(T.nodes)
        for node in nodes:
            children = list(T.successors(node))
            if len(children) == 0: pass
            elif len(children) == 1 and len(list(T.successors(
                    children[0]))) == 0:
                pass
            else:
                father = list(T.predecessors(node))
                if len(father) == 0: pass
                else:
                    symbol = T.nodes[node]["name"]
                    if (self.symbol_to_id[symbol] >= self.PCFG.nb_tags) and (
                            "|" in symbol):  # artificial symbol from BIN rule
                        for child in T.successors(node):
                            T.add_edge(father[0], child)
                        T.remove_node(node)

        #add pre_terminal symbols : remove artificial symbols of type A&B (coming from UNIT rule)
        #decompositing A&B into two symbols A and B (A father of B father of word)
        max_id_node = np.max(T.nodes())
        nodes = deepcopy(T.nodes)
        for node in nodes:
            children = list(T.successors(node))
            if len(children) == 0 or len(list(T.predecessors(node))) == 0: pass
            elif len(children) == 1 and len(list(T.successors(
                    children[0]))) == 0:
                symbol = T.nodes[node]["name"]

                if (self.symbol_to_id[symbol] >= self.PCFG.nb_tags) and (
                        "&" in symbol):  # artificial symbol from UNIT rule
                    word = children[0]

                    idx_cut = None
                    for (idx, c) in enumerate(symbol):
                        if c == "&":
                            idx_cut = idx

                    T.nodes[node]["name"] = symbol[:idx_cut]

                    idx_pre_terminal_node = max_id_node + 1
                    T.add_node(idx_pre_terminal_node,
                               name=symbol[idx_cut + 1:])
                    max_id_node += 1

                    T.remove_edge(node, word)
                    T.add_edge(node, idx_pre_terminal_node)
                    T.add_edge(idx_pre_terminal_node, word)

    def reformat_parsing(self, parsing):
        # converting parsing stored as nested lists into the required format (with nested brackets)

        if type(parsing) == str:
            return parsing

        else:
            string = ""
            for el in parsing:
                root_tag = el[0]
                parsing_substring = el[1]
                string = string + "(" + root_tag + " " + self.reformat_parsing(
                    parsing_substring) + ")" + " "
            string = string[:-1]
            return string

    def parse(self, sentence, remove_artificial_symbols=True, viz_oov=False):
        # parse sentence
        # remove_artificial_symbols : if False, keep Chomsky artificial symbols
        # viz_oov : if True, plot management of oov words
        sentence = sentence.split()

        nb_words = len(sentence)

        if nb_words > 1:
            self.compute_CYK_tables(sentence, viz_oov=viz_oov)
            idx_root_tag = self.symbol_to_id["SENT"]
            if self.max_proba_derivation[0][
                    nb_words - 1][idx_root_tag] == 0:  #no valid parsing
                return None
            parsing_list = self.parse_substring(0, nb_words - 1, idx_root_tag,
                                                sentence)

        else:
            word = sentence[0]
            token_to_tag = self.OOV.closest_in_corpus(word,
                                                      viz_closest=viz_oov)
            if token_to_tag is None:
                tag = max(self.PCFG.freq_terminal_tags,
                          key=self.PCFG.freq_terminal_tags.get)
            else:
                tag = max(self.lexicon_inverted[token_to_tag],
                          key=self.lexicon_inverted[token_to_tag].get)
            parsing_list = "(" + tag + " " + word + ")"

        if remove_artificial_symbols:
            #converting the parsing stored as a string into a tree
            T = postagged_sent_to_tree(
                "( (SENT " + self.reformat_parsing(parsing_list) + "))",
                remove_after_hyphen=False)
            #nx.draw(T, labels=nx.get_node_attributes(T, "name"), arrows=False, pos=graphviz_layout(T, prog='dot'))
            self.remove_artificial_symbols(T)
            return tree_to_postagged_sent(T)  #return parsing as a string

        else:
            return "( (SENT " + self.reformat_parsing(
                parsing_list) + "))"  #return parsing as a string

Example #10

Show file

class CYK:
    """Class for applying the CYK algorithm"""

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens)

        self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)}

        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

        # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ)
        self.grammar_dicts = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand tag of the grammar rule
            idx_root_tag = self.tag_to_id[root_tag]
            self.grammar_dicts[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items():  # split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.tag_to_id[split[0]]
                idx_right_tag = self.tag_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_dicts[idx_root_tag] = dico

    def list_to_sentence(self, parsing):
        """Go from list to string representation"""

        if type(parsing) == str:
            return parsing

        else:
            string = ""
            for p in parsing:
                root_tag = p[0]
                parsing_substring = p[1]
                string = string + "(" + root_tag + " " + self.list_to_sentence(parsing_substring) + ")" + " "
            string = string[:-1]  # Remove the extra space
            return string

    def parse_substring(self, s, l, idx_root_tag, sentence):
        """Parse part of a sentence into a list"""

        if l == 0:
            return sentence[s]

        else:  # split enabling to reach max_proba_derivation[s,l,idx_root_tag]
            cut = self.cyk_matrix[s, l, idx_root_tag, 0]
            idx_left_tag = self.cyk_matrix[s, l, idx_root_tag, 1]
            idx_right_tag = self.cyk_matrix[s, l, idx_root_tag, 2]

            left_tag = self.PCFG.list_all_tags[idx_left_tag]
            right_tag = self.PCFG.list_all_tags[idx_right_tag]

            return [[left_tag, self.parse_substring(s, cut, idx_left_tag, sentence)],
                    [right_tag, self.parse_substring(s + cut + 1, l - cut - 1, idx_right_tag, sentence)]]

    def clean_tags(self, tree):
        """Remove artificial tags and de-telescope tags"""

        # remove artificial tag of type X|X1X2X3.. (coming from BIN rule)
        nodes = deepcopy(tree.nodes)
        for node in nodes:
            children = list(tree.successors(node))
            if len(children) == 0:
                pass
            elif len(children) == 1 and len(list(tree.successors(children[0]))) == 0:
                pass
            else:
                father = list(tree.predecessors(node))
                if len(father) == 0:
                    pass
                else:
                    tag = tree.nodes[node]["name"]
                    if (self.tag_to_id[tag] >= self.PCFG.nb_tags) and (
                            "|" in tag):  # artificial tag from BIN rule
                        for child in tree.successors(node):
                            tree.add_edge(father[0], child)
                        tree.remove_node(node)

        # decomposing (A&B w) into (A (B w))
        max_id_node = np.max(tree.nodes())
        nodes = deepcopy(tree.nodes)
        for node in nodes:
            children = list(tree.successors(node))
            if len(children) == 0 or len(list(tree.predecessors(node))) == 0:
                pass
            elif len(children) == 1 and len(list(tree.successors(children[0]))) == 0:
                tag = tree.nodes[node]["name"]

                if (self.tag_to_id[tag] >= self.PCFG.nb_tags) and (
                        "&" in tag):  # artificial tag from UNIT rule
                    word = children[0]

                    idx_cut = None
                    for (idx, c) in enumerate(tag):
                        if c == "&":
                            idx_cut = idx

                    tree.nodes[node]["name"] = tag[:idx_cut]

                    idx_pre_terminal_node = max_id_node + 1
                    tree.add_node(idx_pre_terminal_node, name=tag[idx_cut + 1:])
                    max_id_node += 1

                    tree.remove_edge(node, word)
                    tree.add_edge(node, idx_pre_terminal_node)
                    tree.add_edge(idx_pre_terminal_node, word)

    def compute_CYK(self, sentence, viz_oov=False):
        """Apply the CYK algorithm (heavily influenced by https://en.wikipedia.org/wiki/CYK_algorithm)"""

        n = len(sentence)
        prob_matrix = np.zeros((n, n, self.PCFG.nb_all_tags))
        cyk_matrix = np.zeros((n, n, self.PCFG.nb_all_tags, 3))

        # probabilities of tags for unary rule
        for (position_word, word) in enumerate(sentence):

            token_to_tag = word

            if not (word in self.OOV.words_lexicon):
                token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov)

            if token_to_tag is None:
                for (tag, counts) in self.PCFG.freq_terminal_tags.items():
                    if tag in self.tag_to_id:
                        id_tag = self.tag_to_id[tag]
                        prob_matrix[position_word, 0, id_tag] = counts
            else:
                for (tag, proba) in self.lexicon_inverted[token_to_tag].items():
                    if tag in self.tag_to_id:
                        id_tag = self.tag_to_id[tag]
                        prob_matrix[position_word, 0, id_tag] = proba

        for l in range(1, n):
            for s in range(n - l):
                for idx_root_tag in self.grammar_dicts:
                    for cut in range(0, l):
                        for idx_left_tag in self.grammar_dicts[idx_root_tag]:
                            proba_left_derivation = prob_matrix[s, cut, idx_left_tag]
                            if proba_left_derivation > prob_matrix[s, l, idx_root_tag]:  # save useless iterations

                                for (idx_right_tag, proba_split) in self.grammar_dicts[idx_root_tag][
                                    idx_left_tag].items():

                                    proba_right_derivation = prob_matrix[s + cut + 1, l - cut - 1, idx_right_tag]
                                    proba_decomposition = proba_split * proba_left_derivation * proba_right_derivation

                                    if proba_decomposition > prob_matrix[s, l, idx_root_tag]:
                                        prob_matrix[s, l, idx_root_tag] = proba_decomposition
                                        cyk_matrix[s, l, idx_root_tag] = [cut, idx_left_tag, idx_right_tag]

        self.prob_matrix = prob_matrix
        self.cyk_matrix = cyk_matrix.astype(int)

    def parse(self, sentence, viz_oov=False):
        """Returns a parsed and tagged sentence from a natural sentence"""
        sentence = sentence.split()

        nb_words = len(sentence)

        if nb_words > 1:
            self.compute_CYK(sentence, viz_oov=viz_oov)
            idx_root_tag = self.tag_to_id["SENT"]
            if self.prob_matrix[0][nb_words - 1][idx_root_tag] == 0:  # no valid parsing
                return None
            parsing_list = self.parse_substring(0, nb_words - 1, idx_root_tag, sentence)

        else:
            word = sentence[0]
            token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov)
            if token_to_tag is None:
                tag = max(self.PCFG.freq_terminal_tags, key=self.PCFG.freq_terminal_tags.get)
            else:
                tag = max(self.lexicon_inverted[token_to_tag], key=self.lexicon_inverted[token_to_tag].get)
            parsing_list = "(" + tag + " " + word + ")"

        # converting the parsing stored as a string into a tree
        tree = tagged_sent_to_tree("( (SENT " + self.list_to_sentence(parsing_list) + "))",
                                   remove_after_hyphen=False)
        self.clean_tags(tree)
        return tree_to_sentence(tree)

Example #11

Show file

    f = open(trainfilename, 'r')
    for line in f:
        trees.append(nltk.Tree.fromstring(line))

    # preprocss the tree forms: ignore functional labels and binarize to CNF
    for tree in trees:
        # ignore_func_labels(tree)
        tree.chomsky_normal_form(horzMarkov=2)
        # tree.chomsky_normal_form()

    # learn PCFG
    lexicon, grammar, vocabulary, symbols = PCFG(trees)
    # print(grammar)

    # for OOV
    oovwords = OOV(embedfilename, vocabulary)

    # parse new sentences using CYK based on learned PCFG
    # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords)

    # i = 0
    for line in sys.stdin:
        # print('start parse')
        # print(line)
        # start = time.time()
        # if line == '\n': continue
        # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename)
        # i += 1
        # if i < 20: continue
        # if i > 3: break
        # parsedtree = parser.compute(line.split())