def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens) self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)} self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word] # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ) self.grammar_dicts = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand tag of the grammar rule idx_root_tag = self.tag_to_id[root_tag] self.grammar_dicts[idx_root_tag] = {} dico = {} for (split, proba) in rules.items(): # split is the right hand term, and proba the probability of the rule idx_left_tag = self.tag_to_id[split[0]] idx_right_tag = self.tag_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_dicts[idx_root_tag] = dico
def __init__(self, corpus): # PCFG and OOV class self.pcfg = PCFG(corpus) self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags, self.pcfg.tokens) # Initialize CYP probability matrix self.proba_matrix = None self.cyk_matrix = None
def __init__(self, fname): self.count1 = {} self.count2 = {} self.count3 = {} self.rules = {} self.lexicon = {} self.nt = [] self.oov = OOV('polyglot-fr.pkl') with open(fname, 'r') as f: self.training_corpus = f.readlines()
def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols, self.PCFG.freq_tokens) #note : if the id of a symbol is above self.PCFG.nb_tags, #it's an artificial symbol introduced with Chomsky normalization self.symbol_to_id = { symbol: i for (i, symbol) in enumerate(self.PCFG.list_all_symbols) } #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids: #we store rules with an additional hierarchical level for speed up #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ) #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries self.grammar_ids = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand symbol of the grammar rule idx_root_tag = self.symbol_to_id[root_tag] self.grammar_ids[idx_root_tag] = {} dico = {} for (split, proba) in rules.items( ): #split is the right hand term, and proba the probability of the rule idx_left_tag = self.symbol_to_id[split[0]] idx_right_tag = self.symbol_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_ids[idx_root_tag] = dico #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ? #this is what stores self.lexicon_inverted self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]
def __init__(self): self.oov = OOV() self.train = [] self.test = [] self.poses = set() # ex// Un: 56 self.tokens = defaultdict(int) # ex// (A, B): 41 self.count_grammar = defaultdict(int) # ex// (A, a): 11 self.count_lexicon = defaultdict(int) # ex// Un: [N, NP] self.token_to_pos = defaultdict(set) # ex// (B, C) : [A1, A2, A3] self.right_to_pos = defaultdict(set) # ex// A: 22 self.preterminals_pos = defaultdict(int) # ex// (N, Un): 0.23 self.prob_pos_to_token = defaultdict(int) # ex// (A, (B, C)): 56 self.count_left_to_right = defaultdict(int) # ex// (A, (B, C)): 0.11 self.prob_left_to_right = defaultdict(int)
class CYK: def __init__(self, corpus): # PCFG and OOV class self.pcfg = PCFG(corpus) self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags, self.pcfg.tokens) # Initialize CYP probability matrix self.proba_matrix = None self.cyk_matrix = None # Apply the CYK algorithm def CYK_algorithm(self, sentence): # Initialize n = len(sentence) r = self.pcfg.nb_all_tags P = np.zeros((n, n, r)) cyk_matrix = np.zeros((n, n, r, 3)) # First level P[0,:,:] for idx_word, word in enumerate(sentence): # Get closest word in the lexicon word = self.oov.closest_word(word) if word is None: for idx_tag, tag in enumerate(self.pcfg.list_all_tags): if tag in self.pcfg.terminal_tags: P[0, idx_word, idx_tag] = self.pcfg.terminal_tags[tag] else: for idx_tag, tag in enumerate(self.pcfg.list_all_tags): if tag in self.pcfg.inv_lexicon[word]: P[0, idx_word, idx_tag] = self.pcfg.inv_lexicon[word][tag] # Other levels for l in range(1, n): for s in range(n - l): for tag in self.pcfg.grammar: idx_tag = self.pcfg.dic_all_tags[tag] for p in range(l): for rule in self.pcfg.grammar[tag]: left_tag = rule.split(' ')[0] right_tag = rule.split(' ')[1] b = self.pcfg.dic_all_tags[left_tag] c = self.pcfg.dic_all_tags[right_tag] prob_splitting = self.pcfg.grammar[tag][rule] * P[ p, s, b] * P[l - p - 1, s + p + 1, c] if prob_splitting > P[l, s, idx_tag]: P[l, s, idx_tag] = prob_splitting cyk_matrix[l, s, idx_tag] = [p, b, c] self.proba_matrix = P self.cyk_matrix = cyk_matrix.astype(int) # Remove new tags and de-telescope tags def clean_tags(self, tree): # remove new tags of type nodes = deepcopy(tree.nodes) for node in nodes: children = list(tree.successors(node)) if len(children) == 0: pass elif len(children) == 1 and len(list(tree.successors( children[0]))) == 0: pass else: parent = list(tree.predecessors(node)) if len(parent) == 0: pass else: tag = tree.nodes[node]["name"] if (self.pcfg.dic_all_tags[tag] >= self.pcfg.nb_tags) and ("|" in tag): for child in tree.successors(node): tree.add_edge(parent[0], child) tree.remove_node(node) # Decomposing A&B -> w into A -> B -> w max_node = np.max(tree.nodes()) nodes = deepcopy(tree.nodes) for node in nodes: children = list(tree.successors(node)) if len(children) == 0 or len(list(tree.predecessors(node))) == 0: pass elif len(children) == 1 and len(list(tree.successors( children[0]))) == 0: tag = tree.nodes[node]["name"] if (self.pcfg.dic_all_tags[tag] >= self.pcfg.nb_tags) and ( "&" in tag): # new tag from unit rule word = children[0] idx_cut = None for (idx, c) in enumerate(tag): if c == "&": idx_cut = idx tree.nodes[node]["name"] = tag[:idx_cut] idx_pre_terminal_node = max_node + 1 tree.add_node(idx_pre_terminal_node, name=tag[idx_cut + 1:]) max_id_node += 1 tree.remove_edge(node, word) tree.add_edge(node, idx_pre_terminal_node) tree.add_edge(idx_pre_terminal_node, word) # Parse part of a sentence def parse_substring(self, s, l, idx_tag, sentence): if l == 0: return sentence[s] else: cut = self.cyk_matrix[l, s, idx_tag, 0] idx_left_tag = self.cyk_matrix[l, s, idx_tag, 1] idx_right_tag = self.cyk_matrix[l, s, idx_tag, 2] left_tag = self.pcfg.list_all_tags[idx_left_tag] right_tag = self.pcfg.list_all_tags[idx_right_tag] return [[ left_tag, self.parse_substring(s, cut, idx_left_tag, sentence) ], [ right_tag, self.parse_substring(s + cut + 1, l - cut - 1, idx_right_tag, sentence) ]] # Returns the parsed sentence def parse(self, sentence): sentence = sentence.split(' ') length_sentence = len(sentence) if length_sentence > 1: self.CYK_algorithm(sentence) idx_root_tag = self.pcfg.dic_all_tags['SENT'] if self.proba_matrix[length_sentence - 1][0][idx_root_tag] == 0: # no valid parsing return None parsing_list = self.parse_substring(0, length_sentence - 1, idx_root_tag, sentence) else: word = sentence[0] word_lexicon = self.oov.closest_word(word) if word_lexicon is None: tag = max(self.pcfg.terminal_tags, key=self.pcfg.terminal_tags.get) else: tag = max(self.pcfg.inv_lexicon[word_lexicon], key=self.pcfg.inv_lexicon[word_lexicon].get) parsing_list = '(' + tag + word + ')' # converting the parsing stored as a string into a tree tree = tagged_sent_to_tree( "( (SENT " + list_to_parsed_sentence(parsing_list) + "))", remove_after_hyphen=False) self.clean_tags(tree) return tree_to_sentence(tree)
class PCFG(object): def __init__(self, fname): self.count1 = {} self.count2 = {} self.count3 = {} self.rules = {} self.lexicon = {} self.nt = [] self.oov = OOV('polyglot-fr.pkl') with open(fname, 'r') as f: self.training_corpus = f.readlines() def parse_corpus(self): ''' Parse the sentences of a corpus to compute a probabilistic grammar in CNF ''' for i, sentence in enumerate(self.training_corpus): root = self._parse_tree(sentence) self._update_rules(root) self._compute_probabilities() self._convert_to_cnf() def _parse_tree(self, sentence): ''' Parse the tree in string format and convert it into a data structure format ''' list_words = (re.split('(\(|\))', sentence)) level = -1 root = Node('ROOT') curr_node = root for i in range(1, len(list_words)-2, 2): # Ignore first and last parenth word = list_words[i] + list_words[i+1] if i == 1: continue if word[0] == '(': split = word[1:].split(' ', 1) non_terminal = split[0].split('-', 1)[0] # Ignore hyphen if split[-1] == '': new_node = Node(non_terminal) else: terminal = split[1] new_node = Node(non_terminal, anchor=terminal) curr_node.add_child(new_node) new_node.add_parent(curr_node) curr_node = new_node else: curr_node = curr_node.parents[-1] return root def _update_rules(self, root): ''' Perform a BFS from the derived tree to count the rules ''' queue = deque() queue.append(root) marked = [root] while queue: node = queue.pop() rule = node.data + ' ->' for child in node.children: if child not in marked: rule += ' ' + child.data queue.append(child) marked.append(child) if node.is_terminal: rule += ' \'' + node.anchor + '\'' # Keep track of the count of ... -> anchor if node.anchor in self.count3: self.count3[node.anchor] += 1 else: self.count3[node.anchor] = 1 # Keep track of the count of alpha -> beta if rule in self.count1: self.count1[rule] += 1 else: self.count1[rule] = 1 # Keep track of the count of alpha -> ... if node.data in self.count2: self.count2[node.data] += 1 else: self.count2[node.data] = 1 def _compute_probabilities(self): ''' Compute the probability for each rule parsed from the count computed from the derived tree of the corpus ''' for rule in self.count1: split = rule.split('->') rhs = split[-1].strip() lhs = split[0].strip() if rhs[0] == "\'" and rhs[-1] == "\'": # if anchor rhs = rhs[1:-1] # remove the '' tuple = (lhs, self.count1[rule]/self.count2[lhs]) if rhs in self.lexicon: self.lexicon[rhs].append(tuple) else: self.lexicon[rhs] = [tuple] if lhs not in self.nt: self.nt.append(lhs) else: self.rules[rule] = self.count1[rule]/self.count2[lhs] def _add_to_dict(self, key, value, dictionary): ''' Add a pair (key, value) to dictionary ''' if key in dictionary: if value not in dictionary[key]: dictionary[key].append(value) else: dictionary[key] = [value] def _add_to_nt_list(self, value): ''' Add a value to a list ''' if value not in self.nt: self.nt.append(value) def _convert_to_cnf(self): ''' Transform N-nary rules when N > 2 to binary rules ''' binary_rules = {} unary_rules = {} for rule in self.rules: proba = self.rules[rule] split = rule.split('->') lhs = split[0].strip() rhs = split[-1].strip() symbols = rhs.split(' ') if len(symbols) > 2: # if more than two symbols in rule new_symbol = lhs + '|' + "+".join(symbols[1:]) new_rhs = symbols[0] + ' ' + new_symbol self._add_to_dict(lhs, (new_rhs, proba), binary_rules) self._add_to_nt_list(lhs) for i in range(1, len(symbols)-2): new_lhs = new_symbol new_symbol = lhs + '|' + "+".join(symbols[i+1:]) new_rhs = symbols[i] + ' ' + new_symbol self._add_to_dict(new_lhs, (new_rhs, 1), binary_rules) self._add_to_nt_list(new_lhs) new_rhs = symbols[-2] + ' ' + symbols[-1] self._add_to_dict(new_symbol, (new_rhs, 1), binary_rules) self._add_to_nt_list(new_symbol) elif len(symbols) == 2: self._add_to_dict(lhs, (rhs, self.rules[rule]), binary_rules) else: self._add_to_dict(lhs, (rhs, self.rules[rule]), unary_rules) self._add_to_nt_list(lhs) self.unary_rules = unary_rules self.binary_rules = binary_rules self.oov.vocabulary = list(self.lexicon.keys()) def cky(self, original_sequence, substitute_sequence): ''' Implement the Probabilistic CKY algorithm ''' n = len(original_sequence) best = [[{} for i in range(n+1)] for j in range(n+1)] back = [[{} for i in range(n+1)] for j in range(n+1)] # Init for i in range(n+1): for j in range(n+1): for X in self.nt: best[i][j][X] = 0 # Handle terminal lexicon for i in range(1, n+1): substitute_word = substitute_sequence[i-1] original_word = original_sequence[i-1] for X, p in self.lexicon[substitute_word]: if p > best[i-1][i][X]: best[i-1][i][X] = p back[i-1][i][X] = original_word # Handle unary rules self._handle_unary(back, best, i-1, i) for l in range(2, n+1): for i in range(n-l+1): j = i + l for k in range(i+1, j): # Handle binary rules for X in self.binary_rules: for rhs, p in self.binary_rules[X]: Y, Z = rhs.split(' ') p_prime = p * best[i][k][Y] * best[k][j][Z] if p_prime > best[i][j][X]: best[i][j][X] = p_prime back[i][j][X] = (k, Y, Z) # Handle unary rules self._handle_unary(back, best, i, j) return back, best def _handle_unary(self, back, best, i, j): ''' Auxiliary function that treats unary rules in the probabilistic CKY algorithm ''' again = True while again: again = False for X in self.unary_rules: for rhs, p in self.unary_rules[X]: Y = rhs.split(' ')[0] p_prime = p * best[i][j][Y] if p_prime > best[i][j][X]: best[i][j][X] = p_prime back[i][j][X] = Y again = True def build_tree(self, i, j, non_terminal, back): ''' Generate the tree from the backpointers computed from P-CKY algorithm ''' node = back[i][j][non_terminal] tree = Node(non_terminal) if type(node) is tuple: # If binary k, left_non_terminal, right_non_terminal = node left_node = self.build_tree(i, k, left_non_terminal, back) right_node = self.build_tree(k, j, right_non_terminal, back) tree.add_child(left_node) tree.add_child(right_node) left_node.add_parent(tree) right_node.add_parent(tree) return tree else: # If unary if node in self.nt: # If not anchor left_node = self.build_tree(i, j, node, back) tree.add_child(left_node) left_node.add_parent(tree) return tree else: # If anchor return Node(non_terminal, node) def generate_tree(self, sequence): ''' Generate a parsed tree corresponding to the most likely derivation for the sequence given as input ''' original_sequence = sequence.split() substitute_sequence = self.oov.process(original_sequence) back, best = self.cky(original_sequence, substitute_sequence) if "ROOT" not in back[0][len(original_sequence)]: # Not parsable return None tree = self.build_tree(0, len(original_sequence), 'ROOT', back) tree.un_cnf() return tree def compute_accuracy(self, gt_tree, predicted_tree): ''' Compute the percentage of tokens for which the parser choses the correct part-of-speech ''' map_token_pos_gt = {} gt_tree.extract_leaves(map_token_pos_gt) map_token_pos_pred = {} predicted_tree.extract_leaves(map_token_pos_pred) acc = 0 for token in map_token_pos_gt.keys(): gt_pos = map_token_pos_gt[token] pred_pos = map_token_pos_pred[token] if gt_pos == pred_pos: acc += 1 return acc/len(map_token_pos_gt.keys()) def evaluate(self, input_file): ''' Compute the average accuracy after comparing the parsed trees after cky and the actual ones of the last 10% of the corpus ''' mean_accuracy = 0 with open(input_file, 'r') as f: testing_sentences = f.readlines() for idx, test_sentence in enumerate(tqdm(testing_sentences)): gt_tree = self._parse_tree(test_sentence) raw_tokens = gt_tree.compute_raw_tokens() predicted_tree = self.generate_tree(raw_tokens) if predicted_tree is not None: mean_accuracy += self.compute_accuracy(gt_tree, predicted_tree) else: # sentence not parsable print("Sentence number %d: \"%s\" not parsable" \ %(idx, raw_tokens)) mean_accuracy += 0 mean_accuracy = mean_accuracy / len(testing_sentences) print("Final average accuracy:", mean_accuracy) def predict(self, input_file, output_file): ''' Predict parse trees from input file with raw tokens and write them in an output file ''' with open(input_file, 'r') as f: testing_sentences = f.readlines() pred_file = open(output_file, "w") for idx, test_sentence in enumerate(tqdm(testing_sentences)): predicted_tree = self.generate_tree(test_sentence) if predicted_tree is not None: pred_file.write(predicted_tree.to_string() + "\n") else: # sentence not parsable print("Sentence number %d: \"%s\" not parsable" \ %(idx, raw_tokens)) pred_file.write("( )\n") pred_file.close()
class PCFG: def __init__(self): self.oov = OOV() self.train = [] self.test = [] self.poses = set() # ex// Un: 56 self.tokens = defaultdict(int) # ex// (A, B): 41 self.count_grammar = defaultdict(int) # ex// (A, a): 11 self.count_lexicon = defaultdict(int) # ex// Un: [N, NP] self.token_to_pos = defaultdict(set) # ex// (B, C) : [A1, A2, A3] self.right_to_pos = defaultdict(set) # ex// A: 22 self.preterminals_pos = defaultdict(int) # ex// (N, Un): 0.23 self.prob_pos_to_token = defaultdict(int) # ex// (A, (B, C)): 56 self.count_left_to_right = defaultdict(int) # ex// (A, (B, C)): 0.11 self.prob_left_to_right = defaultdict(int) def from_path(self, path): """load and create dataset from a treebank in path""" dataset = open(join(dirname(realpath(__file__)), path), 'r').read().splitlines() np.random.shuffle(dataset) sep1, sep2 = int(len(dataset) * 0.8), int(len(dataset) * 0.9) self.train, self.test = dataset[:sep2], dataset[sep2:] def count_occurences(self): """count occurences for the different grammar rules, and compute probabilities""" for line in self.train: new_tree = Tree() new_tree.fit(line) for pos, _dict in new_tree.count_rules.items(): self.poses.add(pos) for left, count in _dict.items(): self.count_grammar[(pos, left)] += count self.right_to_pos[left].add(pos) for pos, _dict in new_tree.count_lexicon.items(): for token, count in _dict.items(): self.count_lexicon[(pos, token[0])] += count self.tokens[token[0]] += 1 self.token_to_pos[token[0]].add(pos) # compute proba for A --> token for (pos, token), count in self.count_lexicon.items(): self.preterminals_pos[pos] += count for (pos, token), count in self.count_lexicon.items(): self.prob_pos_to_token[( pos, token)] = count / self.preterminals_pos[pos] # compute proba for A --> BC for (pos, _), count in self.count_grammar.items(): self.count_left_to_right[pos] += count for (pos, right_side), count in self.count_grammar.items(): self.prob_left_to_right[( pos, right_side)] = count / self.count_left_to_right[pos] def fit(self): """Compute grammar probabilities""" self.count_occurences() self.proba_grammar = { **self.prob_pos_to_token, **self.prob_left_to_right } self.non_terminal = set([x[0] for x in self.proba_grammar.keys()]) self.pos_2_ind = {pos: i for i, pos in enumerate(self.non_terminal)} self.ind_2_pos = {v: k for k, v in self.pos_2_ind.items()} def pcky(self, tokens): """Probabilistic CYK algorithm""" since = time() # normalize input: OOV module words = self.normalize_tokens(tokens) N = len(words) V = len(self.non_terminal) table = np.zeros((N + 1, N + 1, V)) back = np.zeros((N + 1, N + 1, V), dtype=tuple) for j in range(1, N + 1): for A in self.token_to_pos[words[j - 1]]: table[j - 1, j, self.pos_2_ind[A]] = self.proba_grammar[(A, words[j - 1])] for i in range(j - 2, -1, -1): for k in range(i + 1, j): ind_B = np.nonzero(table[i, k, :] > 0)[0] B_list = [self.ind_2_pos[x] for x in ind_B] ind_C = np.nonzero(table[k, j, :] > 0)[0] C_list = [self.ind_2_pos[x] for x in ind_C] prod = product(B_list, C_list) for BC in prod: for A in self.right_to_pos[BC]: indA, indB, indC = self.pos_2_ind[ A], self.pos_2_ind[BC[0]], self.pos_2_ind[ BC[1]] value = (self.proba_grammar[(A, BC)]) * ( table[i, k, indB]) * (table[k, j, indC]) if (table[i, j, indA]) < value: table[i, j, indA] = value back[i, j, indA] = (k, *BC) print("Took {}s".format(int(time() - since))) if not back[0, N, self.pos_2_ind["SENT"]]: return None tree = self.build_tree(tokens, back, 0, N, "SENT") return " ".join(self.debinarize(tree.split())) def build_tree(self, words, back, i, j, pos): """Transform the output of CYK to the form of a parsed sentence with parentheses""" n = j - i if n == 1: return " ( " + pos + " " + words[i] + " ) " else: k, B, C = back[i, j, self.pos_2_ind[pos]] return "( " + pos + " " + self.build_tree( words, back, i, k, B) + " " + self.build_tree( words, back, k, j, C) + ") " def debinarize(self, s): """Reverse Chomsky binarisation""" for i, x in enumerate(s): if "$" in x and s[i - 1] == "(": c = 1 for j, y in enumerate(s[i + 1:]): if y == '(': c += 1 elif y == ")": c -= 1 if c == 0: return self.debinarize(s[:i - 1] + s[i + 1:i + 1 + j] + s[i + 1 + j + 1:]) return s def predict(self, line): """predict the parser of line from training dataset""" new = self.line_to_tokens(line) return self.pcky(new) def line_to_tokens(self, line): """transform a line from dataset to a list of tokens""" tokenized = line.replace("(", " ( ").replace(")", " ) ").split()[1:-1] remove = False new = [] for i, x in enumerate(tokenized): if tokenized[i] == "(" and tokenized[i + 1] != "(": remove = True elif tokenized[i] == "(": new.append(x) else: if not remove: new.append(x) else: remove = False new = list(filter(lambda x: x not in [')', '('], new)) return new def prepare_line_for_prediction(self, line): """Tokenize a line from dataset""" tokenized = line.replace("(", " ( ").replace(")", " ) ").split()[1:-1] new = [] for i, x in enumerate(tokenized): if "-" in x and tokenized[i - 1] == "(": new.append(x.split("-")[0]) else: new.append(x) return " ".join(new) def normalize_word(self, word): """OOV module, 1st: compute levenshtein_distance, if not, return closest word using cosinus similarity""" if word in self.tokens.keys(): return word lv_distances = defaultdict(list) for token in self.tokens.keys(): distance = levenshtein_distance(word, token) for i in range(1, 3): if distance == i: lv_distances[i].append(token) break for i in range(1, 3): if lv_distances[i]: return lv_distances[i][0] return self.oov.closest_to_tokens(word, self.tokens.keys()) def normalize_tokens(self, tokens): """apply self.normalize_word to a list of tokens""" return [self.normalize_word(token) for token in tokens]
class CYK_Parser(): #My parser based on probabilist CYK algorithm def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols, self.PCFG.freq_tokens) #note : if the id of a symbol is above self.PCFG.nb_tags, #it's an artificial symbol introduced with Chomsky normalization self.symbol_to_id = { symbol: i for (i, symbol) in enumerate(self.PCFG.list_all_symbols) } #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids: #we store rules with an additional hierarchical level for speed up #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ) #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries self.grammar_ids = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand symbol of the grammar rule idx_root_tag = self.symbol_to_id[root_tag] self.grammar_ids[idx_root_tag] = {} dico = {} for (split, proba) in rules.items( ): #split is the right hand term, and proba the probability of the rule idx_left_tag = self.symbol_to_id[split[0]] idx_right_tag = self.symbol_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_ids[idx_root_tag] = dico #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ? #this is what stores self.lexicon_inverted self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word] def compute_CYK_tables(self, sentence, viz_oov=False): # compute CYK tables : # - looking for the probabilities of the most likely trees parsing the substrings of sentence, for increasing length of substring (from 1 to length of the sentence) # - storing each time the position of the cut and the rule (right hand term) enabling to reach the most likely parsing tree of a given root tag nb_words = len(sentence) max_proba_derivation = np.zeros( (nb_words, nb_words, self.PCFG.nb_all_symbols)) # max_proba_derivation[s,l,a] is the maximum probability of # a parsing where symbol a derives substring x_s...x_(s+l) split_reaching_max = np.zeros( (nb_words, nb_words, self.PCFG.nb_all_symbols, 3)) # split_reaching_max[s,l,a,0] stores index cut # split_reaching_max[s,l,a,1] stores symbol b # split_reaching_max[s,l,a,2] stores symbol c # such that # (i) b derives x_s...x_(s+cut), c derives x_(s+cut)...x_(s+l) # and a rewrites bc (a->bc in the grammar) # (ii) the splitting <cut,b,c> defined by (i) is the one enabling # to reach the maximum probability for a to derives x_s...x_(s+l) # (ie enabling to reach max_proba_derivation[s,l,a]) # probabilities of tags for unary strings (words) for (position_word, word) in enumerate(sentence): token_to_tag = word if not (word in self.OOV.words_lexicon): if viz_oov: print(word + " is an OOV") token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov) if viz_oov: if token_to_tag is None: print("No closest token found") print("") else: print("Closest token found : " + token_to_tag) print("") if token_to_tag is None: for (tag, counts) in self.PCFG.freq_terminal_tags.items(): if tag in self.symbol_to_id: # avoid the case where tag appearing in lexicon but not in grammar rules id_tag = self.symbol_to_id[tag] max_proba_derivation[position_word, 0, id_tag] = counts else: for (tag, proba) in self.lexicon_inverted[token_to_tag].items(): if tag in self.symbol_to_id: #avoid the case where tag appearing in lexicon but not in grammar rules id_tag = self.symbol_to_id[tag] max_proba_derivation[position_word, 0, id_tag] = proba for l in range(1, nb_words): # we will consider symbols deriving strings of length l+1... for s in range(nb_words - l): # ... and starting at index s of the sentence for idx_root_tag in self.grammar_ids: # ... root_tag is the symbol deriving the considered string (rule left-hand term) for cut in range(0, l): # ... such symbol can rewrite as two symbols AB # with A deriving substring until index cut included, and B deriving substring from index cut+1 for idx_left_tag in self.grammar_ids[ idx_root_tag]: #left symbol A proba_left_derivation = max_proba_derivation[ s, cut, idx_left_tag] if proba_left_derivation > max_proba_derivation[ s, l, idx_root_tag]: for (idx_right_tag, proba_split) in self.grammar_ids[ idx_root_tag][idx_left_tag].items( ): #right symbol B proba_right_derivation = max_proba_derivation[ s + cut + 1, l - cut - 1, idx_right_tag] proba_decomposition = proba_split * proba_left_derivation * proba_right_derivation if proba_decomposition > max_proba_derivation[ s, l, idx_root_tag]: # therefore, we found a new decomposition <cut,split[0],split[1]> # reaching a highest probability for root_tag to derive substring x_s...x_(s+l) max_proba_derivation[ s, l, idx_root_tag] = proba_decomposition split_reaching_max[s, l, idx_root_tag, 0] = cut split_reaching_max[s, l, idx_root_tag, 1] = idx_left_tag split_reaching_max[s, l, idx_root_tag, 2] = idx_right_tag self.max_proba_derivation = max_proba_derivation self.split_reaching_max = split_reaching_max.astype(int) def parse_substring(self, s, l, idx_root_tag, sentence): # parse substring beginning at index s of sentence, of length l+1, and tagged as idx_root_tag if l == 0: return sentence[s] else: # split enabling to reach max_proba_derivation[s,l,idx_root_tag] cut = self.split_reaching_max[s, l, idx_root_tag, 0] idx_left_tag = self.split_reaching_max[s, l, idx_root_tag, 1] idx_right_tag = self.split_reaching_max[s, l, idx_root_tag, 2] left_tag = self.PCFG.list_all_symbols[idx_left_tag] right_tag = self.PCFG.list_all_symbols[idx_right_tag] return [[ left_tag, self.parse_substring(s, cut, idx_left_tag, sentence) ], [ right_tag, self.parse_substring(s + cut + 1, l - cut - 1, idx_right_tag, sentence) ]] def remove_artificial_symbols(self, T): #removing artificial symbols from T the tree structure encoding the parsing of the sentence #debinarize : remove artificial symbols of type X|X1X2X3.. (coming from BIN rule) #attaching children of an artificial symbol to its own father nodes = deepcopy(T.nodes) for node in nodes: children = list(T.successors(node)) if len(children) == 0: pass elif len(children) == 1 and len(list(T.successors( children[0]))) == 0: pass else: father = list(T.predecessors(node)) if len(father) == 0: pass else: symbol = T.nodes[node]["name"] if (self.symbol_to_id[symbol] >= self.PCFG.nb_tags) and ( "|" in symbol): # artificial symbol from BIN rule for child in T.successors(node): T.add_edge(father[0], child) T.remove_node(node) #add pre_terminal symbols : remove artificial symbols of type A&B (coming from UNIT rule) #decompositing A&B into two symbols A and B (A father of B father of word) max_id_node = np.max(T.nodes()) nodes = deepcopy(T.nodes) for node in nodes: children = list(T.successors(node)) if len(children) == 0 or len(list(T.predecessors(node))) == 0: pass elif len(children) == 1 and len(list(T.successors( children[0]))) == 0: symbol = T.nodes[node]["name"] if (self.symbol_to_id[symbol] >= self.PCFG.nb_tags) and ( "&" in symbol): # artificial symbol from UNIT rule word = children[0] idx_cut = None for (idx, c) in enumerate(symbol): if c == "&": idx_cut = idx T.nodes[node]["name"] = symbol[:idx_cut] idx_pre_terminal_node = max_id_node + 1 T.add_node(idx_pre_terminal_node, name=symbol[idx_cut + 1:]) max_id_node += 1 T.remove_edge(node, word) T.add_edge(node, idx_pre_terminal_node) T.add_edge(idx_pre_terminal_node, word) def reformat_parsing(self, parsing): # converting parsing stored as nested lists into the required format (with nested brackets) if type(parsing) == str: return parsing else: string = "" for el in parsing: root_tag = el[0] parsing_substring = el[1] string = string + "(" + root_tag + " " + self.reformat_parsing( parsing_substring) + ")" + " " string = string[:-1] return string def parse(self, sentence, remove_artificial_symbols=True, viz_oov=False): # parse sentence # remove_artificial_symbols : if False, keep Chomsky artificial symbols # viz_oov : if True, plot management of oov words sentence = sentence.split() nb_words = len(sentence) if nb_words > 1: self.compute_CYK_tables(sentence, viz_oov=viz_oov) idx_root_tag = self.symbol_to_id["SENT"] if self.max_proba_derivation[0][ nb_words - 1][idx_root_tag] == 0: #no valid parsing return None parsing_list = self.parse_substring(0, nb_words - 1, idx_root_tag, sentence) else: word = sentence[0] token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov) if token_to_tag is None: tag = max(self.PCFG.freq_terminal_tags, key=self.PCFG.freq_terminal_tags.get) else: tag = max(self.lexicon_inverted[token_to_tag], key=self.lexicon_inverted[token_to_tag].get) parsing_list = "(" + tag + " " + word + ")" if remove_artificial_symbols: #converting the parsing stored as a string into a tree T = postagged_sent_to_tree( "( (SENT " + self.reformat_parsing(parsing_list) + "))", remove_after_hyphen=False) #nx.draw(T, labels=nx.get_node_attributes(T, "name"), arrows=False, pos=graphviz_layout(T, prog='dot')) self.remove_artificial_symbols(T) return tree_to_postagged_sent(T) #return parsing as a string else: return "( (SENT " + self.reformat_parsing( parsing_list) + "))" #return parsing as a string
class CYK: """Class for applying the CYK algorithm""" def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens) self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)} self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word] # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ) self.grammar_dicts = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand tag of the grammar rule idx_root_tag = self.tag_to_id[root_tag] self.grammar_dicts[idx_root_tag] = {} dico = {} for (split, proba) in rules.items(): # split is the right hand term, and proba the probability of the rule idx_left_tag = self.tag_to_id[split[0]] idx_right_tag = self.tag_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_dicts[idx_root_tag] = dico def list_to_sentence(self, parsing): """Go from list to string representation""" if type(parsing) == str: return parsing else: string = "" for p in parsing: root_tag = p[0] parsing_substring = p[1] string = string + "(" + root_tag + " " + self.list_to_sentence(parsing_substring) + ")" + " " string = string[:-1] # Remove the extra space return string def parse_substring(self, s, l, idx_root_tag, sentence): """Parse part of a sentence into a list""" if l == 0: return sentence[s] else: # split enabling to reach max_proba_derivation[s,l,idx_root_tag] cut = self.cyk_matrix[s, l, idx_root_tag, 0] idx_left_tag = self.cyk_matrix[s, l, idx_root_tag, 1] idx_right_tag = self.cyk_matrix[s, l, idx_root_tag, 2] left_tag = self.PCFG.list_all_tags[idx_left_tag] right_tag = self.PCFG.list_all_tags[idx_right_tag] return [[left_tag, self.parse_substring(s, cut, idx_left_tag, sentence)], [right_tag, self.parse_substring(s + cut + 1, l - cut - 1, idx_right_tag, sentence)]] def clean_tags(self, tree): """Remove artificial tags and de-telescope tags""" # remove artificial tag of type X|X1X2X3.. (coming from BIN rule) nodes = deepcopy(tree.nodes) for node in nodes: children = list(tree.successors(node)) if len(children) == 0: pass elif len(children) == 1 and len(list(tree.successors(children[0]))) == 0: pass else: father = list(tree.predecessors(node)) if len(father) == 0: pass else: tag = tree.nodes[node]["name"] if (self.tag_to_id[tag] >= self.PCFG.nb_tags) and ( "|" in tag): # artificial tag from BIN rule for child in tree.successors(node): tree.add_edge(father[0], child) tree.remove_node(node) # decomposing (A&B w) into (A (B w)) max_id_node = np.max(tree.nodes()) nodes = deepcopy(tree.nodes) for node in nodes: children = list(tree.successors(node)) if len(children) == 0 or len(list(tree.predecessors(node))) == 0: pass elif len(children) == 1 and len(list(tree.successors(children[0]))) == 0: tag = tree.nodes[node]["name"] if (self.tag_to_id[tag] >= self.PCFG.nb_tags) and ( "&" in tag): # artificial tag from UNIT rule word = children[0] idx_cut = None for (idx, c) in enumerate(tag): if c == "&": idx_cut = idx tree.nodes[node]["name"] = tag[:idx_cut] idx_pre_terminal_node = max_id_node + 1 tree.add_node(idx_pre_terminal_node, name=tag[idx_cut + 1:]) max_id_node += 1 tree.remove_edge(node, word) tree.add_edge(node, idx_pre_terminal_node) tree.add_edge(idx_pre_terminal_node, word) def compute_CYK(self, sentence, viz_oov=False): """Apply the CYK algorithm (heavily influenced by https://en.wikipedia.org/wiki/CYK_algorithm)""" n = len(sentence) prob_matrix = np.zeros((n, n, self.PCFG.nb_all_tags)) cyk_matrix = np.zeros((n, n, self.PCFG.nb_all_tags, 3)) # probabilities of tags for unary rule for (position_word, word) in enumerate(sentence): token_to_tag = word if not (word in self.OOV.words_lexicon): token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov) if token_to_tag is None: for (tag, counts) in self.PCFG.freq_terminal_tags.items(): if tag in self.tag_to_id: id_tag = self.tag_to_id[tag] prob_matrix[position_word, 0, id_tag] = counts else: for (tag, proba) in self.lexicon_inverted[token_to_tag].items(): if tag in self.tag_to_id: id_tag = self.tag_to_id[tag] prob_matrix[position_word, 0, id_tag] = proba for l in range(1, n): for s in range(n - l): for idx_root_tag in self.grammar_dicts: for cut in range(0, l): for idx_left_tag in self.grammar_dicts[idx_root_tag]: proba_left_derivation = prob_matrix[s, cut, idx_left_tag] if proba_left_derivation > prob_matrix[s, l, idx_root_tag]: # save useless iterations for (idx_right_tag, proba_split) in self.grammar_dicts[idx_root_tag][ idx_left_tag].items(): proba_right_derivation = prob_matrix[s + cut + 1, l - cut - 1, idx_right_tag] proba_decomposition = proba_split * proba_left_derivation * proba_right_derivation if proba_decomposition > prob_matrix[s, l, idx_root_tag]: prob_matrix[s, l, idx_root_tag] = proba_decomposition cyk_matrix[s, l, idx_root_tag] = [cut, idx_left_tag, idx_right_tag] self.prob_matrix = prob_matrix self.cyk_matrix = cyk_matrix.astype(int) def parse(self, sentence, viz_oov=False): """Returns a parsed and tagged sentence from a natural sentence""" sentence = sentence.split() nb_words = len(sentence) if nb_words > 1: self.compute_CYK(sentence, viz_oov=viz_oov) idx_root_tag = self.tag_to_id["SENT"] if self.prob_matrix[0][nb_words - 1][idx_root_tag] == 0: # no valid parsing return None parsing_list = self.parse_substring(0, nb_words - 1, idx_root_tag, sentence) else: word = sentence[0] token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov) if token_to_tag is None: tag = max(self.PCFG.freq_terminal_tags, key=self.PCFG.freq_terminal_tags.get) else: tag = max(self.lexicon_inverted[token_to_tag], key=self.lexicon_inverted[token_to_tag].get) parsing_list = "(" + tag + " " + word + ")" # converting the parsing stored as a string into a tree tree = tagged_sent_to_tree("( (SENT " + self.list_to_sentence(parsing_list) + "))", remove_after_hyphen=False) self.clean_tags(tree) return tree_to_sentence(tree)
f = open(trainfilename, 'r') for line in f: trees.append(nltk.Tree.fromstring(line)) # preprocss the tree forms: ignore functional labels and binarize to CNF for tree in trees: # ignore_func_labels(tree) tree.chomsky_normal_form(horzMarkov=2) # tree.chomsky_normal_form() # learn PCFG lexicon, grammar, vocabulary, symbols = PCFG(trees) # print(grammar) # for OOV oovwords = OOV(embedfilename, vocabulary) # parse new sentences using CYK based on learned PCFG # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords) # i = 0 for line in sys.stdin: # print('start parse') # print(line) # start = time.time() # if line == '\n': continue # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename) # i += 1 # if i < 20: continue # if i > 3: break # parsedtree = parser.compute(line.split())