def dependency_graph(tagging, indexing): """ Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models. """ result = [] for i in range(len(tagging)): result.append( '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])) return DependencyGraph('\n'.join(result), top_relation_label='root')
def predict(self, string: str): """ Tag a string. Parameters ---------- string : str Returns ------- result : Tuple """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string], space_after_punct=True) s_tokens = s_tokens[0] r = self._execute( inputs=[input_ids, segment_ids, input_masks], input_labels=['Placeholder', 'Placeholder_1', 'Placeholder_2'], output_labels=['logits', 'heads_seq'], ) tagging, depend = r['logits'], r['heads_seq'] tagging = [self._idx2tag[i] for i in tagging[0]] depend = depend[0] - self._minus for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': tagging[i] = 'root' elif depend[i] != 0 and tagging[i] == 'root': depend[i] = 0 tagging = merge_sentencepiece_tokens_tagging(s_tokens, tagging, model='xlnet') tagging = list(zip(*tagging)) indexing = merge_sentencepiece_tokens_tagging(s_tokens, depend, model='xlnet') indexing = list(zip(*indexing)) result, indexing_ = [], [] for i in range(len(tagging)): index = int(indexing[i][1]) if index > len(tagging): index = len(tagging) elif (i + 1) == index: index = index + 1 elif index == -1: index = i indexing_.append((indexing[i][0], index)) result.append('%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], index, tagging[i][1])) d = DependencyGraph('\n'.join(result), top_relation_label='root') return d, tagging, indexing_
def predict(self, string: str): """ Tag a string. Parameters ---------- string : str Returns ------- result : Tuple """ input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization( self._tokenizer, [string]) s_tokens = s_tokens[0] tagging, depend = self._sess.run( [self._logits, self._heads_seq], feed_dict={ self._X: input_ids, self._segment_ids: segment_ids, self._input_masks: input_masks, }, ) tagging = [self._idx2tag[i] for i in tagging[0]] depend = depend[0] - 1 for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': tagging[i] = 'root' elif depend[i] != 0 and tagging[i] == 'root': depend[i] = 0 tagging = merge_sentencepiece_tokens_tagging(s_tokens, tagging, model='xlnet') tagging = list(zip(*tagging)) indexing = merge_sentencepiece_tokens_tagging(s_tokens, depend, model='xlnet') indexing = list(zip(*indexing)) result, indexing_ = [], [] for i in range(len(tagging)): index = int(indexing[i][1]) if index > len(tagging): index = len(tagging) indexing_.append((indexing[i][0], index)) result.append('%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], index, tagging[i][1])) d = DependencyGraph('\n'.join(result), top_relation_label='root') return d, tagging, indexing_
def predict(self, string: str): """ Tag a string. Parameters ---------- string: str Returns ------- result: Tuple """ parsed_sequence, bert_sequence = parse_bert_tagging( string, self._tokenizer ) tagging, depend = self._sess.run( [self._logits, self._heads_seq], feed_dict = {self._X: [parsed_sequence]}, ) tagging = [self._idx2tag[i] for i in tagging[0]] depend = depend[0] - 1 for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': tagging[i] = 'root' elif depend[i] != 0 and tagging[i] == 'root': depend[i] = 0 tagging = merge_sentencepiece_tokens_tagging(bert_sequence, tagging) tagging = list(zip(*tagging)) indexing = merge_sentencepiece_tokens_tagging(bert_sequence, depend) indexing = list(zip(*indexing)) result, indexing_ = [], [] for i in range(len(tagging)): index = int(indexing[i][1]) if index > len(tagging): index = len(tagging) indexing_.append((indexing[i][0], index)) result.append( '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], index, tagging[i][1]) ) d = DependencyGraph('\n'.join(result), top_relation_label = 'root') return d, tagging, indexing_
def predict(self, string: str): """ Tag a string. Parameters ---------- string: str Returns ------- result: Tuple """ parsed_sequence, input_mask, bert_sequence = parse_bert_tagging( string, self._tokenizer) r = self._execute( inputs=[[parsed_sequence]], input_labels=['Placeholder'], output_labels=['logits', 'heads_seq'], ) tagging, depend = r['logits'], r['heads_seq'] tagging = [self._idx2tag[i] for i in tagging[0]] depend = depend[0] - 1 for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': tagging[i] = 'root' elif depend[i] != 0 and tagging[i] == 'root': depend[i] = 0 tagging = merge_sentencepiece_tokens_tagging(bert_sequence, tagging) tagging = list(zip(*tagging)) indexing = merge_sentencepiece_tokens_tagging(bert_sequence, depend) indexing = list(zip(*indexing)) result, indexing_ = [], [] for i in range(len(tagging)): index = int(indexing[i][1]) if index > len(tagging): index = len(tagging) indexing_.append((indexing[i][0], index)) result.append('%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], index, tagging[i][1])) d = DependencyGraph('\n'.join(result), top_relation_label='root') return d, tagging, indexing_
def parse_from_dependency(models, string: str, references: List[str] = ['dia', 'itu', 'ini', 'saya', 'awak', 'kamu', 'kita', 'kami', 'mereka'], rejected_references: List[str] = ['saya', 'awak', 'kamu', 'kita', 'kami', 'mereka', 'nya'], acceptable_subjects: List[str] = ['flat', 'subj', 'nsubj', 'csubj', 'obj'], acceptable_nested_subjects: List[str] = ['compound', 'flat'], split_nya: bool = True, aggregate: Callable = np.mean, top_k: int = 20): """ Apply Coreference Resolution using stacks of dependency models. Parameters ---------- models: list list of dependency models, must has `vectorize` method. string: str references: List[str], optional (default=['dia', 'itu', 'ini', 'saya', 'awak', 'kamu', 'kita', 'kami', 'mereka']) list of references. rejected_references: List[str], optional (default=['saya', 'awak', 'kamu', 'kita', 'kami', 'mereka']) list of rejected references during populating subjects. acceptable_subjects:List[str], optional List of dependency labels for subjects. acceptable_nested_subjects: List[str], optional List of dependency labels for nested subjects, eg, syarikat (obl) facebook (compound). split_nya: bool, optional (default=True) split `nya`, eg, `disifatkannya` -> `disifatkan`, `nya`. aggregate: Callable, optional (default=numpy.mean) Aggregate function to aggregate list of vectors from `model.vectorize`. top_k: int, optional (default=20) only accept near top_k to assume a coherence. Returns ------- result: Dict[text, coref] {'text': ['Husein','Zolkepli','suka','makan','ayam','.','Dia','pun','suka','makan','daging','.'], 'coref': {6: {'index': [0, 1], 'text': ['Husein', 'Zolkepli']}}} """ if not isinstance(models, list): raise ValueError('models must be a list') for m in range(len(models)): if type(models[m]) not in [DependencyBERT, DependencyXLNET]: raise ValueError('model must one of [malaya.model.bert.DependencyBERT, malaya.model.xlnet.DependencyXLNET]') if split_nya: string = _split_nya(string) references = references + ['nya'] tagging, indexing = voting_stack(models, string) result = [] for i in range(len(tagging)): result.append( '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1]) ) d_object = DependencyGraph('\n'.join(result), top_relation_label='root') rs = [] for i in range(len(indexing)): for s in acceptable_subjects: if d_object.nodes[i]['rel'] == s: r = [] for n_s in acceptable_nested_subjects: s_ = d_object.traverse_children(i, [n_s], initial_label=[s]) s_ = _combined(s_) r.extend(s_) r = [i for i in r if i.lower() not in references and not i.lower() in rejected_references] rs.extend(r) rs = cluster_words(rs, lowercase=True) vs, X = [], None for m in range(len(models)): v = models[m].vectorize(string) X = [i[0] for i in v] y = [i[1] for i in v] vs.append(y) V = aggregate(vs, axis=0) indices, word_indices = {}, [] for no, row in enumerate(rs): ind = [] for word in row.split(): indices[word] = indices.get(word, no) ind.append(X.index(word)) word_indices.append(ind) index_word = [] for key in indices: index_word.append(X.index(key)) index_references = [] for i in range(len(X)): if X[i].lower() in references: index_references.append(i) similarities = cosine_similarity(V) results = {} for r in index_references: r_ = [r, r - 1] i_ = -1 # subject verb object . subject, we want to reject words before punct while X[r + i_] in PUNCTUATION: i_ -= 1 r_.append(r + i_) index_word_ = [i for i in index_word if i < r] sorted_indices = similarities[r].argsort()[-top_k:][::-1] sorted_indices = sorted_indices[np.isin(sorted_indices, index_word_) & ~ np.isin(sorted_indices, r_)] if len(sorted_indices): s = rs[indices[X[sorted_indices[0]]]] index = word_indices[indices[X[sorted_indices[0]]]] results[r] = {'index': index, 'text': s.split()} return {'text': X, 'coref': results}
def parse_from_dependency( tagging: List[Tuple[str, str]], indexing: List[Tuple[str, str]], subjects: List[List[str]] = [['flat', 'subj', 'nsubj', 'csubj']], relations: List[List[str]] = [[ 'acl', 'xcomp', 'ccomp', 'obj', 'conj', 'advcl' ], ['obj']], objects: List[List[str]] = [['obj', 'compound', 'flat', 'nmod', 'obl']], get_networkx: bool = True): """ Generate knowledge graphs from dependency parsing, we suggest use dependency parsing v1. Parameters ---------- tagging: List[Tuple(str, str)] `tagging` result from dependency model. indexing: List[Tuple(str, str)] `indexing` result from dependency model. subjects: List[List[str]], optional List of dependency labels for subjects. relations: List[List[str]], optional List of dependency labels for relations. objects: List[List[str]], optional List of dependency labels for objects. get_networkx: bool, optional (default=True) If True, will generate networkx.MultiDiGraph. Returns ------- result: Dict[result, G] """ if get_networkx: try: import pandas as pd import networkx as nx except BaseException: logging.warning( 'pandas and networkx not installed. Please install it by `pip install pandas networkx` and try again. Will skip to generate networkx.MultiDiGraph' ) get_networkx = False result = [] for i in range(len(tagging)): result.append( '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])) d_object = DependencyGraph('\n'.join(result), top_relation_label='root') results = [] for i in range(1, len(indexing), 1): if d_object.nodes[i]['rel'] == 'root': subjects_, relations_ = [], [] for s in subjects: s_ = d_object.traverse_children( i, s, initial_label=[d_object.nodes[i]['rel']]) s_ = _combined(s_) s_ = [c[1:] for c in s_] subjects_.extend(s_) for s in relations: s_ = d_object.traverse_children( i, s, initial_label=[d_object.nodes[i]['rel']]) s_ = _combined(s_) relations_.extend(s_) subjects_ = _get_unique(subjects_) subject = _get_longest(subjects_) relations_ = _get_unique(relations_) for relation in relations_: objects_ = [] k = relation[-1][1] for s in objects: s_ = d_object.traverse_children( k, s, initial_label=[d_object.nodes[k]['rel']]) s_ = _combined(s_) objects_.extend(s_) objects_ = _get_unique(objects_) obj = _get_longest(objects_) if obj[0][0] == relation[-1][0] and len(obj) == 1: results.append({ 'subject': subject, 'relation': relation[:-1], 'object': relation[-1:] }) else: if obj[0][0] == relation[-1][0]: obj = obj[1:] results.append({ 'subject': subject, 'relation': relation, 'object': obj }) post_results = [] for r in results: r = _postprocess(r) if r: post_results.append(r) r = {'result': post_results} if get_networkx: df = pd.DataFrame(post_results) G = nx.from_pandas_edgelist( df, source='subject', target='object', edge_attr='relation', create_using=nx.MultiDiGraph(), ) r['G'] = G return r