Example #1
0
def parse_from_dependency(models, string: str,
                          references: List[str] = ['dia', 'itu', 'ini', 'saya', 'awak', 'kamu', 'kita', 'kami', 'mereka'],
                          rejected_references: List[str] = ['saya', 'awak', 'kamu', 'kita', 'kami', 'mereka', 'nya'],
                          acceptable_subjects: List[str] = ['flat', 'subj', 'nsubj', 'csubj', 'obj'],
                          acceptable_nested_subjects: List[str] = ['compound', 'flat'],
                          split_nya: bool = True,
                          aggregate: Callable = np.mean,
                          top_k: int = 20):
    """
    Apply Coreference Resolution using stacks of dependency models.

    Parameters
    ----------
    models: list
        list of dependency models, must has `vectorize` method.
    string: str
    references: List[str], optional (default=['dia', 'itu', 'ini', 'saya', 'awak', 'kamu', 'kita', 'kami', 'mereka'])
        list of references.
    rejected_references: List[str], optional (default=['saya', 'awak', 'kamu', 'kita', 'kami', 'mereka'])
        list of rejected references during populating subjects.
    acceptable_subjects:List[str], optional
        List of dependency labels for subjects.
    acceptable_nested_subjects: List[str], optional
        List of dependency labels for nested subjects, eg, syarikat (obl) facebook (compound).
    split_nya: bool, optional (default=True)
        split `nya`, eg, `disifatkannya` -> `disifatkan`, `nya`.
    aggregate: Callable, optional (default=numpy.mean)
        Aggregate function to aggregate list of vectors from `model.vectorize`.
    top_k: int, optional (default=20)
        only accept near top_k to assume a coherence.

    Returns
    -------
    result: Dict[text, coref]
        {'text': ['Husein','Zolkepli','suka','makan','ayam','.','Dia','pun','suka','makan','daging','.'],
        'coref': {6: {'index': [0, 1], 'text': ['Husein', 'Zolkepli']}}}
    """
    if not isinstance(models, list):
        raise ValueError('models must be a list')

    for m in range(len(models)):
        if type(models[m]) not in [DependencyBERT, DependencyXLNET]:
            raise ValueError('model must one of [malaya.model.bert.DependencyBERT, malaya.model.xlnet.DependencyXLNET]')

    if split_nya:
        string = _split_nya(string)
        references = references + ['nya']

    tagging, indexing = voting_stack(models, string)

    result = []
    for i in range(len(tagging)):
        result.append(
            '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_'
            % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])
        )

    d_object = DependencyGraph('\n'.join(result), top_relation_label='root')

    rs = []
    for i in range(len(indexing)):
        for s in acceptable_subjects:
            if d_object.nodes[i]['rel'] == s:
                r = []
                for n_s in acceptable_nested_subjects:
                    s_ = d_object.traverse_children(i, [n_s], initial_label=[s])
                    s_ = _combined(s_)
                    r.extend(s_)
                r = [i for i in r if i.lower() not in references and not i.lower() in rejected_references]
                rs.extend(r)
    rs = cluster_words(rs, lowercase=True)

    vs, X = [], None
    for m in range(len(models)):
        v = models[m].vectorize(string)
        X = [i[0] for i in v]
        y = [i[1] for i in v]
        vs.append(y)
    V = aggregate(vs, axis=0)

    indices, word_indices = {}, []
    for no, row in enumerate(rs):
        ind = []
        for word in row.split():
            indices[word] = indices.get(word, no)
            ind.append(X.index(word))
        word_indices.append(ind)

    index_word = []
    for key in indices:
        index_word.append(X.index(key))

    index_references = []
    for i in range(len(X)):
        if X[i].lower() in references:
            index_references.append(i)

    similarities = cosine_similarity(V)

    results = {}

    for r in index_references:
        r_ = [r, r - 1]
        i_ = -1

        # subject verb object . subject, we want to reject words before punct
        while X[r + i_] in PUNCTUATION:
            i_ -= 1
            r_.append(r + i_)

        index_word_ = [i for i in index_word if i < r]
        sorted_indices = similarities[r].argsort()[-top_k:][::-1]
        sorted_indices = sorted_indices[np.isin(sorted_indices, index_word_) & ~
                                        np.isin(sorted_indices, r_)]
        if len(sorted_indices):
            s = rs[indices[X[sorted_indices[0]]]]
            index = word_indices[indices[X[sorted_indices[0]]]]
            results[r] = {'index': index, 'text': s.split()}

    return {'text': X, 'coref': results}
Example #2
0
def parse_from_dependency(
        tagging: List[Tuple[str, str]],
        indexing: List[Tuple[str, str]],
        subjects: List[List[str]] = [['flat', 'subj', 'nsubj', 'csubj']],
        relations: List[List[str]] = [[
            'acl', 'xcomp', 'ccomp', 'obj', 'conj', 'advcl'
        ], ['obj']],
        objects: List[List[str]] = [['obj', 'compound', 'flat', 'nmod',
                                     'obl']],
        get_networkx: bool = True):
    """
    Generate knowledge graphs from dependency parsing, we suggest use dependency parsing v1.

    Parameters
    ----------
    tagging: List[Tuple(str, str)]
        `tagging` result from dependency model.
    indexing: List[Tuple(str, str)]
        `indexing` result from dependency model.
    subjects: List[List[str]], optional
        List of dependency labels for subjects.
    relations: List[List[str]], optional
        List of dependency labels for relations.
    objects: List[List[str]], optional
        List of dependency labels for objects.
    get_networkx: bool, optional (default=True)
            If True, will generate networkx.MultiDiGraph.

    Returns
    -------
    result: Dict[result, G]
    """

    if get_networkx:
        try:
            import pandas as pd
            import networkx as nx
        except BaseException:
            logging.warning(
                'pandas and networkx not installed. Please install it by `pip install pandas networkx` and try again. Will skip to generate networkx.MultiDiGraph'
            )
            get_networkx = False

    result = []
    for i in range(len(tagging)):
        result.append(
            '%d\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_' %
            (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1]))

    d_object = DependencyGraph('\n'.join(result), top_relation_label='root')
    results = []
    for i in range(1, len(indexing), 1):
        if d_object.nodes[i]['rel'] == 'root':
            subjects_, relations_ = [], []
            for s in subjects:
                s_ = d_object.traverse_children(
                    i, s, initial_label=[d_object.nodes[i]['rel']])
                s_ = _combined(s_)
                s_ = [c[1:] for c in s_]
                subjects_.extend(s_)
            for s in relations:
                s_ = d_object.traverse_children(
                    i, s, initial_label=[d_object.nodes[i]['rel']])
                s_ = _combined(s_)
                relations_.extend(s_)
            subjects_ = _get_unique(subjects_)
            subject = _get_longest(subjects_)
            relations_ = _get_unique(relations_)

            for relation in relations_:
                objects_ = []
                k = relation[-1][1]
                for s in objects:
                    s_ = d_object.traverse_children(
                        k, s, initial_label=[d_object.nodes[k]['rel']])
                    s_ = _combined(s_)
                    objects_.extend(s_)
                objects_ = _get_unique(objects_)
                obj = _get_longest(objects_)
                if obj[0][0] == relation[-1][0] and len(obj) == 1:
                    results.append({
                        'subject': subject,
                        'relation': relation[:-1],
                        'object': relation[-1:]
                    })
                else:
                    if obj[0][0] == relation[-1][0]:
                        obj = obj[1:]
                    results.append({
                        'subject': subject,
                        'relation': relation,
                        'object': obj
                    })

    post_results = []
    for r in results:
        r = _postprocess(r)
        if r:
            post_results.append(r)

    r = {'result': post_results}

    if get_networkx:
        df = pd.DataFrame(post_results)
        G = nx.from_pandas_edgelist(
            df,
            source='subject',
            target='object',
            edge_attr='relation',
            create_using=nx.MultiDiGraph(),
        )
        r['G'] = G

    return r