コード例 #1
0
ファイル: constituency_dataset.py プロジェクト: lei1993/HanLP
 def track(node):
     i, j, label = next(node)
     if j == i + 1:
         children = [leaves[i]]
     else:
         children = track(node) + track(node)
     if label.endswith('|<>'):
         return children
     labels = label.split('+')
     tree = Tree(labels[-1], children)
     for label in reversed(labels[:-1]):
         tree = Tree(label, [tree])
     return [tree]
コード例 #2
0
ファイル: constituency_dataset.py プロジェクト: lei1993/HanLP
 def load_file(self, filepath: str):
     with open(filepath) as src:
         for line in src:
             line = line.strip()
             if not line:
                 continue
             yield {'constituency': Tree.fromstring(line)}
コード例 #3
0
ファイル: document.py プロジェクト: emorynlp/elit
def list_to_tree(L):
    if isinstance(L, str):
        return L
    while len(L) == 1:
        L = L[0]
        if isinstance(L, str):
            return L
    return Tree(L[0], [list_to_tree(child) for child in L[1]])
コード例 #4
0
def load_bracketed_trees(chtbs) -> List[Tree]:
    trees = []
    for f in chtbs:
        with open(f, encoding='utf-8') as src:
            content = src.read()
            trees = [x for x in content.split('\n\n') if x.strip()]
            for tree in trees:
                tree = Tree.fromstring(tree)
                trees.append(tree)
    return trees
コード例 #5
0
def make_ctb_tasks(chtbs, out_root, part):
    for task in ['cws', 'pos', 'par', 'dep']:
        os.makedirs(join(out_root, task), exist_ok=True)
    timer = CountdownTimer(len(chtbs))
    par_path = join(out_root, 'par', f'{part}.txt')
    with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \
            open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \
            open(par_path, 'w', encoding='utf-8') as par:
        for f in chtbs:
            with open(f, encoding='utf-8') as src:
                content = src.read()
                trees = split_str_to_trees(content)
                for tree in trees:
                    try:
                        tree = Tree.fromstring(tree)
                    except ValueError:
                        print(tree)
                        exit(1)
                    words = []
                    for word, tag in tree.pos():
                        if tag == '-NONE-' or not tag:
                            continue
                        tag = tag.split('-')[0]
                        if tag == 'X':  # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT
                            tag = 'FW'
                        pos.write('{}\t{}\n'.format(word, tag))
                        words.append(word)
                    cws.write(' '.join(words))
                    par.write(tree.pformat(margin=sys.maxsize))
                    for fp in cws, pos, par:
                        fp.write('\n')
            timer.log(
                f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]',
                erase=False)
    remove_all_ec(par_path)
    dep_path = join(out_root, 'dep', f'{part}.conllx')
    convert_to_stanford_dependency_330(par_path, dep_path)
    sents = list(read_conll(dep_path))
    with open(dep_path, 'w') as out:
        for sent in sents:
            for i, cells in enumerate(sent):
                tag = cells[3]
                tag = tag.split('-')[0]  # NT-SHORT ---> NT
                if tag == 'X':  # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT
                    tag = 'FW'
                cells[3] = cells[4] = tag
                out.write('\t'.join(str(x) for x in cells))
                out.write('\n')
            out.write('\n')
コード例 #6
0
def build_tree(tokens: List[str], sequence):
    r"""
    Builds a constituency tree from the sequence. The sequence is generated in pre-order.
    During building the tree, the sequence is de-binarized to the original format (i.e.,
    the suffixes ``|<>`` are ignored, the collapsed labels are recovered).

    Args:
        tokens :
            All tokens in a sentence.
        sequence (list[tuple]):
            A list of tuples used for generating a tree.
            Each tuple consits of the indices of left/right span boundaries and label of the span.

    Returns:
        A result constituency tree.

    Examples:
        >>> tree = Tree.totree(['She', 'enjoys', 'playing', 'tennis', '.'], 'TOP')
        >>> sequence = [(0, 5, 'S'), (0, 4, 'S|<>'), (0, 1, 'NP'), (1, 4, 'VP'), (1, 2, 'VP|<>'),
                        (2, 4, 'S+VP'), (2, 3, 'VP|<>'), (3, 4, 'NP'), (4, 5, 'S|<>')]
        >>> print(Tree.build_tree(root, sequence))
        (TOP
          (S
            (NP (_ She))
            (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
            (_ .)))
    """
    if not tokens:  # User passed in [], which is the tokenized result of ''
        return Tree('TOP', [])
    tree = Tree('TOP', [Tree('_', [t]) for t in tokens])
    root = tree.label()
    leaves = [
        subtree for subtree in tree.subtrees()
        if not isinstance(subtree[0], Tree)
    ]

    def track(node):
        i, j, label = next(node)
        if j == i + 1:
            children = [leaves[i]]
        else:
            children = track(node) + track(node)
        if label.endswith('|<>'):
            return children
        labels = label.split('+')
        tree = Tree(labels[-1], children)
        for label in reversed(labels[:-1]):
            tree = Tree(label, [tree])
        return [tree]

    return Tree(root, track(iter(sequence)))
コード例 #7
0
def dfs_linearize_constituency(sample: dict,
                               tokenizer: PENMANBartTokenizer,
                               remove_space=False) -> dict:
    amr = sample.get('amr', None)
    if amr:
        l, e = tokenizer.linearize(amr)
        sample['graph_tokens'] = e['linearized_graphs']
        sample['graph_token_ids'] = l
        tree = Tree.from_list(json.loads(sample['amr'].metadata['con_list']))
        for each in tree.subtrees(lambda x: x.height() == 2):
            if each[0] == '(':
                each[0] = '<LBR>'
            elif each[0] == ')':
                each[0] = '<RBR>'
        text = tree.pformat(margin=10e7)
        tokens = []
        buffer = []
        for c in text:
            if c == '(' or c == ')':
                tokens.append(''.join(buffer))
                tokens.append(c)
                buffer.clear()
                continue
            buffer.append(c)
        if buffer:
            tokens.append(''.join(buffer))
        tokens = [x.strip() for x in tokens]
        tokens = [x for x in tokens if x]
        restore_bracket = {'<LBR>': '(', '<RBR>': ')'}
        tokens = [restore_bracket.get(x, x) for x in tokens]
        ids = []
        for each in tokens:
            pairs = each.split(' ', 1)
            if len(pairs) == 2:
                con, token = pairs
                ids.append(
                    tokenizer.convert_tokens_to_ids(tokenizer.INIT + con))
                ids.extend(tokenizer.encode(token, add_special_tokens=False))
            else:
                ids.append(
                    tokenizer.convert_tokens_to_ids(tokenizer.INIT + each))
        if remove_space:
            text = ''.join(text.split())
        sample['text'] = text
        sample['text_token_ids'] = [tokenizer.bos_token_id
                                    ] + ids + [tokenizer.eos_token_id]
    return sample
コード例 #8
0
ファイル: constituency_dataset.py プロジェクト: lei1993/HanLP
def binarize(tree: Tree):
    r"""
    Conducts binarization over the tree.

    First, the tree is transformed to satisfy `Chomsky Normal Form (CNF)`_.
    Here we call :meth:`~tree.Tree.chomsky_normal_form` to conduct left-binarization.
    Second, all unary productions in the tree are collapsed.

    Args:
        tree (tree.Tree):
            The tree to be binarized.

    Returns:
        The binarized tree.

    Examples:
        >>> tree = Tree.fromstring('''
                                        (TOP
                                          (S
                                            (NP (_ She))
                                            (VP (_ enjoys) (S (VP (_ playing) (NP (_ tennis)))))
                                            (_ .)))
                                        ''')
        >>> print(Tree.binarize(tree))
        (TOP
          (S
            (S|<>
              (NP (_ She))
              (VP
                (VP|<> (_ enjoys))
                (S+VP (VP|<> (_ playing)) (NP (_ tennis)))))
            (S|<> (_ .))))

    .. _Chomsky Normal Form (CNF):
        https://en.wikipedia.org/wiki/Chomsky_normal_form
    """

    tree: Tree = tree.copy(True)
    nodes = [tree]
    while nodes:
        node = nodes.pop()
        if isinstance(node, Tree):
            nodes.extend([child for child in node])
            if len(node) > 1:
                for i, child in enumerate(node):
                    if not isinstance(child[0], Tree):
                        node[i] = Tree(f"{node.label()}|<>", [child])
    tree.chomsky_normal_form('left', 0, 0)
    tree.collapse_unary()

    return tree
コード例 #9
0
ファイル: visualization.py プロジェクト: lei1993/HanLP
def list_to_tree(L):
    if isinstance(L, str):
        return L
    return Tree(L[0], [list_to_tree(child) for child in L[1]])
コード例 #10
0
    def _conll_rows_to_sentence(self,
                                conll_rows: List[str]) -> OntonotesSentence:
        document_id: str = None
        sentence_id: int = None
        # The words in the sentence.
        sentence: List[str] = []
        # The pos tags of the words in the sentence.
        pos_tags: List[str] = []
        # the pieces of the parse tree.
        parse_pieces: List[str] = []
        # The lemmatised form of the words in the sentence which
        # have SRL or word sense information.
        predicate_lemmas: List[str] = []
        # The FrameNet ID of the predicate.
        predicate_framenet_ids: List[str] = []
        # The sense of the word, if available.
        word_senses: List[float] = []
        # The current speaker, if available.
        speakers: List[str] = []

        verbal_predicates: List[str] = []
        span_labels: List[List[str]] = []
        current_span_labels: List[str] = []

        # Cluster id -> List of (start_index, end_index) spans.
        clusters: DefaultDict[int, List[Tuple[int, int]]] = defaultdict(list)
        # Cluster id -> List of start_indices which are open for this id.
        coref_stacks: DefaultDict[int, List[int]] = defaultdict(list)

        for index, row in enumerate(conll_rows):
            conll_components = row.split()

            document_id = conll_components[0]
            sentence_id = int(conll_components[1])
            word = conll_components[3]
            pos_tag = conll_components[4]
            parse_piece = conll_components[5]

            # Replace brackets in text and pos tags
            # with a different token for parse trees.
            if pos_tag != "XX" and word != "XX":
                if word == "(":
                    parse_word = "-LRB-"
                elif word == ")":
                    parse_word = "-RRB-"
                else:
                    parse_word = word
                if pos_tag == "(":
                    pos_tag = "-LRB-"
                if pos_tag == ")":
                    pos_tag = "-RRB-"
                (left_brackets, right_hand_side) = parse_piece.split("*")
                # only keep ')' if there are nested brackets with nothing in them.
                right_brackets = right_hand_side.count(")") * ")"
                parse_piece = f"{left_brackets} ({pos_tag} {parse_word}) {right_brackets}"
            else:
                # There are some bad annotations in the CONLL data.
                # They contain no information, so to make this explicit,
                # we just set the parse piece to be None which will result
                # in the overall parse tree being None.
                parse_piece = None

            lemmatised_word = conll_components[6]
            framenet_id = conll_components[7]
            word_sense = conll_components[8]
            speaker = conll_components[9]

            if not span_labels:
                # If this is the first word in the sentence, create
                # empty lists to collect the NER and SRL BIO labels.
                # We can't do this upfront, because we don't know how many
                # components we are collecting, as a sentence can have
                # variable numbers of SRL frames.
                span_labels = [[] for _ in conll_components[10:-1]]
                # Create variables representing the current label for each label
                # sequence we are collecting.
                current_span_labels = [None for _ in conll_components[10:-1]]

            self._process_span_annotations_for_word(conll_components[10:-1],
                                                    span_labels,
                                                    current_span_labels)

            # If any annotation marks this word as a verb predicate,
            # we need to record its index. This also has the side effect
            # of ordering the verbal predicates by their location in the
            # sentence, automatically aligning them with the annotations.
            word_is_verbal_predicate = any("(V" in x
                                           for x in conll_components[11:-1])
            if word_is_verbal_predicate:
                verbal_predicates.append(word)

            self._process_coref_span_annotations_for_word(
                conll_components[-1], index, clusters, coref_stacks)

            sentence.append(word)
            pos_tags.append(pos_tag)
            parse_pieces.append(parse_piece)
            predicate_lemmas.append(
                lemmatised_word if lemmatised_word != "-" else None)
            predicate_framenet_ids.append(
                framenet_id if framenet_id != "-" else None)
            word_senses.append(
                float(word_sense) if word_sense != "-" else None)
            speakers.append(speaker if speaker != "-" else None)

        named_entities = span_labels[0]
        srl_frames = [
            (predicate, labels)
            for predicate, labels in zip(verbal_predicates, span_labels[1:])
        ]

        if all(parse_pieces):
            parse_tree = Tree.fromstring("".join(parse_pieces))
        else:
            parse_tree = None
        coref_span_tuples: Set[TypedSpan] = {
            (cluster_id, span)
            for cluster_id, span_list in clusters.items() for span in span_list
        }
        return OntonotesSentence(
            document_id,
            sentence_id,
            sentence,
            pos_tags,
            parse_tree,
            predicate_lemmas,
            predicate_framenet_ids,
            word_senses,
            speakers,
            named_entities,
            srl_frames,
            coref_span_tuples,
        )