Python _guess_ptb_nameの例

プログラミング言語: Python

名前空間/パッケージ名: educe.rst_dt.ptb

メソッド/関数: _guess_ptb_name

hotexamples.comのコード掲載数: 4

Python _guess_ptb_name - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのeduce.rst_dt.ptb._guess_ptb_nameの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: corpus_diagnostics.py プロジェクト: eipiplusun/educe

def parse_doc_ptb(doc_id, doc_tkd_toks):
    """Dirty PTB parser"""
    # get PTB trees
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # use tweaked tokens
    doc_tokens = doc_tkd_toks
    tokens_iter = iter(doc_tokens)

    trees = []
    lex_heads = []
    for tree in PTB_READER.parsed_sents(ptb_name):
        # apply standard cleaning to tree
        # strip function tags, remove empty nodes
        tree_no_empty = prune_tree(tree, is_non_empty)
        tree_no_empty_no_gf = transform_tree(tree_no_empty,
                                             strip_subcategory)
        #
        leaves = tree_no_empty_no_gf.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        clean_tree = ConstituencyTree.build(tree_no_empty_no_gf,
                                            tslice)
        trees.append(clean_tree)

        # lexicalize the PTB tree: find the head word of each constituent
        # constituents and their heads are designated by their Gorn address
        # ("tree position" in NLTK) in the tree
        lheads = find_lexical_heads(clean_tree)
        lex_heads.append(lheads)
    return trees  # , lex_heads

コード例 #2

ファイルを表示

def parse_doc_ptb(doc_id, doc_tkd_toks):
    """Dirty PTB parser"""
    # get PTB trees
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # use tweaked tokens
    doc_tokens = doc_tkd_toks
    tokens_iter = iter(doc_tokens)

    trees = []
    lex_heads = []
    for tree in PTB_READER.parsed_sents(ptb_name):
        # apply standard cleaning to tree
        # strip function tags, remove empty nodes
        tree_no_empty = prune_tree(tree, is_non_empty)
        tree_no_empty_no_gf = transform_tree(tree_no_empty, strip_subcategory)
        #
        leaves = tree_no_empty_no_gf.leaves()
        tslice = itertools.islice(tokens_iter, len(leaves))
        clean_tree = ConstituencyTree.build(tree_no_empty_no_gf, tslice)
        trees.append(clean_tree)

        # lexicalize the PTB tree: find the head word of each constituent
        # constituents and their heads are designated by their Gorn address
        # ("tree position" in NLTK) in the tree
        lheads = find_lexical_heads(clean_tree)
        lex_heads.append(lheads)
    return trees  # , lex_heads

コード例 #3

ファイルを表示

ファイル: corpus_diagnostics.py プロジェクト: eipiplusun/educe

def tokenize_doc_ptb(doc_id, doc_text):
    """Dirty PTB tokenizer"""
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # get doc text
    # here we cheat and get it from the RST-DT tree
    # was: rst_text = doc.orig_rsttree.text()
    rst_text = doc_text
    tagged_tokens = PTB_READER.tagged_words(ptb_name)
    # tweak tokens THEN filter empty nodes
    tweaked1, tweaked2 =\
        itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                      enumerate(tagged_tokens)
                      if not is_empty_category(tok[1]))
    spans = generic_token_spans(rst_text, tweaked1,
                                txtfn=lambda x: x.tweaked_word)
    result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]
    return result

コード例 #4

ファイルを表示

ファイル: corpus_diagnostics.py プロジェクト: moreymat/educe

def tokenize_doc_ptb(doc_id, doc_text):
    """Dirty PTB tokenizer"""
    ptb_name = _guess_ptb_name(doc_id)
    if ptb_name is None:
        return None

    # get doc text
    # here we cheat and get it from the RST-DT tree
    # was: rst_text = doc.orig_rsttree.text()
    rst_text = doc_text
    tagged_tokens = PTB_READER.tagged_words(ptb_name)
    # tweak tokens THEN filter empty nodes
    tweaked1, tweaked2 =\
        itertools.tee(_tweak_token(ptb_name)(i, tok) for i, tok in
                      enumerate(tagged_tokens)
                      if not is_empty_category(tok[1]))
    spans = generic_token_spans(rst_text, tweaked1,
                                txtfn=lambda x: x.tweaked_word)
    result = [_mk_token(t, s) for t, s in izip(tweaked2, spans)]
    return result