Python tokenizeの例

プログラミング言語: Python

名前空間/パッケージ名: gtbtokenize

メソッド/関数: tokenize

hotexamples.comのコード掲載数: 8

Python tokenize - 8件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのgtbtokenize.tokenizeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: featurise.py プロジェクト: ogh/contra

def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')
        
        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
            for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0

コード例 #2

ファイルを表示

ファイル: featurise.py プロジェクト: ogh/contra

def main(args):
    argp = _argparser().parse_args(args[1:])

    # TODO: Update the default to the best one we got after experiments
    # TODO: Adding a default has unforeseen consequences
    assert argp.features

    for line in (l.rstrip('\n') for l in stdin):
        _, lbl, pre, _, post = line.split('\t')

        # Tokenise the context
        # XXX: Discards meaningful spaces
        pre_toks = tokenize(pre.strip()).split()
        post_toks = tokenize(post.strip()).split()

        toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3]

        graph, nodes = prev_next_graph(toks)
        for node in nodes:
            if node.value == FOCUS_DUMMY:
                focus = node
                break
        else:
            assert False

        f_vec = {}
        for f_set in argp.features:
            for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus):
                f_vec[f_name] = f_val

        if not f_vec:
            print >> stderr, 'WARNING: No features generated!'
            continue

        stdout.write(lbl)
        stdout.write('\t')
        stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name])
                              for f_name in sorted(f_vec)))
        stdout.write('\n')

    return 0

コード例 #3

ファイルを表示

 def gen_word_list(term_dict):
     word_list = {}
     max_ip_count = 0
     for k in term_dict:
         term_ptrs = gtbtokenize.tokenize(term_dict[k]["label"]).split()
         for word in term_ptrs: 
             if not word in word_list:
                 word_list[word] = {"terms": set([]), "mags": [], "magnitude": 0, "color": 0, "unique_ips": 0, "opacity": min_alpha, "label": word}
                 if word.lower() in idfs: word_list[word]["idf"] = idfs[word.lower()]["IDF"]
                 else: word_list[word]["idf"] = UNMAPPED_IDF_CONST
             word_list[word]["terms"].add(term_dict[k]["term_id"])
             word_list[word]["mags"].append(term_dict[k]["magnitude"])
             word_list[word]["unique_ips"] += term_dict[k]["unique_ips"]
             if word_list[word]["unique_ips"] > max_ip_count: max_ip_count = word_list[word]["unique_ips"]
     #print len(word_list)
     for k in word_list:
         word_list[k]["magnitude"] = np.log2(np.median(word_list[k]["mags"])*word_list[k]["idf"])
         word_list[k]["color"] = len(word_list[k]["terms"])
         word_list[k]["opacity"] = min_alpha + (1-min_alpha)*np.log2(word_list[word]["unique_ips"])/np.log2(max_ip_count)
         word_list[k]["terms"] = list(word_list[k]["terms"])
     word_df = pd.DataFrame.from_dict(word_list, orient="index")
     word_df = word_df.sort_values("magnitude", ascending=False)
     sel_word_list = word_df[0:WC_THRES]
     return word_list

コード例 #4

ファイルを表示

ファイル: tokenise.py プロジェクト: AlexErmer/CollaborativePDFAnnotation

def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o

コード例 #5

ファイルを表示

def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))

コード例 #6

ファイルを表示

ファイル: extractTIABs.py プロジェクト: spyysalo/pubmed

def tokenize_multiline(text):
    return '\n'.join(tokenize(s) for s in text.split('\n'))

コード例 #7

ファイルを表示

def gtb_token_boundary_gen(text):
    from gtbtokenize import tokenize
    tokens = tokenize(text).split()
    for o in _token_boundaries_by_alignment(tokens, text):
        yield o

コード例 #8

ファイルを表示

ファイル: ontovectorgenerator.py プロジェクト: protegeteam/string-clustering

import json, re, sys, os, collections, csv, math, time
import numpy as np
import gtbtokenize
import networkx as nx
import sklearn.metrics as metrics
from operator import itemgetter
import pandas as pd
from utils import MatrixIO, FileUtils

first_cap_re = re.compile('(.)([A-Z][a-z]+)')
all_cap_re = re.compile('([a-z0-9])([A-Z])')
f = lambda x: re.sub(r'[^a-z0-9]', "", x)
tokenize = lambda x: gtbtokenize.tokenize(x).lower()
EMBEDDING_SIZE = 100

N = 24358723
UNMAPPED_IDF_CONST = 4
MIN_ED = 4
vector_file = "../lod_query/biomed_vectors_p.txt"
vocab_file = "../lod_query/biomed_vocab_p.txt"
idf_file = "../lod_query/idf_file.tsv"
vocab_dict = {}
enc_vocab_dict = {}
stopWords = set([
    "a", "also", "although", "am", "an", "and", "are", ".", "NNNN", "VVVV",
    "as", "at", "back", "be", "became", "because", "become", "becomes",
    "becoming", "been", "being", "bill", "both", "bottom", "but", "by", "call",
    "can", "con", "could", "de", "do", "done", "eg", "etc", "even", "ever",
    "find", "for", "found", "from", "get", "give", "go", "had", "has", "have",
    "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",
    "however", "if", "in", "inc", "into", "is", "it", "its", "itself", "keep",