Python default_tokenize_funcの例

プログラミング言語: Python

名前空間/パッケージ名: kilogram.lang.tokenize

メソッド/関数: default_tokenize_func

hotexamples.comのコード掲載数: 10

Python default_tokenize_func - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのkilogram.lang.tokenize.default_tokenize_funcの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i + 20), i, -1):
                token = ' '.join(sentence[i:j])
                if i + 1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:' + uri + '>')
                    i = j - 1
                    break
            i += 1
        for n in range(1, N + 1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result

コード例 #2

ファイルを表示

ファイル: spark_generate_linked_ngrams.py プロジェクト: XI-lab/kilogram

def generate_ngrams(line):
    result = []
    line = line.strip()
    for sentence in line_filter(' '.join(default_tokenize_func(line))):
        tokens_plain = []
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> do not attempt to link, could be wrong (Apple)
                    tokens_plain.append(token.lower())
                elif token in unambiguous_labels:
                    # TODO: check it doesn't span titles
                    uri = unambiguous_labels[token]
                    # get types
                    tokens_plain.append('<dbpedia:'+uri+'>')
                    i = j-1
                    break
            i += 1
        for n in range(1, N+1):
            for ngram in nltk.ngrams(tokens_plain, n):
                result.append((' '.join(ngram), 1))
    return result

コード例 #3

ファイルを表示

ファイル: generate_organic_label_counts_all.py プロジェクト: dragoon/kilogram

def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    total_count = sum(int(c) for _, c in uri_counts)
    print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))

コード例 #4

ファイルを表示

ファイル: generate_organic_label_counts_all.py プロジェクト: XI-lab/kilogram

def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    total_count = sum(int(c) for _, c in uri_counts)
    print(label + '\t' + str(len(uri_counts)) + '\t' + str(total_count))

コード例 #5

ファイルを表示

def link(sentence):
    tokens = default_tokenize_func(sentence)
    pos_tokens = nltk.pos_tag(tokens)
    candidates = extract_candidates(pos_tokens)
    if len(candidates) > 0:
        graph = SemanticGraph(candidates)
        graph.do_iterative_removal()
        graph.do_linking()
    return candidates

コード例 #6

ファイルを表示

ファイル: generate_organic_label_counts.py プロジェクト: XI-lab/kilogram

def unpack_achors(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = ListPacker.unpack(uri_list)
    if len(uri_counts) > 1:
        return
    uri, count = uri_counts[0]
    print(label + '\t' + uri + '\t' + count)

コード例 #7

ファイルを表示

ファイル: generate_unambiguous_percentile_labels.py プロジェクト: XI-lab/kilogram

def filter_labels(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = [(uri, int(count)) for uri, count in ListPacker.unpack(uri_list)]
    total = sum(zip(*uri_counts)[1])

    for uri, count in uri_counts:
        if count/total > args.percentile and count > args.min_count:
            print(label + '\t' + uri + '\t' + str(count))
            break

コード例 #8

ファイルを表示

ファイル: generate_unambiguous_percentile_labels.py プロジェクト: dragoon/kilogram

def filter_labels(line):
    label, uri_list = line.split('\t')
    # tokenize for commas
    label = ' '.join(tokenize_possessive(default_tokenize_func(label)))
    # should be only one
    uri_counts = [(uri, int(count))
                  for uri, count in ListPacker.unpack(uri_list)]
    total = sum(zip(*uri_counts)[1])

    for uri, count in uri_counts:
        if count / total > args.percentile and count > args.min_count:
            print(label + '\t' + uri + '\t' + str(count))
            break

コード例 #9

ファイルを表示

ファイル: spark_predicted_label_counts.py プロジェクト: XI-lab/kilogram

def generate_ngrams(line):
    labels = []
    line = line.strip()
    for sentence in line_filter(' '.join(tokenize_possessive(default_tokenize_func(line)))):
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i+20), i, -1):
                token = ' '.join(sentence[i:j])
                if i+1 == j and i == 0:
                    # if first word in sentence -> skip, could be wrong (Apple)
                    continue
                elif token in organic_label_dict:
                    labels.append(((token, organic_label_dict[token]), 1))
                    i = j-1
                    break
            i += 1
    return labels

コード例 #10

ファイルを表示

ファイル: spark_predicted_label_counts.py プロジェクト: dragoon/kilogram

def generate_ngrams(line):
    labels = []
    line = line.strip()
    for sentence in line_filter(' '.join(
            tokenize_possessive(default_tokenize_func(line)))):
        sentence = sentence.split()
        i = 0
        while i < len(sentence):
            for j in range(min(len(sentence), i + 20), i, -1):
                token = ' '.join(sentence[i:j])
                if i + 1 == j and i == 0:
                    # if first word in sentence -> skip, could be wrong (Apple)
                    continue
                elif token in organic_label_dict:
                    labels.append(((token, organic_label_dict[token]), 1))
                    i = j - 1
                    break
            i += 1
    return labels