Python Cutterの例

プログラミング言語: Python

名前空間/パッケージ名: title_lib

クラス/型: Cutter

hotexamples.comのコード掲載数: 4

Python Cutter - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtitle_lib.Cutterの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

cut(1)

init(1)

コード例 #1

ファイルを表示

ファイル: title_clean.py プロジェクト: yan9liu/title_search

def main(title_file):
    reader = codecs.open(title_file, 'r', 'utf-8')
    writer = codecs.open(title_file + ".clean", 'w', 'utf-8')
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split('\t')
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u'[\<\(（【\[].*?[\]】）\)\>]', " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(
                normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e
            continue

        writer.write(pid)
        for token in tokens:
            writer.write("\t" + token)
        writer.write("\n")
    reader.close()
    writer.close()

コード例 #2

ファイルを表示

ファイル: title_clean.py プロジェクト: yan9liu/title_search

def main(title_file):
    reader = codecs.open(title_file, "r", "utf-8")
    writer = codecs.open(title_file + ".clean", "w", "utf-8")
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split("\t")
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u"[\<\(（【\[].*?[\]】）\)\>]", " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e
            continue

        writer.write(pid)
        for token in tokens:
            writer.write("\t" + token)
        writer.write("\n")
    reader.close()
    writer.close()

コード例 #3

ファイルを表示

ファイル: title_clean.py プロジェクト: yan9liu/title_search

# coding:utf-8

import codecs
from collections import OrderedDict
import re
import sys

from title_lib import Cutter, normalize

Cutter.init()


def main(title_file):
    reader = codecs.open(title_file, 'r', 'utf-8')
    writer = codecs.open(title_file + ".clean", 'w', 'utf-8')
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split('\t')
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u'[\<\(（【\[].*?[\]】）\)\>]', " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(
                normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e

コード例 #4

ファイルを表示

ファイル: title_clean.py プロジェクト: yan9liu/title_search

# coding:utf-8

import codecs
from collections import OrderedDict
import re
import sys

from title_lib import Cutter, normalize


Cutter.init()


def main(title_file):
    reader = codecs.open(title_file, "r", "utf-8")
    writer = codecs.open(title_file + ".clean", "w", "utf-8")
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split("\t")
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u"[\<\(（【\[].*?[\]】）\)\>]", " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e