コード例 #1
0
ファイル: title_clean.py プロジェクト: yan9liu/title_search
def main(title_file):
    reader = codecs.open(title_file, 'r', 'utf-8')
    writer = codecs.open(title_file + ".clean", 'w', 'utf-8')
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split('\t')
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u'[\<\((【\[].*?[\]】)\)\>]', " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(
                normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e
            continue

        writer.write(pid)
        for token in tokens:
            writer.write("\t" + token)
        writer.write("\n")
    reader.close()
    writer.close()
コード例 #2
0
ファイル: title_clean.py プロジェクト: yan9liu/title_search
def main(title_file):
    reader = codecs.open(title_file, "r", "utf-8")
    writer = codecs.open(title_file + ".clean", "w", "utf-8")
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split("\t")
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u"[\<\((【\[].*?[\]】)\)\>]", " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e
            continue

        writer.write(pid)
        for token in tokens:
            writer.write("\t" + token)
        writer.write("\n")
    reader.close()
    writer.close()
コード例 #3
0
ファイル: title_clean.py プロジェクト: yan9liu/title_search
# coding:utf-8

import codecs
from collections import OrderedDict
import re
import sys

from title_lib import Cutter, normalize

Cutter.init()


def main(title_file):
    reader = codecs.open(title_file, 'r', 'utf-8')
    writer = codecs.open(title_file + ".clean", 'w', 'utf-8')
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split('\t')
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u'[\<\((【\[].*?[\]】)\)\>]', " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(
                normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e
コード例 #4
0
ファイル: title_clean.py プロジェクト: yan9liu/title_search
# coding:utf-8

import codecs
from collections import OrderedDict
import re
import sys

from title_lib import Cutter, normalize


Cutter.init()


def main(title_file):
    reader = codecs.open(title_file, "r", "utf-8")
    writer = codecs.open(title_file + ".clean", "w", "utf-8")
    cnt = 0
    for line in reader:
        cnt += 1
        if cnt % 100000 == 0:
            print cnt
        try:
            splits = line.strip().split("\t")
            pid = splits[0]
            title = splits[1]
            title_repl = re.subn(u"[\<\((【\[].*?[\]】)\)\>]", " ", title)[0]
            # use OrderedDict to keep token order
            tokens = OrderedDict.fromkeys(Cutter.cut(normalize(title_repl))).keys()
        except Exception as e:
            print line
            print e