예제 #1
0
 def __init__(self, encoding="utf8", mecab_option=default_option):
     self.encoding = encoding
     self.mecab_option = mecab_option
     self.tagger = MeCab.Tagger(self.mecab_option)
     self.normalizer = TextNormalizer()
예제 #2
0
#!/usr/bin/python
#encoding: utf8

import sys
from NormalizeText import TextNormalizer

if __name__ == '__main__':
    #print sys.argv
    if len(sys.argv) < 2:
        normalizer = TextNormalizer()
    else:
        normalizer = TextNormalizer(*sys.argv[1:])
    #print normalizer.normalize_methods
    while 1:
        s = raw_input()
        print normalizer.normalize(s.decode('utf8'))
예제 #3
0
from NormalizeText import TextNormalizer
import datetime
from BeautifulSoup import BeautifulSoup as BS
from config import *
import urllib
import urllib2
import MeCabParser
import pldautils

pattern_year = re.compile(u"([1-9][0-9]{3,})[\/年][0-9]")
p_date = re.compile(u"[1-9]{1,2}[\/月][1-9]{1,2}[\/日]")
date_split = re.compile(u"[\/月日]").split
p_event = re.compile(
    "^.*(本日|今日|明日|明後日|募集|予約|歓迎|発売|発表|開催|実施|参加|受付|会場|開場|申し込み|展示|開演|主催).*$")
p_youbi = re.compile(u"\([月火水木金土日]\)")
normalizer = TextNormalizer()
tv_list = [
    u"nhk", u"テレ東", u"テレビ東京", u"日テレ", u"日本テレビ", u"テレビ朝日", u"テレ朝", u"tbs",
    u"フジテレビ"
]
yurl = "http://geo.search.olp.yahooapis.jp/OpenLocalPlatform/V1/geoCoder"


def urlopen(q):
    if type(q).__name__ == "unicode":
        q = q.encode("utf8")
    return urllib2.urlopen(yurl + "?appid=" + yid + "&query=" + q)


class EventTweetTokenizer(MeCabParser.Parser):
    """tweetから余計なものを削除してトークナイズ"""