def __init__(self, encoding="utf8", mecab_option=default_option): self.encoding = encoding self.mecab_option = mecab_option self.tagger = MeCab.Tagger(self.mecab_option) self.normalizer = TextNormalizer()
#!/usr/bin/python #encoding: utf8 import sys from NormalizeText import TextNormalizer if __name__ == '__main__': #print sys.argv if len(sys.argv) < 2: normalizer = TextNormalizer() else: normalizer = TextNormalizer(*sys.argv[1:]) #print normalizer.normalize_methods while 1: s = raw_input() print normalizer.normalize(s.decode('utf8'))
from NormalizeText import TextNormalizer import datetime from BeautifulSoup import BeautifulSoup as BS from config import * import urllib import urllib2 import MeCabParser import pldautils pattern_year = re.compile(u"([1-9][0-9]{3,})[\/年][0-9]") p_date = re.compile(u"[1-9]{1,2}[\/月][1-9]{1,2}[\/日]") date_split = re.compile(u"[\/月日]").split p_event = re.compile( "^.*(本日|今日|明日|明後日|募集|予約|歓迎|発売|発表|開催|実施|参加|受付|会場|開場|申し込み|展示|開演|主催).*$") p_youbi = re.compile(u"\([月火水木金土日]\)") normalizer = TextNormalizer() tv_list = [ u"nhk", u"テレ東", u"テレビ東京", u"日テレ", u"日本テレビ", u"テレビ朝日", u"テレ朝", u"tbs", u"フジテレビ" ] yurl = "http://geo.search.olp.yahooapis.jp/OpenLocalPlatform/V1/geoCoder" def urlopen(q): if type(q).__name__ == "unicode": q = q.encode("utf8") return urllib2.urlopen(yurl + "?appid=" + yid + "&query=" + q) class EventTweetTokenizer(MeCabParser.Parser): """tweetから余計なものを削除してトークナイズ"""