class Parser(object): def __init__(self, encoding="utf8", mecab_option=default_option): self.encoding = encoding self.mecab_option = mecab_option self.tagger = MeCab.Tagger(self.mecab_option) self.normalizer = TextNormalizer() def node(self, s): try: if type(s) == str: s = s.decode(self.encoding) s = self.normalizer.normalize(s) s = s.encode(self.encoding) except: s = "" return self.tagger.parseToNode(s) def parse(self, s, to_unicode=False): node = self.node(s) ret = [] while node: surface = node.surface if surface != "": if to_unicode: surface = surface.decode(self.encoding) ret.append(surface) node = node.next return ret
#!/usr/bin/python #encoding: utf8 import sys from NormalizeText import TextNormalizer if __name__ == '__main__': #print sys.argv if len(sys.argv) < 2: normalizer = TextNormalizer() else: normalizer = TextNormalizer(*sys.argv[1:]) #print normalizer.normalize_methods while 1: s = raw_input() print normalizer.normalize(s.decode('utf8'))
def __init__(self, encoding="utf8", mecab_option=default_option): self.encoding = encoding self.mecab_option = mecab_option self.tagger = MeCab.Tagger(self.mecab_option) self.normalizer = TextNormalizer()
from NormalizeText import TextNormalizer import datetime from BeautifulSoup import BeautifulSoup as BS from config import * import urllib import urllib2 import MeCabParser import pldautils pattern_year = re.compile(u"([1-9][0-9]{3,})[\/年][0-9]") p_date = re.compile(u"[1-9]{1,2}[\/月][1-9]{1,2}[\/日]") date_split = re.compile(u"[\/月日]").split p_event = re.compile( "^.*(本日|今日|明日|明後日|募集|予約|歓迎|発売|発表|開催|実施|参加|受付|会場|開場|申し込み|展示|開演|主催).*$") p_youbi = re.compile(u"\([月火水木金土日]\)") normalizer = TextNormalizer() tv_list = [ u"nhk", u"テレ東", u"テレビ東京", u"日テレ", u"日本テレビ", u"テレビ朝日", u"テレ朝", u"tbs", u"フジテレビ" ] yurl = "http://geo.search.olp.yahooapis.jp/OpenLocalPlatform/V1/geoCoder" def urlopen(q): if type(q).__name__ == "unicode": q = q.encode("utf8") return urllib2.urlopen(yurl + "?appid=" + yid + "&query=" + q) class EventTweetTokenizer(MeCabParser.Parser): """tweetから余計なものを削除してトークナイズ"""