def test_cnvk(): logging.info("=========================================") logging.info("= cnvk =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) calc_time(cnvk.convert, body, cnvk.HIRA2KATA) logging.debug("result: %s" % cnvk.convert(body, cnvk.HIRA2KATA)) logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) calc_time(cnvk.convert, body, cnvk.Z_KATA, cnvk.KATA2HIRA) logging.debug("result: %s" % cnvk.convert(body, cnvk.Z_KATA, cnvk.KATA2HIRA)) logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) calc_time(cnvk.convert, body, cnvk.HIRA2KATA, cnvk.H_KATA) logging.debug("result: %s" % cnvk.convert(body, cnvk.KATA2HIRA, cnvk.H_KATA)) logging.info("半角 to 全角 for %s" % title) calc_time(cnvk.convert, body, cnvk.Z_ASCII) logging.debug("result: %s" % cnvk.convert(body, cnvk.Z_ASCII)) logging.info("全角 to 半角 for %s" % title) calc_time(cnvk.convert, body, cnvk.H_ASCII) logging.debug("result: %s" % cnvk.convert(body, cnvk.H_ASCII))
def sampling(http, cou, number): list2 = [] html2 = requests.get(http) soup2 = BeautifulSoup(html2.content, "html.parser") [s.extract() for s in soup2('sup')] #[s.replace_with('削除済') for s in soup2(text =re.compile('#'))] title_and_trash = soup2.select('[class~=firstHeading]') title = title_and_trash[0].get_text() title = cnvk.convert( title, cnvk.Z_ASCII, cnvk.Z_KATA, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) title, cou2 = filt(title, cou) starting_point = soup2.select('[class~=toclimit-3],[class~=toc]') if len(starting_point) == 0: return cou, False fo = codecs.open('Random_Contents' + '.txt', 'a', 'utf-8') fo2 = codecs.open('Random_Lists' + '.txt', 'a', 'utf-8') print(title) ti = '\n=' + title + '=\n' line2 = ti.strip() Len = len(line2) number += Len fo.write(ti) fo2.write(ti) fo.close() fo2.close() followers = starting_point[0].find_previous_siblings('p') for k in followers: follower = k.get_text() list2.append(follower) list2.reverse() for line in list2: line = cnvk.convert(line, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) line, cou2 = filt(line, cou2) line2 = line.strip() Len = len(line2) number += Len fo = codecs.open('Random_Contents' + '.txt', 'a', 'utf-8') fo.write(line) fo.close() return cou2, html2
def TITLEs(url, cou): html2 = requests.get(url) soup2 = BeautifulSoup(html2.content, "html.parser") [s.extract() for s in soup2('sup')] title_and_trash = soup2.select('[class~=firstHeading]') title = title_and_trash[0].get_text() title = cnvk.convert( title, cnvk.Z_ASCII, cnvk.Z_KATA, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) title, _ = filt(title, cou) return title
def tf_cnvk(text): # 半角を全角に統一(khcoderに投入するため) return cnvk.convert(text, cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA) #KATA2HIRA
# coding: utf-8 ''' Created on 2014/02/20 @author: nahki ''' import cnvk if __name__ == '__main__': ex_word = '' of = open('kaomoji_zenkaku.csv', 'w', 1000) with open('kaomoji_jisho.txt', 'r', 1000) as f: for line in f: word = cnvk.convert(line.upper(), cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA).strip() if word == ex_word: ex_word = word print word continue cost = int(max(-32768, 6000 - 200 * len(word)**1.3)) res = u"%s,,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost, u"kaomoji") of.write(res) ex_word = word of.close()
fin_name = 'jawiki-latest-all-titles-in-ns0' fout_name = 'wikipedia.csv' fin = codecs.open(fin_name, "r", "utf-8") fout = codecs.open(fout_name, "w", "utf-8") for line in fin: word = line.rstrip() if len(word) <= 3: continue if re.compile(r'^[-.0-9]+$').search(word) is not None: continue if re.compile(r'/[0-9]{4}.').search(word) is not None: continue if re.compile(r'^\.').search(word) is not None: continue if re.compile(r',').search(word) is not None: continue print word word = word.upper() # word = word.replace(u'"', u'""') # word = word.replace(u'〜', u'~') """ score = [-36000.0 ,-400 *(title.size**1.5)].max.to_i をちょっと変更することで、良い結果が得られました。 naist-jdicの名詞の標準的なスコアはだいたい6000点ぐらいだったので、 そこから16bitの符号付整数の最小値である-32768に向けてもうちょっと分布が広がるように調整してみました。 score = [-32768.0, (6000 - 200 *(title.size**1.3))].max.to_i この数式だと日本語でだいたい20文字(utf-8で)ぐらいまでの名詞が 分布されるようになります。 """ cost = int(max(-32768, 6000 - 200*len(word)**1.3)) word = cnvk.convert(word.replace(u'_', u' '), cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA).strip() fout.write(u"\"%s\",,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost, u"Wikipedia")) fin.close() fout.close()
# coding: utf-8 ''' Created on 2014/02/20 @author: nahki ''' import cnvk if __name__=='__main__': ex_word = '' of = open('kaomoji_zenkaku.csv', 'w',1000) with open('kaomoji_jisho.txt', 'r', 1000) as f: for line in f: word = cnvk.convert(line.upper(), cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA).strip() if word == ex_word: ex_word = word print word continue cost = int(max(-32768, 6000 - 200*len(word)**1.3)) res = u"%s,,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost, u"kaomoji") of.write(res) ex_word = word of.close()
def make_stopwords(): u"""コピペ用ストップワードを作成して表示 """ import mojimoji import cnvk stopwords=set() hira=u"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもらりるれろやゐゆゑよわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽぁぃぅぇぉゃゅょっゔ" kata=[] for h in hira: kata.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA)) kata.append(u"ヴ") hankata=[] for k in kata: hankata.append(mojimoji.zen_to_han(k)) kazu=u"0123456789" stopwords.add(u"10") stopwords.add(u"11") stopwords.add(u"12") stopwords.add(u"13") stopwords.add(u"14") stopwords.add(u"15") stopwords.add(u"16") stopwords.add(u"17") stopwords.add(u"18") stopwords.add(u"19") stopwords.add(u"20") stopwords.add(u"10") stopwords.add(u"11") stopwords.add(u"12") stopwords.add(u"13") stopwords.add(u"14") stopwords.add(u"15") stopwords.add(u"16") stopwords.add(u"17") stopwords.add(u"18") stopwords.add(u"19") stopwords.add(u"20") zenkazu=mojimoji.han_to_zen(kazu) kazukan=u"一二三四五六七八九十百千万億兆" minialpha=u"abcdefghijklmnopqlstuvwxyz" bigalpha=u"ABCDEFGHIJKLMNOPQLSTUVWXYZ" han_minialpha=mojimoji.han_to_zen(minialpha) han_bigalpha=mojimoji.han_to_zen(bigalpha) hiramoji=[u"する",u"なる",u"てる",u"れる",u"やる",u"いる",u"さん",u"なん",u"くん",u"それ",u"こと",\ u"ちゃん",u"ある",u"これ",u"して",u"くれる",u"くださる",u"そう",u"せる",u"した",u"いか",\ u"ので",u"よう",u"てるん",u"もん",u"られる",u"あそこ",u"あたり",u"あちら",u"あっち",u"あと",\ u"あな",u"あなた",u"あれ",u"いくつ",u"いつ",u"いま",u"いろいろ",u"うち",u"おおまか",u"おまえ",u"おれ", u"がい",u"かく",u"かたちの",u"かやの",u"から",u"がら",u"きた",u"こせ",u"ここ",u"こっち",u"こと",u"ごと",\ u"こちら",u"これ",u"これら",u"ごろ",u"さまざま",u"さらい",u"しかた",u"しよう",u"すか",u"ずつ",u"すね",\ u"そう",u"そこ",u"そちら",u"そっち",u"そで",u"それ",u"それぞれ",u"それなり",u"たくさん",u"たち",u"たび",\ u"ため",u"ちゃ",u"てん",u"とおり",u"とき",u"どこ",u"どこか",u"ところ",u"どちら",u"どれ",u"なか",u"なかば",\ u"なに",u"など",u"なん",u"はじめ",u"はず",u"はるか",u"ひと",u"ひとつ",u"ふく",u"ぶり",u"べつ",u"へん",u"べん",\ u"ほう",u"ほか",u"まさ",u"まし",u"まとも",u"まま",u"みたい",u"みつ",u"みなさん",u"みんな",u"もと",u"もの",\ u"もん",u"やつ",u"よう",u"よそ",u"わけ",u"わたし",u"くる",u"すぎる",u"れる",u"いう",u"くださる",u"ちゃう",\ u"つく",u"せる",u"てるん",u"すぎ",u"ところ",u"おれ",u"ぼく",u"わたし",u"てる",u"しまう",u"みる", ] katamoji=[] for h in hiramoji: katamoji.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA)) han_katamoji=[] for k in katamoji: han_katamoji.append(mojimoji.zen_to_han(k)) kanmoji=["笑","今","気","今日","明日","方","人","俺","私","僕","時","思う","行く","言う","見る","出す","年","月","日","分","秒","週","火","水","木","金","土","国","都",\ "道","府","県","市","区","町","村","各","第","何","的","度","達","誰","者","類","用","別","等","際","系","品","化","所","毎","回","匹","個","席","束","歳","円","毎",\ "前","後","左","右","次","先","春","夏","秋","冬","下記","上記","時間","今回","前回","場合","自分","ヶ所","ヵ所","カ所","箇所","ヶ月","カ月","箇月","名前","本当","確か","時点",\ "様々","結局","半ば","以前","以後","以降","未満","以上","以下","毎日","自体","何人","手段","感じ","同じ","点","君"] h_kigou=cnvk.H_KIGO kigou=[] for h in h_kigou: for x in h: kigou.append(x) kigou.append(u"ω") kigou.append(u'ー') kigou.append(u"д") #参考 内容推測に適したキーワード抽出のための日本語ストップワード(https://www.jstage.jst.go.jp/article/jjske/12/4/12_511/_pdf) kokubu_words=[u"ない",u"高い",u"多い",u"少ない","強い","大きい","小さい","長い","ながい", u"良い",u"よい",u"いい","悪い", u"ある","いる","なる","行く","いく","来る","とる", "見る","みる","言う","いう","得る","過ぎる","すぎる", "する","やる","行なう","行う","おこなう","出来る","できる", "おもう","思う","考える","かんがえる","わかる","見える", "知る","しれる","いえる","示す","述べる","書く","かく","よる", "異なる","違う","ちがう","くらべる", "入れる","出る","でる","入る","はいる", "使う","用いる","もちいる","持つ","もつ","作る","つくる", "なす","起こる","おこる","つく","つける","聞く","よぶ", "かれる","つまり","上","下","次","つぎ", "わが国","自分","人々","人びと","別","他","間","話","例","形","日","家","手","名","身", "そのもの","一つ","あと", #2016/01/24 更に偏在度の高いものと、忘れてたひらがなを追加 "きゃ","きゅ","きょ","しゃ","しゅ","しょ","ちゃ","ちゅ","ちょ","にゃ","にゅ","にょ", "ひゃ","ひゅ","ひょ","みゃ","みゅ","みょ","りゃ","りゅ","りょ","ゎ", "事","目","とこ","中","字","お前","全部","きみ","もらう", ] for h in hira: stopwords.add(h) for k in kata: stopwords.add(k) for h in hankata: stopwords.add(h) for k in kazu: stopwords.add(k) for z in zenkazu: stopwords.add(z) for k in kazukan: stopwords.add(k) for m in minialpha: stopwords.add(m) for b in bigalpha: stopwords.add(b) for h in han_minialpha: stopwords.add(h) for h in han_bigalpha: stopwords.add(h) for h in hiramoji: stopwords.add(h) for k in katamoji: stopwords.add(k) for h in han_katamoji: stopwords.add(h) for k in kanmoji: stopwords.add(unicode(k)) for k in kigou: stopwords.add(k) for k in kokubu_words: stopwords.add(unicode(k)) print "set([", for s in sorted(stopwords): print "u\"{0}\",".format(s), print "])"
import re import cnvk jisx0208 = [] with open("unicode.txt", "r", encoding="utf-8_sig") as f: for line in f: line = line.strip() jisx0208.append(line) jisx0208.append('\n') jisx0208.append(" ") Jisx0208 = set(jisx0208) """ with open("Wikitexttrueb.full","r") as f, open("Wikitexttruea.full","a") as g: for line in f: #line = cnvk.convert(line,cnvk.Z_ASCII,cnvk.Z_KATA) line=re.sub('[0-90-9]+','*',line) #line=re.sub("\*[\*]+","*",line) g.write(line) """ with open("Random_Contentsadjusttrueb.full", "r") as f, open("Random_Contentsadjusttrueaver2.full", "a") as g: for line in f: line = cnvk.convert(line, cnvk.Z_ASCII, cnvk.Z_KATA) line = re.sub('[0-90-9]+', '*', line) #line=re.sub("\*[\*]+","*",line) g.write(line)
# coding: utf-8 ''' Created on 2014/02/18 @author: nahki ''' import MeCab import cnvk import unicodedata from separatewords import MecabTokenize tagger = MeCab.Tagger("-Ochasen") # 明日のジェル検。めっちゃ緊張する…。お腹痛い_(´;ω;‘_)⌒)_。ハンドモデルママンにお願いするけど。ママンの爪大丈夫かしら…?。いまさら不安… text = cnvk.convert(u'明日ママ を 芦田愛菜 に空目。', cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA) res = MecabTokenize.tokenize(text) print res for r in res: print "result", r """ node = tagger.parseToNode(text.encode('utf-8')) #(´・ω・`) while node: #print "%s %s" % (node.surface, node.feature) print node.surface, node.feature node = node.next """
fin = codecs.open(fin_name, "r", "utf-8") fout = codecs.open(fout_name, "w", "utf-8") for line in fin: word = line.rstrip() if len(word) <= 3: continue if re.compile(r'^[-.0-9]+$').search(word) is not None: continue if re.compile(r'/[0-9]{4}.').search(word) is not None: continue if re.compile(r'^\.').search(word) is not None: continue if re.compile(r',').search(word) is not None: continue print word word = word.upper() # word = word.replace(u'"', u'""') # word = word.replace(u'〜', u'~') """ score = [-36000.0 ,-400 *(title.size**1.5)].max.to_i をちょっと変更することで、良い結果が得られました。 naist-jdicの名詞の標準的なスコアはだいたい6000点ぐらいだったので、 そこから16bitの符号付整数の最小値である-32768に向けてもうちょっと分布が広がるように調整してみました。 score = [-32768.0, (6000 - 200 *(title.size**1.3))].max.to_i この数式だと日本語でだいたい20文字(utf-8で)ぐらいまでの名詞が 分布されるようになります。 """ cost = int(max(-32768, 6000 - 200 * len(word)**1.3)) word = cnvk.convert(word.replace(u'_', u' '), cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA).strip() fout.write(u"\"%s\",,,%d,名詞,一般,*,*,*,*,*,*,*,%s\n" % (word, cost, u"Wikipedia")) fin.close() fout.close()
def sampling_detail(http, cou, number): List = [] soup2 = BeautifulSoup(http.content, "html.parser") [s.extract() for s in soup2('sup')] [s.extract() for s in soup2('annotation')] [s.extract() for s in soup2('.mw-editsection')] [s.extract() for s in soup2.select('.gallerybox')] [s.extract() for s in soup2.select('.mbox-text')] [s.extract() for s in soup2.select('.geo-multi-punct')] [s.extract() for s in soup2.select('.geo-nondefault')] [s.extract() for s in soup2.select('.geo-default')] [s.extract() for s in soup2.select('.plainlist')] block = soup2.select('h2 > span[class~=mw-headline]') for i in block: over = i.prettify() if over.find('id="出典"') > -1 or over.find('id="注釈"') > -1 or over.find( 'id="脚注"') > -1 or over.find('id="註釈"') > -1 or over.find( 'id="外部リンク"') > -1: break item1 = i.get_text() item1 = cnvk.convert(item1, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) item1, cou = filt(item1, cou) if item1.find('注釈') > -1 or item1.find('脚注') > -1 or item1.find( '註釈') > -1: break List.append('\n==' + item1 + '==\n') texts = i.find_all_next( ['h2', 'h3', 'h4', 'p', 'li', 'dd', 'dt', 'blockquote']) overlap = [] temp2_prev = texts[0].prettify() temp2_tx_prev = '' for j in texts: temp2 = j.prettify() if temp2.find('h2') > -1: break elif temp2.find('h3') > -1 and temp2.find('mw-headline') > -1: heading2 = j.select('.mw-headline') item2 = heading2[0].get_text() item2 = cnvk.convert(item2, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) item2, cou = filt(item2, cou) if item2 not in overlap: List.append('\n===' + item2 + '===\n') overlap.append(item2) temp2_prev = temp2 elif temp2.find('h4') > -1 and temp2.find('mw-headline') > -1: heading3 = j.select('.mw-headline') item3 = heading3[0].get_text() item3 = cnvk.convert(item3, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) item3, cou = filt(item3, cou) if temp2_prev.find('h3') == -1 and item3 not in overlap: List.append('\n===' + item3 + '===\n') overlap.append(item3) if temp2_prev.find('h3') > -1 and item3 not in overlap: List.append('\n====' + item3 + '====\n') overlap.append(item3) temp2_prev = temp2 elif temp2.find('blockquote') > -1: text0 = j.get_text() text0 = cnvk.convert(text0, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) text0, cou = filt(text0, cou) if text0 not in overlap: List.append('<block>' + text0 + '<block>\n') overlap.append(text0) temp2_tx_prev = text0 temp2_prev = temp2 elif temp2.find('<dt>') > -1: item4 = j.get_text() item4 = cnvk.convert(item4, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) item4, cou = filt(item4, cou) if temp2_tx_prev.find(item4) == -1: if temp2_prev.find('h3') == -1 and item4 not in overlap: List.append('\n===' + item4 + '===\n') overlap.append(item4) if temp2_prev.find('h3') > -1 and item4 not in overlap: List.append('\n====' + item4 + '====\n') overlap.append(item4) temp2_prev = temp2 elif temp2.find('<p>') > -1 or temp2.find( '<li>') > -1 or temp2.find('<dd>') > -1: if temp2.find('mwe-math-element') == -1: text = j.get_text() text = cnvk.convert( text, cnvk.ZAC, cnvk.ZK, {u"⋯": u"…"}, {u"–": u"―"}, {u"—": u"―"}, {u"-": u"‐"}, {u"-": u"‐"}, {u"~": u"〜"}, {u"·": u"・"}, {u"⋅": u"・"}, {u" ": u" "}, {u"›": u"〉"}, {u"‹": u"〈"}, {u"»": u"》"}, {u"«": u"《"}, {u"≥": u"≧"}, {u"≤": u"≦"}, {u"µ": u"μ"}, {u"〝": u"“"}, {u"〟": u"”"}, {u"⁄": u"/"}, {u"=": u"="}) text, cou = filt(text, cou) if temp2_tx_prev.find(text) == -1: if text not in overlap: List.append(text) overlap.append(text) elif temp2.find('mwe-math-element') > -1: text = '<math-element>\n' if temp2_tx_prev.find(text) == -1: List.append(text) temp2_prev = temp2 fo = codecs.open('Random_Contents' + '.txt', 'a', 'utf-8') for line in List: print(line) line2 = line.strip() Len = len(line2) number += Len fo.write(line) fo.write('\n') fo.close() return cou
# coding: utf-8 ''' Created on 2014/02/18 @author: nahki ''' import MeCab import cnvk import unicodedata from separatewords import MecabTokenize tagger = MeCab.Tagger("-Ochasen") # 明日のジェル検。めっちゃ緊張する…。お腹痛い_(´;ω;‘_)⌒)_。ハンドモデルママンにお願いするけど。ママンの爪大丈夫かしら…?。いまさら不安… text = cnvk.convert(u'明日ママ を 芦田愛菜 に空目。', cnvk.Z_NUM, cnvk.Z_ALPHA, cnvk.Z_KIGO, cnvk.Z_KATA) res = MecabTokenize.tokenize(text) print res for r in res: print "result",r """ node = tagger.parseToNode(text.encode('utf-8')) #(´・ω・`) while node: #print "%s %s" % (node.surface, node.feature) print node.surface, node.feature node = node.next """