def extension_moji(): csv_file = "../ds_haiti_t.csv" extension_list = None all_moji_list = [chr(i) for i in range(ord('A'), ord('Z') + 1) ] + [chr(i) for i in range(ord('あ'), ord('ん') + 1) ] + [chr(i) for i in range(ord('ア'), ord('ン') + 1)] with open(csv_file, 'r', encoding='cp932') as f: reader = csv.reader(f) extension_list = [next(reader)] for i in tqdm(range(500)): row = next(reader) tmp = copy(row) tmp_text = mojimoji.zen_to_han(tmp[1]) if (mojimoji.zen_to_han(str(tmp[3])) not in tmp_text): print(i, tmp_text) for ex_i, ex_moji in enumerate(all_moji_list): ex_tmp = copy(tmp) ex_tmp_text = tmp_text.replace( mojimoji.zen_to_han(str(ex_tmp[3])), str(ex_moji)) ex_tmp[1] = ex_tmp_text ex_tmp[3] = ex_i extension_list.append(ex_tmp) # print(extension_list[-1]) with open('../haiti_moji_extension_ds.csv', 'w', encoding="cp932") as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(extension_list)
def register(): ''' Handle DNS registration request (POST). ''' ipaddress = mojimoji.zen_to_han(request.form['ipaddress'].strip()) hostname = mojimoji.zen_to_han(request.form['hostname'].strip()) fullname = '{0}.{1}'.format(hostname, DOMAIN) error = None if not ipaddress: error = "Enter IP address. (IPアドレスを いれてください)" elif not is_valid_ipv4_address(ipaddress): error = "Invalid IP address format. (IPアドレスの フォーマットが まちがっています)" elif not hostname: error = "Enter a name. (なまえを いれてください)" elif not is_valid_hostname(hostname): error = "Invalid hostname. Use only alphabets, numbers, and hyphen. (なまえの フォーマットが まちがっています。アルファベット、すうじ、ハイフンだけが つかえます)" elif not ipaddress in IPLIST: error = "This IP address is not ours. (このIPアドレスは、わたくしたちの ものでは ありません)" if error is None: error = add_dns_resource(ipaddress, fullname) pass if error: session['error'] = error session['ipaddress'] = ipaddress session['hostname'] = hostname return redirect(url_for('show_error')) else: session['ipaddress'] = ipaddress session['fullname'] = fullname return redirect(url_for('show_success'))
def process(text): a = None b = None err_corr = text.split("\t") if len(err_corr) == 2: err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False) err = mojimoji.han_to_zen(err, ascii=False, digit=False) corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False) corr = mojimoji.han_to_zen(corr, ascii=False, digit=False) err_lang = utils.lang_check(err, lang) corr_lang = utils.lang_check(corr, lang) if err_lang and corr_lang: errs = list(err) corrs = list(corr) del_num, ins_num = ld.levenshtein_distance(errs, corrs) del_portion = del_num / len(errs) ins_portion = ins_num / len(corrs) if (del_num < d_num and ins_num < i_num and del_portion < 0.4 and ins_portion < 0.4)\ and (corrs[-1]== '。' or corrs[-1]== '?' or corrs[-1]== '!') \ and (corrs[-2] not in numlist) and ('__' not in corr) and (len(corr)>6): #cleaning the dataset like: 1) err = re.sub("\d+\)\s+", "", err) corr = re.sub("\d+\)\s+", "", corr) err = re.sub("\(\s", "", err) corr = re.sub("\(\s", "", corr) err = re.sub("\s\)", "", err) corr = re.sub("\s\)", "", corr) #cleaning the string like: 1.) err = re.sub("\d+\.\)\s*", "", err) corr = re.sub("\d+\.\)\s*", "", corr) #cleaning the string like: 1. err = re.sub("\d+\.\s*", "", err) corr = re.sub("\d+\.\s*", "", corr) #cleaning the strings begin with ・ err = re.sub("・\s+", "", err) corr = re.sub("・\s+", "", corr) # cleaning the strings begin with * err = re.sub("\*\s+", "", err) corr = re.sub("\*\s+", "", corr) # cleaning the strings begin with * err = re.sub("\*\*\s+", "", err) corr = re.sub("\*\*\s+", "", corr) # cleaning the strings begin with - err = re.sub("-\s+", "", err) corr = re.sub("-\s+", "", corr) # cleaning the tag for conversation: err = re.sub("A:\s*", "", err) corr = re.sub("A:\s*", "", corr) # cleaning the tag for conversation: err = re.sub("B:\s*", "", err) corr = re.sub("B:\s*", "", corr) a = err b = corr return a, b
def fetch_from_ranking(year): assert (len(year) == 6) r = requests.get( f"https://www.karatetsu.com/ranking/index.php?top_ym={year}") soup = BeautifulSoup(r.content, "html5lib") return [(match_paren.sub( "", zen_to_han(s.select("td:nth-of-type(3)")[0].text, kana=False)), zen_to_han(s.select("td:nth-of-type(4)")[0].text, kana=False)) for s in soup.select("#ranking tr")[2:]]
def load_file(path): with open_file(path) as f: for line in f: line = mojimoji.zen_to_han(line, kana=False) line = digit_pattern.sub('#', line) words = line.rstrip().split(' ') yield words
def title_torkenize(sentence): sentence = mojimoji.zen_to_han(sentence) sentence = re.sub( "[\._-―─!@#$%^&\-‐|\\*\“()_■×+α※÷⇒♬◉ᴗ͈ˬ—●▲★☆⭐️⭕⚡⚠①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮♡⭐︎〇◎◆♦▼◼◇△□(:〜~+=)/*&^%$#@!~`)♪ᴖ◡ᴖー{}[]↑↓←→➡⇩™・⊡…\[\]\"\'\”\’:;<>?<>〔〕\r\−〈〉?、、。。・,\./『』【】「」「」→←○《》≪≫\n\u3000]", " ", sentence) sentence = re.sub("[あ-ん]", " ", sentence) sentence = re.sub("( | )+", " ", sentence) sentence = sentence.lower() #〇〇様専用を除く sentence = re.sub("[^ ]*専用", "", sentence) sentence = re.sub("[^ ]*様", "", sentence) #1文字のアルファベットを除く sentence = re.sub(" [a-z]{1}[^(a-z)]", " ", sentence) # 絵文字除去 emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" "]+", flags=re.UNICODE) sentence = emoji_pattern.sub(r'', sentence) sentence = sentence.strip() return sentence
def convert(fo, heisei): if heisei == 31: ext = "xlsx" else: ext = "xls" sheets = pd.read_excel(f"xls/{heisei}.{ext}", sheet_name=None) for sheet_name, df in sheets.items(): m = re.search(r"\d+", sheet_name) if m is None: continue # pylint: disable=c-extension-no-member month = mojimoji.zen_to_han(m.group()) if int(month) >= 4: year = heisei + 1988 else: year = heisei + 1988 + 1 for row in df.itertuples(): if row[2] != "日": continue day = row[1] ymd = f"{year}-{month}-{day}" try: t = dt.strptime(ymd, "%Y-%m-%d") except ValueError as e: print(e) continue fo.write(t.strftime("%Y-%m-%d,")) fo.write(','.join(map(str, row[3:8]))) fo.write("\n")
def get_x_y_text_from_xml(page): """ xmlからx,y,textを抽出した結果をリストとして返す :param page: ElementTreeで抽出したxml :return: x,y,textの辞書が格納された1次元配列 """ x_y_text_list = [] for textbox in page: for textline in textbox: for text in textline: if text.text != '\n' and 'bbox' in text.attrib: bbox = text.attrib['bbox'].split(',') x_y_text_list.append({ 'x': float(bbox[0]), 'y': float(bbox[1]), 'text': mojimoji.zen_to_han(text.text, kana=False) }) if len(x_y_text_list) == 0: os.remove(tmp_file_path) frame = inspect.currentframe() abort(500, { 'code': frame.f_lineno, 'msg': '情報抽出中にエラーが発生しました', 'param': None }) return x_y_text_list
def make_corpus(docs, debug=False): """ 複数の文書からコーパスを作成する @docs 文書のリスト @return トークナイズされた文書のリスト """ docs = list( map( lambda d: list( filter(lambda x: x.strip() != "", re.split("\n|。", d.lower())) ), docs)) docs = [ list(map(lambda x: mojimoji.zen_to_han(x), lines)) for lines in docs ] analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [ list( itertools.chain.from_iterable( [list(analyzer.analyze(l)) for l in lines])) for lines in docs ] if debug: print("\n".join(corpus)) return corpus
def preprocess(doc, debug=False): """ ドキュメントを引数にとってそれを前処理した上でトークナイズされた文のリストに分割する @param doc 対象のドキュメント @return 前処理されたドキュメントに含まれる文のリスト """ doc = doc.lower() lines = re.split("\n|。", doc) lines = list(filter(lambda x: x != "", map(lambda x: x.strip(), lines))) sentences = copy.deepcopy(lines) lines = list(map(lambda x: mojimoji.zen_to_han(x), lines)) analyzer = Analyzer([ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)、。「」]', ' ') ], JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ]) corpus = [' '.join(analyzer.analyze(l)) + '。' for l in lines] if debug: print("\n".join(corpus)) return sentences, corpus
async def on_message(message): if message.channel.id != channel_id: return # ボットのメッセージを除外 if message.author.bot: return #冗長 if re.search(r'(ママ|まま)(おねがい|お願い)', message.content): if message.author.name == 'GESU': await message.channel.send("はいはい、ファラリスの雄牛に入りたいのね") else: await message.channel.send("ママじゃないよ") # メッセージからのハンドル(予約) if re.search(r'(まー|マー|麻)(じゃん|ジャン|雀).*(予約|よやく)', message.content): await reserv(message) # メッセージからのハンドル(予約処理) if re.search(r'(次は誰|次誰)', message.content): await next_reserv(message) # メッセージからのハンドル(前自発者のお礼) if re.search(r'.*ありがと.*', message.content): if len(turn_list) == 0: return if turn_list[0].author.name == message.author.name: if len(turn_list) == 1: await message.channel.send("お疲れ様:heart:") turn_list.clear() return await next_reserv(message) # メッセージからのハンドル(救援ID) if re.search(r'^[a-zA-Z0-9\s]+$', mojimoji.zen_to_han(message.content), re.IGNORECASE): if len(turn_list) < 2: return if turn_list[1].author.name == message.author.name: await message.channel.send("ふぇぇ……誰にも呼ばれない") await next_reserv(message) # メンションからのハンドル if bot.user in message.mentions: # 予約受付 if re.search(r'.*予約.*', message.content): await reserv(message) # 予約処理 elif re.search(r'.*次.*', message.content): await next_reserv(message) elif message.content == 'help': await help(message) else: await message.channel.send("私の扱い方だわに") await help(message) await bot.process_commands(message)
def extractEntryAndIndex(): print('Start extracting entry and index') # 一時ファイルの読み込み f = codecs.open(f_temp_path, 'r', 'utf-8') entryIdForIndex = 0 pbar = tqdm(range(15958)) for line in f: start = line.find("<dt id=") # Extract Entry if start > -1: end = line.find("<a") entryId = line[start + 8:20] title = line[22:end - 1] storeEntryToDB(entryId, title) entryIdForIndex += 1 pbar.update(1) # Extract index elif line.find("<key") > -1: # Ignore the Kana type and store index into database if line.find('type="かな"') < 0: value_end = line.find("</key>") title_end = line.find("type=") value = line[title_end + 10:value_end] value = mojimoji.zen_to_han(value, kana=False).lower() title = line[12:title_end - 2] storeIndexToDB(entryIdForIndex, value, title) elif line.find("") > 0: break f.close()
def preprocessing(text: str) -> str: result: str = text # 全角 -> 半角 result = mojimoji.zen_to_han(result, kana=False) # number -> kanji result = re.sub( r'\d+', lambda m: kanjize.int2kanji(int(m.group(0))), result, ) # remove '笑' result = re.sub( r'[a-z]+', lambda m: '' if re.match(r'^w+$', m.group(0), re.IGNORECASE) else m.group(0), result, flags=re.IGNORECASE) # remove symbolic char result = re.sub(r'[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]', '', result) result = re.sub(r'[!-/:-@[-`{、。”’・ ]', '', result) # remove emoji result = ''.join(ch for ch in result if ch not in emoji.UNICODE_EMOJI['en']) return result
def convert_children_count(children_str): # 文字列が入っている場合は変換 if type(children_str) is str: return mojimoji.zen_to_han(str(children_str).strip("人")) else: # 文字列が入ってない場合は空文字に変換 return ''
def handle_message(event): if event.reply_token == "00000000000000000000000000000000": return message = mojimoji.zen_to_han(event.message.text.lower(), kana=False) # 期間を抽出 if re.match(".*(日|デイリー|daily).*", message): scope = "daily" elif re.match(".*(週|ウィークリー|weekly).*", message): scope = "weekly" elif re.match(".*(月|マンスリー|monthly).*", message): scope = "monthly" else: scope = None # scopeがある場合、記事数を取得 if scope is None: return "No reply" else: max_amount = re.search("[0-9]+[(つ|こ|個|本|記事)]", message) if max_amount is not None: max_amount = int(re.search("[0-9]+", max_amount.group()).group()) else: max_amount = 5 info = get_trend_info(scope, max_amount) info["scope"] = scope reply = make_message(info) line_bot_api.reply_message(event.reply_token, TextSendMessage(text=reply))
def format_zen_han(l): import mojimoji l = l.decode('utf-8') if type(l) == str else l l = mojimoji.zen_to_han(l, kana=False) #全角数字・アルファベットを半角に l = mojimoji.han_to_zen(l, digit=False, ascii=False) #半角カナを全角に l = l.encode('utf-8') return l
def text_to_char_index(full_vocab, real_vocab_number, chara_bukken_revised, sentence_text, addition_translate, comp_width=COMP_WIDTH, preprocessed_char_number=0, skip_unknown=False, shuffle=None): # mode: # average: will repeat the original index to #comp_width for the process of the embedding layer # padding: will pad the original index to #comp_width with zero for the process of the embedding layer # char_emb_dim char embedding size # comp_width #components used if preprocessed_char_number == 0: preprocessed_char_number = len(full_vocab) # convert digital number and latin to hangaku text = mojimoji.zen_to_han(sentence_text, kana=False) # convert kana to zengaku text = mojimoji.han_to_zen(text, digit=False, ascii=False) # convert kata to hira _, katakana2hiragana, _ = _make_kana_convertor() text = katakana2hiragana(text) text = text.translate(addition_translate) # finally, lowercase text = text.lower() # expanding every character with 3 components ch2id = {} for i, w in enumerate(full_vocab): ch2id[w] = i int_text = [] # print(text) for c in text: # print(c) try: i = ch2id[c] except KeyError: print("Unknown Character: ", c) if skip_unknown: continue # skip unknown words else: i = 1 # assign to unknown words # print(i) if real_vocab_number < i < preprocessed_char_number: comps = chara_bukken_revised[i] if shuffle == "flip": comps = comps[::-1] # print(comps) if len(comps) >= comp_width: int_text += comps[:comp_width] else: int_text += comps + [0] * (comp_width - len(comps)) else: if shuffle == "random": if i < real_vocab_number: i = (i + 20) % real_vocab_number int_text += [i] + [0] * (comp_width - 1) return int_text
def __word_normiraze(self, word): """単語の正規化""" word = word.lower() word = mojimoji.han_to_zen(word, ascii=False, digit=False) word = mojimoji.zen_to_han(word, kana=False) return word
def extension_day(): day_1 = ['1日目','一日目','木曜日','1','一','木'] day_2 = ['2日目','二日目','金曜日','2','二','金'] csv_file = "../ds_haiti.csv" extension_list = None with open(csv_file,'r',encoding='cp932') as f: reader = csv.reader(f) extension_list = [next(reader)] # reader_list = [r for r in reader] for row in tqdm(reader): tmp = copy(row) tmp[2] = int(tmp[2])-1 tmp_text = mojimoji.zen_to_han(tmp[1]) if(tmp[2]==0): for d in day_1: tmp_day = copy(tmp) tmp_text_ex = tmp_text.replace('木曜日',str(d)) tmp_day[1] = tmp_text_ex extension_list.append(copy(tmp_day)) else: for d in day_2: tmp_day = copy(tmp) tmp_text_ex = tmp_text.replace('金曜日',str(d)) tmp_day[1] = tmp_text_ex extension_list.append(copy(tmp_day)) with open('../haiti_day_extension_ds.csv', 'w' ,encoding="cp932") as f: writer = csv.writer(f, lineterminator='\n') writer.writerows(extension_list)
def tokenizer(text): text = mojimoji.zen_to_han(text.replace("\n", ""), kana=False) parsed = tagger.parse(text).split("\n") parsed = [t.split("\t") for t in parsed] parsed = list(filter(lambda x: x[0] != "" and x[0] != "EOS", parsed)) parsed = [p[2] for p in parsed] return parsed
def parse_address(text, debug=False): ''' Get location ''' text = mojimoji.zen_to_han(text, kana=False) text = re.sub(r'[\((].*[)\)]', '', text) text = re.sub(r'[\s、]', '', text) _ward = re.search(r'[a-zA-Z一-龥ぁ-んァ-ヶ・ー]+区', text) if _ward: ward = _ward.group() text = re.sub(ward, '', text) # remove else: raise ValueError(text) text = text.replace('丁目', '-') text = text.replace('I', '1') text = re.sub(r'(以下|詳細)*未定', '', text) text = re.sub(r'[ー‐―−]', '-', text) text = re.sub(r'(\d)番[地]*', r'\1-', text) text = re.sub(r'[-]{2,}', r'-', text) text = re.sub(r'([a-zA-Z一-龥ぁ-んァ-ヶ・ー])(\d)', r'\1-\2', text) text = re.sub(r'(\d)号', r'\1', text) lines = text.split('-') lines += ['' for i in range(4 - len(lines))] # adjust length return (ward, *lines)
def _normalize(s): """ mecab-ipadic-neologd の正規化処理(一部修正)を適用する ref: https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja Parameters ---------- s : str raw text Returns ------- str normalized text """ s = s.strip() s = regex.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s) # normalize hyphens s = regex.sub('[﹣-ー—―─━ー]+', 'ー', s) # normalize choonpus s = regex.sub('[~∼∾〜〰~]', '〜', s) # normalize tildes s = _remove_extra_spaces(s) s = regex.sub('[’]', '\'', s) s = regex.sub('[”]', '"', s) s = mojimoji.han_to_zen(s, digit=False, ascii=False) s = mojimoji.zen_to_han(s, kana=False) s = s.lower() return s
def test_mojimoji(): logging.info("=========================================") logging.info("= mojimoji =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) logging.info("Not implemented") logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) logging.info("Not implemented") logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) logging.info("Not implemented") logging.info("半角 to 全角 for %s" % title) calc_time(mojimoji.han_to_zen, body) logging.debug("result: %s" % mojimoji.han_to_zen(body)) logging.info("全角 to 半角 for %s" % title) calc_time(mojimoji.zen_to_han, body) logging.debug("result: %s" % mojimoji.zen_to_han(body))
async def process_commands(self, message): if message.author.bot: return # Backup original content # その都度インスタンスを生成することにする。 self.__message_stocker = MessageAccumulation(self, message) # 書き換え if message.content.startswith(self.command_prefix): # 先頭がcommand_prefixの時のみ改変処理を行う。 # コマンド処理に差支えが無い範囲で、文字列加工を行う事。 # コマンド相当文字の全角半角変換 zen_to_han_command_line = self.regex_command.sub( lambda match: mojimoji.zen_to_han(match.group(0)).lower(), message.content) # 改行をマークしておき、一旦1行に纏めて、コマンドでsplitする。 linking_command = ("🃴".join(zen_to_han_command_line.splitlines())).replace(' ',' ') split_linking_command = self.regex_command.split(linking_command) removal_blank_line = [item for item in split_linking_command if item != ""] #print(f"removal_blank_line: {removal_blank_line}") # コマンド記号を先頭に内容を再組立てする。 command_line_list = [] split_line = [] for item in removal_blank_line: if self.regex_command.match(item): if len(split_line) >= 1: command_line_list.append("".join(split_line)) split_line = [] split_line.append(item) if len(split_line) >= 1: command_line_list.append("".join(split_line)) #print(f"command_line_list: {command_line_list}") # message.contentを改変し、通常処理のように偽装する。 for command_line in command_line_list: cr_lines = [item for item in command_line.replace('🃴', '\n').splitlines() if item != ""] for line in cr_lines: # new message.content message.content = line print("----Modified message:\n" + message.content) # NOTE: この時、message.deleteを使い、 # コマンドを打ったユーザーのメッセージを削除するような処理を行うと、 # 一つのメッセージに対し、2回の削除が走ってしまう可能性があるため注意。 ctx = await self.get_context(message) self.__message_stocker.set_last_context(ctx) await self.invoke(ctx) # ストックしたメッセージを一気に送信する。 await self.__message_stocker.release_send() else: # 通常時は本来通り動作させる。 ctx = await self.get_context(message) self.__message_stocker.set_last_context(ctx) await self.invoke(ctx)
def oracle_create(): logger.debug("start oracle_create") form = OracleCreateForm() if request.method == 'GET': logger.debug("start oracle_create GET") return render_template(ORACLE_PATH + '/create.html', form=form) logger.debug("start oracle_create POST") form = OracleCreateForm(request.form) if request.method == 'POST' and form.validate(): session['username'] = request.form['username'] session['email'] = request.form['email'] #logger.debug("UT_ENTRYTEMP insert") entry = Entry.dict() entry['UNAME'] = session['username'] entry['UKANA'] = mojimoji.zen_to_han(session['username']) entry['UEMAIL'] = session['email'] if Entry.insert(entry): flash("登録できました。", "success") else: flash("登録できませんでした。", "danger") return redirect("/sample/oracle/") return render_template(ORACLE_PATH + 'create.html', form=form)
def executeConvert(self): for path in glob.glob('./pdf/*'): input_path = path output_file = os.path.splitext(os.path.basename(input_path))[0] output_path = os.path.join('./text', output_file + '.txt') rsrcmgr = PDFResourceManager() codec = 'utf-8' params = LAParams() text = "" with StringIO() as output: device = TextConverter(rsrcmgr, output, codec=codec, laparams=params) with open(input_path, 'rb') as input: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(input): interpreter.process_page(page) text += output.getvalue() device.close() output.close() # 半角空白が発生するため、trimする text = re.sub(r' | ', '', text.strip()) text = mojimoji.zen_to_han(text) # output text with open(output_path, "wb") as f: f.write(text.encode('utf-8', "ignore"))
def result(): if request.method == 'POST': query = request.form["target_text"] ud_rate = request.form["ud_rate"] ud_rate_percent = float(ud_rate) / 100 query = mojimoji.zen_to_han(query, kana=False) txt_seg = TextSegmentation() r_dict = txt_seg.segment_text(query, 99) # query, limit r_dict_popped = txt_seg.pop_search_words(ud_rate_percent, r_dict) #### #### routerからだとなぜか最後のループでdictの要素がlistで無くなる r_dict_joined = txt_seg.join_dict_elements(r_dict_popped, 3) # minimum elements search_word_dict = txt_seg.reindex_r_dict(r_dict_joined) twi = TwiSearch(session) search_result = twi.make_search_result(search_word_dict) jf = JsonFormatter() init_tweet_list_json = jf.init_tweet_list_json(search_word_dict, search_result) search_word_json = jf.search_dict_to_json(search_word_dict) tweet_list_json = jf.input_tweet_list_json(search_word_dict, search_result, init_tweet_list_json) tweet_list_json = jf.del_empty_json(tweet_list_json, search_word_dict) # Save function # from model import Model # model = Model() # model.save_result_tweet('json_data/result_tweet_json8.json', tweet_list_json) # tweet_list_json = model.load_search_result('json_data/result_tweet_json8.json') return render_template("result.html", ud_rate = ud_rate, tweet_list_json=tweet_list_json, search_word_json=search_word_json)
def mysql_create(): logger.debug("start mysql_create") form = MySqlCreateForm() if request.method == 'GET': logger.debug("start mysql_create GET") return render_template(MYSQL_PATH + 'create.html', form=form) logger.debug("start mysql_create POST") form = MySqlCreateForm(request.form) if request.method == 'POST' and form.validate(): session['username'] = request.form['username'] session['email'] = request.form['email'] #logger.debug("UT_ENTRYTEMP insert") user = User.dict() user['UNAME'] = session['username'] user['UKANA'] = mojimoji.zen_to_han(session['username']) user['UEMAIL'] = session['email'] if User.insert(user): flash("登録できました。", "success") else: flash("登録できませんでした。", "danger") return redirect(MYSQL_PATH) return render_template(MYSQL_PATH + 'create.html', form=form)
def tokenize(self, manuscript: str) -> list: token_list = [] append = token_list.append try: tokens = self._m.parse(manuscript).split('\n') except IndexError: print(manuscript) return None for tok in tokens: # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音 tok = re.split(r'[\,\t]', tok) if len(tok) < 10: continue ps = tok[1] if ps not in ['名詞', '動詞', '形容詞']: continue # 原形があれば原形をリストに入れる w = tok[7] if w == '*' or w == '': # 原形がなければ表層系(原稿の単語そのまま)をリストに入れる w = tok[0] if w == '' or w == '\n': continue # 全角英数はすべて半角英数にする w = mojimoji.zen_to_han(w, kana=False, digit=False) # 半角カタカナはすべて全角にする w = mojimoji.han_to_zen(w, digit=False, ascii=False) # 英語はすべて小文字にする w = w.lower() append(w) return token_list
def get_shoplist_pref(filedir: str = ""): """ filedir : 都道府県ごとの店情報が格納されたテキストファイルの場所 1つの都道府県に関して,店の名前のリストを作成し,返す """ shoplist = [] contents = pd.read_csv(filedir) ltd = re.compile(r"([(株式)(有限)(合資)]+会社){1}") bracket = re.compile(r"\(.+\)") for shopname in contents["name"]: # カタカナ以外の文字を半角へ shopname = moji.zen_to_han(shopname, kana=False) # 括弧に囲まれた文字列を削除 shopname = bracket.sub("", shopname) # 〇〇会社という文字列は除く shopname = ltd.sub("", shopname) # /で区切られていたら区切られる前の文字列と # 区切り文字を消した文字列を格納する if shopname.find("/") > -1: shoplist.append(shopname[:shopname.find("/")]) shopname = shopname.replace("/", "") shoplist.append(shopname) return shoplist
def normalize_numbers(sentence: str, replacement: str = "0", pattern: str = None) -> str: """正規表現patternに該当する数字をreplacementに置換する Parameters ---------- sentence : str 処理したい文章 replacement : str, optional 処理後の置換文字列, by default "0" pattern : str, optional 処理したい正規表現のパターン Noneの場合はデフォルトパターン"(\d+[,,]*)+"で処理, by default None Returns ------- str 数字正規化処理後の文字列 """ if not pattern: pattern = normalize_numbers_default_pattern hankaku_num_sentence = mojimoji.zen_to_han(sentence, kana=False, digit=True, ascii=False) return re.sub(pattern, replacement, hankaku_num_sentence)
def normalize_text(text): try: text = zen_to_han(text, kana=False) except TypeError: pass # non-unicode object text = re.sub(u'\r?\n', '', text, re.M) text = re.sub(u'-', '-', text, re.M) return text
def normalize_text(text): text = str(text) text = text.lower() # all lower text = mojimoji.zen_to_han(text) # all hankaku text = norm_numeric(text) text = norm_emoji(text) text = norm_url(text) text = norm_continuous_char(text) return text
def test_zen_to_han(): eq_(u'アイウエオ', mojimoji.zen_to_han(u'アイウエオ')) eq_(u'ガギグゲゴ', mojimoji.zen_to_han(u'ガギグゲゴ')) eq_(u'パピプペポ', mojimoji.zen_to_han(u'パピプペポ')) eq_(u'0123', mojimoji.zen_to_han(u'0123')) eq_(u'abcABC', mojimoji.zen_to_han(u'abcABC')) eq_(u'#?!', mojimoji.zen_to_han(u'#?!')) eq_(u'あいうえお', mojimoji.zen_to_han(u'あいうえお'))
def wakachi(self): u"""分かち書きを行う Returns: 辞書型で結果を返す """ md=config.m_mecab_dic tagger=MeCab.Tagger(md.option) tagger.parse('') emoji=re.compile(u'^U00') kigou=re.compile(u'^[!-~]$') # 全角半角を正規化 self.text=mojimoji.zen_to_han(self.text,kana=False,digit=True,ascii=True) self.text=mojimoji.han_to_zen(self.text,kana=True,digit=False,ascii=False) node=tagger.parseToNode(self.text.encode('utf-8')) words=[] while node: pos=node.feature.split(",")[md.pos] if pos=="形容詞" or pos == "形容動詞" or pos=="動詞" or pos=="名詞": if len(node.feature.split(","))<=md.base: base = node.surface else: base=node.feature.split(",")[md.base] if base == "*": base = node.surface # 絵文字、ひらがな、カタカナ一文字は除外 if (emoji.match(unicode(base)) is not None) or (kigou.match(unicode(base)) is not None): pass # ストップワードに含まれたら除外 elif unicode(base) in get_stopwords(): pass else: # 大文字は小文字化して格納する words.append(base.lower()) node=node.next wakachi=map(str,words) wakachi = " ".join(wakachi) if "\n" in wakachi: wakachi=wakachi.split("\n",1)[0].strip() self.wakachigaki=wakachi return {'_id':self.id,'screen_name':self.screen_name,'text':self.text,'wakachi':wakachi}
def get_bow(content): """ We assume that the argument is written in Japanese. """ # Convert full-width to half-width. content = mojimoji.zen_to_han(content.decode('utf-8')).encode('utf-8').lower() # # Morphological analysis bow = mecab.parse(content.lower()) rst = [] for w in bow['nouns']+bow['verbs']: # Extract nouns and verbs. if not sw.is_stop_word(w): rst.append(w) return rst
def normalize(self, str): """ 日本語文字列を正規化する。 全角の英数字・記号が半角に変換されること。(カタカナは除く) >>> normalizer = BasicNormalizer() >>> normalizer.normalize('日本語あいうえおアイウエオ') '日本語あいうえおアイウエオ' >>> normalizer.normalize('0123456789') '0123456789' >>> normalizer.normalize('ABCDEFGHIJKLMNOPQRSTUVWXYZ') 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' >>> normalizer.normalize('abcdefghijklmnopqrstuvwxyz') 'abcdefghijklmnopqrstuvwxyz' >>> normalizer.normalize('a !”#$%&’()*+,−./:;<=>?@[¥]^_‘{|}〜') 'a !"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~' 改行文字・タブ文字が空白に置換されること。 >>> normalizer.normalize('a\\tb\\rc\\nd') 'a b c d' 連続する空白が圧縮されること。 >>> normalizer.normalize('a b c d') 'a b c d' 前後の空白文字が削除されること。 >>> normalizer.normalize('\\t\\r\\n a\\t\\r\\n ') 'a' """ str = mojimoji.zen_to_han(str, kana = False) str = re.sub('[\t\r\n]', ' ', str) str = re.sub(' {2,}', ' ', str) str = str.strip() return str
def make_stopwords(): u"""コピペ用ストップワードを作成して表示 """ import mojimoji import cnvk stopwords=set() hira=u"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもらりるれろやゐゆゑよわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽぁぃぅぇぉゃゅょっゔ" kata=[] for h in hira: kata.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA)) kata.append(u"ヴ") hankata=[] for k in kata: hankata.append(mojimoji.zen_to_han(k)) kazu=u"0123456789" stopwords.add(u"10") stopwords.add(u"11") stopwords.add(u"12") stopwords.add(u"13") stopwords.add(u"14") stopwords.add(u"15") stopwords.add(u"16") stopwords.add(u"17") stopwords.add(u"18") stopwords.add(u"19") stopwords.add(u"20") stopwords.add(u"10") stopwords.add(u"11") stopwords.add(u"12") stopwords.add(u"13") stopwords.add(u"14") stopwords.add(u"15") stopwords.add(u"16") stopwords.add(u"17") stopwords.add(u"18") stopwords.add(u"19") stopwords.add(u"20") zenkazu=mojimoji.han_to_zen(kazu) kazukan=u"一二三四五六七八九十百千万億兆" minialpha=u"abcdefghijklmnopqlstuvwxyz" bigalpha=u"ABCDEFGHIJKLMNOPQLSTUVWXYZ" han_minialpha=mojimoji.han_to_zen(minialpha) han_bigalpha=mojimoji.han_to_zen(bigalpha) hiramoji=[u"する",u"なる",u"てる",u"れる",u"やる",u"いる",u"さん",u"なん",u"くん",u"それ",u"こと",\ u"ちゃん",u"ある",u"これ",u"して",u"くれる",u"くださる",u"そう",u"せる",u"した",u"いか",\ u"ので",u"よう",u"てるん",u"もん",u"られる",u"あそこ",u"あたり",u"あちら",u"あっち",u"あと",\ u"あな",u"あなた",u"あれ",u"いくつ",u"いつ",u"いま",u"いろいろ",u"うち",u"おおまか",u"おまえ",u"おれ", u"がい",u"かく",u"かたちの",u"かやの",u"から",u"がら",u"きた",u"こせ",u"ここ",u"こっち",u"こと",u"ごと",\ u"こちら",u"これ",u"これら",u"ごろ",u"さまざま",u"さらい",u"しかた",u"しよう",u"すか",u"ずつ",u"すね",\ u"そう",u"そこ",u"そちら",u"そっち",u"そで",u"それ",u"それぞれ",u"それなり",u"たくさん",u"たち",u"たび",\ u"ため",u"ちゃ",u"てん",u"とおり",u"とき",u"どこ",u"どこか",u"ところ",u"どちら",u"どれ",u"なか",u"なかば",\ u"なに",u"など",u"なん",u"はじめ",u"はず",u"はるか",u"ひと",u"ひとつ",u"ふく",u"ぶり",u"べつ",u"へん",u"べん",\ u"ほう",u"ほか",u"まさ",u"まし",u"まとも",u"まま",u"みたい",u"みつ",u"みなさん",u"みんな",u"もと",u"もの",\ u"もん",u"やつ",u"よう",u"よそ",u"わけ",u"わたし",u"くる",u"すぎる",u"れる",u"いう",u"くださる",u"ちゃう",\ u"つく",u"せる",u"てるん",u"すぎ",u"ところ",u"おれ",u"ぼく",u"わたし",u"てる",u"しまう",u"みる", ] katamoji=[] for h in hiramoji: katamoji.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA)) han_katamoji=[] for k in katamoji: han_katamoji.append(mojimoji.zen_to_han(k)) kanmoji=["笑","今","気","今日","明日","方","人","俺","私","僕","時","思う","行く","言う","見る","出す","年","月","日","分","秒","週","火","水","木","金","土","国","都",\ "道","府","県","市","区","町","村","各","第","何","的","度","達","誰","者","類","用","別","等","際","系","品","化","所","毎","回","匹","個","席","束","歳","円","毎",\ "前","後","左","右","次","先","春","夏","秋","冬","下記","上記","時間","今回","前回","場合","自分","ヶ所","ヵ所","カ所","箇所","ヶ月","カ月","箇月","名前","本当","確か","時点",\ "様々","結局","半ば","以前","以後","以降","未満","以上","以下","毎日","自体","何人","手段","感じ","同じ","点","君"] h_kigou=cnvk.H_KIGO kigou=[] for h in h_kigou: for x in h: kigou.append(x) kigou.append(u"ω") kigou.append(u'ー') kigou.append(u"д") #参考 内容推測に適したキーワード抽出のための日本語ストップワード(https://www.jstage.jst.go.jp/article/jjske/12/4/12_511/_pdf) kokubu_words=[u"ない",u"高い",u"多い",u"少ない","強い","大きい","小さい","長い","ながい", u"良い",u"よい",u"いい","悪い", u"ある","いる","なる","行く","いく","来る","とる", "見る","みる","言う","いう","得る","過ぎる","すぎる", "する","やる","行なう","行う","おこなう","出来る","できる", "おもう","思う","考える","かんがえる","わかる","見える", "知る","しれる","いえる","示す","述べる","書く","かく","よる", "異なる","違う","ちがう","くらべる", "入れる","出る","でる","入る","はいる", "使う","用いる","もちいる","持つ","もつ","作る","つくる", "なす","起こる","おこる","つく","つける","聞く","よぶ", "かれる","つまり","上","下","次","つぎ", "わが国","自分","人々","人びと","別","他","間","話","例","形","日","家","手","名","身", "そのもの","一つ","あと", #2016/01/24 更に偏在度の高いものと、忘れてたひらがなを追加 "きゃ","きゅ","きょ","しゃ","しゅ","しょ","ちゃ","ちゅ","ちょ","にゃ","にゅ","にょ", "ひゃ","ひゅ","ひょ","みゃ","みゅ","みょ","りゃ","りゅ","りょ","ゎ", "事","目","とこ","中","字","お前","全部","きみ","もらう", ] for h in hira: stopwords.add(h) for k in kata: stopwords.add(k) for h in hankata: stopwords.add(h) for k in kazu: stopwords.add(k) for z in zenkazu: stopwords.add(z) for k in kazukan: stopwords.add(k) for m in minialpha: stopwords.add(m) for b in bigalpha: stopwords.add(b) for h in han_minialpha: stopwords.add(h) for h in han_bigalpha: stopwords.add(h) for h in hiramoji: stopwords.add(h) for k in katamoji: stopwords.add(k) for h in han_katamoji: stopwords.add(h) for k in kanmoji: stopwords.add(unicode(k)) for k in kigou: stopwords.add(k) for k in kokubu_words: stopwords.add(unicode(k)) print "set([", for s in sorted(stopwords): print "u\"{0}\",".format(s), print "])"
def z2h(filename): with open(filename, "r") as f: for line in f: print(mojimoji.zen_to_han(line), end="")
# -*- encoding: utf-8 -*- __author__= "koichi-ezato" __date__ = "$2014/10/10" import mojimoji # unicodeをutf-8にエンコードする def unicode_to_utf8(r): return r.encode('utf-8') # 全角文字を全て半角文字に変換 print '----- 全角→半角変換 -----\r\n' print 'target:アイウabc012\r\n' zenAll = u'アイウabc012' r = mojimoji.zen_to_han(zenAll) print unicode_to_utf8(r) # 全角カナ以外の全角文字を全て半角に変換 r = mojimoji.zen_to_han(zenAll, kana = False) print unicode_to_utf8(r) # 全角数字以外の全角文字を全て半角に変換 r = mojimoji.zen_to_han(zenAll, digit = False) print unicode_to_utf8(r) # 全角アスキー文字以外の全角文字を全て半角に変換 r = mojimoji.zen_to_han(zenAll, ascii = False) print unicode_to_utf8(r) print '\r\n----- 全角→半角変換 -----\r\n'