Example #1
0
def extension_moji():
    csv_file = "../ds_haiti_t.csv"
    extension_list = None
    all_moji_list = [chr(i) for i in range(ord('A'),
                                           ord('Z') + 1)
                     ] + [chr(i) for i in range(ord('あ'),
                                                ord('ん') + 1)
                          ] + [chr(i) for i in range(ord('ア'),
                                                     ord('ン') + 1)]
    with open(csv_file, 'r', encoding='cp932') as f:
        reader = csv.reader(f)
        extension_list = [next(reader)]
        for i in tqdm(range(500)):
            row = next(reader)
            tmp = copy(row)
            tmp_text = mojimoji.zen_to_han(tmp[1])
            if (mojimoji.zen_to_han(str(tmp[3])) not in tmp_text):
                print(i, tmp_text)
            for ex_i, ex_moji in enumerate(all_moji_list):
                ex_tmp = copy(tmp)
                ex_tmp_text = tmp_text.replace(
                    mojimoji.zen_to_han(str(ex_tmp[3])), str(ex_moji))
                ex_tmp[1] = ex_tmp_text
                ex_tmp[3] = ex_i
                extension_list.append(ex_tmp)
            # print(extension_list[-1])
    with open('../haiti_moji_extension_ds.csv', 'w', encoding="cp932") as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerows(extension_list)
Example #2
0
def register():
    '''
    Handle DNS registration request (POST).
    '''
    ipaddress = mojimoji.zen_to_han(request.form['ipaddress'].strip())
    hostname = mojimoji.zen_to_han(request.form['hostname'].strip())
    fullname = '{0}.{1}'.format(hostname, DOMAIN)
    error = None

    if not ipaddress:
        error = "Enter IP address. (IPアドレスを いれてください)"
    elif not is_valid_ipv4_address(ipaddress):
        error = "Invalid IP address format. (IPアドレスの フォーマットが まちがっています)"
    elif not hostname:
        error = "Enter a name. (なまえを いれてください)"
    elif not is_valid_hostname(hostname):
        error = "Invalid hostname. Use only alphabets, numbers, and hyphen. (なまえの フォーマットが まちがっています。アルファベット、すうじ、ハイフンだけが つかえます)"
    elif not ipaddress in IPLIST:
        error = "This IP address is not ours. (このIPアドレスは、わたくしたちの ものでは ありません)"

    if error is None:
        error = add_dns_resource(ipaddress, fullname)
        pass

    if error:
        session['error'] = error
        session['ipaddress'] = ipaddress
        session['hostname'] = hostname
        return redirect(url_for('show_error'))
    else:
        session['ipaddress'] = ipaddress
        session['fullname'] = fullname
        return redirect(url_for('show_success'))
def process(text):
    a = None
    b = None
    err_corr = text.split("\t")
    if len(err_corr) == 2:
        err = mojimoji.zen_to_han(err_corr[0].rstrip('\n'), kana=False)
        err = mojimoji.han_to_zen(err, ascii=False, digit=False)
        corr = mojimoji.zen_to_han(err_corr[1].rstrip('\n'), kana=False)
        corr = mojimoji.han_to_zen(corr, ascii=False, digit=False)
        err_lang = utils.lang_check(err, lang)
        corr_lang = utils.lang_check(corr, lang)

        if err_lang and corr_lang:

            errs = list(err)
            corrs = list(corr)
            del_num, ins_num = ld.levenshtein_distance(errs, corrs)
            del_portion = del_num / len(errs)
            ins_portion = ins_num / len(corrs)


            if (del_num < d_num and ins_num < i_num and del_portion < 0.4 and ins_portion < 0.4)\
                    and (corrs[-1]== '。' or corrs[-1]== '?' or corrs[-1]== '!') \
                    and (corrs[-2] not in numlist) and ('__' not in corr) and (len(corr)>6):
                #cleaning the dataset like: 1)
                err = re.sub("\d+\)\s+", "", err)
                corr = re.sub("\d+\)\s+", "", corr)
                err = re.sub("\(\s", "", err)
                corr = re.sub("\(\s", "", corr)
                err = re.sub("\s\)", "", err)
                corr = re.sub("\s\)", "", corr)
                #cleaning the string like: 1.)
                err = re.sub("\d+\.\)\s*", "", err)
                corr = re.sub("\d+\.\)\s*", "", corr)
                #cleaning the string like: 1.
                err = re.sub("\d+\.\s*", "", err)
                corr = re.sub("\d+\.\s*", "", corr)
                #cleaning the strings begin with ・
                err = re.sub("・\s+", "", err)
                corr = re.sub("・\s+", "", corr)
                # cleaning the strings begin with *
                err = re.sub("\*\s+", "", err)
                corr = re.sub("\*\s+", "", corr)
                # cleaning the strings begin with *
                err = re.sub("\*\*\s+", "", err)
                corr = re.sub("\*\*\s+", "", corr)
                # cleaning the strings begin with -
                err = re.sub("-\s+", "", err)
                corr = re.sub("-\s+", "", corr)
                # cleaning the tag for conversation:
                err = re.sub("A:\s*", "", err)
                corr = re.sub("A:\s*", "", corr)
                # cleaning the tag for conversation:
                err = re.sub("B:\s*", "", err)
                corr = re.sub("B:\s*", "", corr)
                a = err
                b = corr

                return a, b
Example #4
0
def fetch_from_ranking(year):
    assert (len(year) == 6)
    r = requests.get(
        f"https://www.karatetsu.com/ranking/index.php?top_ym={year}")
    soup = BeautifulSoup(r.content, "html5lib")
    return [(match_paren.sub(
        "", zen_to_han(s.select("td:nth-of-type(3)")[0].text, kana=False)),
             zen_to_han(s.select("td:nth-of-type(4)")[0].text, kana=False))
            for s in soup.select("#ranking tr")[2:]]
Example #5
0
def load_file(path):
    with open_file(path) as f:
        for line in f:
            line = mojimoji.zen_to_han(line, kana=False)
            line = digit_pattern.sub('#', line)
            words = line.rstrip().split(' ')
            yield words
Example #6
0
def title_torkenize(sentence):
    sentence = mojimoji.zen_to_han(sentence)
    sentence = re.sub(
        "[\._-―─!@#$%^&\-‐|\\*\“()_■×+α※÷⇒♬◉ᴗ͈ˬ—●▲★☆⭐️⭕⚡⚠①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮♡⭐︎〇◎◆♦▼◼◇△□(:〜~+=)/*&^%$#@!~`)♪ᴖ◡ᴖー{}[]↑↓←→➡⇩™・⊡…\[\]\"\'\”\’:;<>?<>〔〕\r\−〈〉?、、。。・,\./『』【】「」「」→←○《》≪≫\n\u3000]",
        " ", sentence)
    sentence = re.sub("[あ-ん]", " ", sentence)
    sentence = re.sub("( | )+", " ", sentence)
    sentence = sentence.lower()
    #〇〇様専用を除く
    sentence = re.sub("[^ ]*専用", "", sentence)
    sentence = re.sub("[^ ]*様", "", sentence)
    #1文字のアルファベットを除く
    sentence = re.sub(" [a-z]{1}[^(a-z)]", " ", sentence)
    # 絵文字除去
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        "]+",
        flags=re.UNICODE)
    sentence = emoji_pattern.sub(r'', sentence)
    sentence = sentence.strip()

    return sentence
Example #7
0
def convert(fo, heisei):
    if heisei == 31:
        ext = "xlsx"
    else:
        ext = "xls"
    sheets = pd.read_excel(f"xls/{heisei}.{ext}", sheet_name=None)

    for sheet_name, df in sheets.items():
        m = re.search(r"\d+", sheet_name)
        if m is None:
            continue
        # pylint: disable=c-extension-no-member
        month = mojimoji.zen_to_han(m.group())
        if int(month) >= 4:
            year = heisei + 1988
        else:
            year = heisei + 1988 + 1
        for row in df.itertuples():
            if row[2] != "日":
                continue
            day = row[1]
            ymd = f"{year}-{month}-{day}"
            try:
                t = dt.strptime(ymd, "%Y-%m-%d")
            except ValueError as e:
                print(e)
                continue
            fo.write(t.strftime("%Y-%m-%d,"))
            fo.write(','.join(map(str, row[3:8])))
            fo.write("\n")
Example #8
0
def get_x_y_text_from_xml(page):
    """
    xmlからx,y,textを抽出した結果をリストとして返す
    :param page:    ElementTreeで抽出したxml
    :return:        x,y,textの辞書が格納された1次元配列
    """

    x_y_text_list = []
    for textbox in page:
        for textline in textbox:
            for text in textline:
                if text.text != '\n' and 'bbox' in text.attrib:
                    bbox = text.attrib['bbox'].split(',')
                    x_y_text_list.append({
                        'x':
                        float(bbox[0]),
                        'y':
                        float(bbox[1]),
                        'text':
                        mojimoji.zen_to_han(text.text, kana=False)
                    })

    if len(x_y_text_list) == 0:
        os.remove(tmp_file_path)
        frame = inspect.currentframe()
        abort(500, {
            'code': frame.f_lineno,
            'msg': '情報抽出中にエラーが発生しました',
            'param': None
        })

    return x_y_text_list
Example #9
0
def make_corpus(docs, debug=False):
    """
    複数の文書からコーパスを作成する
    @docs 文書のリスト
    @return トークナイズされた文書のリスト
    """
    docs = list(
        map(
            lambda d: list(
                filter(lambda x: x.strip() != "", re.split("\n|。", d.lower()))
            ), docs))

    docs = [
        list(map(lambda x: mojimoji.zen_to_han(x), lines)) for lines in docs
    ]

    analyzer = Analyzer([
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'[(\)、。「」]', ' ')
    ], JanomeTokenizer(), [
        POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
        ExtractAttributeFilter('base_form')
    ])

    corpus = [
        list(
            itertools.chain.from_iterable(
                [list(analyzer.analyze(l)) for l in lines])) for lines in docs
    ]

    if debug:
        print("\n".join(corpus))

    return corpus
Example #10
0
def preprocess(doc, debug=False):
    """
    ドキュメントを引数にとってそれを前処理した上でトークナイズされた文のリストに分割する
    @param doc 対象のドキュメント
    @return 前処理されたドキュメントに含まれる文のリスト
    """

    doc = doc.lower()

    lines = re.split("\n|。", doc)
    lines = list(filter(lambda x: x != "", map(lambda x: x.strip(), lines)))
    sentences = copy.deepcopy(lines)
    lines = list(map(lambda x: mojimoji.zen_to_han(x), lines))

    analyzer = Analyzer([
        UnicodeNormalizeCharFilter(),
        RegexReplaceCharFilter(r'[(\)、。「」]', ' ')
    ], JanomeTokenizer(), [
        POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']),
        ExtractAttributeFilter('base_form')
    ])
    corpus = [' '.join(analyzer.analyze(l)) + '。' for l in lines]
    if debug:
        print("\n".join(corpus))

    return sentences, corpus
async def on_message(message):
    if message.channel.id != channel_id: return

    # ボットのメッセージを除外
    if message.author.bot:
        return

    #冗長
    if re.search(r'(ママ|まま)(おねがい|お願い)', message.content):
        if message.author.name == 'GESU':
            await message.channel.send("はいはい、ファラリスの雄牛に入りたいのね")
        else:
            await message.channel.send("ママじゃないよ")

    # メッセージからのハンドル(予約)
    if re.search(r'(まー|マー|麻)(じゃん|ジャン|雀).*(予約|よやく)', message.content):
        await reserv(message)

    # メッセージからのハンドル(予約処理)
    if re.search(r'(次は誰|次誰)', message.content):
        await next_reserv(message)

    # メッセージからのハンドル(前自発者のお礼)
    if re.search(r'.*ありがと.*', message.content):
        if len(turn_list) == 0:
            return

        if turn_list[0].author.name == message.author.name:
            if len(turn_list) == 1:
                await message.channel.send("お疲れ様:heart:")
                turn_list.clear()
                return

            await next_reserv(message)

    # メッセージからのハンドル(救援ID)
    if re.search(r'^[a-zA-Z0-9\s]+$', mojimoji.zen_to_han(message.content),
                 re.IGNORECASE):
        if len(turn_list) < 2:
            return

        if turn_list[1].author.name == message.author.name:
            await message.channel.send("ふぇぇ……誰にも呼ばれない")
            await next_reserv(message)

    # メンションからのハンドル
    if bot.user in message.mentions:
        # 予約受付
        if re.search(r'.*予約.*', message.content):
            await reserv(message)
        # 予約処理
        elif re.search(r'.*次.*', message.content):
            await next_reserv(message)
        elif message.content == 'help':
            await help(message)
        else:
            await message.channel.send("私の扱い方だわに")
            await help(message)

    await bot.process_commands(message)
Example #12
0
def extractEntryAndIndex():

    print('Start extracting entry and index')

    # 一時ファイルの読み込み
    f = codecs.open(f_temp_path, 'r', 'utf-8')
    entryIdForIndex = 0
    pbar = tqdm(range(15958))

    for line in f:
        start = line.find("<dt id=")
        # Extract Entry
        if start > -1:
            end = line.find("<a")
            entryId = line[start + 8:20]
            title = line[22:end - 1]
            storeEntryToDB(entryId, title)
            entryIdForIndex += 1
            pbar.update(1)

        # Extract index
        elif line.find("<key") > -1:
            # Ignore the Kana type and store index into database
            if line.find('type="かな"') < 0:
                value_end = line.find("</key>")
                title_end = line.find("type=")
                value = line[title_end + 10:value_end]
                value = mojimoji.zen_to_han(value, kana=False).lower()
                title = line[12:title_end - 2]
                storeIndexToDB(entryIdForIndex, value, title)

        elif line.find("&#x01;") > 0:
            break

    f.close()
Example #13
0
def preprocessing(text: str) -> str:
    result: str = text
    # 全角 -> 半角
    result = mojimoji.zen_to_han(result, kana=False)
    # number -> kanji
    result = re.sub(
        r'\d+',
        lambda m: kanjize.int2kanji(int(m.group(0))),
        result,
    )
    # remove '笑'
    result = re.sub(
        r'[a-z]+',
        lambda m: ''
        if re.match(r'^w+$', m.group(0), re.IGNORECASE) else m.group(0),
        result,
        flags=re.IGNORECASE)
    # remove symbolic char
    result = re.sub(r'[\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]', '', result)
    result = re.sub(r'[!-/:-@[-`{、。”’・  ]', '', result)
    # remove emoji
    result = ''.join(ch for ch in result
                     if ch not in emoji.UNICODE_EMOJI['en'])

    return result
Example #14
0
def convert_children_count(children_str):
    # 文字列が入っている場合は変換
    if type(children_str) is str:
        return mojimoji.zen_to_han(str(children_str).strip("人"))
    else:
        # 文字列が入ってない場合は空文字に変換
        return ''
Example #15
0
def handle_message(event):
    if event.reply_token == "00000000000000000000000000000000":
        return

    message = mojimoji.zen_to_han(event.message.text.lower(), kana=False)

    # 期間を抽出
    if re.match(".*(日|デイリー|daily).*", message):
        scope = "daily"
    elif re.match(".*(週|ウィークリー|weekly).*", message):
        scope = "weekly"
    elif re.match(".*(月|マンスリー|monthly).*", message):
        scope = "monthly"
    else:
        scope = None

    # scopeがある場合、記事数を取得
    if scope is None:
        return "No reply"
    else:
        max_amount = re.search("[0-9]+[(つ|こ|個|本|記事)]", message)
        if max_amount is not None:
            max_amount = int(re.search("[0-9]+", max_amount.group()).group())
        else:
            max_amount = 5

    info = get_trend_info(scope, max_amount)
    info["scope"] = scope
    reply = make_message(info)

    line_bot_api.reply_message(event.reply_token, TextSendMessage(text=reply))
Example #16
0
def format_zen_han(l):
  import mojimoji
  l = l.decode('utf-8') if type(l) == str else l
  l = mojimoji.zen_to_han(l, kana=False) #全角数字・アルファベットを半角に
  l = mojimoji.han_to_zen(l, digit=False, ascii=False) #半角カナを全角に
  l = l.encode('utf-8')
  return l
def text_to_char_index(full_vocab,
                       real_vocab_number,
                       chara_bukken_revised,
                       sentence_text,
                       addition_translate,
                       comp_width=COMP_WIDTH,
                       preprocessed_char_number=0,
                       skip_unknown=False,
                       shuffle=None):
    # mode:
    # average: will repeat the original index to #comp_width for the process of the embedding layer
    # padding: will pad the original index to #comp_width with zero for the process of the embedding layer
    # char_emb_dim  char embedding size
    # comp_width  #components used

    if preprocessed_char_number == 0:
        preprocessed_char_number = len(full_vocab)

    # convert digital number and latin to hangaku
    text = mojimoji.zen_to_han(sentence_text, kana=False)
    # convert kana to zengaku
    text = mojimoji.han_to_zen(text, digit=False, ascii=False)
    # convert kata to hira
    _, katakana2hiragana, _ = _make_kana_convertor()
    text = katakana2hiragana(text)
    text = text.translate(addition_translate)
    # finally, lowercase
    text = text.lower()
    # expanding every character with 3 components
    ch2id = {}
    for i, w in enumerate(full_vocab):
        ch2id[w] = i
    int_text = []
    # print(text)
    for c in text:
        # print(c)
        try:
            i = ch2id[c]
        except KeyError:
            print("Unknown Character: ", c)
            if skip_unknown:
                continue  # skip unknown words
            else:
                i = 1  # assign to unknown words
        # print(i)
        if real_vocab_number < i < preprocessed_char_number:
            comps = chara_bukken_revised[i]
            if shuffle == "flip":
                comps = comps[::-1]
            # print(comps)
            if len(comps) >= comp_width:
                int_text += comps[:comp_width]
            else:
                int_text += comps + [0] * (comp_width - len(comps))
        else:
            if shuffle == "random":
                if i < real_vocab_number:
                    i = (i + 20) % real_vocab_number
            int_text += [i] + [0] * (comp_width - 1)
    return int_text
Example #18
0
    def __word_normiraze(self, word):
        """単語の正規化"""
        word = word.lower()
        word = mojimoji.han_to_zen(word, ascii=False, digit=False)
        word = mojimoji.zen_to_han(word, kana=False)

        return word
Example #19
0
def extension_day():
    day_1 = ['1日目','一日目','木曜日','1','一','木']
    day_2 = ['2日目','二日目','金曜日','2','二','金']
    csv_file = "../ds_haiti.csv"
    extension_list = None
    with open(csv_file,'r',encoding='cp932') as f:
        reader = csv.reader(f)
        extension_list = [next(reader)]
        # reader_list = [r for r in reader]
        for row in tqdm(reader):
            tmp = copy(row)
            tmp[2] = int(tmp[2])-1
            tmp_text = mojimoji.zen_to_han(tmp[1])
            if(tmp[2]==0):
                for d in day_1:
                    tmp_day = copy(tmp)
                    tmp_text_ex = tmp_text.replace('木曜日',str(d))
                    tmp_day[1] = tmp_text_ex
                    extension_list.append(copy(tmp_day))
            else:
                for d in day_2:
                    tmp_day = copy(tmp)
                    tmp_text_ex = tmp_text.replace('金曜日',str(d))
                    tmp_day[1] = tmp_text_ex
                    extension_list.append(copy(tmp_day))
    with open('../haiti_day_extension_ds.csv', 'w' ,encoding="cp932") as f:
        writer = csv.writer(f, lineterminator='\n')
        writer.writerows(extension_list)
def tokenizer(text):
    text = mojimoji.zen_to_han(text.replace("\n", ""), kana=False)
    parsed = tagger.parse(text).split("\n")
    parsed = [t.split("\t") for t in parsed]
    parsed = list(filter(lambda x: x[0] != "" and x[0] != "EOS", parsed))
    parsed = [p[2] for p in parsed]
    return parsed
Example #21
0
def parse_address(text, debug=False):
    '''
    Get location
    '''
    text = mojimoji.zen_to_han(text, kana=False)
    text = re.sub(r'[\((].*[)\)]', '', text)
    text = re.sub(r'[\s、]', '', text)
    _ward = re.search(r'[a-zA-Z一-龥ぁ-んァ-ヶ・ー]+区', text)
    if _ward:
        ward = _ward.group()
        text = re.sub(ward, '', text)  # remove
    else:
        raise ValueError(text)

    text = text.replace('丁目', '-')
    text = text.replace('I', '1')
    text = re.sub(r'(以下|詳細)*未定', '', text)
    text = re.sub(r'[ー‐―−]', '-', text)
    text = re.sub(r'(\d)番[地]*', r'\1-', text)
    text = re.sub(r'[-]{2,}', r'-', text)
    text = re.sub(r'([a-zA-Z一-龥ぁ-んァ-ヶ・ー])(\d)', r'\1-\2', text)
    text = re.sub(r'(\d)号', r'\1', text)
    lines = text.split('-')
    lines += ['' for i in range(4 - len(lines))]  # adjust length
    return (ward, *lines)
Example #22
0
def _normalize(s):
    """
    mecab-ipadic-neologd の正規化処理(一部修正)を適用する
    ref: https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
    
    Parameters
    ----------
    s : str
        raw text

    Returns
    -------
    str
        normalized text
    """

    s = s.strip()

    s = regex.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = regex.sub('[﹣-ー—―─━ー]+', 'ー', s)  # normalize choonpus
    s = regex.sub('[~∼∾〜〰~]', '〜', s)  # normalize tildes

    s = _remove_extra_spaces(s)
    s = regex.sub('[’]', '\'', s)
    s = regex.sub('[”]', '"', s)

    s = mojimoji.han_to_zen(s, digit=False, ascii=False)
    s = mojimoji.zen_to_han(s, kana=False)
    s = s.lower()

    return s
Example #23
0
def test_mojimoji():
    logging.info("=========================================")
    logging.info("=               mojimoji                =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title)
        logging.info("Not implemented")

        logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title)
        logging.info("Not implemented")

        logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title)
        logging.info("Not implemented")

        logging.info("半角 to 全角 for %s" % title)
        calc_time(mojimoji.han_to_zen, body)
        logging.debug("result: %s" % mojimoji.han_to_zen(body))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(mojimoji.zen_to_han, body)
        logging.debug("result: %s" % mojimoji.zen_to_han(body))
Example #24
0
    async def process_commands(self, message):
        if message.author.bot:
            return

        # Backup original content
        # その都度インスタンスを生成することにする。
        self.__message_stocker = MessageAccumulation(self, message)

        # 書き換え
        if message.content.startswith(self.command_prefix):
            # 先頭がcommand_prefixの時のみ改変処理を行う。
            # コマンド処理に差支えが無い範囲で、文字列加工を行う事。

            # コマンド相当文字の全角半角変換
            zen_to_han_command_line = self.regex_command.sub(
                lambda match: mojimoji.zen_to_han(match.group(0)).lower(),
                message.content)

            # 改行をマークしておき、一旦1行に纏めて、コマンドでsplitする。
            linking_command = ("🃴".join(zen_to_han_command_line.splitlines())).replace(' ',' ')
            split_linking_command = self.regex_command.split(linking_command)
            removal_blank_line = [item for item in split_linking_command if item != ""]
            #print(f"removal_blank_line: {removal_blank_line}")

            # コマンド記号を先頭に内容を再組立てする。
            command_line_list = []
            split_line = []
            for item in removal_blank_line:
                if self.regex_command.match(item):
                    if len(split_line) >= 1:
                        command_line_list.append("".join(split_line))
                    split_line = []
                split_line.append(item)
            if len(split_line) >= 1:
                command_line_list.append("".join(split_line))
            #print(f"command_line_list: {command_line_list}")

            # message.contentを改変し、通常処理のように偽装する。
            for command_line in command_line_list:
                cr_lines = [item for item in command_line.replace('🃴', '\n').splitlines() if item != ""]
                for line in cr_lines:
                    # new message.content
                    message.content = line
                    print("----Modified message:\n" + message.content)

                    # NOTE: この時、message.deleteを使い、
                    # コマンドを打ったユーザーのメッセージを削除するような処理を行うと、
                    # 一つのメッセージに対し、2回の削除が走ってしまう可能性があるため注意。
                    ctx = await self.get_context(message)
                    self.__message_stocker.set_last_context(ctx)
                    await self.invoke(ctx)

            # ストックしたメッセージを一気に送信する。
            await self.__message_stocker.release_send()

        else:
            # 通常時は本来通り動作させる。
            ctx = await self.get_context(message)
            self.__message_stocker.set_last_context(ctx)
            await self.invoke(ctx)
Example #25
0
def oracle_create():

    logger.debug("start oracle_create")
    form = OracleCreateForm()

    if request.method == 'GET':
        logger.debug("start oracle_create GET")
        return render_template(ORACLE_PATH + '/create.html', form=form)

    logger.debug("start oracle_create POST")
    form = OracleCreateForm(request.form)

    if request.method == 'POST' and form.validate():
        session['username'] = request.form['username']
        session['email'] = request.form['email']

        #logger.debug("UT_ENTRYTEMP insert")
        entry = Entry.dict()
        entry['UNAME'] = session['username']
        entry['UKANA'] = mojimoji.zen_to_han(session['username'])
        entry['UEMAIL'] = session['email']

        if Entry.insert(entry):
            flash("登録できました。", "success")
        else:
            flash("登録できませんでした。", "danger")

        return redirect("/sample/oracle/")

    return render_template(ORACLE_PATH + 'create.html', form=form)
Example #26
0
    def executeConvert(self):
        for path in glob.glob('./pdf/*'):
            input_path = path
            output_file = os.path.splitext(os.path.basename(input_path))[0]
            output_path = os.path.join('./text', output_file + '.txt')

            rsrcmgr = PDFResourceManager()
            codec = 'utf-8'
            params = LAParams()
            text = ""
            with StringIO() as output:
                device = TextConverter(rsrcmgr,
                                       output,
                                       codec=codec,
                                       laparams=params)
                with open(input_path, 'rb') as input:
                    interpreter = PDFPageInterpreter(rsrcmgr, device)
                    for page in PDFPage.get_pages(input):
                        interpreter.process_page(page)

                    text += output.getvalue()
                device.close()
            output.close()
            # 半角空白が発生するため、trimする
            text = re.sub(r' | ', '', text.strip())
            text = mojimoji.zen_to_han(text)
            # output text
            with open(output_path, "wb") as f:
                f.write(text.encode('utf-8', "ignore"))
Example #27
0
def result():
    if request.method == 'POST':

        query = request.form["target_text"]
        ud_rate = request.form["ud_rate"]
        ud_rate_percent = float(ud_rate) / 100
        query  = mojimoji.zen_to_han(query, kana=False)

        txt_seg = TextSegmentation()
        r_dict           = txt_seg.segment_text(query, 99)       # query, limit
        r_dict_popped    = txt_seg.pop_search_words(ud_rate_percent, r_dict) #### #### routerからだとなぜか最後のループでdictの要素がlistで無くなる
        r_dict_joined    = txt_seg.join_dict_elements(r_dict_popped, 3) # minimum elements
        search_word_dict = txt_seg.reindex_r_dict(r_dict_joined)

        twi = TwiSearch(session)
        search_result = twi.make_search_result(search_word_dict)

        jf = JsonFormatter()
        init_tweet_list_json = jf.init_tweet_list_json(search_word_dict, search_result)
        search_word_json = jf.search_dict_to_json(search_word_dict)
        tweet_list_json = jf.input_tweet_list_json(search_word_dict, search_result, init_tweet_list_json)
        tweet_list_json = jf.del_empty_json(tweet_list_json, search_word_dict)

        # Save function
        # from model import Model
        # model = Model()
        # model.save_result_tweet('json_data/result_tweet_json8.json', tweet_list_json)
        # tweet_list_json = model.load_search_result('json_data/result_tweet_json8.json')

        return render_template("result.html", ud_rate = ud_rate, tweet_list_json=tweet_list_json, search_word_json=search_word_json)
Example #28
0
def mysql_create():

    logger.debug("start mysql_create")
    form = MySqlCreateForm()

    if request.method == 'GET':
        logger.debug("start mysql_create GET")
        return render_template(MYSQL_PATH + 'create.html', form=form)

    logger.debug("start mysql_create POST")
    form = MySqlCreateForm(request.form)

    if request.method == 'POST' and form.validate():
        session['username'] = request.form['username']
        session['email'] = request.form['email']

        #logger.debug("UT_ENTRYTEMP insert")
        user = User.dict()
        user['UNAME'] = session['username']
        user['UKANA'] = mojimoji.zen_to_han(session['username'])
        user['UEMAIL'] = session['email']

        if User.insert(user):
            flash("登録できました。", "success")
        else:
            flash("登録できませんでした。", "danger")

        return redirect(MYSQL_PATH)

    return render_template(MYSQL_PATH + 'create.html', form=form)
Example #29
0
 def tokenize(self, manuscript: str) -> list:
     token_list = []
     append = token_list.append
     try:
         tokens = self._m.parse(manuscript).split('\n')
     except IndexError:
         print(manuscript)
         return None
     for tok in tokens:
         # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音
         tok = re.split(r'[\,\t]', tok)
         if len(tok) < 10:
             continue
         ps = tok[1]
         if ps not in ['名詞', '動詞', '形容詞']:
             continue
         # 原形があれば原形をリストに入れる
         w = tok[7]
         if w == '*' or w == '':
             # 原形がなければ表層系(原稿の単語そのまま)をリストに入れる
             w = tok[0]
         if w == '' or w == '\n':
             continue
         # 全角英数はすべて半角英数にする
         w = mojimoji.zen_to_han(w, kana=False, digit=False)
         # 半角カタカナはすべて全角にする
         w = mojimoji.han_to_zen(w, digit=False, ascii=False)
         # 英語はすべて小文字にする
         w = w.lower()
         append(w)
     return token_list
Example #30
0
def get_shoplist_pref(filedir: str = ""):
    """
    filedir : 都道府県ごとの店情報が格納されたテキストファイルの場所

    1つの都道府県に関して,店の名前のリストを作成し,返す
    """

    shoplist = []
    contents = pd.read_csv(filedir)
    ltd = re.compile(r"([(株式)(有限)(合資)]+会社){1}")
    bracket = re.compile(r"\(.+\)")
    for shopname in contents["name"]:
        # カタカナ以外の文字を半角へ
        shopname = moji.zen_to_han(shopname, kana=False)
        # 括弧に囲まれた文字列を削除
        shopname = bracket.sub("", shopname)
        # 〇〇会社という文字列は除く
        shopname = ltd.sub("", shopname)
        # /で区切られていたら区切られる前の文字列と
        # 区切り文字を消した文字列を格納する
        if shopname.find("/") > -1:
            shoplist.append(shopname[:shopname.find("/")])
            shopname = shopname.replace("/", "")
        shoplist.append(shopname)
    return shoplist
Example #31
0
def normalize_numbers(sentence: str,
                      replacement: str = "0",
                      pattern: str = None) -> str:
    """正規表現patternに該当する数字をreplacementに置換する

    Parameters
    ----------
    sentence : str
        処理したい文章
    replacement : str, optional
        処理後の置換文字列, by default "0"
    pattern : str, optional
        処理したい正規表現のパターン
        Noneの場合はデフォルトパターン"(\d+[,,]*)+"で処理, by default None

    Returns
    -------
    str
        数字正規化処理後の文字列
    """

    if not pattern:
        pattern = normalize_numbers_default_pattern
    hankaku_num_sentence = mojimoji.zen_to_han(sentence,
                                               kana=False,
                                               digit=True,
                                               ascii=False)
    return re.sub(pattern, replacement, hankaku_num_sentence)
def normalize_text(text):
    try:
        text = zen_to_han(text, kana=False)
    except TypeError:
        pass  # non-unicode object

    text = re.sub(u'\r?\n', '', text, re.M)
    text = re.sub(u'-', '-', text, re.M)
    return text
Example #33
0
def normalize_text(text):
    text = str(text)
    text = text.lower()  # all lower
    text = mojimoji.zen_to_han(text)  # all hankaku
    text = norm_numeric(text)
    text = norm_emoji(text)
    text = norm_url(text)
    text = norm_continuous_char(text)
    return text
Example #34
0
def test_zen_to_han():
    eq_(u'アイウエオ', mojimoji.zen_to_han(u'アイウエオ'))
    eq_(u'ガギグゲゴ', mojimoji.zen_to_han(u'ガギグゲゴ'))
    eq_(u'パピプペポ', mojimoji.zen_to_han(u'パピプペポ'))
    eq_(u'0123', mojimoji.zen_to_han(u'0123'))
    eq_(u'abcABC', mojimoji.zen_to_han(u'abcABC'))
    eq_(u'#?!', mojimoji.zen_to_han(u'#?!'))
    eq_(u'あいうえお', mojimoji.zen_to_han(u'あいうえお'))
Example #35
0
    def wakachi(self):
        u"""分かち書きを行う

        Returns:
            辞書型で結果を返す
        """
        md=config.m_mecab_dic

        tagger=MeCab.Tagger(md.option)
        tagger.parse('')


        emoji=re.compile(u'^U00')
        kigou=re.compile(u'^[!-~]$')

        # 全角半角を正規化
        self.text=mojimoji.zen_to_han(self.text,kana=False,digit=True,ascii=True)
        self.text=mojimoji.han_to_zen(self.text,kana=True,digit=False,ascii=False)

        node=tagger.parseToNode(self.text.encode('utf-8'))
        words=[]

        while node:
            pos=node.feature.split(",")[md.pos]
            if pos=="形容詞" or pos == "形容動詞" or pos=="動詞" or pos=="名詞":
                if len(node.feature.split(","))<=md.base:
                    base = node.surface
                else:
                    base=node.feature.split(",")[md.base]

                if base == "*":
                    base = node.surface
                # 絵文字、ひらがな、カタカナ一文字は除外
                if (emoji.match(unicode(base)) is not None) or (kigou.match(unicode(base)) is not None):
                    pass
                # ストップワードに含まれたら除外
                elif unicode(base) in get_stopwords():
                    pass
                else:
                    # 大文字は小文字化して格納する
                    words.append(base.lower())
            node=node.next

        wakachi=map(str,words)
        wakachi = " ".join(wakachi)

        if "\n" in wakachi:
            wakachi=wakachi.split("\n",1)[0].strip()
        self.wakachigaki=wakachi

        return {'_id':self.id,'screen_name':self.screen_name,'text':self.text,'wakachi':wakachi}
Example #36
0
def get_bow(content):
    """
    We assume that the argument is written in Japanese.
    """

    # Convert full-width to half-width.
    content = mojimoji.zen_to_han(content.decode('utf-8')).encode('utf-8').lower() # 

    # Morphological analysis
    bow = mecab.parse(content.lower())
    rst = []
    for w in bow['nouns']+bow['verbs']: # Extract nouns and verbs.
        if not sw.is_stop_word(w):
            rst.append(w)
    return rst
Example #37
0
    def normalize(self, str):
        """
        日本語文字列を正規化する。

        全角の英数字・記号が半角に変換されること。(カタカナは除く)

        >>> normalizer = BasicNormalizer()
        >>> normalizer.normalize('日本語あいうえおアイウエオ')
        '日本語あいうえおアイウエオ'
        >>> normalizer.normalize('0123456789')
        '0123456789'
        >>> normalizer.normalize('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        >>> normalizer.normalize('abcdefghijklmnopqrstuvwxyz')
        'abcdefghijklmnopqrstuvwxyz'
        >>> normalizer.normalize('a !”#$%&’()*+,−./:;<=>?@[¥]^_‘{|}〜')
        'a !"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'

        改行文字・タブ文字が空白に置換されること。

        >>> normalizer.normalize('a\\tb\\rc\\nd')
        'a b c d'

        連続する空白が圧縮されること。

        >>> normalizer.normalize('a b  c   d')
        'a b c d'

        前後の空白文字が削除されること。

        >>> normalizer.normalize('\\t\\r\\n a\\t\\r\\n ')
        'a'
        """

        str = mojimoji.zen_to_han(str, kana = False)
        str = re.sub('[\t\r\n]', ' ', str)
        str = re.sub(' {2,}', ' ', str)
        str = str.strip()
        return str
Example #38
0
def make_stopwords():
    u"""コピペ用ストップワードを作成して表示

    """
    import mojimoji
    import cnvk
    stopwords=set()
    hira=u"あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもらりるれろやゐゆゑよわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽぁぃぅぇぉゃゅょっゔ"
    kata=[]
    for h in hira:
        kata.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA))
    kata.append(u"ヴ")
    hankata=[]
    for k in kata:
        hankata.append(mojimoji.zen_to_han(k))
    kazu=u"0123456789"
    stopwords.add(u"10")
    stopwords.add(u"11")
    stopwords.add(u"12")
    stopwords.add(u"13")
    stopwords.add(u"14")
    stopwords.add(u"15")
    stopwords.add(u"16")
    stopwords.add(u"17")
    stopwords.add(u"18")
    stopwords.add(u"19")
    stopwords.add(u"20")
    stopwords.add(u"10")
    stopwords.add(u"11")
    stopwords.add(u"12")
    stopwords.add(u"13")
    stopwords.add(u"14")
    stopwords.add(u"15")
    stopwords.add(u"16")
    stopwords.add(u"17")
    stopwords.add(u"18")
    stopwords.add(u"19")
    stopwords.add(u"20")
    zenkazu=mojimoji.han_to_zen(kazu)
    kazukan=u"一二三四五六七八九十百千万億兆"
    minialpha=u"abcdefghijklmnopqlstuvwxyz"
    bigalpha=u"ABCDEFGHIJKLMNOPQLSTUVWXYZ"
    han_minialpha=mojimoji.han_to_zen(minialpha)
    han_bigalpha=mojimoji.han_to_zen(bigalpha)
    hiramoji=[u"する",u"なる",u"てる",u"れる",u"やる",u"いる",u"さん",u"なん",u"くん",u"それ",u"こと",\
              u"ちゃん",u"ある",u"これ",u"して",u"くれる",u"くださる",u"そう",u"せる",u"した",u"いか",\
              u"ので",u"よう",u"てるん",u"もん",u"られる",u"あそこ",u"あたり",u"あちら",u"あっち",u"あと",\
              u"あな",u"あなた",u"あれ",u"いくつ",u"いつ",u"いま",u"いろいろ",u"うち",u"おおまか",u"おまえ",u"おれ",
              u"がい",u"かく",u"かたちの",u"かやの",u"から",u"がら",u"きた",u"こせ",u"ここ",u"こっち",u"こと",u"ごと",\
              u"こちら",u"これ",u"これら",u"ごろ",u"さまざま",u"さらい",u"しかた",u"しよう",u"すか",u"ずつ",u"すね",\
              u"そう",u"そこ",u"そちら",u"そっち",u"そで",u"それ",u"それぞれ",u"それなり",u"たくさん",u"たち",u"たび",\
              u"ため",u"ちゃ",u"てん",u"とおり",u"とき",u"どこ",u"どこか",u"ところ",u"どちら",u"どれ",u"なか",u"なかば",\
              u"なに",u"など",u"なん",u"はじめ",u"はず",u"はるか",u"ひと",u"ひとつ",u"ふく",u"ぶり",u"べつ",u"へん",u"べん",\
              u"ほう",u"ほか",u"まさ",u"まし",u"まとも",u"まま",u"みたい",u"みつ",u"みなさん",u"みんな",u"もと",u"もの",\
              u"もん",u"やつ",u"よう",u"よそ",u"わけ",u"わたし",u"くる",u"すぎる",u"れる",u"いう",u"くださる",u"ちゃう",\
              u"つく",u"せる",u"てるん",u"すぎ",u"ところ",u"おれ",u"ぼく",u"わたし",u"てる",u"しまう",u"みる",
              ]

    katamoji=[]
    for h in hiramoji:
        katamoji.append(cnvk.convert(h,cnvk.HIRA2KATA,cnvk.Z_KATA))
    han_katamoji=[]
    for k in katamoji:
        han_katamoji.append(mojimoji.zen_to_han(k))

    kanmoji=["笑","今","気","今日","明日","方","人","俺","私","僕","時","思う","行く","言う","見る","出す","年","月","日","分","秒","週","火","水","木","金","土","国","都",\
             "道","府","県","市","区","町","村","各","第","何","的","度","達","誰","者","類","用","別","等","際","系","品","化","所","毎","回","匹","個","席","束","歳","円","毎",\
             "前","後","左","右","次","先","春","夏","秋","冬","下記","上記","時間","今回","前回","場合","自分","ヶ所","ヵ所","カ所","箇所","ヶ月","カ月","箇月","名前","本当","確か","時点",\
             "様々","結局","半ば","以前","以後","以降","未満","以上","以下","毎日","自体","何人","手段","感じ","同じ","点","君"]

    h_kigou=cnvk.H_KIGO
    kigou=[]
    for h in h_kigou:
        for x in h:
            kigou.append(x)
    kigou.append(u"ω")
    kigou.append(u'ー')
    kigou.append(u"д")

    #参考 内容推測に適したキーワード抽出のための日本語ストップワード(https://www.jstage.jst.go.jp/article/jjske/12/4/12_511/_pdf)
    kokubu_words=[u"ない",u"高い",u"多い",u"少ない","強い","大きい","小さい","長い","ながい",
                  u"良い",u"よい",u"いい","悪い",
                  u"ある","いる","なる","行く","いく","来る","とる",
                  "見る","みる","言う","いう","得る","過ぎる","すぎる",
                  "する","やる","行なう","行う","おこなう","出来る","できる",
                  "おもう","思う","考える","かんがえる","わかる","見える",
                  "知る","しれる","いえる","示す","述べる","書く","かく","よる",
                  "異なる","違う","ちがう","くらべる",
                  "入れる","出る","でる","入る","はいる",
                  "使う","用いる","もちいる","持つ","もつ","作る","つくる",
                  "なす","起こる","おこる","つく","つける","聞く","よぶ",
                  "かれる","つまり","上","下","次","つぎ",
                  "わが国","自分","人々","人びと","別","他","間","話","例","形","日","家","手","名","身",
                  "そのもの","一つ","あと",

                  #2016/01/24 更に偏在度の高いものと、忘れてたひらがなを追加
                  "きゃ","きゅ","きょ","しゃ","しゅ","しょ","ちゃ","ちゅ","ちょ","にゃ","にゅ","にょ",
                  "ひゃ","ひゅ","ひょ","みゃ","みゅ","みょ","りゃ","りゅ","りょ","ゎ",
                  "事","目","とこ","中","字","お前","全部","きみ","もらう",
                  ]

    for h in hira:
        stopwords.add(h)
    for k in kata:
        stopwords.add(k)
    for h in hankata:
        stopwords.add(h)
    for k in kazu:
        stopwords.add(k)
    for z in zenkazu:
        stopwords.add(z)
    for k in kazukan:
        stopwords.add(k)
    for m in minialpha:
        stopwords.add(m)
    for b in bigalpha:
        stopwords.add(b)
    for h in han_minialpha:
        stopwords.add(h)
    for h in han_bigalpha:
        stopwords.add(h)
    for h in hiramoji:
        stopwords.add(h)
    for k in katamoji:
        stopwords.add(k)
    for h in han_katamoji:
        stopwords.add(h)
    for k in kanmoji:
        stopwords.add(unicode(k))
    for k in kigou:
        stopwords.add(k)
    for k in kokubu_words:
        stopwords.add(unicode(k))
    print "set([",
    for s in sorted(stopwords):
        print "u\"{0}\",".format(s),
    print "])"
Example #39
0
def z2h(filename):
    with open(filename, "r") as f:
        for line in f:
            print(mojimoji.zen_to_han(line), end="")
# -*- encoding: utf-8 -*-
__author__= "koichi-ezato"
__date__ = "$2014/10/10"

import mojimoji

# unicodeをutf-8にエンコードする
def unicode_to_utf8(r):
	return r.encode('utf-8')

# 全角文字を全て半角文字に変換
print '----- 全角→半角変換 -----\r\n'
print 'target:アイウabc012\r\n'

zenAll = u'アイウabc012'
r = mojimoji.zen_to_han(zenAll)

print unicode_to_utf8(r)

# 全角カナ以外の全角文字を全て半角に変換
r = mojimoji.zen_to_han(zenAll, kana = False)
print unicode_to_utf8(r)

# 全角数字以外の全角文字を全て半角に変換
r = mojimoji.zen_to_han(zenAll, digit = False)
print unicode_to_utf8(r)

# 全角アスキー文字以外の全角文字を全て半角に変換
r = mojimoji.zen_to_han(zenAll, ascii = False)
print unicode_to_utf8(r)
print '\r\n----- 全角→半角変換 -----\r\n'