Python normalizeの例、neologdn.normalize Pythonの例

コード例 #1

0

ファイルを表示

def main():
    args = get_args()
    vocab = Vocab(args.vocab_path, args.vocab_size)  # create a vocabulary
    hps = get_hps()
    if not args.data_path == "":
        batcher = Batcher(args.data_path, vocab, hps, args.single_pass)
        import pdb
        pdb.set_trace()
        x = batcher.next_batch()
        import pdb
        pdb.set_trace()
        pass
    else:
        with open(args.json_path) as f:
            art = json.load(f)
        article = neologdn.normalize(art['body'])
        abstract = neologdn.normalize(art['title'])
        m = MeCab('-Owakati')
        parsed_article = m.parse(article)
        abs_words = m.parse(abstract).split()
        ex = B.Example(parsed_article, abs_words, vocab, hps)
        b = B.Batch([ex], hps, vocab)
        import pdb
        pdb.set_trace()
        pass

コード例 #2

0

ファイルを表示

def json_batch(fname, hps, vocab):
    with open(fname) as f:
        art = json.load(f)
    article = neologdn.normalize(art['body'])
    abstract = neologdn.normalize(art['title'])
    m = MeCab('-Owakati')
    parsed_article = m.parse(article)
    abs_words = m.parse(abstract).split()
    ex = B.Example(parsed_article, abs_words, vocab, hps)
    b = B.Batch([ex], hps, vocab)
    return b

コード例 #3

0

ファイルを表示

def read_from_csv(path, parse_func):
    lists = []
    with open(path, 'r', encoding='utf-8') as f:
        sources = csv.reader(f, delimiter=',')
        for src, dst in sources:
            src = parse_func(
                neologdn.normalize(
                    src.replace('"', '').replace("～～～", '～')))
            dst = parse_func(
                neologdn.normalize(
                    dst.replace('"', '').replace('～～～', '～')))
            lists += [[' '.join(src),
                       ' '.join(dst)]]
    return lists

コード例 #4

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: kaorisugi/twittter_sentiment

    def main(self):
        tweets_num = 0
        stopwords = self.Stop_Words()
        df_tweet, tweets = self.Load_tweets()
        #ツイートを分かち書きしてcsvに出力(モード'a'はデータ追加、モード'w'は新規作成)
        with open('./output/' + self.out_file, self.mode) as f:
            for i in tweets:
                tweets_num += 1
                i = neologdn.normalize(i)
                i = re.sub('\n', "", i)
                i = re.sub(r'[!-~]', "", i)  #半角記号,数字,英字を削除
                i = re.sub(r'[︰-＠]', "", i)  #全角記号削除
                i = self.format_text(i)  #記号削除
                i = re.sub(
                    r'[【】●ㅅ●Ф☆✩︎♡→←▼①②③④⑤『』ω《》∠∇∩♪∀◞ཀCщ≧≦ ́◤◢■◆★※↑↓〇◯○◎⇒▽◉Θ♫♬〃“”◇ᄉ⊂⊃д°]',
                    "", i)
                #i = re.sub(r'[‥…？！〜「」｢｣:：♪♩『』→↓↑〈〉・゜・´Д´°ω°•ω•★＊☆♡（）✔Θ∀´∀｀˘ω˘‼бωб￣▽￣]', "", i)
                i = self.remove_emoji(i)
                i = self.Tokenizer(i, stopwords)
                i = ' '.join(i)  #リストを文字列に変換
                i = str(i)
                f.write(i)

        with open('./output/' + self.out_file) as f:
            wakati = f.read()

        print('csv出力完了：' + self.out_file)
        print("学習用ツイート数（判定用ツイート含む/短すぎるツイートは削除）：", tweets_num)
        print("[分かち書きサンプル]\n", wakati[:100])
        print()
        return df_tweet, self.similar

コード例 #5

0

ファイルを表示

ファイル: get_cloud.py プロジェクト: futnag/tweetcloud

def normalize_string(text):
    normalized_text = neologdn.normalize(text).lower()
    replaced_text = re.sub("[!?@「」()、。・（）…/_:;\s]", "", normalized_text)
    # replaced_text = re.sub("[!?@「」()、。（）…/_:;\d\s]", "", normalized_text)
    # replaced_text = re.sub("[!?@「」()、。（）…/_:;\d\sa-zA-Z]", "", normalized_text)

    return replaced_text

コード例 #6

0

ファイルを表示

    def most_words(self):
        nsdtweet = self.api.user_timeline(screen_name="@nsd244", count=200)
        #nsdtext = nsdtweet[0].text
        words = []
        print(len(nsdtweet))
        for status in nsdtweet:
            tex = neologdn.normalize(status.text)  # 正規化
            tex = ''.join(c for c in tex
                          if c not in emoji.UNICODE_EMOJI)  # 絵文字の除去
            tex = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '',
                         tex)  # URLの除去
            tex = re.sub(r'(\d)([,.])(\d+)', r'\1\3', tex)  # 桁区切りの除去
            tex = re.sub(r'\d+', '0', tex)  # 数字の置換
            tex = re.sub(r'[!-/:-@[-`{-~]', r' ', tex)  # 半角記号の置換
            tex = re.sub(u'[■-♯]', ' ',
                         tex)  # 全角記号の置換 (ここでは0x25A0 - 0x266Fのブロックのみを除去)

            m = MeCab.Tagger(
                "/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")

            for line in m.parse(tex).splitlines()[:-1]:
                surface, feature = line.split('\t')
                if feature.startswith(
                        "名詞"
                ) and ',非自立,' not in feature and surface != "0" and surface != "RT":
                    words.append(surface)

        #print(words)
        counter = Counter(words)
        out = []
        for word, cnt in counter.most_common(10):
            out.append("単語：" + word + ", 出現回数:" + str(cnt) + "\n")

        self.api.update_status(status="@nsd244" + "\n".join(map(str, out)),
                               in_reply_to_status_id=nsdtweet[0].id)

コード例 #7

0

ファイルを表示

ファイル: text_preprocess.py プロジェクト: chendongliang87/JapaneseTokenizers

def normalize_text(input_text,
                   dictionary_mode='ipadic',
                   new_line_replaced='。',
                   is_replace_eos=True,
                   is_kana=True,
                   is_ascii=True,
                   is_digit=True):
    """* What you can do
    - It converts input-text into normalized-text which is good for tokenizer input.

    * Params
    - new_line_replaced: a string which replaces from \n string.
    """
    # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
    if is_replace_eos:
        without_new_line = input_text.replace('\n', new_line_replaced)
    else:
        without_new_line = new_line_replaced

    if dictionary_mode == 'neologd':
        return neologdn.normalize(
            normalize_text_normal_ipadic(without_new_line))
    else:
        return normalize_text_normal_ipadic(without_new_line,
                                            kana=is_kana,
                                            ascii=is_ascii,
                                            digit=is_digit)

コード例 #8

0

ファイルを表示

 def _normalize_hours(self, hours):
     hoursn = hours
     hoursn = hoursn.replace('～', '-')
     hoursn = hoursn.replace('・','･')
     hoursn = neologdn.normalize(hoursn)
     hoursn = hoursn[:100]
     return hoursn

コード例 #9

0

ファイルを表示

ファイル: keitaiso.py プロジェクト: hiraokusky/Jatext-Classification

    def normalize(self, word):
        """
        テキストを正規化する
        """
        # 前後空白を削除
        word = word.strip()
        # 日本語の区切りをシンプルに変換
        word = word.translate(self.tt_seps)
        # 小文字化
        word = word.lower()
        # 漢数字をアラビア数字にする
        word = self.kansuji2arabic(word)
        # NFKC（Normalization Form Compatibility Composition）で
        # 半角カタカナ、全角英数、ローマ数字・丸数字、異体字などなどを正規化。
        word = unicodedata.normalize("NFKC", word)
        # アルファベットやアラビア数字、括弧やエクスクラメーションマークなどの記号は、半角に統一
        # カタカナは、全角に統一
        # "うまーーーい!!" → "うまーい!" など、重ね表現を除去
        # 一方で、"やばっっっ!!" は除去できてません
        # repeat引数に1を渡すと、2文字以上の重ね表現は1文字にできますが、そうすると"Good"は"God"になってしまったりします
        # ”〜”などの記号も除去されてます
        word = neologdn.normalize(word)

        # もろもろ正規化したあとのパターンマッチング変換

        # URLを削除
        word = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', word)
        # 桁区切りの除去と数字の置換
        word = re.sub(r'(\d)([,.])(\d+)', r'\1\3', word)
        word = re.sub(r'\d+', '0', word)
        # 記号削除
        word = re.sub(r'[\(\)\<\>\[\]\【\】\《\》\≪\≫\/\#\?\・]', '', word)

        return word

コード例 #10

0

ファイルを表示

ファイル: run_server.py プロジェクト: uratoss/chainer_traner_seq2seq

def predict():
    response = {"success": False, "Content-Type": "application/json"}
    if flask.request.method == "POST":
        if flask.request.get_json().get("xs"):
            user_input = flask.request.get_json().get("xs")
            normalized = neologdn.normalize(user_input)
            s = m.parse(normalized).replace('\n', '').strip().split()
            print('xs is ', s)
            xs = []
            for x in s:
                try:
                    xs.append(vocab[x])
                except (KeyError):
                    xs.append(random.uniform(0, len(vocab) - 1))
            xs.append(vocab['<eos>'])
            xs = xp.array(xs).astype(xp.int32)
            dummy = [(xs, xp.zeros(1).astype(xp.int32))]

            with chainer.using_config("train", False), chainer.using_config(
                    "enable_backprop", False):
                ys_list = model(dummy)[0]
                ys = []
                for y in ys_list:
                    if int(y) is vocab["<eos>"]:
                        break
                    ys.append(rvocab[int(y)])

            # classify the input feature
            response["ys"] = ''.join(ys)
            print('ys is ', response["ys"])

            # indicate that the request was a success
            response["success"] = True
    # return the data dictionary as a JSON response
    return flask.jsonify(response)

コード例 #11

0

ファイルを表示

ファイル: dataloader.py プロジェクト: snjstudent/my_dialogue_system

 def _preprocess(self, sentence: str) -> str:
     sentence = sentence.replace('●', '')
     s = sentence
     # 参考元：https://qiita.com/gacky01/items/26cd642731e3eddde60d
     while s.find("（") != -1:
         start_1 = s.find("（")
         if s.find("）") != -1:
             end_1 = s.find("）")
             if start_1 >= end_1:
                 s = s.replace(s[end_1], "")
             else:
                 s = s.replace(s[start_1:end_1 + 1], "")
             if len(s) == 0:
                 continue
         else:
             s = s[0:start_1]
     while s.find("【") != -1:
         start_4 = s.find("【")
         if s.find("】") != -1:
             end_4 = s.find("】")
             s = s.replace(s[start_4:end_4 + 1], "")
         else:
             s = s[0:start_4]
     sentence = s
     return neologdn.normalize(re.sub(r'\d+', '0',
                                      sentence)).replace("\n", "")

コード例 #12

0

ファイルを表示

ファイル: create_wordcloud.py プロジェクト: futnag/create_wordcloud

def normalize_string(text):
    """
    文字列から余計な記号などを取り除く
    """
    normalized_text = neologdn.normalize(text).lower()
    replaced_text = re.sub("[!?@「」()、。・（）…/_:;\s]", "", normalized_text)
    return replaced_text

コード例 #13

0

ファイルを表示

def normalize_text(input_text,
                   dictionary_mode='ipadic',
                   new_line_replaced='。',
                   is_replace_eos=True,
                   is_kana=True,
                   is_ascii=True,
                   is_digit=True):
    """* What you can do
    - It converts input-text into normalized-text which is good for tokenizer input.

    * Params
    - new_line_replaced: a string which replaces from \n string.
    """
    # type: (str,str,str,bool,bool,bool,bool)->str
    if is_replace_eos:
        without_new_line = input_text.replace('\n', new_line_replaced)
    else:
        without_new_line = new_line_replaced

    if dictionary_mode == 'neologd':
        # this code comes from https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
        return neologdn.normalize(without_new_line)
    else:
        return normalize_text_normal_ipadic(without_new_line,
                                            kana=is_kana,
                                            ascii=is_ascii,
                                            digit=is_digit)

コード例 #14

0

ファイルを表示

def text_clean(text):
    text = text.replace('\u3000', '')
    text = neologdn.normalize(text, repeat=3)
    text = ''.join(['' if c in emoji.UNICODE_EMOJI else c for c in text])
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text)
    text = re.sub(r'\d+', '0', text)
    text = re.sub(r'[!-/:-@[-`{-~]', r'', text)
    text = re.sub(u'[■-♯]', '', text)
    text = regex.sub(r'^(\p{Nd}+\p{Zs})(.*)$', r'\2', text)
    text = text.strip()
    text = text.replace('“', '')
    text = text.replace('…', '')
    text = text.replace('『', '「')
    text = text.replace('』', '」')
    text = text.replace('《', '「')
    text = text.replace('》', '」')
    text = text.replace('〕', '）')
    text = text.replace('〔', '（')
    text = text.replace('〈', '（')
    text = text.replace('〉', '）')
    text = text.replace('→', '')
    text = text.replace(',', '、')
    text = text.replace('，', '、')
    text = text.replace('．', '。')
    text = text.replace('.', '。')
    text = text.replace(' ', '')
    return text

コード例 #15

0

ファイルを表示

ファイル: text_2_token.py プロジェクト: Kyohei-Prj/Sentence_Vec

def start_preprocess(path):

    with open(path, mode='r', encoding='utf-8') as f:
        text = f.readlines()

    # preprocess text
    text = [neo.normalize(sentence, repeat=2).lower() for sentence in text]
    text = replacer(text, r'<.+>', '')
    text = replacer(text, r'\d+', '')
    text = [sentence.rstrip() for sentence in text]
    pattern = r'[、。「」〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％：〔〕“”!"#$%&()*+,-./:;<=>?@^_`{|}~]'
    text = replacer(text, pattern, '')

    # tokenize text
    text_str = ' '.join(text)
    tokens = tokenizer(text_str)
    token_list = tokens.split(' ')

    # remove stopwords
    processed_text = remove_stopwords(token_list)
    processed_text = ' '.join(processed_text)

    # save to txt
    save_path = path.replace('text', 'processed_text')
    with open(save_path, mode='w') as f:
        f.write(processed_text)

コード例 #16

0

ファイルを表示

def normalize_text(input_text,
                   dictionary_mode='ipadic',
                   new_line_replaced='。',
                   is_replace_eos=True,
                   is_kana=True,
                   is_ascii=True,
                   is_digit=True):
    """* What you can do
    - It converts input-text into normalized-text which is good for tokenizer input.

    * Params
    - new_line_replaced: a string which replaces from \n string.
    """
    # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
    if is_replace_eos:
        without_new_line = input_text.replace('\n', new_line_replaced)
    else:
        without_new_line = new_line_replaced

    if dictionary_mode=='neologd' and is_neologdn_valid:
        return neologdn.normalize(normalize_text_normal_ipadic(without_new_line))
    elif dictionary_mode=='neologd' and is_neologdn_valid == False:
        raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.")
    else:
        return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)

コード例 #17

0

ファイルを表示

def split_into_words(text, tokenizer):
    # tokens = tokenizer.tokenize(text)
    normalized_text = neologdn.normalize(text)
    normalized_text = re.sub(r'[!-/:-@[-`{-~]', r' ', normalized_text)
    tokens = [token for token in tokenizer.analyze(normalized_text)]

    ret = []
    for idx in range(len(tokens)):
        token = tokens[idx]
        if idx + 1 == len(tokens):
            if parts[0] == '名詞' and parts[1] != '接尾' and parts[1] != '副詞可能':
                ret.append(token.base_form)
            elif parts[0] == '名詞':
                continue
            else:
                ret.append(token.base_form)
            break
        post_token = tokens[idx + 1]
        parts = token.part_of_speech.split(',')
        post_parts = post_token.part_of_speech.split(',')
        if parts[0] == '名詞':
            if parts[1] == '一般' and post_parts[0] == '名詞' and post_parts[
                    1] == '接尾':
                ret.append(token.base_form + post_token.base_form)
            elif parts[1] == '一般':
                ret.append(token.base_form)
            elif parts[1] == '接尾':
                continue
            elif parts[1] == '副詞可能':
                continue
            else:
                ret.append(token.base_form)
        else:
            ret.append(token.base_form)
    return ret

コード例 #18

0

ファイルを表示

def searchTeacher(text, bool):
    text = neologdn.normalize(text)  #正規化
    text = text.split("，")[0] if "，" in text else text
    text = text.replace(" ", "")
    text = text.replace("C1", "")

    sql_lecture = f"select * from lecture_assessments where subject LIKE '%{text}%'"
    lecture_info = get_dict_resultset(sql_lecture)  #検索結果が入っている.

    teacher_info_list = []
    if bool:
        #[{'subject': '実践機械学習', 'teacher': '篠原歩', 'difficulty': '仏', 'worth': '', 'comment': 'Pythonに関する授業',
        # 'test': '', 'report': '', 'attendance': ''},
        # {'subject': '実践機械学習', 'teacher': '篠原歩', 'difficulty': '仏', 'worth': '', 'comment': '機械学習に興味があるけどよく知らないという人にはよさそう',
        # 'test': '', 'report': 'あり', 'attendance': 'あり'}]　こういう辞書のリストをつくる
        teacher_info_list = [{
            key: value
            for key, value in zip(keys, _lecture_info)
        } for _lecture_info in lecture_info]
    else:
        if lecture_info:
            teacher_info_list = list(
                set([_lecture_info[1] for _lecture_info in lecture_info
                     ]))  #教授名だけのリスト. 一度setにしてからlistに戻すことで,重複している要素を除いている.
            if teacher_info_list[0] == "":  #先頭が空であることが多いので,それを除去.
                teacher_info_list = teacher_info_list[1:]

    return teacher_info_list

コード例 #19

0

ファイルを表示

ファイル: Morphological_analysis.py プロジェクト: naikasann/Naivebayes_used_sklearn

    def text_to_ward(self, text):
        m = MeCab.Tagger("-d C:\mecab-ipadic-neologd")
        m.parse(" ")

        buff = neologdn.normalize(text)
        m_text = m.parse(buff)
        basic_word = []

        m_text = m_text.split("\n")
        for row in m_text:
            word = row.split("\t")[0]
            if word == "EOS":
                break
            else:
                pos = row.split("\t")[1].split(",")
                parts = pos[0]
                if "記号" in parts:
                    if word != "。":
                        continue
                    basic_word.append(word)
                elif "助" in parts:
                    pass
                elif "形容詞" in parts or "動詞" in parts:
                    basic_word.append(pos[6])
                    pass
                elif "名詞" in parts or "副詞" in parts:
                    basic_word.append(word)
                    pass

        result_word = " ".join(basic_word)
        return result_word

コード例 #20

0

ファイルを表示

def web_rand(url="",fields={}):
    https = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where(),
    headers={"User-Agent":"Janome_doe"})
    try:html=https.request('POST',str(url).split("?")[0]+"?"+parse.quote(str(url).split("?")[1],safe="=&-"))
    except: print("err");return "ERROR:invalid endpoint"
    html=html.data.decode('utf-8').translate(str.maketrans("","","\"\'\\/<>%`?;"))#Not_secure_filename!
    return neologdn.normalize(html).translate(str.maketrans("","","_:| ～-#"))

コード例 #21

0

ファイルを表示

def normalize_text(text: str) -> str:
    """テキストを正規化する.

    :param text: 正規化対象テキスト
    :return: 正規化後テキスト
    """
    normalized_text = neologdn.normalize(text)
    return str(_normalize_circle_char(string=normalized_text))

コード例 #22

0

ファイルを表示

ファイル: calc_user_vec.py プロジェクト: hcpmiyuki/odegather-api-

 def cleansing_text(self, text):
     text = self.cleansing_space(text)
     text = self.cleansing_url(text)
     text = self.cleansing_emoji(text)
     text = self.cleansing_unity(text)
     text = self.cleansing_num(text)
     text = neologdn.normalize(text)
     return text

コード例 #23

0

ファイルを表示

def normalize(text: str) -> str:
    """
    テキストの正規化
    """
    text = text.replace("\n", " ").strip()  # 改行を除去して1行の長いテキストとみなす
    text = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text)  # 数字の桁区切りの除去
    text = re.sub(r'\d+', '0', text)  # 数字をすべて0に統一
    text = neologdn.normalize(text)  # 全角・半角の統一と重ね表現の除去
    return text

コード例 #24

0

ファイルを表示

    def preprocessing(self, text):
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\r', '', text)
        text = mojimoji.han_to_zen(text, digit=False, ascii=False)
        text = mojimoji.zen_to_han(text, kana=True)
        text = ''.join(c for c in text if c not in emoji.UNICODE_EMOJI)
        text = neologdn.normalize(text)        

        return text

コード例 #25

0

ファイルを表示

def normalize(text):
    text_without_account = re.sub(r'@[a-zA-Z0-9_]+', '', text)  # remove twitter_account
    text_without_url = re.sub(r'https?://[\w/;:%#\$&\?\(\)~\.=\+\-]+', '', text_without_account)  # remove URL
    text_normalized = neologdn.normalize(text_without_url).replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
    text_without_emoji = ''.join(['' if c in emoji.UNICODE_EMOJI else c for c in text_normalized])
    #tmp = re.sub(r'(\d)([,.])(\d+)', r'\1\3', text_without_emoji)
    #text_replaced_number = re.sub(r'\d+', '0', tmp)
    text_replaced_indention = ' '.join(text_without_emoji.splitlines())
    return text_replaced_indention.lower()

コード例 #26

0

ファイルを表示

ファイル: oov.py プロジェクト: yagays/oov_magnitude_ja

    def query(self, word):
        normalized_word = neologdn.normalize(word)

        if word in self.vocab:
            return self.w2v[word]
        elif normalized_word in self.vocab:
            return self.w2v[normalized_word]
        else:
            return self.out_of_vocab_vector(normalized_word)

コード例 #27

0

ファイルを表示

def preprocessing(text: str) -> str:
    # メンション除去
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
    # リンク除去
    text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text)
    # 絵文字除去
    text = ''.join(['' if c in emoji.UNICODE_EMOJI["en"] else c for c in text])
    # いい感じな正規化
    text = neologdn.normalize(text)
    return text

コード例 #28

0

ファイルを表示

def extractWords(text):
    text = removeEmoji(text)
    text = neologdn.normalize(text)
    words = []
    analyzedResults = tagger.parse(text).split("\n")
    for result in analyzedResults:
        splittedWord = result.split(",")[0].split("\t")[0]
        if not splittedWord in stopWords:
            words.append(splittedWord)
    return words

コード例 #29

0

ファイルを表示

ファイル: basic_MeCab.py プロジェクト: mf1611/scripts

def extract_noun(text: str):
    norm_text = neologdn.normalize(text)
    parsed = parse_text(norm_text)
    noun_df = parsed[
        parsed.type.str.startswith('名詞-一般') | 
        parsed['type'].str.startswith('名詞-固有名詞') |
        parsed.type.str.startswith('名詞-サ変接続') |
        parsed.type.str.startswith('名詞-形容動詞語幹')
    ]
    return ' '.join(noun_df.orig.tolist())

コード例 #30

0

ファイルを表示

ファイル: mecab_wrapper_python2.py プロジェクト: yusukefs/JapaneseTokenizers

    def tokenize(self, sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=normalize_text):
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        # type: (str, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        ### decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd':
            normalized_sentence = neologdn.normalize(sentence)
        elif func_normalizer == normalize_text:
            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        elif func_normalizer is None:
            normalized_sentence = sentence
        else:
            normalized_sentence = func_normalizer(sentence)

        assert isinstance(sentence, string_types)
        tokenized_objects = []

        # don't delete this variable. encoded_text protects sentence from deleting
        encoded_text = normalized_sentence.encode('utf-8')

        node = self.mecabObj.parseToNode(encoded_text)
        node = node.next
        while node.next is not None:

            word_surface = node.surface.decode('utf-8')

            tuple_pos, word_stem = self.__feature_parser(node.feature.decode('utf-8'), word_surface)

            tokenized_obj = TokenizedResult(
                node_obj=node,
                tuple_pos=tuple_pos,
                word_stem=word_stem,
                word_surface=word_surface,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_objects.append(tokenized_obj)
            node = node.next

        tokenized_sentence = TokenizedSenetence(
            sentence=sentence,
            tokenized_objects=tokenized_objects
        )

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence

コード例 #31

0

ファイルを表示

 def test_normalize_lengthened(self):
     self.assertEqual(normalize("うまああああああああああああい", repeat=7), "うまあああああああい")
     self.assertEqual(normalize("かわいいいいいるい", repeat=6), "かわいいいいいるい")

コード例 #32

0

ファイルを表示

ファイル: test_neologdn.py プロジェクト: pombredanne/neologdn

 def test_normalize(self):
     self.assertEqual(normalize("０"), "0")
     self.assertEqual(normalize("ﾊﾝｶｸ"), "ハンカク")
     self.assertEqual(normalize("o₋o"), "o-o")
     self.assertEqual(normalize("majika━"), "majikaー")
     self.assertEqual(normalize("わ〰い"), "わい")
     self.assertEqual(normalize("スーパーーーー"), "スーパー")
     self.assertEqual(normalize("!#"), "!#")
     self.assertEqual(normalize("ゼンカク　スペース"), "ゼンカクスペース")
     self.assertEqual(normalize("お             お"), "おお")
     self.assertEqual(normalize("      おお"), "おお")
     self.assertEqual(normalize("おお      "), "おお")
     self.assertEqual(normalize("検索 エンジン 自作 入門 を 買い ました!!!"),\
                      "検索エンジン自作入門を買いました!!!")
     self.assertEqual(normalize("アルゴリズム C"), "アルゴリズムC")
     self.assertEqual(normalize("　　　ＰＲＭＬ　　副　読　本　　　"), "PRML副読本")
     self.assertEqual(normalize("Coding the Matrix"), "Coding the Matrix")
     self.assertEqual(normalize("南アルプスの　天然水　Ｓｐａｒｋｉｎｇ　Ｌｅｍｏｎ　レモン一絞り"),\
                      "南アルプスの天然水Sparking Lemonレモン一絞り")
     self.assertEqual(normalize("南アルプスの　天然水-　Ｓｐａｒｋｉｎｇ*　Ｌｅｍｏｎ+　レモン一絞り"),\
                      "南アルプスの天然水- Sparking*Lemon+レモン一絞り")
     self.assertEqual(normalize(u'ﾊﾟﾊﾟ'), u"パパ")
     self.assertEqual(normalize(u'a˗֊‐‑‒–⁃⁻₋−'), "a-")
     self.assertEqual(normalize(u'あ﹣－ｰ—―─━ー'), u"あー")
     self.assertEqual(normalize(u'チルダ~∼∾〜〰～'), u"チルダ")

コード例 #33

0

ファイルを表示

ファイル: test_neologdn.py プロジェクト: ikegami-yukino/neologdn

 def test_suppress_removal_of_spaces_between_Japanese(self):
     self.assertEqual(normalize('巴 マミ', remove_space=False), '巴 マミ')

コード例 #34

0

ファイルを表示

ファイル: test_neologdn.py プロジェクト: ikegami-yukino/neologdn

 def test_normalize(self):
     self.assertEqual(normalize('０'), '0')
     self.assertEqual(normalize('ﾊﾝｶｸ'), 'ハンカク')
     self.assertEqual(normalize('o₋o'), 'o-o')
     self.assertEqual(normalize('majika━'), 'majikaー')
     self.assertEqual(normalize('わ〰い'), 'わい')
     self.assertEqual(normalize('スーパーーーー'), 'スーパー')
     self.assertEqual(normalize('!#'), '!#')
     self.assertEqual(normalize('ゼンカク　スペース'), 'ゼンカクスペース')
     self.assertEqual(normalize('お             お'), 'おお')
     self.assertEqual(normalize('      おお'), 'おお')
     self.assertEqual(normalize('おお      '), 'おお')
     self.assertEqual(normalize('検索 エンジン 自作 入門 を 買い ました!!!'),\
                      '検索エンジン自作入門を買いました!!!')
     self.assertEqual(normalize('アルゴリズム C'), 'アルゴリズムC')
     self.assertEqual(normalize('　　　ＰＲＭＬ　　副　読　本　　　'), 'PRML副読本')
     self.assertEqual(normalize('Coding the Matrix'), 'Coding the Matrix')
     self.assertEqual(normalize('南アルプスの　天然水　Ｓｐａｒｋｉｎｇ　Ｌｅｍｏｎ　レモン一絞り'),\
                      '南アルプスの天然水Sparking Lemonレモン一絞り')
     self.assertEqual(normalize('南アルプスの　天然水-　Ｓｐａｒｋｉｎｇ*　Ｌｅｍｏｎ+　レモン一絞り'),\
                      '南アルプスの天然水- Sparking*Lemon+レモン一絞り')
     self.assertEqual(normalize('ﾊﾟﾊﾟ'), 'パパ')
     self.assertEqual(normalize('a˗֊‐‑‒–⁃⁻₋−'), 'a-')
     self.assertEqual(normalize('あ﹣－ｰ—―─━ー'), 'あー')
     self.assertEqual(normalize('チルダ~∼∾〜〰～'), 'チルダ')
     self.assertEqual(normalize('う゛ほﾟ'), 'ゔぽ')