Esempio n. 1
0
def run(input, output):
    data = input.read()
    result = []
    for line in data.split('\n'):
        cols = line.split('\t')
        if len(cols) == 5 and cols[0].isdigit():
            # 団体コード	"都道府県名(漢字)"	"市区町村名(漢字)"	"都道府県名(カナ)"	"市区町村名(カナ)"
            result.append({
                'code':
                unicodedata.normalize('NFKC', cols[0]).strip(),
                'pref':
                unicodedata.normalize('NFKC', cols[1]).strip(),
                'city':
                unicodedata.normalize('NFKC', cols[2]).strip(),
                'pref_k':
                jaconv.h2z(cols[3]).strip(),
                'city_k':
                jaconv.h2z(cols[4]).strip(),
                'pref_h':
                jaconv.kata2hira(jaconv.h2z(cols[3])).strip(),
                'city_h':
                jaconv.kata2hira(jaconv.h2z(cols[4])).strip()
            })

    output.write(
        json.dumps(
            {
                'title': 'jp_citycode',
                'version': DATA_VERSION,
                'table': result
            },
            ensure_ascii=False).encode("utf-8"))
    click.echo('%d件処理しました' % len(result))
Esempio n. 2
0
 def run(self):
     data = self.load()
     jumanpp = Juman()
     output = []
     for _, row in data.iterrows():
         zenkaku = jaconv.h2z(row["sentence"], ascii=True, digit=True)
         splited = [
             mrph.midasi for mrph in jumanpp.analysis(zenkaku).mrph_list()
         ]
         if self.task_name == 'QA_B':
             qa_zenkaku = jaconv.h2z(
                 f"{row['target']}の{row['aspect']}は{row['sentiment']}",
                 ascii=True,
                 digit=True,
             )
         else:
             qa_zenkaku = " "
         qa_splited = [
             mrph.midasi
             for mrph in jumanpp.analysis(qa_zenkaku).mrph_list()
         ]
         output.append({
             "context": " ".join(splited),
             "qa": " ".join(qa_splited),
             "label": 1
         })
     self.dump(pd.DataFrame(output))
Esempio n. 3
0
def encode_plus(mecab,
                tokenizer,
                text_question,
                text_ending,
                text_context,
                max_length=512):
    #Question
    text_question = jaconv.h2z(text_question,
                               kana=True,
                               digit=True,
                               ascii=True)
    tokens_question = tokenize_with_mecab(mecab, text_question)

    #Ending
    text_ending = jaconv.h2z(text_ending, kana=True, digit=True, ascii=True)
    tokens_ending = tokenize_with_mecab(mecab, text_ending)

    #Context
    text_context = jaconv.h2z(text_context, kana=True, digit=True, ascii=True)
    tokens_context = tokenize_with_mecab(mecab, text_context)

    tokens_a = ["[CLS]"] + tokens_question + ["[SEP]"
                                              ] + tokens_ending + ["[SEP]"]
    tokens_b = tokens_context + ["[SEP]"]

    input_ids_a = tokenizer.convert_tokens_to_ids(tokens_a)
    input_ids_b = tokenizer.convert_tokens_to_ids(tokens_b)

    len_a = len(input_ids_a)
    len_b = len(input_ids_b)

    if len_a + len_b > max_length:
        input_ids_b = input_ids_b[:max_length - len_a]
        input_ids_b[max_length - len_a - 1] = 3  #[SEP]
    elif len_a + len_b < max_length:
        padding_length = max_length - (len_a + len_b)
        input_ids_b = input_ids_b + [0 for i in range(padding_length)]

    #Input IDs
    input_ids = input_ids_a + input_ids_b
    input_ids = torch.tensor(input_ids)

    #Attention mask
    attention_mask = torch.ones(max_length, dtype=torch.long)
    for i in range(len_a + len_b, max_length):
        attention_mask[i] = 0

    #Token type IDs
    token_type_ids = torch.ones(max_length, dtype=torch.long)
    for i in range(len_a):
        token_type_ids[i] = 0

    encoding = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids
    }

    return encoding
Esempio n. 4
0
def handleM4A(path):
    # ./GENRE/Compilations/ARTIST/ALBUM/SONG.m4a
    temp = path.replace("\\", "/")
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    song = temp[temp.rfind("/") + 1:]
    song = jaconv.z2h(song, kana=False, digit=True, ascii=True)
    song = jaconv.h2z(song, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    album = temp[temp.rfind("/") + 1:]
    album = jaconv.z2h(album, kana=False, digit=True, ascii=True)
    album = jaconv.h2z(album, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    artist = temp[temp.rfind("/") + 1:]
    artist = jaconv.z2h(artist, kana=False, digit=True, ascii=True)
    artist = jaconv.h2z(artist, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    genre = temp[temp.rfind("/") + 1:]
    genre = jaconv.z2h(genre, kana=False, digit=True, ascii=True)
    genre = jaconv.h2z(genre, kana=True, digit=False, ascii=False)
    temp = temp[:temp.rfind("/")]
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # take artist as Compilations
    category = temp[temp.rfind("/") + 1:]
    temp = temp[:temp.rfind("/")]
    if category == "__02_Compilations__":
        artist = "__Compilations__"
    elif category == "__01_Favorites__":
        pass
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    mp4 = MP4(path)
    #------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    MyLogger.info(path)
    mp4.tags[TRACK_TITLE] = song
    mp4.tags[ALBUM] = album
    mp4.tags[ALBUM_ARTIST] = artist
    mp4.tags[ALBUM_SORT_ORDER] = conv.do(album)
    mp4.tags[ARTIST] = artist
    mp4.tags[ARTIST_SORT_ORDER] = conv.do(artist)
    mp4.tags[GENRE] = genre
    MyLogger.info("mp4.tags[TRACK_TITLE]", str(mp4.tags[TRACK_TITLE]))
    MyLogger.info("mp4.tags[ALBUM]", str(mp4.tags[ALBUM]))
    MyLogger.info("mp4.tags[ALBUM_ARTIST]", str(mp4.tags[ALBUM_ARTIST]))
    MyLogger.info("mp4.tags[ALBUM_SORT_ORDER]",
                  str(mp4.tags[ALBUM_SORT_ORDER]))
    MyLogger.info("mp4.tags[ARTIST]", str(mp4.tags[ARTIST]))
    MyLogger.info("mp4.tags[ARTIST_SORT_ORDER]",
                  str(mp4.tags[ARTIST_SORT_ORDER]))
    MyLogger.info("mp4.tags[GENRE]", str(mp4.tags[GENRE]))
Esempio n. 5
0
def genEnSpeech(text, gender):
    kakasi = pykakasi.kakasi()

    kakasi.setMode("H", "a")
    kakasi.setMode("K", "a")
    kakasi.setMode("J", "a")
    kakasi.setMode("r", "Hepburn")

    conv = kakasi.getConverter()

    t2s_client = texttospeech.TextToSpeechClient()

    synthesis_input = texttospeech.types.SynthesisInput(
        text=conv.do(h2z(text)))

    ssml_gende = texttospeech.enums.SsmlVoiceGender.FEMALE
    if gender == 'male':
        ssml_gende = texttospeech.enums.SsmlVoiceGender.MALE

    voice = texttospeech.types.VoiceSelectionParams(language_code='en-US',
                                                    ssml_gender=ssml_gende)

    audio_config = texttospeech.types.AudioConfig(
        audio_encoding=texttospeech.enums.AudioEncoding.MP3)

    response = t2s_client.synthesize_speech(synthesis_input, voice,
                                            audio_config)

    return response.audio_content
Esempio n. 6
0
def g2p(input_yomi):
    # 全て全角カタカナに変換
    input_yomi = jaconv.h2z(input_yomi)
    input_yomi = jaconv.hira2kata(input_yomi)

    output_yomi = []

    for i, item in enumerate(input_yomi):

        # 先頭に長音符がきたら読まない
        if i == 0 and (item == "ー" or item == "〜"):
            pass

        # 文字列の、末端で無いとき、次の文字が捨て仮名で無いか確認する
        elif i < len(input_yomi)-1:
            if input_yomi[i+1] in sutegana:
                youon = item+input_yomi[i+1]
                # 拗音の音素を出力
                if youon in g2p_list:
                    output_yomi.append(g2p_list[youon])
                # 拗音ではない場合、通常の仮名の音素を出力
                else:
                    output_yomi += nonyouon_before_st(input_yomi, i)
                    output_yomi += nonyouon_before_st(input_yomi, i+1)
            else:
                output_yomi += nonyouon(input_yomi, i, item)
        # 末端
        else:
            output_yomi += nonyouon(input_yomi, i, item)

    output_str = " ".join(output_yomi)
    output_yomi = output_str.split()
    # 音素を出力
    return output_yomi
Esempio n. 7
0
def str_cleanUp(st):
    st = st.replace(" ", "")
    st = st.replace("・", "")
    st = st.replace("&", "アンド")
    st = jaconv.h2z(st, kana=True)
    st = jaconv.hira2kata(st)
    return st
Esempio n. 8
0
def re_cellstr(str):
    # 文字列前後の空白を削除。
    str = str.strip()
    # セル内改行を取り除く。
    str = str.replace('\n', '▽')
    # 半角カタカナを全角に変換する。
    str = jaconv.h2z(str)

    # 全角のアスキーをASCIIへ変換(スペースもASCIIになる。)。
    # 『〜』カラ
    str = re.sub('[〜~]', '〓から〓', str)
    # 『()』カッコ
    str = re.sub('(()(.+?)())', r'〓Rカッコ〓\2〓Rカッコ〓', str)
    str = re.sub('([)(.+?)(])', r'〓Bカッコ〓\2〓Bカッコ〓', str)
    str = jaconv.z2h(str, kana=False, ascii=True, digit=False)
    # 『〜』カラ復号
    str = str.replace('〓から〓', '〜')
    # 『()』カッコ復号
    str = re.sub('(〓Rカッコ〓)(.+?)(〓Rカッコ〓)', r'(\2)', str)
    str = re.sub('(〓Bカッコ〓)(.+?)(〓Bカッコ〓)', r'(\2)', str)

    # スペース(複数含む)をスペース一つに変換。
    str = re.sub("\s+", " ", str)
    # コラムが右に1列増えるのを防ぐため。
    str = re.sub(",", "/", str)
    return str
Esempio n. 9
0
def search(text):
    RequestURL = 'http://usjinfo.com/wait/realtime.php'
    html = urllib.request.urlopen(RequestURL).read()
    if text.find('#USJ') == 0 and len(text) > 5:
        att = text[4:].split()
    elif text.find('#USJ') == 0 and len(text) < 5:
        att = False
    soup = BeautifulSoup(html, "html.parser")
    ReturnText = []
    attractions = []
    item = {}
    li = soup.find_all('ul', class_='list')[1].find_all('li')
    for tag in li:
        if tag.text.find('現在')  != -1:
            pass
        elif tag.text.find('分') != -1 or tag.text.find('休止') != -1 or tag.text.find('終了') != -1:
            item.update({'info': tag.text.replace(' ','').replace('\n', '')})
            attractions.append(item)
        else:
            item = {'title': jaconv.h2z(tag.text.replace(' ','').replace('\n', ''))}
    for se in attractions:
        if att is not False:
            if se['title'].find(att[0]) != -1:
                ReturnText.append(se['title'] + ': ' + se['info'])
        elif att is False:
            ReturnText.append(se['title'] + ': ' + se['info'])
    if not ReturnText:
        ReturnText.append('見つかりませんでした。検索キーワードを見直すか、管理者に問い合わせてください。')

    return '\n'.join(ReturnText)
Esempio n. 10
0
 def _analyze_pas(self) -> None:
     """extract predicate argument structure from <述語項構造:> tag in knp string"""
     sid2idx = {
         sid: idx
         for idx, sid in enumerate(self.sid2sentence.keys())
     }
     for tag in self.tag_list():
         if tag.pas is None:
             continue
         pas = Pas(
             BasePhrase(tag, self.tag2dtid[tag], tag.pas.sid,
                        self.mrph2dmid), self.mrph2dmid)
         for case, arguments in tag.pas.arguments.items():
             if self.relax_cases:
                 if case in ALL_CASES and case.endswith('≒'):
                     case = case.rstrip('≒')  # ガ≒ -> ガ
             for arg in arguments:
                 arg.midasi = jaconv.h2z(arg.midasi,
                                         digit=True)  # 不特定:人1 -> 不特定:人1
                 # exophor
                 if arg.flag == 'E':
                     entity = self._create_entity(exophor=arg.midasi,
                                                  eid=arg.eid)
                     pas.add_special_argument(case, arg.midasi, entity.eid,
                                              '')
                 else:
                     sid = self.sentences[sid2idx[arg.sid] - arg.sdist].sid
                     arg_bp = self._get_bp(sid, arg.tid)
                     mention = self._create_mention(arg_bp)
                     pas.add_argument(case, mention, '', self.mrph2dmid)
         if pas.arguments:
             self._pas[pas.dtid] = pas
Esempio n. 11
0
def test_h2z():
    assert_equal(jaconv.h2z('ティロフィナーレ'), 'ティロフィナーレ')
    assert_equal(jaconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ')
    _compare(jaconv.h2z, HALF_KANA, FULL_KANA)
    _compare(partial(jaconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII)
    _compare(partial(jaconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jaconv.h2z(_concat(HALF_KANA if kana else FULL_KANA,
                                        HALF_ASCII if ascii else FULL_ASCII,
                                        HALF_DIGIT if digit else FULL_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
Esempio n. 12
0
    def template_specific(self, doc, tag, txt):
        with tag("div",
                 style=css_position(self.template["nickname"]),
                 klass="text"):
            with tag("p"):
                doc.asis(
                    jaconv.h2z(self.author[:4].upper(), ascii=True,
                               digit=True))

        with tag("div",
                 style=css_position(self.template["stats"]),
                 klass="text"):
            with tag("p"):
                doc.asis("""
                    <span style="clear: both; float: left">レベル</span><span style="float: right">%s</span>
                    <span style="clear: both; float: left">HP</span><span style="float: right">%s</span>
                    <span style="clear: both; float: left">MP</span><span style="float: right">%s</span>
                    <span style="clear: both; float: left">G</span><span style="float: right">%s</span>
                    <span style="clear: both; float: left">E</span><span style="float: right">%s</span>
                """.strip() % (
                    random_wide(1, 99),
                    random_wide(1, 999),
                    random_wide(1, 999),
                    random_wide(1, 9999),
                    random_wide(1, 9999),
                ))
Esempio n. 13
0
def test_h2z():
    assert_equal(jaconv.h2z('ティロフィナーレ'), 'ティロフィナーレ')
    assert_equal(jaconv.h2z('ティロフィナーレ', ignore='ィ'), 'ティロフィナーレ')
    _compare(jaconv.h2z, HALF_KANA, FULL_KANA)
    _compare(partial(jaconv.h2z, ascii=True), HALF_ASCII, FULL_ASCII)
    _compare(partial(jaconv.h2z, digit=True), HALF_DIGIT, FULL_DIGIT)

    for ascii in (True, False):
        for digit in (True, False):
            for kana in (True, False):
                assert_equal(
                    jaconv.h2z(_concat(HALF_KANA if kana else FULL_KANA,
                                        HALF_ASCII if ascii else FULL_ASCII,
                                        HALF_DIGIT if digit else FULL_DIGIT),
                                ascii=ascii, digit=digit, kana=kana),
                    _concat(FULL_KANA, FULL_ASCII, FULL_DIGIT))
def make_table(etl_filename, csv_filename):
    i = 0
    etl_filename = os.path.join(os.path.abspath(__file__), "..", etl_filename)
    with open(etl_filename, 'rb') as f:
        while True:
            # reading one image from 'sys.argv[1]' like a 'ETL9G_01'
            s = f.read(RECORD_SIZE)
            if s is None or len(s) < RECORD_SIZE:
                break
            # unpackaging from binay file. refer to 'https://techacademy.jp/magazine/19058'
            # refer to 'http://etlcdb.db.aist.go.jp/specification-of-etl-8'
            r = struct.unpack(">H2sHBBBBBBIHHHHBBBBHH2016s4x", s)
            img = Image.frombytes('F', (64, 63), r[20], 'bit', (4, 0))
            img = np.array(img.convert('L'))  # 0..15
            #print(r[0:5], type(r[3].to_bytes(1,"big")))
            #print(r[3].to_bytes(1,"big"))
            #print(r[3].to_bytes(1,"big").decode('shift_jis'))
            lbl = r[3].to_bytes(1, "big").decode('shift_jis')  # 文字コード
            lbl = jaconv.h2z(lbl, digit=True, ascii=True)

            #lbl = r[3].to_bytes(1,"big").decode('utf-8') # 文字コード
            #dirname = bytes.fromhex(lbl).decode('jis_x_0201')
            #input()
            # write index and char to 'table.csv'

            with open(csv_filename, mode='a', newline='',
                      encoding='utf-8') as csvf:
                writer = csv.writer(csvf)
                writer.writerow([str(r[0] - 1), str(lbl), "ok"])
            i += 1
Esempio n. 15
0
def insert_str(IN_STR, STR_NUM):
	#print(IN_STR)
	IN_STR_ARR = IN_STR.split('★')
	outstr_arr = []
	for ELEM_STR in IN_STR_ARR:
		if ELEM_STR == "":
			continue
		#print("ELEMENT : " + ELEM_STR)
		IN_STR2 = jaconv.h2z(ELEM_STR,digit=False,ascii=True)
		#print('INPUT : ' + IN_STR2)
		str_sz = len(IN_STR2)
		str_setnum = int(len(IN_STR2) / STR_NUM)
		#print(str_setnum)
		for j in range(0, str_setnum + 1):
			linedata = []
			for i in range(0, STR_NUM):
				idx = j * STR_NUM + i
				#print(idx)
				if idx >= str_sz:
					continue
				#print(IN_STR[idx])
				else:
					linedata.append(IN_STR2[idx])
			linedata2 = ''.join(linedata)
			outstr_arr.append(linedata2)
	outstr = '\n'.join(outstr_arr)
	#outstr = outstr.replace("。\n", "。")
	#print(outstr)
	return(outstr)
Esempio n. 16
0
 def _analyze_pas(self) -> None:
     """Extract predicate-argument structures represented in <述語項構造: > tags."""
     sid2idx = {
         sid: idx
         for idx, sid in enumerate(self.sid2sentence.keys())
     }
     for bp in self.bp_list():
         if bp.tag.pas is None:
             continue
         pas = Pas(bp)
         for case, arguments in bp.tag.pas.arguments.items():
             if self.relax_cases:
                 if case in ALL_CASES and case.endswith('≒'):
                     case = case.rstrip('≒')  # ガ≒ -> ガ
             for arg in arguments:
                 arg.midasi = jaconv.h2z(arg.midasi,
                                         digit=True)  # 不特定:人1 -> 不特定:人1
                 # exophor
                 if arg.flag == 'E':
                     entity = self._create_entity(exophor=arg.midasi,
                                                  eid=arg.eid)
                     pas.add_special_argument(case, arg.midasi, entity.eid,
                                              '')
                 else:
                     sid = self.sentences[sid2idx[arg.sid] - arg.sdist].sid
                     arg_bp = self._get_bp(sid, arg.tid)
                     _ = self._create_mention(arg_bp)
                     pas.add_argument(case, arg_bp, '')
         if pas.arguments:
             self._pas[pas.dtid] = pas
Esempio n. 17
0
def chunk_srt(text):
    chunks = []

    subs = list(sub for sub in srt.parse(text) if remove_spaces_punctuation(
        sub.content))  # filter ones with no useful chars, like '♬~'

    grouped_subs = group_subs_list(subs)

    for group in grouped_subs:
        text_pieces = []
        html_pieces = []

        for sub in group:
            start_time = sub.start.total_seconds()
            end_time = sub.end.total_seconds()

            cleaned_content = jaconv.h2z(sub.content).strip(
            )  # h2z only affects kana by default, which is what we want

            text_pieces.append(cleaned_content)

            html_pieces.append(
                f'<p t0="{start_time:.3f}" t1="{end_time:.3f}">' +
                html.escape(cleaned_content).replace('\n', '<br>') + f'</p>')

        chunks.append({
            'text': '\n'.join(text_pieces),
            'html': '\n'.join(html_pieces),
        })

    return chunks
Esempio n. 18
0
def orig_XML_to_doc_obj(tree):
    """convert orig to document tree.

    under construction

    """


    
    doc = Document('dummy')
    root = tree.getroot()
    for sent in root.findall('.//Sentence'):
        if sent.text == '' or sent.text == None:
            continue
        text = jaconv.h2z(sent.text, digit=True, ascii=True)        
        doc.sentences.append(Sentence(text))



        




        
    return doc
Esempio n. 19
0
    def convert(self, sent):
        sent = jaconv.z2h(sent, kana=False, ascii=True, digit=True)
        iters = re.finditer(r'([a-zA-Z][a-zA-Z\s]*)$', sent)
        output_word = ""
        pos = 0
        for i in iters:
            s_pos, e_pos = i.span()
            word = i.groups()[0]
            word = re.sub('^\s', r'', word)
            word = re.sub('\s$', r'', word)
            s_word = ""

            while pos < s_pos:
                output_word += sent[pos]
                pos += 1

            if word in self.med_dic:
                s_word = self.med_dic[word]
            elif word.lower() in self.med_dic:
                s_word = self.med_dic[word.lower()]
            else:
                s_word = word

            if s_word == '':
                s_word = word

            output_word += s_word
            pos = e_pos

        while pos < len(sent):
            output_word += sent[pos]
            pos += 1

        return jaconv.h2z(output_word, kana=True, ascii=True, digit=True)
Esempio n. 20
0
    async def on_message(self, ctx):
        if ctx.author.bot:
            return
        if all([
                ctx.channel.id != cs.Zyanken_room,
                ctx.channel.id != cs.Test_room
        ]):
            return

        for hand in ["グー", "チョキ", "パー"]:
            # グー,チョキ,パーの順に文字が含まれているか検索
            if hand not in jaconv.hira2kata(jaconv.h2z(ctx.content)):
                continue
            # img, hand, msg, emoji1, emoji2 = zf.honda_to_zyanken(hand, ctx.author.id)
            img, hand, msg, emoji1, emoji2 = zf.honda_to_zyanken_breaktime(
                hand, ctx.author.id)
            if str(ctx.author.id) not in zf.No_reply:
                await ctx.add_reaction(emoji1)
                await ctx.add_reaction(emoji2)
                await ctx.channel.send(f"{ctx.author.mention} {hand}\n{msg}",
                                       file=discord.File(img),
                                       delete_after=5)
            if cs.Zyanken not in [roles.id for roles in ctx.author.roles]:
                guild = self.bot.get_guild(ctx.guild.id)
                await guild.get_member(ctx.author.id
                                       ).add_roles(get_role(guild, cs.Zyanken))
            """
            if emoji2 == "🎉" and len(zf.Former_winner) <= 5:
                guild = self.bot.get_guild(ctx.guild.id)
                await guild.get_member(ctx.author.id).add_roles(get_role(guild, cs.Winner))
                if ctx.author.id not in zf.Former_winner:
                    zf.Former_winner.append(ctx.author.id)
            """
            break
def processing(pconf, address):
    for p in pconf.get('print_only_then', []):
        if address[p['col']] != p['value']:
            return None

    for p in pconf.get('dont_print_then', []):
        if address[p['col']] == p['value']:
            return None

    for p in pconf.get('replace', []):
        address[p['col']] = address[p['col']].replace(p['from'], p['to'])

    for p in pconf.get('split', []):
        for i, v in enumerate(address[p['col']]):
            address[f'{p["col"]}_{i:02}'] = v

    for p in pconf.get('default', []):
        if address.get(p['col'], '') == '':
            address[p['col']] = p['value']

    for p in pconf.get('show_only_if_exists', []):
        if address.get(p['exists'], '') == '':
            address[p['col']] = ''

    for k, v in address.items():
        address[k] = jaconv.h2z(v, ascii=True, digit=True)

    return address
Esempio n. 22
0
def text_ins_reg(ins):
    # 検索置換の開始。
    # ### 全角数字を半角に変換する。
    ins = jaconv.z2h(ins, kana=False, ascii=False, digit=True)
    # ### 数字の桁区切りが全角だった場合 => 半角に変換。
    ins = re.sub('^[ +]|[ +]$', '', ins)
    # ### こちらは『''』の前に『r』がなくてもグループ化と正規化がうまくいってる?
    ins = re.sub('(?<=\d),(?=\d+)', '\1,\2', ins)
    # ### 句点読点を統一。通常文章バージョン
    ins = ins.replace(',', '、')  # 理科系バージョン => '、', ','
    ins = ins.replace('.', '。')  # 理科系バージョン => '。', '.'
    # ### 全角ASCIIを半角に変換する。
    # 全角スペースを下駄に変換、全角ASCIIを半角に変換、下駄を全角スペースに戻す。
    ins = ins.replace(' ', '〓')
    ins = jaconv.z2h(ins, kana=False, ascii=True, digit=False)
    ins = ins.replace('〓', ' ')
    # ### 半角カタカナを全角に変換する。
    ins = jaconv.h2z(ins)
    # ### ASCIIの『()』『[]』を全角に変える。
    # ### 『''』の前に『r』を付けることについて、規則が全く理解できない!
    ins = re.sub('\((.+?)\)', r'(\1)', ins)
    ins = re.sub('\[(.+?)\]', r'[\1]', ins)
    # ### 時間表示の『:』を全角に変換する。
    ins = re.sub('(\d{1,2}):(\d{2})', r'\1:\2', ins)
    # ### 箇条書き先頭の数字周りの全角ピリオドをママ活かす。
    # ### ### 句点だった場合
    ins = re.sub('^(\d{1,3})。', r'\1.', ins, flags=re.MULTILINE)
    # ### ### Piriodの場合
    ins = re.sub('^(\d{1,3})\.\s', r'\1.', ins, flags=re.MULTILINE)
    # ### 問題点 文字列前後の不要なスペースを取り除けない。
    # ins = ins.strip()
    ins = re.sub('^\s+', r'', ins, flags=re.MULTILINE)
    # ##############################################
    return ins
Esempio n. 23
0
def read_file(in_path, han2han=False, encode='cp932'):
    '''
    データを読み込む

    Args:
        in_path: 入力ファイルのパス
        han2han: 半角カナをそのままにするフラグ
        encode : 入力ファイルのエンコード。デフォルトは cp932(Windows版シフトJIS)

    Returns:
        lines: ファイルを読んだ結果。各要素が一行分の文字列の配列  
    '''

    lines = []

    with codecs.open(in_path, 'r', encode) as f:

        # 一行単位で標準入力
        for line in f:
            str = line.rstrip()

            # 空行は無視
            if len(str) == 0:
                continue

            if not han2han:
                # 半角カナ -> 全角
                str = jaconv.h2z(str)

            # 要素を追加
            lines.append(str)

    return lines
Esempio n. 24
0
def test_jaconv():
    logging.info("=========================================")
    logging.info("=               jaconv                  =")
    logging.info("=========================================")
    test_cases = get_test_cases()
    for tc in test_cases:
        title = tc['title']
        body = tc['body']

        logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title)
        calc_time(jaconv.hira2kata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title)
        calc_time(jaconv.kata2hira, body)
        logging.debug("result: %s" % jaconv.kata2hira(body))

        logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title)
        calc_time(jaconv.hira2hkata, body)
        logging.debug("result: %s" % jaconv.hira2hkata(body))

        logging.info("半角 to 全角 for %s" % title)
        calc_time(jaconv.h2z, body)
        logging.debug("result: %s" % jaconv.h2z(body))

        logging.info("全角 to 半角 for %s" % title)
        calc_time(jaconv.z2h, body)
        logging.debug("result: %s" % jaconv.z2h(body))
Esempio n. 25
0
def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True):
    """
    * All hankaku Katanaka is converted into Zenkaku Katakana
    * All hankaku English alphabet and numberc string are converted into Zenkaku one
    """
    # type: (text_type,bool,bool,bool)->text_type
    return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)
Esempio n. 26
0
def func_normalize_text(text):
    # type: (str)->str
    """* What you can do
    - It make normalize input text into text which is suitable to KNP analysis.
    """
    if six.PY2:
        if isinstance(text, str):
            text = text.decode('utf-8')
        return jaconv.h2z(text=re.sub(r'\s', '', string=text),
                          kana=True,
                          ascii=True,
                          digit=True)
    else:
        return jaconv.h2z(text=re.sub(r'\s', '', string=text),
                          kana=True,
                          ascii=True,
                          digit=True)
Esempio n. 27
0
def normalize_txt(text):
    return jaconv.h2z(jaconv.z2h(text.strip(),
                                 kana=False,
                                 digit=True,
                                 ascii=True),
                      kana=True,
                      digit=False,
                      ascii=False).lower()
Esempio n. 28
0
 def prepare(self, text):
     text = normalize.shorten_repeat(text, 3)
     text = jaconv.h2z(text)
     text = re_a_tag.sub('', text)
     text = kigou.sub('', text)
     for (old, new) in self.paraphrases['before'].items():
         text = text.replace(old, new)
     return text
Esempio n. 29
0
    def _value_to_zenkaku(value: str) -> str:
        null_pattern = re.compile(r'null')
        match = re.search(null_pattern, value)
        if match:
            return None

        zenkaku = jaconv.h2z(value, kana=True, digit=True, ascii=True)

        return zenkaku
Esempio n. 30
0
def normalize_jpn(text):
    text = suuji(text)
    text = jaconv.h2z(text.upper(),
                      ignore='',
                      kana=True,
                      ascii=True,
                      digit=True)
    text = text.replace("ー", "ー")
    return text
Esempio n. 31
0
def normalize_char_width(string: str) -> str:
    """Normalize character widths in string to a set standard.

    Converts all katakana to full-width, and converts all latin alphabet and
    numeric characters to half-width
    """
    out_str = jaconv.h2z(string, kana=True, ascii=False, digit=False)
    out_str = jaconv.z2h(out_str, kana=False, ascii=True, digit=True)
    return out_str
Esempio n. 32
0
def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_emoticon(text)
        text = jaconv.h2z(text)
        text = text.replace('よぉ', 'よ').replace('よぉ', 'よ')
        text = text.replace('よお', 'よ').replace('よお', 'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text
def normalize_text_normal_ipadic(input_text):
    """
    * All hankaku Katanaka is converted into Zenkaku Katakana
    * All hankaku English alphabet and numberc string are converted into Zenkaku one
    """
    return jaconv.h2z(input_text, kana=True, ascii=True, digit=True)
Esempio n. 34
0
MECAB_ARGS_KEYS = 'rdulDOapmMFUBESxbPCtco'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    for key in MECAB_ARGS_KEYS:
        parser.add_argument('-%s' % key)
    parser.add_argument('-N', type=int)
    args = parser.parse_args()
    mecab_arg = ''
    for key in MECAB_ARGS_KEYS:
        arg = getattr(args, key)
        if arg:
            mecab_arg += ' -%s%s' % (key, arg)
    if not args.F:
        mecab_arg += DEFALUT_FORMAT
    mecab = MeCab.Tagger(mecab_arg)
    while True:
        sentence = input()
        sentence = jaconv.h2z(sentence)
        if args.N:
            mecab.parseNBestInit(sentence)
            for i in range(args.N):
                result = mecab.next()
                if result:
                    print(result)
                else:
                    break
        else:
            result = mecab.parse(sentence)
            print(result)