def save_to_csv(self, filename, data): """Save this data on csv file by prefecture""" row = [] address = zenhan.z2h(data['address'], zenhan.ALL) # remove the zip code address = re.sub(r'%s\d+-\d+' % u'〒', '', address).strip() row.append(data['name']) row.append(data['name_kata']) row.append(address) row.append(u'\n'.join(data['routes'] or u'')) row.append(data['prefecture']) row.append(data['area']) row.append(zenhan.z2h(data['phone'], zenhan.ALL)) row.append(data['working_hours']) row.append(data['holydays']) row.append(data['shop_url']) row.append(data['credit_cards_comment']) row.append(u'・'.join(data['credit_cards'] or u'')) row.append(data['seats']) row.append(data['stylist']) row.append(data['parking']) row.append(unicode(data['cut_price'])) row.append(data['page_url']) CsvWriter.write_to_csv(filename, row, firs_row=self.first_row)
def normalize(self,text): #アルファベット:全角=>半角 text = zenhan.z2h(text,mode=1) #数字:全角=>半角 text = zenhan.z2h(text,mode=2) #カタカナ:半角=>全角 text = zenhan.h2z(text,mode=4) return text
def delete_aft(line): text = zenhan.z2h(line, mode=1) #アルファベット(全角→半角) text = zenhan.z2h(text, mode=2) #数字(全角→半角) text = zenhan.h2z(text, mode=4) #カタカナ(半角→全角) text = re.sub( r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]', "", text) #その他文字列削除 return text
def delete_symbol(line): text = zenhan.z2h(line, mode=1) #アルファベット(全角→半角) text = zenhan.z2h(text, mode=2) #数字(全角→半角) text = zenhan.h2z(text, mode=4) #カタカナ(半角→全角) symbol = re.sub(r'[\u0000-\uE0FFF]', "", text) #unicode非対応の文字 text = re.sub( r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]', "", text) #その他文字列削除 """unicode非対応の文字の削除""" if not symbol == "": text = re.sub("[%s]" % symbol, "", text) return text
def conversion_data_format(input_data_dict): """変換処理を行いリストに格納し、それを返す処理。 """ name = input_data_dict['name'].strip() gender = GENDER_MODIFIED_MAP[input_data_dict['gender']] birthday = input_data_dict['birthday'] email = input_data_dict['email'] tel = zenhan.z2h(input_data_dict['tel']).replace('ー', '-') zip_code = input_data_dict['post_code'].replace('ー', '') address = input_data_dict['address'] item_sample = SAMPLE_PRODUCT_MAP[input_data_dict['item_num']] reception_datetime = parser.parse(input_data_dict['reception_date']) str_reception_datetime = reception_datetime.strftime('%Y/%m/%d %H:%M:%S') conversion_format = [ name, gender, birthday, email, tel, zip_code, address, item_sample, str_reception_datetime ] return conversion_format
def normalize(ingredient): ingredient = ingredient.strip() for SURROUND in SURROUNDS: ingredient = SURROUND.sub(lambda s: '', ingredient) ingredient = OPTIONAL_START.sub(lambda s: '', ingredient) match = UNCLOSED_PAREN.match(ingredient) if match: ingredient = match.groups()[0] ingredient = zenhan.z2h(ingredient, mode=1) # ascii ingredient = zenhan.h2z(ingredient, mode=4) # kana # convert all katakana to hiragana ingredient = hiragana(ingredient) match = STARTS_WITH_ALPHA.match(ingredient) if match and not ingredient.startswith('S&B'): ingredient = match.groups()[0] for SPECIAL_SYMBOL in SPECIAL_SYMBOLS: ingredient = SPECIAL_SYMBOL.sub(lambda s: '', ingredient) ingredients = SPLIT.split(ingredient) ingredients = map(lambda ingr: ENDS_WITH.sub(lambda s: '', ingr), ingredients) ingredients = map(lambda ingr: ingr.strip(), ingredients) ingredients = filter(lambda ingr: ingr, ingredients) for ingredient in ingredients: yield ingredient
def load(self, f): for wseq in super(WikiEdaTree, self).load(f): eposlist = [] for i, word in enumerate(wseq.word_list): if "misc" in word and len(word["misc"]) > 0: if len(word["misc"]) < 2: # space B/I raise FormattingException("malformed annotation: %s" % "".join(word["misc"])) word["_wpadding"] = word["misc"].pop(0) stype = word["misc"].pop(0) if stype not in ("B", "I"): raise FormattingException("malformed annotation: %s" % "".join(word["misc"])) if stype == "B": word["stype"] = self.WIKI_B word["misc"].pop(0) word["entity"] = "".join(word["misc"]) del word["misc"] eposlist.append(i) else: if i <= 0 \ or "stype" not in wseq.word_list[i - 1] \ or wseq.word_list[i - 1]["stype"] not in (self.WIKI_B, self.WIKI_I): raise FormattingException("malformed annotation: I-without-B: %s" % word["wid"]) word["stype"] = self.WIKI_I self.WIKI_I else: word["stype"] = self.WIKI_O for epos in eposlist: mention_orig = wseq.word_list[epos]["surface"] for i in xrange(epos + 1, len(wseq.word_list)): if wseq.word_list[i]["stype"] == self.WIKI_I: mention_orig += wseq.word_list[i]["surface"] else: break wseq.word_list[epos]["mention"] = z2h(mention_orig, mode=3) yield wseq
def clean_text(text): # del_n = re.compile('\n') # text = del_n.sub('',text) text = text.lower() text = unicodedata.normalize('NFKC', text) text = zenhan.z2h(text, zenhan.ASCII | zenhan.DIGIT) return text
def zenNum2hanNum(strings): """ 全角数字を半角数字に変換する その他の文字はそのまま """ strings = MultiBytes.convert2unicode(strings) return zenhan.z2h(strings, mode=2)
def clean_text(text): # del_n = re.compile('\n') # text = del_n.sub('',text) text = text.lower() text = unicodedata.normalize('NFKC', text) text = zenhan.z2h(text,zenhan.ASCII|zenhan.DIGIT) return text
def setEffect(string, material): effect = p.sub("", string) effect = effect.replace('-', '-') effect = zenhan.z2h(effect, 3) effect = effect.replace(material, "") effect = effect.replace("。", "。\n") return effect
def normalize(word): word = zenhan.z2h(word.lower(), zenhan.ASCII).strip() for t_word in _trim_words: if word.startswith(t_word): return word.lstrip(t_word) else: return word
def test_zenhan(): logging.info("=========================================") logging.info("= zenhan =") logging.info("=========================================") test_cases = get_test_cases() for tc in test_cases: title = tc['title'] body = tc['body'] logging.info("ひらがな(全角) to カタカナ(全角) for %s" % title) logging.info("Not implemented") logging.info("カタカナ(全角) to ひらがな(全角) for %s" % title) logging.info("Not implemented") logging.info("ひらがな(全角) to カタカナ(半角) for %s" % title) logging.info("Not implemented") logging.info("半角 to 全角 for %s" % title) calc_time(zenhan.h2z, body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT) logging.debug("result: %s" % zenhan.h2z(body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT)) logging.info("全角 to 半角 for %s" % title) calc_time(zenhan.z2h, body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT) logging.debug("result: %s" % zenhan.z2h(body, zenhan.ASCII|zenhan.KANA|zenhan.DIGIT))
def setMaterial(string): string = zenhan.z2h(string.replace('-', '-'), 3) if '<br' in string: string = p.sub("", string[:string.index('<br')]) else: string = p.sub("", string) return string
def delete_twitter(line): text = zenhan.z2h(line, mode=1) #アルファベット(全角→半角) text = zenhan.z2h(text, mode=2) #数字(全角→半角) text = zenhan.h2z(text, mode=4) #カタカナ(半角→全角) symbol = re.sub(r'[\u0000-\uE0FFF]', "", text) #unicode非対応の文字 text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) #URL text = re.sub(r'@[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) #ユーザ名 text = re.sub(r'#[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text) #ハッシュタグ text = re.sub( r'[\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u3004\u3007-\u303F\u3099-\u30A0\u30FB\u30FD-\u4DFF\uA000-\uE0FFF]', "", text) #その他文字列削除 """unicode非対応の文字の削除""" if not symbol == "": text = re.sub("[%s]" % symbol, "", text) return text
def get_tweet(auth, g): url = "https://api.twitter.com/1.1/statuses/mentions_timeline.json" tweets = requests.get(url, auth=auth, params={ "count": "200", "since_id": g.last_mention }).json() if len(tweets) > 0: g.last_mention = tweets[0]['id_str'] for tweet in tweets: try: got_tweet = tweet['text'] patternScreenName = r"@[a-zA-Z0-9/:%#\$&\?\(\)~\.=\+\-_]+" patternUrl = r"https?://[a-zA-Z0-9/:%#\$&\?\(\)~\.=\+\-_]+" got_tweet = re.sub(patternScreenName, "", got_tweet) got_tweet = re.sub(patternUrl, "", got_tweet) got_tweet = re.sub(r'[\r|\t]', '', got_tweet) got_tweet = got_tweet.replace('\n', '') print('kitayo:' + got_tweet) utterLine = conv.parser(zh.z2h(got_tweet).lower()) utterLineR = utterLine[::-1] text = "@" + str( tweet['user']['screen_name']) + " " + conv.conversation( utterLineR, conv.model, conv.dictionary, conv.id2wd) if any([text.find(ng) != -1 for ng in ng_words]): text = "@" + str(tweet['user']['screen_name']) + str( get_ng_word()) put_tweet(auth, text, tweet['id'], g) except: print("wakarazu") text = "@" + str(tweet['user']['screen_name']) + str( get_unknown_word()) put_tweet(auth, text, tweet['id'], g)
def copy_tmp_to_forpdf(filename, gakki): dt = datetime.today() try: wb = px.load_workbook(filename) ws1 = wb['ForPDF'] ws2 = wb['temp'] kogi_bango = "" for i in range(66, 71): for j1 in range(2, 17, 2): # Kougi-bango #write course number r1 = round(3 * j1 / 2 + 2) kogi_bango = ws2[cell(c=chr(i), i=j1)].value if kogi_bango is None: pass elif isinstance(kogi_bango, str): kogi_bango0 = kogi_bango.split(" ", 1)[0].strip() ws1[cell(c=chr(i), i=r1)].value = "" ws1[cell(c=chr(i), i=r1)].value =\ zh.z2h(text=kogi_bango0, mode=7) elif isinstance(kogi_bango, int): kogi_bango0 = "{:06d}".format(kogi_bango) ws1[cell(c=chr(i), i=r1)].value = "" ws1[cell(c=chr(i), i=r1)].value =\ zh.z2h(text=kogi_bango0, mode=7) for j2 in range(3, 18, 2): # write course title r2 = round(((3 * j2 + 7) / 2) - 1) kogi_me = ws2[cell(c=chr(i), i=j2)].value if kogi_me is not None: kogi_me = kogi_me.replace("-", "-").\ replace("英語コミュニケーション", "EC") ws1[cell(c=chr(i), i=r2)].value = "" ws1[cell(c=chr(i), i=r2)].value =\ zh.z2h(text=kogi_me, mode=3) ws1["F2"].value = dt.strftime("%Y/%m/%d") if gakki != "": ws1["D1"].value = "Q{}".format(gakki) wb.save(filename) print("Successfully completed") return True except PermissionError: print("The file was not closed.") return False
def norm(s): s = s.split("※",1)[0] s = s.replace(" ", " ") s = s.replace("-", "-") s = zenhan.z2h(s, mode=7) s = zenhan.h2z(s, mode=4) s = s.strip() return s
def import_file(cls, filename, kind): """Store data from csv files. filename: string kind: hotel/restaurant """ import csv import progressbar import time from ghost_spider.elastic import LatteHotelEs, LatteRestaurantEs to_class = None if kind == 'hotel': to_class = LatteHotelEs elif kind == 'restaurant': to_class = LatteRestaurantEs else: raise NotImplementedError() csvfile = open(filename, 'rb') fieldnames = cls.fieldnames reader = csv.DictReader(csvfile, fieldnames) try: to_class.DEBUG = False next(reader) # skip the title line rows = list(reader) total = len(rows) progress = progressbar.AnimatedProgressBar(end=total, width=100) bulk = "" count_lines = 0 for line, row in enumerate(rows): progress += 1 progress.show_progress() data = {} for k, v in row.iteritems(): if v: if not isinstance(v, (list, tuple)): data.update({k: v.decode('utf-8')}) data["name_low"] = data["name"].lower() data["name_cleaned"] = to_class.analyze(data["name"].lower(), 'baseform_analyzer') data["name_cleaned"] = zenhan.z2h(data["name_cleaned"], zenhan.ASCII) data["url"] = data["url"].lower() data["kind"] = data["kind"].split('|') if data.get('kind') else [] bulk += to_class.bulk_data(data, action="create") count_lines += 1 if (count_lines % 200) == 0: to_class.send(bulk) bulk = "" if bulk: to_class.send(bulk) progress.show_progress() print " " finally: if csvfile: csvfile.close()
def conv(txt, unic=False): kZ = unicode(txt) kZ = zenhan.z2h(kZ) kZ = kZ.lower() kZ = zenhan.h2z(kZ) if unic: return kZ kZ = kZ.encode('utf8') return kZ
def wakati(str): words = [] for line in mecab.parse(zenhan.z2h(str, mode=3).lower()).split("\n"): cols = line.split("\t") if len(cols) >= 2: c = cols[1].split(",") if not c[0] in ["助詞", "助動詞", "副詞", "記号"] and not c[1] in ["非自立", "代名詞"]: words.append(cols[0]) return words
def run(self, edit): for region in self.view.sel(): select_texts = self.view.substr(region) if select_texts != "": zen2han_text = zenhan.h2z(select_texts,zenhan.KANA) han2zen_text = zenhan.z2h(select_texts,zenhan.KANA) if select_texts != zen2han_text: self.view.replace(edit, region, zen2han_text) elif select_texts != han2zen_text: self.view.replace(edit, region, han2zen_text)
def run(self, edit): for region in self.view.sel(): select_texts = self.view.substr(region) if select_texts != "": zen2han_text = zenhan.h2z(select_texts, zenhan.KANA) han2zen_text = zenhan.z2h(select_texts, zenhan.KANA) if select_texts != zen2han_text: self.view.replace(edit, region, zen2han_text) elif select_texts != han2zen_text: self.view.replace(edit, region, han2zen_text)
def zenhan_search(self, statement, numOfResult): han_statement = zenhan.z2h(statement) zen_statement = zenhan.h2z(statement) han_list = self.tokenizer.split_query(han_statement) zen_list = self.tokenizer.split_query(zen_statement) if han_statement != zen_statement: to_search = han_list + zen_list else: to_search = self.tokenizer.split_query(statement) return self._search(to_search, numOfResult)
def save_for_production(cls, filename, data): """Save this data on csv file by prefecture""" row = [] address = zenhan.z2h(data['address'], zenhan.ALL) # remove the zip code address = re.sub(r'%s\d+-\d+' % u'〒', '', address).strip() row.append(data['name']) row.append(data['name_kata']) row.append(address) row.append(data['parent_url_key']) row.append(zenhan.z2h(data['phone'], zenhan.ALL)) # hotel_kind = u'ホテル' # if data.get('kind') and data.get('kind') in LocationHotelSelectors.REPLACE_HOTEL: # hotel_kind = data.get('kind') # else: # for genre in data['genre']: # if genre in LocationHotelSelectors.REPLACE_HOTEL: # hotel_kind = LocationHotelSelectors.REPLACE_HOTEL[genre] # break row.append(data.get('kind') or '') CsvWriter.write_to_csv(filename, row, firs_row=cls.production_first_row)
def normalize(data): NBSP = b"\xC2\xA0".decode("UTF-8") data = unicodedata.normalize("NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, "")))) # 0x2010 -- 0x2015 dashesU8 = [b'\xe2\x80\x90', b'\xe2\x80\x91', b'\xe2\x80\x92', b'\xe2\x80\x93', b'\xe2\x80\x94', b'\xe2\x80\x95'] dashes = "".join([s.decode("UTF-8") for s in dashesU8]) digits = re.match("^[0-9\\+\\-{0}]+$".format(dashes), data) if digits: for d in dashes: data = data.replace(d, "-") return data
def normalizeText(string): patternUrl = r"https?://[a-zA-Z0-9/:%#\$&\?\(\)~\.=\+\-_]+" patternScreenName = r"@[a-zA-Z0-9/:%#\$&\?\(\)~\.=\+\-_]+" patternHashtag = r"#[a-zA-Z0-9/:%#\$&\?\(\)~\.=\+\-_]+" rep1str = re.sub(patternUrl, "", string) rep2str = re.sub(patternScreenName, "", rep1str) rep3str = re.sub(patternHashtag, "", rep2str) rep4str = re.sub("(^(\s)*)|((\s)*$)", "", rep3str) rep5str = rep4str.replace("\n", "") rep6str = zh.z2h(rep5str).lower() return rep6str
def extract_data_from_html(filename): # html to table url_filename = return_urlfilename(filename) html = urlopen(url_filename) bsObj = BeautifulSoup(html, "html.parser") table = bsObj.findAll("table") # table to update_date updated_date = strptime(z2h(table[1].get_text().strip(), 2), "%Y年%m月%d日") # table to gpa_list gpa_table = table[4] gpa_rows = gpa_table.findAll("tr") gpa_list = [] gpa_items = [ "course_title", "lecturer", "year_completed", "grade_points", "grade", "credits", "gp" ] gpa_rows.pop(0) for gpa_row in gpa_rows: tmp_dict = {item: None for item in gpa_items} is_append_list = True for i, cell in enumerate(gpa_row.findAll(['td', 'th'])): tmp_celltext =\ z2h(cell.get_text().strip().replace("\u3000", " "), 3) if i == 3 and tmp_celltext == "": is_append_list = False continue tmp_dict[gpa_items[i]] = tmp_celltext if is_append_list is True: # tmp_dict["year_completed"] = [] tmp_dict["grade_points"] = int(tmp_dict["grade_points"]) tmp_dict["credits"] = float(tmp_dict["credits"]) tmp_dict["gp"] = float(tmp_dict["gp"]) gpa_list.append(tmp_dict) return updated_date, gpa_list
def save_to_csv(cls, filename, data): """Save this data on csv file by prefecture""" row = [] address = zenhan.z2h(data['address'], zenhan.ASCII) # remove the zip code address = re.sub(r'%s\d+-\d+' % u'〒', '', address).strip() row.append(data['name']) row.append(data['name_kata']) row.append(address) row.append(data['prefecture']) row.append(data['area']) row.append(zenhan.z2h(data['phone'], zenhan.ALL)) row.append(data['kind']) row.append(data.get('latte_url') or u'') row.append(data['page_url']) row.append(data['id']) CsvWriter.write_to_csv(filename, row, firs_row=cls.first_row)
def tokenize(text): ''' とりあえず形態素解析して名詞だけ取り出す感じにしてる Extract alphabet as lower, hankaku, space-trimed ''' node = mecab.parseToNode(text) while node: if node.feature.split(',')[0] == '名詞': try: yield zenhan.z2h(node.surface.lower().strip()) except: yield '0' node = node.next
def search(browser): #検索値入力 query = input("商品名入力: ") #amazonHPへ browser.get('https://www.amazon.co.jp/') #検索値入力 search = browser.find_element_by_id('twotabsearchtextbox') search.send_keys(query) #検索ボタン押す search_btn = browser.find_element_by_class_name('nav-input') search_btn.click() time.sleep(2) #商品一覧取得 items = browser.find_elements_by_class_name('s-result-item') time.sleep(2) for num, item in enumerate(items): try: item_name = item.find_element_by_tag_name('h2').text print(num, ':', item_name) print('-' * 20) except: pass #商品の選択 select_num = str(input('商品番号入力: ')) select_item_num = 'result_' + zenhan.z2h(select_num) time.sleep(1) select_item_html = browser.find_element_by_id(select_item_num) #その商品の詳細ページURL取得 select_item = select_item_html.find_element_by_class_name('a-link-normal') select_item_url = select_item.get_attribute('href') #商品名取得 select_item_name = select_item_html.find_element_by_tag_name('h2').text browser.get(select_item_url) time.sleep(1) return select_item_name
def month_date_end_search(line): """月表記のある予定終了の日付を検出し,intとintで返す.""" zen_tilde = '~' # 全角スペース zen_space = ' ' # 全角0 zen_zero = '0' nichi = '日' tsuki = '月' dollar = '$' # 全角スペースを0に置き換えることで無理やり対応 line = line.replace(zen_space, zen_zero) line = line.replace(zen_tilde, zen_zero) index_month = line.find(tsuki) # 日が一桁の場合の対策 line = line.replace(tsuki, zen_zero, 1) # 二度目のnichiの位置を検出 index_second_date = line.find(nichi, index_month + 1) # 日と曜日の位置関係から誤表記を訂正 index_second_dollar = line.find(dollar, index_month + 1) if index_second_date + 1 != index_second_dollar: index_second_date = index_second_dollar # 月, 日を返す return int(zenhan.z2h(line[index_month - 2:index_month])), int(zenhan.z2h(line[index_second_date - 2:index_second_date]))
def date_start_search(line): """予定開始の日付を検出し,strで返す.""" # 全角スペース zen_space = ' ' # 全角0 zen_zero = '0' nichi = '日' dollar = '$' # 全角スペースを0に置き換えることで無理やり対応 line = line.replace(zen_space, zen_zero) index = line.find(nichi) # 日と曜日の位置関係から誤表記を訂正 index_first_dollar = line.find(dollar, index + 1) if index + 1 != index_first_dollar: index = index_first_dollar # ex. 1 → 01 #if line[index - 1] == zen_space: # line[index - 1] = zen_zero return zenhan.z2h(line[index - 2:index])
def getCardURL(name): url = "https://ocg.xpg.jp/search/search.fcgi?Name=" + urllib.parse.quote( name.encode('Shift_JIS')) + "&Mode=0" try: fp = requests.get(url) soup = BeautifulSoup(fp.content, "html.parser") fp.close time.sleep(1) texts = soup.find_all("a", href=re.compile("/c/+")) for text in texts: name_tmp = p.sub("", str(text)) name_tmp = zenhan.z2h(name_tmp.replace('-', '-'), 3) if '【' in name_tmp: name_tmp = name_tmp[:name_tmp.index("【")] if name == name_tmp: url_text = text.get("href") return url_text except urllib.error.HTTPError: time.sleep(1) return False
def date_end_search(line): """月表記のない予定終了の日付を検出し,strで返す.""" zen_tilde = '~' # 全角スペース zen_space = ' ' # 全角0 zen_zero = '0' nichi = '日' dollar = '$' # 全角スペースと全角チルダを0に置き換えることで無理やり対応 line = line.replace(zen_space, zen_zero) line = line.replace(zen_tilde, zen_zero) index_first_date = line.find(nichi) # 二度目のnichiの位置を検出 index_second_date = line.find(nichi, index_first_date + 1) # 日と曜日の位置関係から誤表記を訂正 index_second_dollar = line.find(dollar, index_first_date + 2) if index_second_date + 1 != index_second_dollar: index_second_date = index_second_dollar return zenhan.z2h(line[index_second_date - 2:index_second_date])
def changeClassroom(self, classroom): if match(r".*,.*", classroom): cr = classroom elif classroom == "工学部1号館情報実習室1(CAE室)": cr = "工1-CAE室" elif match(r"一般教育棟.*", classroom): cr = classroom.replace("一般教育棟", "").replace("教室", "") elif match(r"工学部.*", classroom): cr = classroom.replace('工学部', "工").replace("号館第", "-").replace( "号館", "-").replace("講義室", "") elif match(r"情報実習室.*", classroom): cr = classroom.replace("情報実習室", "情") elif match(r"理学部.*", classroom): cr = classroom.replace("理学部", "理").replace("号館第", "-").replace( "号館", "-").replace("講義室", "") else: cr = classroom cr = cr.replace(" ", "") return zh.z2h(text=cr, mode=3)
def scp_number(msg): msg = zenhan.z2h(msg.casefold()).replace("-", "").replace("scp", "") number = re.sub("\\D", "", msg) if number is (None and ""): return None brt = msg.replace(number, "") if brt == "": brt = "en" if brt not in BRANCHS: # 要改良 reply = get_country_from_code(brt) return reply try: dictionary = pd.read_csv(currentpath + "/data/scps.csv", index_col=0) except FileNotFoundError as e: print(e) result = dictionary.query('branches in @brt') result = result.query('url.str.contains(@number)', engine='python') result = result[0:1].values.tolist() result = itertools.chain(*result) result = list(result) if len(result) == 0 or number is re.sub("\\D", "", result[0]): if len(number) > 4: return None if "en" in brt: return("scp-" + str(number) + "はまだ存在しません") else: return("scp-" + str(number) + "-" + str(brt) + "はまだ存在しません") return(result)
def setName(string): name = zenhan.z2h(p.sub("", string).replace('-', '-'), 3) if '】' in name: name = name[:name.index('【')] return name
def med_facility(tdfk, facid): tdfk = zenhan.z2h(tdfk, mode=7, ignore=()).zfill(2) facid = zenhan.z2h(facid, mode=7, ignore=()).zfill(7) return 'M' + tdfk + facid
def get_recipe(url, dish): #print "**********" #print dish request = urllib2.Request(url) response = urllib2.urlopen(request) response1=response.read() soup = BeautifulSoup(response1,"html.parser") recipe ="" for p in soup.findAll('p',text=False): if p.text.find("人")!=-1 and p.text.find("材料")!=-1: recipe = p.text temps=recipe.split("\n") elements =dict() amount=0 people=0 # print recipe for temp in temps: # calculate number of people (done) if temp.find("材料")!=-1: people=float(re.search("[0-9]",zenhan.z2h(temp,2)).group()) # get each element for one man (done) elif temp!="": element=temp.replace("●","").replace("○","").replace("〇","").replace("◎","").lstrip(" ") if temp.find("…")!=-1: element= element.split("…") else: element= element.split(None,1) # print element if people!=0: # print people # convert all string to hankaku if len(element) >= 2: han_element = zenhan.z2h(element[1],2) else: # print element break # march unit #print element[1] #print han_element unit=re.search("[^0-9\/~ ]+",han_element).group(0) #print zenhan.z2h(element[1],2) string_amount=re.search("[0-9\/ ]+",han_element.replace(unit,"")) #print string_amount if string_amount!=None: amount= float(sum(Fraction(s) for s in string_amount.group(0).split()))/people else: amount =0 else: print "people=0" #print element[0]+"\t"+str(amount)+"\t"+unit elements.update({element[0]:[amount,zenhan.h2z(unit,4)]}) return elements
def normalize(text): return zenhan.z2h(text, mode=zenhan.DIGIT | zenhan.ASCII)
def normalize_text(text): text = text.strip() text = zenhan.z2h(text, mode=7) for a, b in normalize_replace_map: text = text.replace(a, b) return text
def test_z2h_digit_only(self): converted = zenhan.z2h(self.original, zenhan.DIGIT) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
def zenAlphaNum2hanAlphaNum(strings): """ 全角英数字を半角英数字に変換する """ strings = MultiBytes.convert2unicode(strings) return zenhan.z2h(strings, mode=3)
def test_z2h_all(self): converted = zenhan.z2h(self.original, zenhan.ALL) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚")) self.assertEqual(converted, zenhan.z2h(self.original, zenhan.ASCII|zenhan.DIGIT|zenhan.KANA))
def test_z2h_ascii_and_digit(self): converted = zenhan.z2h(self.original, zenhan.ASCII|zenhan.DIGIT) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
i = 0 for p1 in tmp1.find_all("td", {"class": "a-center tb-color001"}): for p2 in p1.find_all("span"): # 銘柄コード #print(p2.attrs['id']) data['code'][i] = p2.attrs['id'] i += 1 for i, p1 in enumerate( tmp1.find_all(attrs={ "rowspan": "2", "class": "a-center tb-color001 w-space" })): #上場日 data['jojodate'][i] = zenhan.z2h(p1.contents[0].string).strip() i = 0 for p1 in tmp1.find_all(attrs={"rowspan": "2", "class": "a-left tb-color001"}): #会社名 data['name'][i] = p1.find("a").text.replace('*', '').strip() i += 1 i = 0 for p1 in tmp1.find_all("tr"): for p2 in p1.find_all("td", {"class": "a-center tb-color001"}): for p3 in p2: if ("第一部" in p3 or "第二部" in p3 or "マザーズ" in p3 or "JQスタンダード" in p3):
def test_z2h_ascii_and_kana(self): converted = zenhan.z2h(self.original, zenhan.ASCII|zenhan.KANA) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
def zenkaku_to_hankaku(text): '''全角文字を半角文字に変換します。''' return zenhan.z2h(text, mode=7)
def setPendulumScale(scale): scale = int(zenhan.z2h(scale[scale.index('赤') + 1:])) return scale
def test_z2h_kana_only(self): converted = zenhan.z2h(self.original, zenhan.KANA) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
def test_z2h_digit_and_kana(self): converted = zenhan.z2h(self.original, zenhan.DIGIT|zenhan.KANA) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))
def messaging_service(): logger.info('messaging_service()') strreq = request.data try: strreq = strreq.decode() except AttributeError: pass jsonreq = json.loads(strreq) if debuglog: logger.info(str(jsonreq)) reply_token = '' message = '' strMessage = '' user_id = '' etype = '' postbackdata = '' timestamp = datetime.now() for e in jsonreq['events']: etype = e['type'] reply_token = e['replyToken'] user_id = e['source']['userId'] if etype == 'message': message = e['message'] strMessage = message['text'] timestamp = int(e['timestamp']) elif etype == 'follow': timestamp = int(e['timestamp']) elif etype == 'postback': logger.info(str(e)) postbackdata = e['postback']['data'] timestamp = int(e['timestamp']) date = datetime.now() send_line_bot_log(user_id, f"{user_id},{strMessage}," + date.strftime('%Y/%m/%d %H:%M:%S')) strIntent = "" _i = " " _q = " " _a = " " _n = " " #followevent if etype == 'follow' or strMessage == u'クイズ連携': send_line_bot_log(user_id, f"{user_id},follow message," + date.strftime('%Y/%m/%d %H:%M:%S')) mess = [] mess.append("言えまてんBotです!フォローありがとう!") mess.append({ 'type': 'template', 'altText': '確認', 'template' : { 'type': 'confirm', 'text': 'Clovaはお持ちですか?言えまてんクイズからのメッセージをこちらに送信しても大丈夫ですか?', 'actions': [ { 'type': 'postback', 'label': 'はい', 'data': 'res=yes' }, { 'type': 'postback', 'label': 'いいえ', 'data': 'res=no' } ] } }) send_line_reply(reply_token, mess) if etype == 'postback': if postbackdata == 'res=yes': text = "ありがとう!言えまてんクイズの使い方を知りたい時は「使い方」と言ってみてね!" send_line_reply(reply_token, text) insert(user_id, "follow", "follow", "ok" ,_i,_q,_a,_n) else: text = "残念です!言えまてんクイズの使い方を知りたい時は「使い方」と言ってみてね!" send_line_reply(reply_token, text) insert(user_id, "follow", "follow", "ng" ,_i,_q,_a,_n) return True intent = getIntent(user_id, "reply") if 'date' in intent: postDate = intent['date'] if datetime.strptime(postDate , '%Y/%m/%d %H:%M:%S') > datetime.now() - timedelta(hours=8): strIntent = intent['intent'] _i = intent['induction'] _q = intent['quiz'] _a = intent['answer'] if re.compile("こんにちは|Hello|こんばんは|おはよう").search(message['text']): text = "こんにちは!言えまてんボットです。よろしくね。「使い方」というと説明するよ!" send_line_reply(reply_token, text) elif re.compile("^(使い方|つかいかた|Help|ヘルプ)").search(message['text']): mess = u'言えまてんクイズの使い方です。最初にひとつの言葉を10回繰り返して言ってもらいます。\n' mess += u'次に、その言葉に少し関係のある問題を出すので答えを考えてね。問題は全部で' + str(len(quiz)) + 'つありますよ。\n' mess += u'答えがあっていると正解!です。もう一度問題をやるか聞かれたら「はい」か「いいえ」と答えてね。\n' mess += u'「問題1」、「問題1の答え」のようにBotに言うと問題についてお答えします。問題を思いついた人は「応募」と言ってみてね。\n' send_line_reply(reply_token,mess) elif strIntent == "post1" and len(message['text']) > 1: _i = message['text'] insert(user_id, "reply", "post2", message,_i,_q,_a,_n) mess = [] mess.append( "「" + message['text'] + "」ですね。わかりました。") mess.append( "次に問題を教えてください。") send_line_reply(reply_token,mess) elif strIntent == "post2" and len(message['text']) > 5: _q = message['text'] insert(user_id, "reply", "post3", message,_i,_q,_a,_n) mess = [] mess.append( "問題は「" + message['text'] + "」ですね。わかりました。") mess.append( "次は答えを教えてください。") send_line_reply(reply_token,mess) elif strIntent == "post3" and len(message['text']) > 0: _a = message['text'] insert(user_id, "reply", "post4", message,_i,_q,_a,_n) mess = [] mess.append( "答えは「" + message['text'] + "」ですね。わかりました。") mess.append( "最後にニックネームを教えて!もし採用されたら問題の解説の時に紹介するね。内緒にしたい時は「匿名」と答えてね。") send_line_reply(reply_token,mess) elif strIntent == "post4" and len(message['text']) > 0: _n = message['text'] insert(user_id, "reply", "finish", message,_i,_q,_a,_n) mess = [] mess.append( "「" + message['text'] + "」さん、応募ありがとう!") mess.append( "参考にするね!") send_line_reply(reply_token,mess) send_sns(str(jsonreq), _i, _q, _a, _n) elif re.compile("^(応募|おうぼ|投稿)").search(message['text']): insert(user_id, "reply", "post1", message,_i,_q,_a,_n) mess = [] mess.append( "言えまてんクイズです。新しい問題を応募してます。面白い問題を考えた人は、1.10回繰り返す言葉(キリンとか)、2.問題、3.答えの3つを教えてね。") mess.append( "では、10回繰り返して言ってもらうフレーズを教えてください。") send_line_reply(reply_token, mess) elif re.compile("^問題[0-9]{1,2}$").search(zenhan.z2h(message['text'])): match = re.compile("[0-9]{1,2}").search(zenhan.z2h(message['text'])) num = int(match.group()) if num > 0: if len(quiz) >= num: text = f"問題{num}: {quiz[num]['q']}" send_line_reply(reply_token,text) else: text = f"問題{num}がみつかりません。" send_line_reply(reply_token,text) elif re.compile("^問題[0-9]{1,2}の答").search(zenhan.z2h(message['text'])): match = re.compile("[0-9]{1,2}").search(zenhan.z2h(message['text'])) num = int(match.group()) if num > 0: if len(quiz) >= num: mess = [] mess.append(f"問題{num}の答え=> {quiz[num]['a'][0]}") mess.append(f"問題{num}の解説=> {quiz[num]['i']}") send_line_reply(reply_token,mess) else: text = f"問題{num}がみつかりません。" send_line_reply(reply_token,text) return True
def normalize(data): NBSP = b"\xC2\xA0".decode("UTF-8") return unicodedata.normalize("NFKC", zenhan.z2h(zenhan.h2z(data.replace(NBSP, ""))))
def test_z2h_ascii_only(self): converted = zenhan.z2h(self.original, zenhan.ASCII) self.assertEqual(converted, u("゚abcDE゙F123456アガサダナバビプペ゚"))