def parse_header(parsed_mail, field): # type: (List[str], str) -> str decoded = [] raw_header = parsed_mail.get(field, '') # decode_header does not work well in some case, # eg. FW: =?ISO-2022-JP?B?GyRCR1s/LklURz0bKEI=?=: chunks = re.split(r'(=\?[^?]+\?[BQ]\?[^?]+\?=)', re.sub(r'\r?\n\s+', ' ', raw_header)) i = 0 while i < len(chunks): if chunks[i].startswith('=?') and chunks[i].endswith('?=') and \ i < len(chunks) - 2 and \ chunks[i + 1] == ' ' and \ chunks[i + 2].startswith('=?') and chunks[i + 2].endswith('?='): del (chunks[i + 1]) i += 1 for chunk in chunks: if chunk.find('=?') >= 0: for decoded_chunk, charset in decode_header(chunk): if charset: if charset == 'ISO-2022-JP': if callable(nkf): decoded_chunk = nkf('-Jw', decoded_chunk) charset = 'utf-8' else: charset = 'ISO-2022-JP-2004' decoded_chunk = decoded_chunk \ .replace(b'\033$B', b'\033$(Q') \ .replace(b'\033(J', b'\033(B') elif charset == 'SJIS': if callable(nkf): decoded_chunk = nkf('-Sw', decoded_chunk) charset = 'utf-8' else: charset = 'CP932' elif charset == 'EUC-JP': if callable(nkf): decoded_chunk = nkf('-Ew', decoded_chunk) charset = 'utf-8' else: charset = 'EUCJIS2004' try: decoded_chunk = decoded_chunk.decode( charset, errors='replace') except TypeError: pass else: decoded_chunk = decoded_chunk.decode() decoded.append(decoded_chunk) elif chunk: decoded.append(chunk) return ''.join(decoded)
def file_read(file_path): # ファイルオープン contents = open(file_path).read() contents = nkf.nkf("-w -d", contents)\ .decode("utf_8") return contents
def create_img(name, tenki, kuji, honbun): global g_pos_y g_pos_y = 0 img = PIL.Image.new('RGB', (384, 1000), (255,255,255)) date = datetime.datetime.today().strftime("%Y/%m/%d") # draw_text(img, ' ') draw_text(img, ' ') draw_text(img, date, align='center', size=28) draw_text(img, ' ') draw_text(img, name + ' 局長殿', align = 'center', size=40) draw_text(img, ' ') u = tenki.split('、') for v in u: draw_text(img, v, align = 'center', size = 32) draw_text(img, ' ') draw_text(img, '================================') draw_text(img, 'KDNおみくじ', align='center', size=48) draw_text(img, '================================') draw_text(img, ' ') kuji = kuji.split() draw_text(img, kuji[0], align='center', size = 36) draw_text(img, ' ') draw_text(img, kuji[1], align='center', size = 36) draw_text(img, ' ') u = nkf.nkf('-f22', honbun).decode('utf-8').splitlines() for v in u: draw_text(img, v, size = 32) gray = img.convert('L') img = gray.point(lambda x: 0 if x < 128 else 255) # img.show() img.save('output.png') return img
def decode(text, encoding=None, *args): if not encoding or encoding in ('ISO-8859-1', 'iso-8859-1'): encoding = nkf.guess(text) if encoding in ('BINARY', 'ISO-8859-1'): encoding = 'utf8' encoding = normalize_encoding(encoding) if not encoding in all_encodings: return nkf.nkf('-w', text).decode('utf8') return text.decode(encoding, *args)
def process_institution(): institution_rows = [] institution_fields = None for f in glob.glob("import/catalog/institution*.csv"): data = normalize(nkf.nkf("-w", open(f, "rb").read()).decode("UTF-8")) for hyphen in [b"\xe2\x80\x90", b"\xe2\x88\x92"]: data = data.replace(hyphen.decode("UTF-8"), "-") def fetch_fields(reader): for row in reader: if len(row) > 1: return row return None if sys.version_info.major < 3: data = data.encode("UTF-8") readers = [ csv.reader(io.BytesIO(data)), csv.reader(io.BytesIO(data), dialect="excel-tab") ] else: readers = [ csv.reader(io.StringIO(data)), csv.reader(io.StringIO(data), dialect="excel-tab") ] for rd in readers: fields = fetch_fields(rd) if fields: if institution_fields is None: institution_fields = fields else: assert institution_fields == fields for r in rd: r = [ x.replace("\r\n", " ").replace("\n", " ").replace("\r", " ").strip() for x in r ] if not is_blank_row(r): institution_rows.append(r) break ids = [r[0] for r in institution_rows] assert len(ids) == len(set(ids)) outname = "refine/institution.csv" if sys.version_info.major < 3: out = csv.writer(open(outname, "wb")) else: out = csv.writer(codecs.open(outname, "wb", encoding="UTF-8")) out.writerow(institution_fields) [out.writerow(r) for r in institution_rows]
def extract_message(message): if message.is_multipart(): messages = [] for m in message.get_payload(): extracted = EmailParser.extract_message(m) if extracted: if isinstance(extracted, list): messages.extend(extracted) else: messages.append(extracted) return messages body = message.get_payload(decode=True) if not body: return None charset = chardet.detect(body)['encoding'] if charset is None: charset = 'utf-8' elif charset == 'ISO-2022-JP': if callable(nkf): body = nkf('-Jwx', body) charset = 'utf-8' else: charset = 'ISO-2022-JP-2004' body = body.replace(b'\033$B', b'\033$(Q').replace(b'\033(J', b'\033(B') elif charset == 'SHIFT_JIS': if callable(nkf): body = nkf('-Swx', body) charset = 'utf-8' else: charset = 'CP932' elif charset == 'EUC-JP': if callable(nkf): body = nkf('-Ew', body) charset = 'utf-8' else: charset = 'EUCJIS2004' return message['Content-Type'], body.decode(encoding=charset, errors='replace')
def process_institution(): institution_rows = [] institution_fields = None for f in glob.glob("import/catalog/institution*.csv"): data = normalize(nkf.nkf("-w", open(f, "rb").read()).decode("UTF-8")) for hyphen in [b"\xe2\x80\x90", b"\xe2\x88\x92"]: data = data.replace(hyphen.decode("UTF-8"), "-") def fetch_fields(reader): for row in reader: if len(row) > 1: return row return None if sys.version_info.major < 3: data = data.encode("UTF-8") readers = [csv.reader(io.BytesIO(data)), csv.reader(io.BytesIO(data), dialect="excel-tab")] else: readers = [csv.reader(io.StringIO(data)), csv.reader(io.StringIO(data), dialect="excel-tab")] for rd in readers: fields = fetch_fields(rd) if fields: if institution_fields is None: institution_fields = fields else: assert institution_fields == fields for r in rd: r = [x.replace("\r\n", " ").replace("\n"," ").replace("\r"," ").strip() for x in r] if not is_blank_row(r): institution_rows.append(r) break ids = [r[0] for r in institution_rows] assert len(ids) == len(set(ids)) outname = "refine/institution.csv" if sys.version_info.major < 3: out = csv.writer(open(outname, "wb")) else: out = csv.writer(codecs.open(outname, "wb", encoding="UTF-8")) out.writerow(institution_fields) [out.writerow(r) for r in institution_rows]
def __init__( self, iterable, dialect='excel', error_mode="strict", encoding=None, headers=None, *args, **kwargs ): if isinstance(iterable, io.StringIO): assert encoding self.encoding = encoding iterable = StringIO.StringIO(iterable.read().encode(encoding)) else: # force utf8 with nkf module self.encoding = 'utf8' iterable = StringIO.StringIO(nkf.nkf('-w', iterable.read())) self.headers = headers self.reader = headers and \ csv.reader(iterable, dialect=dialect, *args, **kwargs) or \ csv.DictReader(iterable, dialect=dialect, *args, **kwargs) self.dialect = self.reader.dialect self.line_num = 1 self.error_mode = error_mode
def __init__(self, iterable, dialect='excel', error_mode="strict", encoding=None, headers=None, *args, **kwargs): if isinstance(iterable, io.StringIO): assert encoding self.encoding = encoding iterable = StringIO.StringIO(iterable.read().encode(encoding)) else: # force utf8 with nkf module self.encoding = 'utf8' iterable = StringIO.StringIO(nkf.nkf('-w', iterable.read())) self.headers = headers self.reader = headers and \ csv.reader(iterable, dialect=dialect, *args, **kwargs) or \ csv.DictReader(iterable, dialect=dialect, *args, **kwargs) self.dialect = self.reader.dialect self.line_num = 1 self.error_mode = error_mode
def get_pos_detail3(self): return self.pos_detail3 # Wordクラスの情報出力関数 def print_word(word): print(word.get_surface()) print(word.get_pos()) print(word.get_pos_detail1()) print(word.get_pos_detail2()) print(word.get_pos_detail3()) # 日本語を標準出力できるように sys.stdout = codecs.getwriter("utf_8")(sys.stdout) contents = open("./hoge.txt").read() contents = nkf.nkf("-w -d", contents) # 形態素解析する # 注意:MeCab解析する文字列は必ずencodeされていること. # 結果は,decodeして使用すること. # 参考:http://shogo82148.github.io/blog/2012/12/15/mecab-python/ result = MeCab.Tagger("")\ .parse(contents)\ .decode("utf-8") # 形態素に分解して,word_arrに突っ込む lines = result.split("\n") pattern = r"^(.*?)\t(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$" doc_arr = [] for line in lines: word_arr = []
def htmltotext(os_pathname): html = open(os_pathname).read() html = nkf.nkf("-w", html) return elinks(html)
def utf8_cleanup(text): if isinstance(text, str): return nkf.nkf("-w", text) #else return text.encode("utf-8")
def decode(self, input, errors='strict'): ret = nkf.nkf('-m0 -x -J -w', input) ret = unicode(ret, 'utf8', 'replace') return ret, len(input)
def do(filename): html = open(filename).read() html = nkf.nkf("-w", html) return elinks.extract_from_html(html)
def encode(self, input, errors='strict'): ret = input.encode('utf8', 'replace') ret = nkf.nkf('-m0 -x -W -j', ret) return (ret, len(input))
def pdftotext(os_pathname): text = process_output(["/usr/bin/pdftotext",os_pathname, "-"]) return nkf.nkf("-w", text)
def text(os_pathname): if os.stat(os_pathname).st_size > 1024 * 1024 * 10: return "***TOO LARGE TEXT FILE***" # else text = open(os_pathname).read() return nkf.nkf("-w", text)
def to_hiragana(s): out = nkf.nkf("-w --hiragana", s) return out.decode("utf-8")
# Wordクラスの情報出力関数 def print_word(word): print(word.get_surface()) print(word.get_pos()) print(word.get_pos_detail1()) print(word.get_pos_detail2()) print(word.get_pos_detail3()) # 日本語を標準出力できるように sys.stdout = codecs.getwriter("utf_8")(sys.stdout) contents = open("./appry.txt").read() contents = nkf.nkf("-w -d", contents) # 形態素解析する # 注意:MeCab解析する文字列は必ずencodeされていること. # 結果は,decodeして使用すること. # 参考:http://shogo82148.github.io/blog/2012/12/15/mecab-python/ result = MeCab.Tagger("")\ .parse(contents)\ .decode("utf-8") # 形態素をWordクラスにして,その配列を作る lines = result.split("\n") pattern = r"^(.*?)\t(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)$" word_arr = [] for line in lines: iterator = re.finditer(pattern, line)
def load_mai2009(filename): text = open(filename).read() text = nkf.nkf('-w', text) text = text.rstrip() return text.decode('utf-8')
def encode(self, input, errors='strict'): ret = input.encode('utf8', 'replace') ret = nkf.nkf('-m0 -x -W -s', ret) return (ret, len(input))
def decode(self, input, errors='strict'): ret = nkf.nkf('-m0 -x -S -w', input) ret = unicode(ret, 'utf8', 'replace') return ret, len(input)