/
preprocessing.py
115 lines (94 loc) · 3.86 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import zenhan
SURROUNDS = (
re.compile(ur'\(.*\).*'),
re.compile(ur'\(.*\).*'),
re.compile(ur'\(.*\).*'),
re.compile(ur'\(.*\).*'),
re.compile(ur'\[.*\].*'),
re.compile(ur'\<.*\>.*'),
re.compile(ur'<.*>.*'),
re.compile(ur'【.*】.*'),
)
UNCLOSED_PAREN = re.compile(ur"(\w+)\([^(]+", re.UNICODE)
STARTS_WITH_ALPHA = re.compile(ur"^[a-zA-Z][.:()()\s]*([^a-zA-Z]+)")
ENDS_WITH = re.compile(ur"(等|など)$")
SPLIT = re.compile(ur'or|OR|または|又は|/|\+|あれば|、|・')
OPTIONAL_START = re.compile(ur'^(好みの|お好みの|お好みにより|あれば|お好きの|お好きな|市販の)')
SPECIAL_SYMBOLS = (
re.compile(ur'\*'),
re.compile(ur'@'),
# re.compile(ur'\u30fb'), # "・"
re.compile(ur'[\u2000-\u206f]'), # punctuation
re.compile(ur'[\u2460-\u24ff]'), # enclosed alphanumerics
re.compile(ur'[\u2500-\u257f]'), # box drawing
re.compile(ur'[\u25a0-\u25ff]'), # geometric shapes
re.compile(ur'[\u2600-\u26ff]'), # miscellaneous symbols
re.compile(ur'[\u2700-\u27bf]'), # dingbats
re.compile(ur'[\u3000|\u3002-\u303f]'), # cjk symbols and punctuation
# except "、"
re.compile(ur'[\uff00-\uffef]'), # halfwidth and fullwdith forms
)
def make_function_hiragana():
re_katakana = re.compile(ur'[ァ-ヴ]')
def hiragana(text):
"""ひらがな変換"""
return re_katakana.sub(lambda x: unichr(ord(x.group(0)) - 0x60), text)
return hiragana
hiragana = make_function_hiragana()
def normalize(ingredient):
ingredient = ingredient.strip()
for SURROUND in SURROUNDS:
ingredient = SURROUND.sub(lambda s: '', ingredient)
ingredient = OPTIONAL_START.sub(lambda s: '', ingredient)
match = UNCLOSED_PAREN.match(ingredient)
if match:
ingredient = match.groups()[0]
ingredient = zenhan.z2h(ingredient, mode=1) # ascii
ingredient = zenhan.h2z(ingredient, mode=4) # kana
# convert all katakana to hiragana
ingredient = hiragana(ingredient)
match = STARTS_WITH_ALPHA.match(ingredient)
if match and not ingredient.startswith('S&B'):
ingredient = match.groups()[0]
for SPECIAL_SYMBOL in SPECIAL_SYMBOLS:
ingredient = SPECIAL_SYMBOL.sub(lambda s: '', ingredient)
ingredients = SPLIT.split(ingredient)
ingredients = map(lambda ingr: ENDS_WITH.sub(lambda s: '', ingr), ingredients)
ingredients = map(lambda ingr: ingr.strip(), ingredients)
ingredients = filter(lambda ingr: ingr, ingredients)
for ingredient in ingredients:
yield ingredient
if __name__ == '__main__':
ingredients = [
u'a醤油',
u'Bごま油',
u'A 塩',
u'A.醤油',
u'A)片栗粉',
u'里芋(冷凍もの可',
u'ケチャップ+ソース+醤油',
u'(じゃこ白胡麻海苔',
u'EXオリーブオイル',
u'スィートチリソース(今回はID:1384495 を使いました。)',
u'梅干し(ほぐして種ごと',
u'あればローリエ',
u'お好きな葉野菜',
u'酒あれば泡盛',
u'市販のカレールー',
u'しめじ、えのき等',
u'キウリ、ハム、錦糸卵 ',
u'水・ごま油・酒・すり胡麻',
u'ニンニク(みじん切り)...A ',
u'ニンニク<みじん切り>...A ',
u'ニンニク<みじん切り>...A ',
u'豚薄切り肉【巻ける位のものなら何でも】',
u'一味唐辛子or七味唐辛子',
u'生Cilantro(コリアンダー)',
u'@醤油',
]
for ingredient in ingredients:
for normalized_ingredient in normalize(ingredient):
print normalized_ingredient.encode('utf8')