def parseyomi(t): """Get first yomigana out of translation @param t unicode @return unicode or None """ if t.startswith('@'): return skstr.findbetween(t, '@', '\n') if t.startswith('- {'): t = skstr.findbetween(t, '- {', '}') if t and jpchars.iskana(t[0]): return t
def parseyomi(t, sep=None): """Get first yomigana out of translation @param t unicode @param* sep unicode separator for multiple rules @return unicode separated by ',' """ return skstr.findbetween(t, u'【', u'】') or '' # sep is not implemented
def parserole(t, sep=None): """Get first role out of translation @param t unicode @param* sep unicode separator for multiple rules @return unicode separated by ',' """ return skstr.findbetween(t, '[', ']') or '' # sep is not implemented
def _parsedesc(self, h): """ @param h unicode html @return unicode """ return skstr.findbetween(h, '<!-- description -->', '<!-- /description -->')
def _iterparse(self, h): """ @param h unicode @yield {kw} """ h = skstr.findbetween(h, '<div class="relative">', u'<!-- ▲メイン -->') if h: for m in self._rx_parse.finditer(h): brand = m.group(1) key = m.group(2) title = m.group(3) if key and title: try: key = int(key) except: key = 0 if key: yield { 'id': key, 'url': "https://www.melonbooks.co.jp/detail/detail.php?product_id=%s" % key, 'title': unescapehtml(title), 'brand': unescapehtml(brand), #'price': price, # price is not parsed here }
def parsedef(line): """ @param path str @return unicode or None """ from sakurakit import skstr return skstr.findbetween(line, '{', '}')
def translate(text, to='en', fr='ja'): """Return translated text, which is NOT in unicode format @param text unicode not None @param* fr unicode not None, must be valid language code @param* to unicode not None, must be valid language code @return unicode or None Returned text is not decoded, as its encoding can be guessed. """ try: JSONP_CALLBACK = 'ret' r = session.post( HONYAKU_API, headers=GZIP_HEADERS, data={ 'SSRC': text, 'SLANG': niftydef.nifty_lang(fr), 'TLANG': niftydef.nifty_lang(to), #'txtDirection': fr + to, # not needed #'XMODE': 0, # not needed }, ) ret = r.content # Example: ret('', {"translatedText":"If you can be calm, true-kun or 210 Yen."}, 200, null, null); if r.ok: ret = skstr.findbetween(ret, HONYAKU_TEXT_START, HONYAKU_TEXT_STOP) ret = ret.decode('utf8', errors='ignore') ret = skstr.unescapehtml(ret) return ret #except socket.error, e: # dwarn("socket error", e.args) except requests.ConnectionError, e: dwarn("connection error", e.args)
def _parsetitle(self, h): """ @param h unicode html @return unicode """ r = skstr.findbetween(h, 'width:auto;">', '</strong>') if r and '<' not in r: return unescapehtml(r)
def _parseintro(self, h): """ @param h unicode html @return unicode """ ret = skstr.findbetween(h, '<div class="richeditor">', u'<!--') if ret: ret = ret.strip() return ret
def addFile(self, path): """ @param path unicode @return bool @raise """ import codecs RAD_BASE = 10000 with codecs.open(path, 'r', self.ENCODING) as f: charSection = False for line in f: left, mid, right = line.partition(':') #if len(left) > 1 and len(left) < 5: # dprint("stop at utf16 character") # break charSection = charSection or len(left) < 5 radicals = findbetween(right, '(', ')').split(',') for i, r in enumerate(radicals): if len(r) == 3: radicals[i] = r.decode('utf8') elif len(r) >= 5: radicals[i] = int(r) - RAD_BASE # UTF32 characters are not skipped #else: # radicals = None # break if radicals: radicals = tuple( radicals ) # use tuple instead of list to significantly reduce memory usage if left == u'密': l = findbetween(right, '(', ')').split(',') if charSection: # character section #assert len(left) == 1 self.chars[left] = radicals else: # radical section index = int(left) - RAD_BASE while len(self.rads) < index: self.rads.append(None) self.rads.append(radicals)
def _parsecreator(self, h, key): """ @param h unicode html @param key unicode @return [unicode] not None """ ret = skstr.findbetween(h, '<br>%s:' % key, '<br>') if ret and '<' not in ret: return unescapehtml(ret).split('/') return []
def _parseprice(self, h): """ @param h unicode html @return int """ try: return int( skstr.findbetween(h, u'<td class="price">¥', '<').strip().replace(',', '')) except: return 0
def _parsebrand(self, h): """ @param h unicode html @return unicode """ r = skstr.findbetween(h, u'の通販・購入(', u')') if not r: m = self._rx_brand_circle.search(h) if m: r = m.group(1) if r and '<' not in r: return unescapehtml(r)
def parsedef(t): """Get short definition out of translation @param t unicode @return unicode """ t = skstr.findbetween(t, '<N>', '</N>') if t: for c in u',;(': i = t.find(c) if i != -1: t = t[:i] for c in u']>)】~': i = t.find(c) if i != -1: t = t[i + 1:] if '...' in t or u'“' in t: return '' t = t.strip() return t or ''