Exemple #1
0
def parseyomi(t):
  """Get first yomigana out of translation
  @param  t  unicode
  @return  unicode or None
  """
  if t.startswith('@'):
    return skstr.findbetween(t, '@', '\n')
  if t.startswith('- {'):
    t = skstr.findbetween(t, '- {', '}')
    if t and jpchars.iskana(t[0]):
      return t
Exemple #2
0
def parseyomi(t, sep=None):
    """Get first yomigana out of translation
  @param  t  unicode
  @param* sep  unicode  separator for multiple rules
  @return  unicode  separated by ','
  """
    return skstr.findbetween(t, u'【', u'】') or ''  # sep is not implemented
Exemple #3
0
def parserole(t, sep=None):
    """Get first role out of translation
  @param  t  unicode
  @param* sep  unicode  separator for multiple rules
  @return  unicode  separated by ','
  """
    return skstr.findbetween(t, '[', ']') or ''  # sep is not implemented
Exemple #4
0
 def _parsedesc(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     return skstr.findbetween(h, '<!-- description -->',
                              '<!-- /description -->')
Exemple #5
0
 def _iterparse(self, h):
     """
 @param  h  unicode
 @yield  {kw}
 """
     h = skstr.findbetween(h, '<div class="relative">', u'<!-- ▲メイン -->')
     if h:
         for m in self._rx_parse.finditer(h):
             brand = m.group(1)
             key = m.group(2)
             title = m.group(3)
             if key and title:
                 try:
                     key = int(key)
                 except:
                     key = 0
                 if key:
                     yield {
                         'id':
                         key,
                         'url':
                         "https://www.melonbooks.co.jp/detail/detail.php?product_id=%s"
                         % key,
                         'title':
                         unescapehtml(title),
                         'brand':
                         unescapehtml(brand),
                         #'price': price, # price is not parsed here
                     }
Exemple #6
0
def parsedef(line):
    """
  @param  path  str
  @return  unicode or None
  """
    from sakurakit import skstr
    return skstr.findbetween(line, '{', '}')
Exemple #7
0
def translate(text, to='en', fr='ja'):
    """Return translated text, which is NOT in unicode format
  @param  text  unicode not None
  @param* fr  unicode not None, must be valid language code
  @param* to  unicode not None, must be valid language code
  @return  unicode or None

  Returned text is not decoded, as its encoding can be guessed.
  """
    try:
        JSONP_CALLBACK = 'ret'
        r = session.post(
            HONYAKU_API,
            headers=GZIP_HEADERS,
            data={
                'SSRC': text,
                'SLANG': niftydef.nifty_lang(fr),
                'TLANG': niftydef.nifty_lang(to),
                #'txtDirection': fr + to, # not needed
                #'XMODE': 0, # not needed
            },
        )

        ret = r.content
        # Example: ret('', {"translatedText":"If you can be calm, true-kun or 210 Yen."}, 200, null, null);
        if r.ok:
            ret = skstr.findbetween(ret, HONYAKU_TEXT_START, HONYAKU_TEXT_STOP)
            ret = ret.decode('utf8', errors='ignore')
            ret = skstr.unescapehtml(ret)
            return ret

    #except socket.error, e:
    #  dwarn("socket error", e.args)
    except requests.ConnectionError, e:
        dwarn("connection error", e.args)
Exemple #8
0
 def _parsetitle(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     r = skstr.findbetween(h, 'width:auto;">', '</strong>')
     if r and '<' not in r:
         return unescapehtml(r)
Exemple #9
0
 def _parseintro(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     ret = skstr.findbetween(h, '<div class="richeditor">', u'<!--')
     if ret:
         ret = ret.strip()
     return ret
Exemple #10
0
    def addFile(self, path):
        """
    @param  path  unicode
    @return  bool
    @raise
    """
        import codecs
        RAD_BASE = 10000
        with codecs.open(path, 'r', self.ENCODING) as f:
            charSection = False
            for line in f:
                left, mid, right = line.partition(':')
                #if len(left) > 1 and len(left) < 5:
                #  dprint("stop at utf16 character")
                #  break
                charSection = charSection or len(left) < 5

                radicals = findbetween(right, '(', ')').split(',')
                for i, r in enumerate(radicals):
                    if len(r) == 3:
                        radicals[i] = r.decode('utf8')
                    elif len(r) >= 5:
                        radicals[i] = int(r) - RAD_BASE
                    # UTF32 characters are not skipped
                    #else:
                    #  radicals = None
                    #  break
                if radicals:
                    radicals = tuple(
                        radicals
                    )  # use tuple instead of list to significantly reduce memory usage

                if left == u'密':
                    l = findbetween(right, '(', ')').split(',')

                if charSection:  # character section
                    #assert len(left) == 1
                    self.chars[left] = radicals

                else:  # radical section
                    index = int(left) - RAD_BASE
                    while len(self.rads) < index:
                        self.rads.append(None)
                    self.rads.append(radicals)
Exemple #11
0
 def _parsecreator(self, h, key):
     """
 @param  h  unicode  html
 @param  key  unicode
 @return  [unicode] not None
 """
     ret = skstr.findbetween(h, '<br>%s:' % key, '<br>')
     if ret and '<' not in ret:
         return unescapehtml(ret).split('/')
     return []
Exemple #12
0
 def _parseprice(self, h):
     """
 @param  h  unicode  html
 @return  int
 """
     try:
         return int(
             skstr.findbetween(h, u'<td class="price">¥',
                               '<').strip().replace(',', ''))
     except:
         return 0
Exemple #13
0
 def _parsebrand(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     r = skstr.findbetween(h, u'の通販・購入(', u')')
     if not r:
         m = self._rx_brand_circle.search(h)
         if m:
             r = m.group(1)
     if r and '<' not in r:
         return unescapehtml(r)
Exemple #14
0
def parsedef(t):
    """Get short definition out of translation
  @param  t  unicode
  @return  unicode
  """
    t = skstr.findbetween(t, '<N>', '</N>')
    if t:
        for c in u',;(':
            i = t.find(c)
            if i != -1:
                t = t[:i]
        for c in u']>)】~':
            i = t.find(c)
            if i != -1:
                t = t[i + 1:]
        if '...' in t or u'“' in t:
            return ''
        t = t.strip()
    return t or ''