Beispiel #1
0
 def _iterparse(self, h):
     """
 @param  h  unicode
 @yield  {kw}
 """
     h = skstr.findbetween(h, '<div class="relative">', u'<!-- ▲メイン -->')
     if h:
         for m in self._rx_parse.finditer(h):
             brand = m.group(1)
             key = m.group(2)
             title = m.group(3)
             if key and title:
                 try:
                     key = int(key)
                 except:
                     key = 0
                 if key:
                     yield {
                         'id':
                         key,
                         'url':
                         "https://www.melonbooks.co.jp/detail/detail.php?product_id=%s"
                         % key,
                         'title':
                         unescapehtml(title),
                         'brand':
                         unescapehtml(brand),
                         #'price': price, # price is not parsed here
                     }
Beispiel #2
0
  def _iterparse(self, h):
    """
    @param  h  unicode
    @yield  {kw}
    """
    for m in self._rx_parse.finditer(h):
      hh = m.group()
      mm = self._rx_url_title.search(hh)
      if mm:
        url = mm.group(1)
        title = clean_title(unescapehtml(mm.group(2)))

        id = self._parseurlid(url)
        path = self._parseurlpath(url)
        if id and path:

          mm = self._rx_brand.search(hh)
          brand = unescapehtml(mm.group(1)) if mm else None

          mm = self._rx_img.search(hh)
          img = mm.group(1).replace('r.jpg', '.jpg') if mm else None

          mm = self._rx_price.search(hh)
          try: price = int(mm.group(1).replace(',', ''))
          except: price = 0

          yield {
            'url': "http://www.toranoana.jp" + url,     # str not None
            'id': id,
            'title': title, # unicode not None
            'image': img, # str or None
            'brand': brand, # unicode or None
            'price': price, # int not None
          }
Beispiel #3
0
    def _iterparse(self, h):
        """
    @param  h  unicode
    @yield  {kw}
    """
        try:
            start = h.find(u"検索結果(タイトル)")  # int
            stop = h.find(u"ブランド別製品リスト")  # int
            if start > 0 and stop > start:
                hh = h[start:stop]

                years = []  # [int year, int start]
                for m in self._rx_year.finditer(hh):
                    years.append((
                        int(m.group(1)),
                        m.start(),
                    ))
                if not years:
                    dwarn("cannot find release years, maybe, unknown years")

                id0 = title0 = None

                # first, yield the matched game
                m = self._rx_first_id.search(hh)
                if m:
                    id0 = int(m.group(1))
                    if id0:
                        m = self._rx_first_title.search(hh)
                        if m:
                            title0 = unescapehtml(m.group(1))

                if id0 and title0:
                    year0 = years[0][0] if years else None
                    brand0 = self._parsebrand(h)
                    yield {
                        'id': id0,
                        'title': title0,
                        'date': self._parsedate(h),
                        'brand': brand0,
                        'year': year0,
                    }

                    # then, parse index of years
                    # iterparse and compare index against year index
                    for m in self._rx_product.finditer(hh):
                        id = int(m.group(1))
                        title = unescapehtml(m.group(2))
                        year = None
                        if years:
                            for y, start in years:
                                if start > m.start():
                                    break
                                year = y
                        yield {'id': id, 'title': title, 'year': year}

        except ValueError:  # raised by int()
            dwarn("failed to convert to int")
Beispiel #4
0
 def _iterparsewriters(self, h):
     """
 @param  h  unicode  html
 @yield  unicode
 """
     m = self._rx_info_writers.search(h)
     if m:
         line = unescapehtml(m.group(1))
         for m in self._rx_staff.finditer(line):
             yield unescapehtml(m.group(1))
Beispiel #5
0
def t_unicode(t):
    """
  @param  t  str
  @return  unicode or None
  """
    return unescapehtml(t).decode('utf8',
                                  errors='ignore').strip() if t else None
Beispiel #6
0
def translate(text, to='en', fr='ja'):
    """Return translated text, which is NOT in unicode format
  @param  text  unicode not None
  @param  fr  unicode not None, must be valid language code
  @param  to  unicode not None, must be valid language code
  @return  unicode or None
  """
    try:
        r = session.get(
            api(to, fr),
            headers=GZIP_HEADERS,  # disabled since not supported by qt
            params={'before': text})

        #print r.headers['Content-Type']
        ret = r.content

        if r.ok and len(ret) > 1000:
            # Extract text within '<textarea .*name="after">' and '</textarea>'
            m = __re_search.search(ret)
            if m:
                ret = m.group(1)
                ret = ret.decode('utf8', errors='ignore')
                ret = unescapehtml(ret)
            else:
                dwarn("content not matched: %s" % ret)
        return ret

    #except socket.error, e:
    #  dwarn("socket error", e.args)
    except requests.ConnectionError, e:
        dwarn("connection error", e.args)
Beispiel #7
0
    def _parsejson(self, data):
        """@reimp
    @param  data
    @return  {kw}
    @raise
    """
        items = data['items']
        for item in items:
            if item['romanTitle'] == '::inedited:: ':
                item['romanTitle'] = ''
            f = _PATCHES.get(item['id'])
            if f:
                for k, v in f.iteritems():
                    item[k] = v

            for k in 'title', 'romanTitle', 'brand':
                t = item[k]
                if t:
                    item[k] = unescapehtml(
                        t).rstrip()  # remove right most space

            t = item.get('releaseDayNumber')
            if t and isinstance(t, int):
                s = "%s" % t
                if s.endswith('44'):  # date number should not ends with > 31
                    t -= 44 - 28  # change to 28
                    item['releaseDayNumber'] = t
        return items
Beispiel #8
0
 def _iterparsebrands(self, h):
     """
 @param  h  unicode  html
 @yield  {kw}
 """
     try:
         m = self._rx_brands.search(h)
         if m:
             line = m.group(1)
             for hh in line.split(u'、'):
                 id = int(self._rx_brands_id.search(hh).group(1))
                 name = unescapehtml(
                     self._rx_brands_name.search(hh).group(1))
                 yield {
                     'id':
                     id,  # int
                     'name':
                     name,  # unicode
                     'img':
                     "http://media.erogetrailers.com/img/brand/%i.png" %
                     id,  # str
                     #'url': "http://erogetrailers.com/brand/%i" % id, # not used
                 }
     except Exception, e:
         dwarn(e)
Beispiel #9
0
    def _parsemetadesc(self, h):
        """
    @param  h  unicode  html
    @return  kw
    """
        ret = {}
        m = self._rx_meta_desc.search(h)
        if m:
            desc = m.group(1)
            m = self._rx_desc_title.search(desc)
            if m:
                ret['title'] = unescapehtml(m.group(1))

            #m = self._rx_desc_brand.search(desc)
            #if m:
            #  brand = unescapehtml(m.group(1))
            #  if brand[-1] == u'の':
            #    brand = brand[:-1]
            #  ret['brand'] = brand

            m = self._rx_desc_price.search(desc)
            if m:
                price = m.group(1).replace(',', '')
                try:
                    ret['price'] = int(price)
                except:
                    pass
        return ret
Beispiel #10
0
def translate(text, to='en', fr='ja'):
    """Return translated text, which is NOT in unicode format
  @param  text  unicode not None
  @param* fr  unicode not None, must be valid language code
  @param* to  unicode not None, must be valid language code
  @return  unicode or None

  Returned text is not decoded, as its encoding can be guessed.
  """
    try:
        JSONP_CALLBACK = 'ret'
        r = session.post(
            HONYAKU_API,
            headers=GZIP_HEADERS,
            data={
                'SSRC': text,
                'SLANG': niftydef.nifty_lang(fr),
                'TLANG': niftydef.nifty_lang(to),
                #'txtDirection': fr + to, # not needed
                #'XMODE': 0, # not needed
            },
        )

        ret = r.content
        # Example: ret('', {"translatedText":"If you can be calm, true-kun or 210 Yen."}, 200, null, null);
        if r.ok:
            ret = skstr.findbetween(ret, HONYAKU_TEXT_START, HONYAKU_TEXT_STOP)
            ret = ret.decode('utf8', errors='ignore')
            ret = skstr.unescapehtml(ret)
            return ret

    #except socket.error, e:
    #  dwarn("socket error", e.args)
    except requests.ConnectionError, e:
        dwarn("connection error", e.args)
Beispiel #11
0
    def translate(self, t, to='auto', fr='auto'):
        """
    @param  t  unicode
    @param* to  str
    @param* fr  str
    @return  unicode or None
    """
        try:
            r = self.session.post(self.api,
                                  headers=self.headers,
                                  data={
                                      'hl': googledef.lang2locale(to),
                                      'sl': googledef.lang2locale(fr),
                                      'q': t,
                                  })

            h = r.content
            if h:
                start = h.find(self._TEXT_BEGIN)
                if start > 0:
                    start += len(self._TEXT_BEGIN)
                    stop = h.find(self._TEXT_END, start)
                    if stop > 0:
                        h = h[start:stop]
                        return unescapehtml(h)

        #except socket.error, e:
        #  dwarn("socket error", e.args)
        except requests.ConnectionError, e:
            dwarn("connection error", e.args)
Beispiel #12
0
 def _parsetitle(self, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     t = self._parsemeta(self._rx_meta_title, h)
     if t:
         return unescapehtml(self._rx_title.sub('', t)).strip()
Beispiel #13
0
 def _parsebanner(self, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     m = self._rx_banner.search(h)
     if m:
         return unescapehtml(m.group(1))
Beispiel #14
0
 def _parsetitle(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     kw = self._parsemetakw(h)
     if kw:
         return unescapehtml(kw[0])
Beispiel #15
0
 def _parsetitle(self, h):
   """
   @param  h  unicode
   @return  unicode
   """
   m = self._re_title.search(h)
   if m:
     return unescapehtml(m.group(1))
Beispiel #16
0
 def _parsecomment(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     m = self._rx_comment.search(h)
     if m:
         return unescapehtml(m.group(1))
Beispiel #17
0
 def _parsetitle(self, h):
     """
 @param  h  unicode  html
 @return  unicode
 """
     r = skstr.findbetween(h, 'width:auto;">', '</strong>')
     if r and '<' not in r:
         return unescapehtml(r)
Beispiel #18
0
 def _parsekeywords(self, h):
     """
 @param  h  unicode  html
 @return  [unicode] or None
 """
     t = self._parsemeta(self._rx_meta_keywords, h)
     if t:
         return unescapehtml(t).split(',')
Beispiel #19
0
 def _parseseries(self, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     m = self._rx_series.search(h)
     if m:
         return unescapehtml(m.group(1))
Beispiel #20
0
def _unescape_term_text(text):
    """
  @param  text  unicode
  @return  unicode
  """
    if not text or '&' not in text or ';' not in text:
        return text
    return skstr.unescapehtml(text).replace('&eos;', defs.TERM_ESCAPE_EOS)
Beispiel #21
0
 def _iterparsedescriptions(self, h):
     """
 @param  h  unicode  html
 @yield  unicode
 """
     for m in self._rx_desc.finditer(h):
         yield unescapehtml(
             self._removescripts(self._replacelinks(m.group())))
Beispiel #22
0
 def _parsedate(self, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     m = self._rx_info_date.search(h)
     if m:
         return unescapehtml(m.group(1))
Beispiel #23
0
 def _parsetd(self, rx, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     m = rx.search(h)
     if m:
         return unescapehtml(m.group(1))
Beispiel #24
0
 def _iterparseddlinks(self, *args, **kwargs):
   """
   @yield  unicode
   """
   dd = self._parsedd(*args, **kwargs);
   if dd:
     for m in self._rx_link.finditer(dd):
       yield unescapehtml(m.group(1))
Beispiel #25
0
 def _parsetitle(self, h):
   """
   @param  h  unicode  html
   @return  unicode or None
   """
   t = self._parsemetakeyword(h)
   if t:
     return unescapehtml(t.partition(',')[0])
Beispiel #26
0
 def _parsebrand(self, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     m = self._rx_brand.search(h)
     if m:
         return unescapehtml(
             m.group(1)).strip()  # there is a space in the beginning
Beispiel #27
0
 def _iterparsefields(self, h):
     """
 @param  h  unicode
 @yield  (str key, unicode or None)
 """
     for k, rx in self._rx_fields:
         m = rx.search(h)
         if m:
             yield k, unescapehtml(m.group(1)).strip()
Beispiel #28
0
 def _parsebrand(self, h):
     """
 @param  h  unicode  html
 @return  unicode or None
 """
     m = self._rx_brand.search(h)
     if m:
         return unescapehtml(m.group(1)).replace(" / ",
                                                 ',').replace(u"/", ',')
Beispiel #29
0
    def _iterparsecharacters(self, h):
        """
    @param  h  unicode
    @yield  kw
    """
        m = self._rx_image.search(h)
        if m:
            prefix = m.group()
            for i in xrange(1, 100):
                img = "%sc%02d.jpg" % (prefix, i)
                start = h.find(img)
                if start < 0:
                    break
                stop = h.find('</table>', start)
                if start < 0:
                    break
                desc = h[start:stop]

                m = self._rx_label.search(desc)
                label = unescapehtml(m.group(1)) if m else ''

                m = self._rx_cv.search(desc)
                cv = unescapehtml(m.group(1)) if m else ''

                name = yomi = ''
                m = self._rx_chara.search(desc)
                if m:
                    name = unescapehtml(m.group(1)).replace(u' ', ' ')  # u3000
                    # ●羽馬 紫織(はば・しおり)
                    beg = name.find(u'(')
                    if beg > 0:
                        end = name.rfind(u')')
                        if end > 0:
                            yomi = name[beg + 1:end].replace(u'・', ' ')
                            name = name[:beg]

                yield {
                    'id': i,  # int
                    'img': self.HOST + img,
                    'label': label,
                    'name': name.strip(),
                    'yomi': yomi.strip(),
                    'cv': cv,  # unicode
                }
Beispiel #30
0
 def _parseddlink(self, *args, **kwargs):
   """
   @return  unicode not None
   """
   dd = self._parsedd(*args, **kwargs);
   if dd:
     m = self._rx_link.search(dd)
     if m:
       return unescapehtml(m.group(1))
   return ''