Esempio n. 1
0
    def __getitem__(self, key):
        if key == 'player':
            player = self._hxs.select(self._xpath_config[key]).extract()
            if not player:
                player = self._hxs.select(self._xpath_config['player2']).extract()

            if player:            
                for i in re.split('\n+\s*', player[0]):
                    if 'image' in i:
                        s = re.search('http.*\.jpg', i)
                        image_urls = [s.group(0)]
            else:
                """ No player on the page """
                image_urls = None
        elif key == 'thumbnail':
            image_urls = self._hxs.select(self._xpath_config[key]).extract()
            if not image_urls:
                image_urls = self._hxs.select(self._xpath_config['thumbnail2']).extract()
        elif key == 'sample':
            image_urls = self._hxs.select(self._xpath_config[key]).extract()
            if not image_urls:
                image_urls = self._hxs.select(self._xpath_config['sample2']).extract()
        elif key == 'sample_large':
            image_urls = self._hxs.select(self._xpath_config[key]).extract()
            if not image_urls:
                image_urls = self._hxs.select(self._xpath_config['sample2_large']).extract()
        elif key in self.keys():
            image_urls = self._hxs.select(self._xpath_config[key]).extract()
        else:
            raise KeyError

        if image_urls:
            return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
    def __getitem__(self, key):
        if key == 'name':
            str_list = self._hxs.select(self._xpath_config[key]).extract()
            return [s.strip() for s in str_list]
#        elif key == 'name_index_hiragana' or key == 'name_index_katakana':
#            str_list = self._hxs.select(self._xpath_config['name_index']).re(ur'「.」')
#            if str_list:
#                name_index = str_list[0].strip(u'「').strip(u'」').strip()
#                if key == 'name_index_hiragana':
#                    return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA),
#                                                        len(self._hxs.select(self._xpath_config['name']).extract()))]
#                elif key == 'name_index_katakana':
#                    return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA),
#                                                        len(self._hxs.select(self._xpath_config['name']).extract()))]
#                else:
#                    raise KeyError(key)
#            else:
#                raise ValueError(str_list)
        elif key == 'name_index_hiragana' or key == 'name_index_katakana':
            """ A list of None is returned """
            return [i for i in itertools.repeat(None, len(self._hxs.select(self._xpath_config['name']).extract()))]
        elif key == 'image':
            base_url = 'http://www.onacle.tv'
            original_urls = []
            image_urls = self._hxs.select(self._xpath_config[key]).extract()
            for image_url in image_urls:
                if 'no_photo' in image_url:
                    image_url = '%s/%s' % (base_url, image_url)
                else:
                    image_url = '%s%s' % (base_url, image_url)
                original_urls.append(image_url)
            return [{'name': get_image_name(original_url), 'original_url': original_url}
                    for original_url in original_urls]
        else:
            raise KeyError(key)
Esempio n. 3
0
 def __getitem__(self, key):
     if key == 'name':
         str_list = self._hxs.select(self._xpath_config[key]).extract()
         return [s.strip() for s in str_list]
     elif key == 'name_index_hiragana' or key == 'name_index_katakana':
         str_list = self._hxs.select(self._xpath_config['name_index']).extract()
         if str_list:
             #name_index = self._hxs.select(self._xpath_config['name_index']).extract()[0][1]
             name_index = str_list[0][1]
             if key == 'name_index_hiragana':
                 #return [i for i in itertools.repeat(cnvk.convert(name_index.encode('utf-8'), cnvk.KATA2HIRA), len(self['name']))]
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(self['name']))]
             elif key == 'name_index_katakana':
                 #return [i for i in itertools.repeat(cnvk.convert(name_index.encode('utf-8'), cnvk.HIRA2KATA), len(self['name']))]
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(self['name']))]
             else:
                 raise Exception
         else:
             """ Emply list is returned if kana=other """
             return [i for i in itertools.repeat(None, len(self['name']))]
     elif key == 'image':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
         #return ['%s.jpg' % hashlib.sha1(image_url).hexdigest() for image_url in image_urls]
         #return [(get_image_name(image_url), image_url) for image_url in image_urls]
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     else:
         raise KeyError(key)
 def __getitem__(self, key):
     base_url = self._response.url.rstrip('index.html')
     title_id = base_url.split('/')[-2]
     image_url = 'http://www.heyzo.com/contents/3000/%s/images/'
     gallery_url = 'http://www.heyzo.com/contents/3000/%s/gallery/%s'
     image_urls = []
     if key == 'face':
         image_urls = [('%sthumbnail.jpg' % image_url) % title_id]
     elif key == 'player':
         image_urls = [('%splayer_thumbnail_450.jpg' % image_url) % title_id]
     elif key == 'sample':
         sample_urls_temp = re.findall(r'<a class="sample-capture">[\s\S].*?</a>',
                                       self._hxs.select(self._xpath_config[key]).extract()[0])
         pattern = re.compile(r'/contents/\d+/%s/images/capture\d+\.jpg' % title_id)
         image_urls = ['http://www.heyzo.com%s' % re.search(pattern, i).group(0) for i in sample_urls_temp]
     elif key == 'thumbnail':
         script_str = re.search(r'} else {[\s\S]*', self._hxs.select(self._xpath_config[key]).extract()[0]).group(0)
         for i in re.findall(r'[thumbnail_]*\d+\.jpg', script_str):
             if re.match(r'thumbnail_\d+\.jpg', i):
                 image_urls.append(gallery_url % (title_id, re.match(r'thumbnail_\d+\.jpg', i).group(0)))
     elif key == 'thumbnail_large':
         script_str = re.search(r'} else {[\s\S]*', self._hxs.select(self._xpath_config[key]).extract()[0]).group(0)
         for i in re.findall(r'[thumbnail_]*\d+\.jpg', script_str):
             if re.match(r'\d+\.jpg', i):
                 image_urls.append(gallery_url % (title_id, re.match(r'\d+\.jpg', i).group(0)))
     elif key in self.keys():
         pass
     else:
         raise KeyError(key)
     if image_urls:
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
 def __getitem__(self, key):
     if key == 'name':
         str_list = self._hxs.select(self._xpath_config[key]).extract()
         return [s.strip() for s in str_list]
     elif key == 'name_index_hiragana' or key == 'name_index_katakana':
         str_list = self._hxs.select(self._xpath_config['name_index']).re(ur'「.」')
         if str_list:
             name_index = str_list[0].strip(u'「').strip(u'」').strip()
             if key == 'name_index_hiragana':
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA),
                                                     len(self._hxs.select(self._xpath_config['name']).extract()))]
             elif key == 'name_index_katakana':
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA),
                                                     len(self._hxs.select(self._xpath_config['name']).extract()))]
             else:
                 raise KeyError(key)
         else:
             raise ValueError(str_list)
     elif key == 'image':
         #image_urls = self._hxs.select(self._xpath_config[key]).extract()
         image_urls = ['http://my.tokyo-hot.com%s' % image_url
                       for image_url in self._hxs.select(self._xpath_config[key]).extract()]
         #return [{'name': get_image_name(image_url),
         #         'original_url': 'http://my.tokyo-hot.com%s' % image_url} for image_url in image_urls]
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     else:
         raise KeyError(key)
 def __getitem__(self, key):
     base_url = self._response.url.rstrip('index.html')
     if key == 'player':
         #player = self._hxs.select(self._xpath_config[key]).extract()[0]
         #for i in re.split('\n+\s*', player):
         #    if 'image : ' in i:
         #        s = re.search(r'\/moviepages\/\d{6}-\d{3}\/images\/l_l\.jpg', i)
         #        image_urls = [s.group(0)]
         image_urls = ['%simages/l_l.jpg' % base_url]
     elif key == 'sample' or key == 'sample_large':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
         if image_urls:
             #image_urls = re.split('\n+\s*', image_urls[0])
             if key == 'sample':
                 #pattern = r'images/g_t0\d{2}.jpg'
                 format = 'g_t0%02d.jpg'
             elif key == 'sample_large':
                 #pattern = r'images/g_big0\d{2}.jpg'
                 format = r'g_big0%02d.jpg'
             #image_urls = ['%s%s' % (base_url, re.search(pattern, image_url).group(0)) for image_url in image_urls if re.search(pattern, image_url)]
             filenames = [format % num for num in xrange(1, 13)]
             image_urls = ['%simages/%s' % (base_url, filename) for filename in filenames]
     elif key == 'face' or key == 'thumbnail' or key == 'thumbnail_large':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
         if image_urls:
             if key == 'face':
                 pattern = r'images/n\.jpg'
             elif key == 'thumbnail':
                 pattern = r'images/\d_\d_s\.gif'
             elif key == 'thumbnail_large':
                 pattern = r'images/\d_\d\.jpg'
             #image_urls = self._hxs.select(self._xpath_config[key]).extract()
             image_urls = ['%s%s' % (base_url, re.search(pattern, image_url).group(0)) for image_url in image_urls]
     else:
         raise KeyError
     
     if not image_urls:
         image_url = '%s/%s' % (self._response.url, key) 
         """ Returning a dummy file name and empty original_url. 
             'name' is needed a record to be created on DB. """
         return [{'name': get_image_name(image_url), 'original_url': ''}]
     else:
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
    def __getitem__(self, key):
        image_urls = []
        if key == "player":
            # image_urls = ['http://my.tokyo-hot.com%s' % url
            image_urls = [url for url in self._hxs.select(self._xpath_config[key]).extract()]
        elif key in self.keys():
            pass
        else:
            raise KeyError(key)

        if image_urls:
            return [{"name": get_image_name(image_url), "original_url": image_url} for image_url in image_urls]
 def __getitem__(self, key):
     image_urls = []
     if key == 'face' or key == 'thumbnail':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
     elif key == 'player':
         pattern = re.compile(r'http://.*\.enkou55.com/images/title/\d{2}/\d{2}/\d{2}/player.jpg')
         if self._hxs.select(self._xpath_config[key]).extract():
             image_urls = [re.search(pattern, self._hxs.select(self._xpath_config[key]).extract()[0]).group()]
     elif key in self.keys():
         pass
     else:
         raise KeyError(key)
     if image_urls:
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
Esempio n. 9
0
 def __getitem__(self, key):
     if key == 'name':
         #str_list = self._hxs.select(self._xpath_config[key]).extract()
         #return [s.strip() for s in str_list]
         name_list = []
         name_index_hiragana_list = []
         name_index_katakana_list = []
         for i in range(len(self._hxs.select(self._xpath_config['name_index']))):
         #for xpath in self._xpath_config[key]:
             str_list = self._hxs.select(self._xpath_config[key][i]).extract()
             name_index = self._hxs.select(self._xpath_config['name_index']).extract()[i]
             try:
                 name_index_hiragana_list += [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(str_list))]
                 name_index_katakana_list += [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(str_list))]
             except TypeError:
                 raise TypeError(type(name_index_hiragana_list))
             name_list += [s.strip() for s in str_list]
         self['name_index_hiragana'] = name_index_hiragana_list
         self['name_index_katakana'] = name_index_katakana_list
         return name_list
     elif key == 'name_index_hiragana' or key == 'name_index_katakana':
         #return self[key]
         pass
         """
         str_list = self._hxs.select(self._xpath_config['name_index']).extract()
         if str_list:
             name_index = str_list[0][1]
             #name_index = str_list
             if key == 'name_index_hiragana':
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(self['name']))]
             elif key == 'name_index_katakana':
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(self['name']))]
             else:
                 #raise Exception
                 raise KeyError(key)
         else:
             return [i for i in itertools.repeat(None, len(self['name']))]
         """
     elif key == 'image':
         #image_urls = self._hxs.select(self._xpath_config[key]).extract()
         image_urls = ['http://www.caribbeancom.com%s' % image_url
                       for image_url in self._hxs.select(self._xpath_config[key]).extract()]
         #image_urls = self['image_url']
         #return ['%s.jpg' % hashlib.sha1(image_url).hexdigest() for image_url in image_urls]
         #return [{'name': get_image_name(image_url), 'original_url': 'http://www.caribbeancom.com%s' % image_url}
         #        for image_url in image_urls]
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     else:
         raise KeyError(key)
 def __getitem__(self, key):
     base_url = self._response.url.rstrip('index.html')
     #image_urls = None
     image_urls = []
     if key == 'player':
         image_urls = ['%simages/l_l.jpg' % base_url]
     elif key == 'sample':
         image_urls = ['http://www.caribbeancom.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()]
     elif key == 'sample_large':
         value_list = self._hxs.select(self._xpath_config[key]).extract()
         if value_list:
             if 'g_big' in self._hxs.select(self._xpath_config[key]).extract()[0]: # old design
                 for image_url in value_list:
                     image_urls.append('http://www.caribbeancom.com%s' % re.sub(r'/member', '', image_url))
             else: # new design
                 for i in xrange(0, 5):
                     image_urls.append('http://www.caribbeancom.com%s' % value_list[i])
         #image_urls = ['http://www.caribbeancom.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()]
     elif key in self.keys():
         pass
     else:
         raise KeyError(key)
     """
     elif key == 'sample' or key == 'sample_large':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
         if image_urls:
             #image_urls = re.split('\n+\s*', image_urls[0])
             if key == 'sample':
                 #pattern = r'images/g_t0\d{2}.jpg'
                 #format = 'g_t0%02d.jpg'
                 #filename = 's/%03d.jpg'
                 #filenames = ['s/%03d.jpg' % num for num in xrange(1, 21)]
                 filenames = ['s/%03d.jpg' % num for num in xrange(1, len(image_urls) + 1)]
             elif key == 'sample_large':
                 #pattern = r'images/g_big0\d{2}.jpg'
                 #format = r'g_big0%02d.jpg'
                 #filename = 'l/%03d.jpg'
                 filenames = ['l/%03d.jpg' % num for num in xrange(1, 6)] 
                 filenames += [None] * 15
             #image_urls = ['%s%s' % (base_url, re.search(pattern, image_url).group(0)) for image_url in image_urls if re.search(pattern, image_url)]
             #filenames = [format % num for num in xrange(1, 13)]
             image_urls = ['%simages/%s' % (base_url, filename) for filename in filenames]
     """
     if image_urls:
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
 def __getitem__(self, key):
     base_url = 'http://www.onacle.tv'
     image_urls = []
     if key == 'face' or key == 'player' or key == 'sample':
         image_urls = ['%s%s' % (base_url, image_path)
                       for image_path in self._hxs.select(self._xpath_config[key]).extract()]
     elif key == 'thumbnail':
         original_id = parse_qs(urlparse(self._response.url).query)['videocode'][0]
         pattern = re.compile(r'/video/%s/%s_\d+\.jpg' % (original_id, original_id))
         if self._hxs.select(self._xpath_config[key]).extract():
             image_paths = []
             for image_path in self._hxs.select(self._xpath_config[key]).extract():
                 image_paths += re.findall(pattern, image_path)
             image_urls = ['%s%s' % (base_url, image_path) for image_path in image_paths]
     elif key in self.keys():
         pass
     else:
         raise KeyError(key)
     if image_urls:
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
 def __getitem__(self, key):
     if key == 'face' or key == 'thumbnail':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
     elif key == 'player':
         title_id = int(self._response.url.split('/')[-1].strip('.html'))
         image_urls = ['http://image.xxx-av.com/image/%d/movie_main.jpg' % title_id]
     elif key == 'sample' or key == 'sample_large' or key == 'thumbnail_large':
         image_urls = ''
     else:
         raise KeyError
     
     #if not image_urls:
     #    image_url = '%s/%s' % (self._response.url, key)
     #    """ Returning a dummy file name and empty original_url.
     #        'name' is needed a record to be created on DB. """
     #    return [{'name': get_image_name(image_url), 'original_url': ''}]
     #else:
     #    return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     if image_urls:
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
 def __getitem__(self, key):
     if key == 'name':
         name_list = []
         for i in range(len(self._xpath_config[key])):
             for name in self._hxs.select(self._xpath_config[key][i]).extract():
                 name_list.append(name.split()[0].strip())
             #name_list += name_list
         return name_list
     elif key == 'name_hiragana' or key == 'name_katakana':
         name_kana_list = []
         for i in range(len(self._xpath_config['name'])):
             for name in self._hxs.select(self._xpath_config['name'][i]).extract():
                 if len(name.split()) == 2:
                     if key == 'name_hiragana':
                         name_kana_list.append(cnvk.convert(name.split()[1].strip(), cnvk.KATA2HIRA))
                     elif key == 'name_katakana':
                         name_kana_list.append(cnvk.convert(name.split()[1].strip(), cnvk.HIRA2KATA))
                 else:
                     name_kana_list.append(None)
         return name_kana_list
     elif key == 'name_index_hiragana' or key == 'name_index_katakana':
         name_index_kana_list = []
         for i in range(len(self._hxs.select(self._xpath_config['name_index']))):
             name_temp_list = self._hxs.select(self._xpath_config['name'][i]).extract()
             name_index = self._hxs.select(self._xpath_config['name_index']).extract()[i]
             try:
                 if key == 'name_index_hiragana':
                     name_index_kana_list += list(itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(name_temp_list)))
                 elif key == 'name_index_katakana':
                     name_index_kana_list += list(itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(name_temp_list)))
                 else:
                     raise TypeError
             except TypeError:
                 raise TypeError(type(name_index_kana_list))
         return name_index_kana_list
     elif key == 'image':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     else:
         raise KeyError(key)
 def __getitem__(self, key):
     if key == 'name':
         str_list = self._hxs.select(self._xpath_config[key]).extract()
         return [s.strip() for s in str_list]
     elif key == 'name_index_hiragana' or key == 'name_index_katakana':
         str_list = self._hxs.select(self._xpath_config['name_index']).extract()
         if str_list:
             name_index = str_list[0].strip()
             if key == 'name_index_hiragana':
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA),
                                                     len(self._hxs.select(self._xpath_config['name']).extract()))]
             elif key == 'name_index_katakana':
                 return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA),
                                                     len(self._hxs.select(self._xpath_config['name']).extract()))]
             else:
                 raise Exception
         else:
             """ Emply list is returned if kana=other """
             return [i for i in itertools.repeat(None, len(self._hxs.select(self._xpath_config['name']).extract()))]
     elif key == 'image':
         image_urls = self._hxs.select(self._xpath_config[key]).extract()
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     else:
         raise KeyError(key)
Esempio n. 15
0
 def __getitem__(self, key):
     if key == 'name':
         return [name.strip() for name in self._hxs.select(self._xpath_config[key]).extract()]
     elif key == 'name_index_hiragana' or key == 'name_index_katakana':
         name_index_kana_list = []
         indices = self._hxs.select(self._xpath_config['name_index']).extract()
         for i in range(len(indices)):
             if i + 1 == len(indices):
                 pattern = re.compile(r'<dt>%s</dt>[\S\s]*</dl>' % indices[i].encode('utf-8'))
             else:
                 pattern = re.compile(r'<dt>%s</dt>[\S\s]*<dt>%s</dt>' % (indices[i].encode('utf-8'),
                                                                          indices[i + 1].encode('utf-8')))
             actresses = [re.sub('alt="|" />', '', name) for name in re.findall(r'alt=".* />', re.findall(pattern, self._response._body)[0])]
             try:
                 if key == 'name_index_hiragana':
                     name_index_kana_list += list(itertools.repeat(cnvk.convert(indices[i], cnvk.KATA2HIRA),
                                                                   len(actresses)))
                 elif key == 'name_index_katakana':
                     name_index_kana_list += list(itertools.repeat(cnvk.convert(indices[i], cnvk.HIRA2KATA),
                                                                   len(actresses)))
                 else:
                     raise TypeError
             except TypeError:
                 raise TypeError(type(name_index_kana_list))
             else:
                 i += 1
         return name_index_kana_list
     elif key == 'image':
         #image_urls = self._hxs.select(self._xpath_config[key]).extract()
         image_urls = ['http://www.heyzo.com%s' % image_url
                       for image_url in self._hxs.select(self._xpath_config[key]).extract()]
         #return [{'name': get_image_name('http://www.heyzo.com/%s' % image_url),
         #         'original_url': 'http://www.heyzo.com/%s' % image_url} for image_url in image_urls]
         return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
     else:
         raise KeyError(key)