def __getitem__(self, key): if key == 'player': player = self._hxs.select(self._xpath_config[key]).extract() if not player: player = self._hxs.select(self._xpath_config['player2']).extract() if player: for i in re.split('\n+\s*', player[0]): if 'image' in i: s = re.search('http.*\.jpg', i) image_urls = [s.group(0)] else: """ No player on the page """ image_urls = None elif key == 'thumbnail': image_urls = self._hxs.select(self._xpath_config[key]).extract() if not image_urls: image_urls = self._hxs.select(self._xpath_config['thumbnail2']).extract() elif key == 'sample': image_urls = self._hxs.select(self._xpath_config[key]).extract() if not image_urls: image_urls = self._hxs.select(self._xpath_config['sample2']).extract() elif key == 'sample_large': image_urls = self._hxs.select(self._xpath_config[key]).extract() if not image_urls: image_urls = self._hxs.select(self._xpath_config['sample2_large']).extract() elif key in self.keys(): image_urls = self._hxs.select(self._xpath_config[key]).extract() else: raise KeyError if image_urls: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): if key == 'name': str_list = self._hxs.select(self._xpath_config[key]).extract() return [s.strip() for s in str_list] # elif key == 'name_index_hiragana' or key == 'name_index_katakana': # str_list = self._hxs.select(self._xpath_config['name_index']).re(ur'「.」') # if str_list: # name_index = str_list[0].strip(u'「').strip(u'」').strip() # if key == 'name_index_hiragana': # return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), # len(self._hxs.select(self._xpath_config['name']).extract()))] # elif key == 'name_index_katakana': # return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), # len(self._hxs.select(self._xpath_config['name']).extract()))] # else: # raise KeyError(key) # else: # raise ValueError(str_list) elif key == 'name_index_hiragana' or key == 'name_index_katakana': """ A list of None is returned """ return [i for i in itertools.repeat(None, len(self._hxs.select(self._xpath_config['name']).extract()))] elif key == 'image': base_url = 'http://www.onacle.tv' original_urls = [] image_urls = self._hxs.select(self._xpath_config[key]).extract() for image_url in image_urls: if 'no_photo' in image_url: image_url = '%s/%s' % (base_url, image_url) else: image_url = '%s%s' % (base_url, image_url) original_urls.append(image_url) return [{'name': get_image_name(original_url), 'original_url': original_url} for original_url in original_urls] else: raise KeyError(key)
def __getitem__(self, key): if key == 'name': str_list = self._hxs.select(self._xpath_config[key]).extract() return [s.strip() for s in str_list] elif key == 'name_index_hiragana' or key == 'name_index_katakana': str_list = self._hxs.select(self._xpath_config['name_index']).extract() if str_list: #name_index = self._hxs.select(self._xpath_config['name_index']).extract()[0][1] name_index = str_list[0][1] if key == 'name_index_hiragana': #return [i for i in itertools.repeat(cnvk.convert(name_index.encode('utf-8'), cnvk.KATA2HIRA), len(self['name']))] return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(self['name']))] elif key == 'name_index_katakana': #return [i for i in itertools.repeat(cnvk.convert(name_index.encode('utf-8'), cnvk.HIRA2KATA), len(self['name']))] return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(self['name']))] else: raise Exception else: """ Emply list is returned if kana=other """ return [i for i in itertools.repeat(None, len(self['name']))] elif key == 'image': image_urls = self._hxs.select(self._xpath_config[key]).extract() #return ['%s.jpg' % hashlib.sha1(image_url).hexdigest() for image_url in image_urls] #return [(get_image_name(image_url), image_url) for image_url in image_urls] return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] else: raise KeyError(key)
def __getitem__(self, key): base_url = self._response.url.rstrip('index.html') title_id = base_url.split('/')[-2] image_url = 'http://www.heyzo.com/contents/3000/%s/images/' gallery_url = 'http://www.heyzo.com/contents/3000/%s/gallery/%s' image_urls = [] if key == 'face': image_urls = [('%sthumbnail.jpg' % image_url) % title_id] elif key == 'player': image_urls = [('%splayer_thumbnail_450.jpg' % image_url) % title_id] elif key == 'sample': sample_urls_temp = re.findall(r'<a class="sample-capture">[\s\S].*?</a>', self._hxs.select(self._xpath_config[key]).extract()[0]) pattern = re.compile(r'/contents/\d+/%s/images/capture\d+\.jpg' % title_id) image_urls = ['http://www.heyzo.com%s' % re.search(pattern, i).group(0) for i in sample_urls_temp] elif key == 'thumbnail': script_str = re.search(r'} else {[\s\S]*', self._hxs.select(self._xpath_config[key]).extract()[0]).group(0) for i in re.findall(r'[thumbnail_]*\d+\.jpg', script_str): if re.match(r'thumbnail_\d+\.jpg', i): image_urls.append(gallery_url % (title_id, re.match(r'thumbnail_\d+\.jpg', i).group(0))) elif key == 'thumbnail_large': script_str = re.search(r'} else {[\s\S]*', self._hxs.select(self._xpath_config[key]).extract()[0]).group(0) for i in re.findall(r'[thumbnail_]*\d+\.jpg', script_str): if re.match(r'\d+\.jpg', i): image_urls.append(gallery_url % (title_id, re.match(r'\d+\.jpg', i).group(0))) elif key in self.keys(): pass else: raise KeyError(key) if image_urls: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): if key == 'name': str_list = self._hxs.select(self._xpath_config[key]).extract() return [s.strip() for s in str_list] elif key == 'name_index_hiragana' or key == 'name_index_katakana': str_list = self._hxs.select(self._xpath_config['name_index']).re(ur'「.」') if str_list: name_index = str_list[0].strip(u'「').strip(u'」').strip() if key == 'name_index_hiragana': return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(self._hxs.select(self._xpath_config['name']).extract()))] elif key == 'name_index_katakana': return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(self._hxs.select(self._xpath_config['name']).extract()))] else: raise KeyError(key) else: raise ValueError(str_list) elif key == 'image': #image_urls = self._hxs.select(self._xpath_config[key]).extract() image_urls = ['http://my.tokyo-hot.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()] #return [{'name': get_image_name(image_url), # 'original_url': 'http://my.tokyo-hot.com%s' % image_url} for image_url in image_urls] return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] else: raise KeyError(key)
def __getitem__(self, key): base_url = self._response.url.rstrip('index.html') if key == 'player': #player = self._hxs.select(self._xpath_config[key]).extract()[0] #for i in re.split('\n+\s*', player): # if 'image : ' in i: # s = re.search(r'\/moviepages\/\d{6}-\d{3}\/images\/l_l\.jpg', i) # image_urls = [s.group(0)] image_urls = ['%simages/l_l.jpg' % base_url] elif key == 'sample' or key == 'sample_large': image_urls = self._hxs.select(self._xpath_config[key]).extract() if image_urls: #image_urls = re.split('\n+\s*', image_urls[0]) if key == 'sample': #pattern = r'images/g_t0\d{2}.jpg' format = 'g_t0%02d.jpg' elif key == 'sample_large': #pattern = r'images/g_big0\d{2}.jpg' format = r'g_big0%02d.jpg' #image_urls = ['%s%s' % (base_url, re.search(pattern, image_url).group(0)) for image_url in image_urls if re.search(pattern, image_url)] filenames = [format % num for num in xrange(1, 13)] image_urls = ['%simages/%s' % (base_url, filename) for filename in filenames] elif key == 'face' or key == 'thumbnail' or key == 'thumbnail_large': image_urls = self._hxs.select(self._xpath_config[key]).extract() if image_urls: if key == 'face': pattern = r'images/n\.jpg' elif key == 'thumbnail': pattern = r'images/\d_\d_s\.gif' elif key == 'thumbnail_large': pattern = r'images/\d_\d\.jpg' #image_urls = self._hxs.select(self._xpath_config[key]).extract() image_urls = ['%s%s' % (base_url, re.search(pattern, image_url).group(0)) for image_url in image_urls] else: raise KeyError if not image_urls: image_url = '%s/%s' % (self._response.url, key) """ Returning a dummy file name and empty original_url. 'name' is needed a record to be created on DB. """ return [{'name': get_image_name(image_url), 'original_url': ''}] else: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): image_urls = [] if key == "player": # image_urls = ['http://my.tokyo-hot.com%s' % url image_urls = [url for url in self._hxs.select(self._xpath_config[key]).extract()] elif key in self.keys(): pass else: raise KeyError(key) if image_urls: return [{"name": get_image_name(image_url), "original_url": image_url} for image_url in image_urls]
def __getitem__(self, key): image_urls = [] if key == 'face' or key == 'thumbnail': image_urls = self._hxs.select(self._xpath_config[key]).extract() elif key == 'player': pattern = re.compile(r'http://.*\.enkou55.com/images/title/\d{2}/\d{2}/\d{2}/player.jpg') if self._hxs.select(self._xpath_config[key]).extract(): image_urls = [re.search(pattern, self._hxs.select(self._xpath_config[key]).extract()[0]).group()] elif key in self.keys(): pass else: raise KeyError(key) if image_urls: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): if key == 'name': #str_list = self._hxs.select(self._xpath_config[key]).extract() #return [s.strip() for s in str_list] name_list = [] name_index_hiragana_list = [] name_index_katakana_list = [] for i in range(len(self._hxs.select(self._xpath_config['name_index']))): #for xpath in self._xpath_config[key]: str_list = self._hxs.select(self._xpath_config[key][i]).extract() name_index = self._hxs.select(self._xpath_config['name_index']).extract()[i] try: name_index_hiragana_list += [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(str_list))] name_index_katakana_list += [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(str_list))] except TypeError: raise TypeError(type(name_index_hiragana_list)) name_list += [s.strip() for s in str_list] self['name_index_hiragana'] = name_index_hiragana_list self['name_index_katakana'] = name_index_katakana_list return name_list elif key == 'name_index_hiragana' or key == 'name_index_katakana': #return self[key] pass """ str_list = self._hxs.select(self._xpath_config['name_index']).extract() if str_list: name_index = str_list[0][1] #name_index = str_list if key == 'name_index_hiragana': return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(self['name']))] elif key == 'name_index_katakana': return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(self['name']))] else: #raise Exception raise KeyError(key) else: return [i for i in itertools.repeat(None, len(self['name']))] """ elif key == 'image': #image_urls = self._hxs.select(self._xpath_config[key]).extract() image_urls = ['http://www.caribbeancom.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()] #image_urls = self['image_url'] #return ['%s.jpg' % hashlib.sha1(image_url).hexdigest() for image_url in image_urls] #return [{'name': get_image_name(image_url), 'original_url': 'http://www.caribbeancom.com%s' % image_url} # for image_url in image_urls] return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] else: raise KeyError(key)
def __getitem__(self, key): base_url = self._response.url.rstrip('index.html') #image_urls = None image_urls = [] if key == 'player': image_urls = ['%simages/l_l.jpg' % base_url] elif key == 'sample': image_urls = ['http://www.caribbeancom.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()] elif key == 'sample_large': value_list = self._hxs.select(self._xpath_config[key]).extract() if value_list: if 'g_big' in self._hxs.select(self._xpath_config[key]).extract()[0]: # old design for image_url in value_list: image_urls.append('http://www.caribbeancom.com%s' % re.sub(r'/member', '', image_url)) else: # new design for i in xrange(0, 5): image_urls.append('http://www.caribbeancom.com%s' % value_list[i]) #image_urls = ['http://www.caribbeancom.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()] elif key in self.keys(): pass else: raise KeyError(key) """ elif key == 'sample' or key == 'sample_large': image_urls = self._hxs.select(self._xpath_config[key]).extract() if image_urls: #image_urls = re.split('\n+\s*', image_urls[0]) if key == 'sample': #pattern = r'images/g_t0\d{2}.jpg' #format = 'g_t0%02d.jpg' #filename = 's/%03d.jpg' #filenames = ['s/%03d.jpg' % num for num in xrange(1, 21)] filenames = ['s/%03d.jpg' % num for num in xrange(1, len(image_urls) + 1)] elif key == 'sample_large': #pattern = r'images/g_big0\d{2}.jpg' #format = r'g_big0%02d.jpg' #filename = 'l/%03d.jpg' filenames = ['l/%03d.jpg' % num for num in xrange(1, 6)] filenames += [None] * 15 #image_urls = ['%s%s' % (base_url, re.search(pattern, image_url).group(0)) for image_url in image_urls if re.search(pattern, image_url)] #filenames = [format % num for num in xrange(1, 13)] image_urls = ['%simages/%s' % (base_url, filename) for filename in filenames] """ if image_urls: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): base_url = 'http://www.onacle.tv' image_urls = [] if key == 'face' or key == 'player' or key == 'sample': image_urls = ['%s%s' % (base_url, image_path) for image_path in self._hxs.select(self._xpath_config[key]).extract()] elif key == 'thumbnail': original_id = parse_qs(urlparse(self._response.url).query)['videocode'][0] pattern = re.compile(r'/video/%s/%s_\d+\.jpg' % (original_id, original_id)) if self._hxs.select(self._xpath_config[key]).extract(): image_paths = [] for image_path in self._hxs.select(self._xpath_config[key]).extract(): image_paths += re.findall(pattern, image_path) image_urls = ['%s%s' % (base_url, image_path) for image_path in image_paths] elif key in self.keys(): pass else: raise KeyError(key) if image_urls: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): if key == 'face' or key == 'thumbnail': image_urls = self._hxs.select(self._xpath_config[key]).extract() elif key == 'player': title_id = int(self._response.url.split('/')[-1].strip('.html')) image_urls = ['http://image.xxx-av.com/image/%d/movie_main.jpg' % title_id] elif key == 'sample' or key == 'sample_large' or key == 'thumbnail_large': image_urls = '' else: raise KeyError #if not image_urls: # image_url = '%s/%s' % (self._response.url, key) # """ Returning a dummy file name and empty original_url. # 'name' is needed a record to be created on DB. """ # return [{'name': get_image_name(image_url), 'original_url': ''}] #else: # return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] if image_urls: return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls]
def __getitem__(self, key): if key == 'name': name_list = [] for i in range(len(self._xpath_config[key])): for name in self._hxs.select(self._xpath_config[key][i]).extract(): name_list.append(name.split()[0].strip()) #name_list += name_list return name_list elif key == 'name_hiragana' or key == 'name_katakana': name_kana_list = [] for i in range(len(self._xpath_config['name'])): for name in self._hxs.select(self._xpath_config['name'][i]).extract(): if len(name.split()) == 2: if key == 'name_hiragana': name_kana_list.append(cnvk.convert(name.split()[1].strip(), cnvk.KATA2HIRA)) elif key == 'name_katakana': name_kana_list.append(cnvk.convert(name.split()[1].strip(), cnvk.HIRA2KATA)) else: name_kana_list.append(None) return name_kana_list elif key == 'name_index_hiragana' or key == 'name_index_katakana': name_index_kana_list = [] for i in range(len(self._hxs.select(self._xpath_config['name_index']))): name_temp_list = self._hxs.select(self._xpath_config['name'][i]).extract() name_index = self._hxs.select(self._xpath_config['name_index']).extract()[i] try: if key == 'name_index_hiragana': name_index_kana_list += list(itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(name_temp_list))) elif key == 'name_index_katakana': name_index_kana_list += list(itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(name_temp_list))) else: raise TypeError except TypeError: raise TypeError(type(name_index_kana_list)) return name_index_kana_list elif key == 'image': image_urls = self._hxs.select(self._xpath_config[key]).extract() return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] else: raise KeyError(key)
def __getitem__(self, key): if key == 'name': str_list = self._hxs.select(self._xpath_config[key]).extract() return [s.strip() for s in str_list] elif key == 'name_index_hiragana' or key == 'name_index_katakana': str_list = self._hxs.select(self._xpath_config['name_index']).extract() if str_list: name_index = str_list[0].strip() if key == 'name_index_hiragana': return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.KATA2HIRA), len(self._hxs.select(self._xpath_config['name']).extract()))] elif key == 'name_index_katakana': return [i for i in itertools.repeat(cnvk.convert(name_index, cnvk.HIRA2KATA), len(self._hxs.select(self._xpath_config['name']).extract()))] else: raise Exception else: """ Emply list is returned if kana=other """ return [i for i in itertools.repeat(None, len(self._hxs.select(self._xpath_config['name']).extract()))] elif key == 'image': image_urls = self._hxs.select(self._xpath_config[key]).extract() return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] else: raise KeyError(key)
def __getitem__(self, key): if key == 'name': return [name.strip() for name in self._hxs.select(self._xpath_config[key]).extract()] elif key == 'name_index_hiragana' or key == 'name_index_katakana': name_index_kana_list = [] indices = self._hxs.select(self._xpath_config['name_index']).extract() for i in range(len(indices)): if i + 1 == len(indices): pattern = re.compile(r'<dt>%s</dt>[\S\s]*</dl>' % indices[i].encode('utf-8')) else: pattern = re.compile(r'<dt>%s</dt>[\S\s]*<dt>%s</dt>' % (indices[i].encode('utf-8'), indices[i + 1].encode('utf-8'))) actresses = [re.sub('alt="|" />', '', name) for name in re.findall(r'alt=".* />', re.findall(pattern, self._response._body)[0])] try: if key == 'name_index_hiragana': name_index_kana_list += list(itertools.repeat(cnvk.convert(indices[i], cnvk.KATA2HIRA), len(actresses))) elif key == 'name_index_katakana': name_index_kana_list += list(itertools.repeat(cnvk.convert(indices[i], cnvk.HIRA2KATA), len(actresses))) else: raise TypeError except TypeError: raise TypeError(type(name_index_kana_list)) else: i += 1 return name_index_kana_list elif key == 'image': #image_urls = self._hxs.select(self._xpath_config[key]).extract() image_urls = ['http://www.heyzo.com%s' % image_url for image_url in self._hxs.select(self._xpath_config[key]).extract()] #return [{'name': get_image_name('http://www.heyzo.com/%s' % image_url), # 'original_url': 'http://www.heyzo.com/%s' % image_url} for image_url in image_urls] return [{'name': get_image_name(image_url), 'original_url': image_url} for image_url in image_urls] else: raise KeyError(key)