def parse(self, response): for cell in response.xpath('//td'): try: l = extract_a(cell) next(l) url, t = next(l) except StopIteration: continue if not t or '=' not in url: continue a = get_article(url, t) if a is None: continue desc = ''.join(cell.xpath(desc_xp).extract()).strip() if desc: a['description'] = desc article_json(a) yield a if get_type(response.url) == 'mono': pn = pagen else: pn = '(//div[@class="paginationControl"])[1]' for url, t in extract_a(response.xpath(pn)): try: yield response.follow(url, meta={'page': int(t)}) except ValueError: pass
def parse(self, response): for url, t in extract_a(response.xpath('//td[@class="num"]')): yield Request(response.urljoin(url), callback=self.thread_parse) for url, t in extract_a(response.xpath('(//div[@class="pg"])[1]')): try: yield Request(response.urljoin(url), meta={'page': int(t)}) except ValueError: pass
def parse(self, response): for section in response.xpath('//div[@class="d-sect"]')[2:-1]: sname = extract_t(section.xpath('p')) for url, t in extract_a(section): g = get_article(url) g['category'] = sname yield response.follow(url, meta={'genre': g}, callback=m_parse)
def parse(self, response): if get_type(response.url) == 'mono': mora = '(//td[@class="makerlist-box-t2" or @class="initial"])' xp = { 'main': '//td[@class="w50"]', 'name': './/a[@class="bold"]', 'description': './/div[@class="maker-text"]', } subt = { 'main': subt_main, 'name': 'td/a', 'description': '(td)[2]', } yield from makers(response, subt) else: mora = '(//ul[starts-with(@class,"d-mod")])[position()>1]' xp = { 'main': '//div[@class="d-unit"]', 'name': './/span[@class="d-ttllarge"]', 'description': './/p', } g = response.meta.get('genre') yield from makers(response, xp, g) if g: return for url, t in extract_a(response.xpath(mora)): yield response.follow(url)
def parse(self, response): v_type = get_type(response.url) for actress in response.css('div.act-box').xpath('.//li'): url, t = next(extract_a(actress)) if v_type == 'digital': url = url[:-13] a = get_article(url, t) if a is None: continue name, alias = alias_re.match(t).groups() if alias is not None: a['name'] = name a['alias'] = alias extra = actress.xpath('.//span/text()').extract() if extra: a['kana'], alias_kana = alias_re.match(extra[0]).groups() if alias_kana is not None: a['alias_kana'] = alias_kana try: a['count'] = int(extra[1].split(':')[1]) except (IndexError, ValueError): pass a['image'] = actress.xpath('.//img/@src').extract_first() article_json(a) yield a for url, t in extract_a(response.xpath(pagen)): try: page = int(t) if page == 1: continue yield response.follow(url, meta={'page': page}) except ValueError: continue for url, t in extract_a(response.xpath(aiueo)): yield response.follow(url)
def parse(self, response): v_type = get_type(response.url) desc = response.css('div.mg-b20.lh4') if v_type == 'mono': desc = desc.xpath('p') item = { 'type': v_type, 'url': response.url.split('?')[0], 'title': extract_t(response.xpath('//h1')), 'cover': response.xpath(cover_xp).extract_first(), 'description': extract_t(desc), } urls = {} for row in response.xpath('//td[@class="nw"]/..'): info = extract_t(row.xpath('td'))[:-1] try: info, parser = info_box[info] except KeyError: continue if parser == 'PERFORMER': item.update(get_performers(row.xpath('td'), urls)) elif parser is None: item[info] = extract_t(row.xpath('td[2]')) else: try: item[info] = parser(get_articles(row.xpath('td'), urls)) except StopIteration: pass sample = response.xpath('//a[starts-with(@id,"sample-image")]/img') if sample: item['samples'] = len(sample) item['sample_link'] = sample.xpath('@src').extract_first() m_l = response.xpath('//script[contains(.,"#mutual-link")]/text()') if m_l: m_l = response.urljoin(mutual_l.format(*m_l.re(r":\s*'(.*)',"))) item['mutual'] = set(i[0] for i in extract_a(get_aux(m_l))) a_p = response.xpath('//script[contains(.,"#a_performer")]/text()') if a_p: a_p = response.urljoin(a_p.re_first(r"url: '(.*)',")) item.update(get_performers(get_aux(a_p), urls)) for url, a in urls.items(): a['type'] = v_type yield response.follow(url, meta={'article': a}, callback=a_parse) item['JSON_FILENAME'] = JSON_FILENAME yield item
def get_articles(links, urls=None, only_id=True): for url, t in extract_a(links): a = get_article(url, t, _type=False) if a is None: continue if urls is not None and url not in urls: urls[url] = a if only_id: yield a['id'] else: yield a['article'], a['id']
def get_articles(links, urls=None, only_id=True): for url, t in extract_a(links): if url.startswith('javascript:'): continue a = get_article(url, t) if a is None: continue if urls is not None and url not in urls: urls[url] = a if only_id: yield a['id'] else: yield a['article'], a['id']
def makers(response, xp, genre=None): for mk in response.xpath(xp.pop('main')): url = next(extract_a(mk))[0] m = get_article(url) if m is None: continue if genre is not None: m['genre'] = set((genre['id'], )) yield m continue m.update({k: extract_t(mk.xpath(v)) for k, v in xp.items()}) img = mk.xpath('.//img/@src').extract_first() if img: m['image'] = img article_json(m) yield m
def parse(self, response): if get_type(response.url) == 'mono': xp = '//div[@class="sect01"]' s_xp = 'table/@summary' else: xp = '//div[@class="d-area area-list"]' s_xp = 'div[@class="d-capt"]/text()' for section in response.xpath(xp)[1:]: sname = section.xpath(s_xp).extract_first() for url, t in extract_a(section): if url.startswith('#'): continue item = get_article(url, t) if item is None: continue item['category'] = sname article_json(item) yield item
def parse(self, response): p_type, pid = get_pid(response.url) if not pid: print(response.url) return desc = response.xpath('//div[@class="title2"]/following-sibling::p') if p_type == 'PPV': desc = response.xpath('//ul[@class="review"]/li[1]') item = { 'pid': pid, 'type': p_type, 'url': response.url, 'title': extract_t(response.xpath('//h2')), 'description': extract_t(desc), } vid = extract_t(response.xpath('//div[@class="top-title"]')) if vid: item['vid'] = vid.split(': ')[1] for src in response.xpath(cover_xp).extract(): if 'imgs.aventertainments' in src: item['cover'] = src break urls = {} for li in response.xpath(main_xp): info = extract_t(li.xpath('span') or li) try: info, parser = info_box[info[:-1]] except KeyError: continue if parser is None: item[info] = extract_t(li, p='text()[2]') else: try: i = parser(get_articles(li, urls)) except StopIteration: i = None item[info] = i for details in response.xpath('//div[@id="detailbox"]'): info = extract_t(details.xpath('span')) try: info, parser = info_box[info[:-1]] except KeyError: continue if parser is None: pass else: try: item[info] += parser(get_articles(details, urls)) except StopIteration: pass try: item['keyword'] = sorted(set(item.pop('keyword'))) except KeyError: pass sample = response.xpath('//div[@class="TabbedPanels"]//img') if sample: item['sample_link'] = sample.xpath('@src').extract_first() th = response.css('ul.thumbs') if th: item['gallery'] = tuple(extract_t(ul, 'li/a/@href') for ul in th) mutual = response.xpath('//div[@id="mini-tabs"]') if mutual: item['mutual'] = sorted(i[0] for i in extract_a(mutual)) for url, a in urls.items(): a['type'] = p_type yield response.follow(url, meta={'article': a}, callback=a_parse) item['JSON_FILENAME'] = JSON_FILENAME yield item
def studios(links): for url, t in extract_a(links): studio = get_article(url, t) article_json(studio) yield studio