def get(base, dst): productplist = list() pcount = 1 for href in t.exp(base % r'1').xpath(r'//*[@class = "pg"]//a/@href'): href = int(href.split(r'.')[0].split(r'-')[-1]) if href > pcount: pcount = href for i in range(1, pcount + 1): params = dict() params[r'dst'] = dst params[r'url'] = base % i productplist.append(params) threadinfos = dict() for data in t.reducer(productplist, mapper_product): threadinfos.update(data) pginfos = dict() for data in t.reducer(threadinfos.values(), mapper_pg): pginfos.update(data) imginfos = dict() for data in t.reducer(pginfos.values(), mapper_img): imginfos.update(data) t.reducer(imginfos.values(), mapper_get) return True
def query_pg_images(url): result = None try: data = list() for x in t.exp(url).xpath(r'//*[@class = "adw"]//img/@src'): data.append(t.http_urljoin(url, x)) result = data finally: return result
def query_pgs(url): result = None try: data = [url] for x in t.exp(url).xpath(r'//*[@class = "pg"]//a[not(@class)]/@href'): data.append(t.http_urljoin(url, x)) result = data finally: return result
def query_threadpages(url): result = None try: urls = [url] maxp = 1 for x in t.exp(url).xpath(r'//div[@id = "pages"]/a/text()'): if not x.isnumeric(): continue if int(x) > maxp: maxp = int(x) for i in range(2, maxp + 1): urls.append(t.http_urljoin(url, r'%s.html' % i)) result = urls finally: return result
def query_product_page(url, dst): result = None try: threadinfos = dict() for group in t.exp(url, r'utf-8').xpath(r'//*[@class = "group"]'): link = group.xpath(r'.//*[@class = "bution"]//a')[0] threadurl = t.http_urljoin(url, t.expa(link, r'href')) thread = dict() thread[r'cover'] = t.exps( group.xpath(r'.//*[@class = "photo"]//img/@src')) thread[r'url'] = threadurl thread[r'subject'] = r'%s[%s]' % (fixsubject( t.expt(link)), re.sub(r'\D', r'', threadurl)) thread[r'dst'] = dst for v in thread.values(): if not v: return threadinfos[threadurl] = thread result = threadinfos finally: return result
def query_threadpageimages(url): result = None try: result = t.exp(url).xpath(r'//div[@class = "content"]/img/@src') finally: return result
def query_navpage_threadobjtbl(url, err=None): xx = t.exp(url, err=err) if xx is None: return None return xx.xpath(r'//div[@class = "hezi"]/ul/li')
def query_product_total(url, err=None): xx = t.exp(url, err=err) if xx is None: return None return t.exps(xx.xpath(r'//div[@class = "shoulushuliang"]/span/text()'))