def padmag(url): page = fetch(url) dom = html.fromstring(page) title = sel('title')(dom)[0].text.strip() tags = map(esc_tag, title.split(' - ')) def extract_desc(): desc = [] def exclude(a): for k in ['attachment_id=', 'tag=', 'cat=']: if a.find(k) >= 0: return True return False for p in sel('div.content p')(dom): if p.text: desc.append(p.text) for a in sel('a')(p): href = a.attrib['href'] if href and not exclude(href): desc.append(href) return '\n'.join(filter(None, map(lambda i:i.strip(), desc))) desc = extract_desc() pics = [] i = 1 for img in sel('div.content img')(dom): src = img.attrib['src'] if not src.startswith('http://www.padmag.cn/wp-content/'): continue msg = '(%d): %s' % (i, desc) i += 1 yield title, tags, url, msg, src
def leica(url): page = fetch(url) dom = html.fromstring(page) title = sel('title')(dom)[0].text.strip() pos = title.find(u'』') tags = map(esc_tag, [title[:pos+1], title[pos+1:], u'Leica中文摄影杂志']) pics = [] for i, img in enumerate(sel('p img.insertimage')(dom)): src = img.attrib['src'] msg = '[%d]%s' % (i+1, title) yield title, tags, url, msg, src
def extract_desc(): desc = [] def exclude(a): for k in ['attachment_id=', 'tag=', 'cat=']: if a.find(k) >= 0: return True return False for p in sel('div.content p')(dom): if p.text: desc.append(p.text) for a in sel('a')(p): href = a.attrib['href'] if href and not exclude(href): desc.append(href) return '\n'.join(filter(None, map(lambda i:i.strip(), desc)))
def wsj(url): page = fetch(url) dom = html.fromstring(page) title = sel('title')(dom)[0].text tags = map(esc_tag, title.replace(u':', '|').replace('-', '|').replace('_', '|').split('|')) pics = [] items = sel('#sliderBox li')(dom) sz = len(items) for i, li in enumerate(items): img = sel('img')(li)[0] p = sel('p')(li)[0] src = img.attrib['src'] src = '/'.join(filter(lambda i: i != '..', src.split('/'))) img = 'http://cn.wsj.com/%s' % src msg = '(%d/%d) %s' % (i+1, sz, p.text) yield title, tags, url, msg, img