def next_depth(self, response): #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None) for link, title in URL.link_title(response.body, response.effective_url): if not self.inject_url(link, response.args):continue self.link_title_db.add(link, response.effective_url, title) if callable(self.callback): self.callback(response) self.crawled[response.effective_url] = 2 if response.effective_url != response.url: self.crawled[response.url] = 2 self.referer = response.effective_url
def parser(self, html, sp, homepage): if not html: return None links = [] if sp == 'baidu': pattern = re.compile('nameEnc: "([^^].*?)"') username = pattern.findall(html) if not username: return None link = 'http://frd.baidu.com/api/friend.getlist?un=%s'%username[0] mario = Mario() response = mario.get(link) if not response or not response.body: return None pattern = re.compile('\["([^^].*?)","[^^].*?","[^^].*?","[^^].*?",\d+,"[^^].*?",\d+,\d+\]') names = pattern.findall(response.body) if not names: return None bsp = BSP() for n in names: u = bsp.normalize('http://hi.baidu.com/sys/checkuser/%s'%n) if u and u[1] != homepage and u[1] not in links: links.append(u) elif sp == 'sohu': pattern = re.compile('"link" : "([^^].*?)"', re.I) urls = pattern.findall(html) bsp = BSP() for url in urls: r = bsp.normalize(url) if r and r[1] != homepage and r[1] not in links: links.append(r[1]) elif sp == '163': pattern = re.compile('.userName="******"') usernames = pattern.findall(html) links = [] bsp = BSP() for u in usernames: if not u: continue link = bsp.valid163(u, 'http:%s.blog.163.com/'%u, '163') if link and link[1] and link[1] not in links: links.append(link[1]) else: bsp = BSP() for link, title in URL.link_title(html, homepage): if not link: continue r = bsp.normalize(link) if r and r[1] != homepage and r[1] not in links: links.append(r[1]) return links