예제 #1
0
 def next_depth(self, response):
     #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None)
     for link, title in URL.link_title(response.body, response.effective_url):
         if not self.inject_url(link, response.args):continue
         self.link_title_db.add(link, response.effective_url, title)
     if callable(self.callback): self.callback(response)
     self.crawled[response.effective_url] = 2
     if response.effective_url != response.url:
         self.crawled[response.url] = 2
     self.referer = response.effective_url
예제 #2
0
 def parser(self, html, sp, homepage):
     if not html: return None
     links = []
     if sp == 'baidu':
         pattern = re.compile('nameEnc: "([^^].*?)"')
         username = pattern.findall(html)
         if not username: return None
         link = 'http://frd.baidu.com/api/friend.getlist?un=%s'%username[0]
         mario = Mario()
         response = mario.get(link)
         if not response or not response.body: return None
         pattern = re.compile('\["([^^].*?)","[^^].*?","[^^].*?","[^^].*?",\d+,"[^^].*?",\d+,\d+\]')
         names = pattern.findall(response.body)
         if not names: return None
         bsp = BSP()
         for n in names:
             u = bsp.normalize('http://hi.baidu.com/sys/checkuser/%s'%n)
             if u and u[1] != homepage and u[1] not in links:
                 links.append(u)
     elif sp == 'sohu':
         pattern = re.compile('"link" : "([^^].*?)"', re.I)
         urls = pattern.findall(html)
         bsp = BSP()
         for url in urls:
             r = bsp.normalize(url)
             if r and r[1] != homepage and r[1] not in links:
                 links.append(r[1])
     elif sp == '163':
         pattern = re.compile('.userName="******"')
         usernames = pattern.findall(html)
         links = []
         bsp = BSP()
         for u in usernames:
             if not u: continue
             link = bsp.valid163(u, 'http:%s.blog.163.com/'%u, '163')
             if link and link[1] and link[1] not in links: links.append(link[1])
     else:
         bsp = BSP()
         for link, title in URL.link_title(html, homepage):
             if not link:
                 continue
             r = bsp.normalize(link)
             if r and r[1] != homepage and r[1] not in links:
                 links.append(r[1])
     return links