Example #1
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     i = webgraphItem()
     i['node'] = response.url
     print "#######################"
     print response.url
     print "#######################"
    # i['http_status'] = response.status
     llinks=[]
     for anchor in hxs.select('//a[@href]'):
         href=anchor.select('@href').extract()[0]
         if not href.lower().startswith("javascript") and  href.startswith("http://perso.ens-lyon.fr/baptiste.roziere/"):
             llinks.append(urljoin_rfc(response.url,href))
     i['edge'] = llinks
     return i
Example #2
0
 def parse_item(self, response):
     hxs = HtmlXPathSelector(response)
     i = webgraphItem()
     i['node'] = response.url
     print "#######################"
     print response.url
     print "#######################"
    # i['http_status'] = response.status
     llinks=[]
     seen = {}
     for anchor in hxs.select('//a[@href]'):
         href=anchor.select('@href').extract()[0]
         if href.startswith("http://www.cdiscount.com") and not (href in seen):
             seen[href]=True
             llinks.append(urljoin_rfc(response.url,href))
     i['edge'] = llinks
     return i