def parse(self, response): if 'news.ycombinator.com' in response.url: soup = bs(response.body) items = [(x[0].text, x[0].get('href')) for x in filter(None, [ x.findChildren() for x in soup.findAll('td', {'class': 'title'}) ])] for item in items: print item news_item = NewsItem() news_item['title'] = item[0] news_item['url'] = item[1] try: yield Request(item[1], callback=self.parse) except ValueError: yield Request('http://news.ycombinator.com/' + item[1], callback=self.parse) yield news_item else: sha1_response = hashlib.sha1(response.url).hexdigest() folder = PATH + '/' + sha1_response if not os.path.exists(folder): os.makedirs(folder) with open(folder + '/index.html', 'w+') as file_obj: file_obj.write(response.body)
def parse(self, response): if 'news.ycombinator.com' in response.url: soup = bs(response.body) items = [(x[0].text, x[0].get('href')) for x in filter(None, [ x.findChildren() for x in soup.findAll('td', {'class':'title'}) ])] for item in items: print item news_item = NewsItem() news_item['title'] = item[0] news_item['url'] = item[1] try: yield Request(item[1], callback=self.parse) except ValueError: yield Request('http://news.ycombinator.com/' + item[1], callback=self.parse) yield news_item else: sha1_response = hashlib.sha1(response.url).hexdigest() if not os.path.exists(PATH + sha1_response): os.makedirs(PATH + sha1_response) with open(PATH + sha1_response + '/html', 'w+') as file_obj: file_obj.write(response.body)
def praseRefer(content): soup = bs(content) for item in soup.findAll("div",{"class":"ebookLst_s"}): for ii in item.findAll("div",{"class":"con"}): jj = ii.findChildren() link = jj[1].get('href') product_id = link.replace("http://product.dangdang.com/product.aspx?product_id=",'') name = jj[1].text try: insert(name,product_id) except: pass
def parse(self, response): filename = response.url.split("/")[-2] soup = bs(response.body) raw_links = [x.findChildren()[0].findChildren()[0].get('href') for x in soup.findAll('ul', {'class':'actions'}) ] if raw_links: item = AliasScrapeItem() item['raw_url'] = 'http://github.com' + raw_links[0] item['repo_url'] = response.url yield item else: children = soup.findAll('h2', {'class':'title'}) children = [x.findChildren()[0].get('href') for x in children] #open(filename, 'wb').write(response.body) for url in children: yield Request('http://github.com' + url, callback=self.parse)