예제 #1
0
파일: hnspider.py 프로젝트: smchui/hncrawl
    def parse(self, response):
        if 'news.ycombinator.com' in response.url:
            soup = bs(response.body)
            items = [(x[0].text, x[0].get('href'))
                     for x in filter(None, [
                         x.findChildren()
                         for x in soup.findAll('td', {'class': 'title'})
                     ])]

            for item in items:
                print item
                news_item = NewsItem()
                news_item['title'] = item[0]
                news_item['url'] = item[1]
                try:
                    yield Request(item[1], callback=self.parse)
                except ValueError:
                    yield Request('http://news.ycombinator.com/' + item[1],
                                  callback=self.parse)

                yield news_item

        else:
            sha1_response = hashlib.sha1(response.url).hexdigest()
            folder = PATH + '/' + sha1_response
            if not os.path.exists(folder):
                os.makedirs(folder)
            with open(folder + '/index.html', 'w+') as file_obj:
                file_obj.write(response.body)
예제 #2
0
파일: hnspider.py 프로젝트: Ivansek/hncrawl
  def parse(self, response):
    if 'news.ycombinator.com' in response.url:
      soup = bs(response.body)
      items = [(x[0].text, x[0].get('href')) for x in
        filter(None, [
          x.findChildren() for x in
            soup.findAll('td', {'class':'title'})
        ])]

      for item in items:
        print item
        news_item = NewsItem()
        news_item['title']  = item[0]
        news_item['url']    = item[1]
        try:
          yield Request(item[1], callback=self.parse)
        except ValueError:
          yield Request('http://news.ycombinator.com/' + item[1], callback=self.parse)

        yield news_item

    else:
      sha1_response = hashlib.sha1(response.url).hexdigest()
      if not os.path.exists(PATH + sha1_response):
        os.makedirs(PATH + sha1_response)
      with open(PATH + sha1_response + '/html', 'w+') as file_obj:
        file_obj.write(response.body)
예제 #3
0
def praseRefer(content):

	soup = bs(content)

	for item in soup.findAll("div",{"class":"ebookLst_s"}):
		for ii in item.findAll("div",{"class":"con"}):
			jj = ii.findChildren()

			link =  jj[1].get('href')
			product_id = link.replace("http://product.dangdang.com/product.aspx?product_id=",'')
			name = jj[1].text
			try:
				insert(name,product_id)
			except:
				pass
예제 #4
0
  def parse(self, response):
    filename = response.url.split("/")[-2]
    soup = bs(response.body)

    raw_links = [x.findChildren()[0].findChildren()[0].get('href') for x in
      soup.findAll('ul', {'class':'actions'})
    ]
    if raw_links:
      item = AliasScrapeItem()
      item['raw_url'] = 'http://github.com' + raw_links[0]
      item['repo_url'] = response.url
      yield item

    else:
      children = soup.findAll('h2', {'class':'title'})
      children = [x.findChildren()[0].get('href') for x in children]
      #open(filename, 'wb').write(response.body)

      for url in children:
        yield Request('http://github.com' + url, callback=self.parse)