Esempio n. 1
0
  def parse(self):
    response = self._get_response(self.post.url)
    response.encoding = 'utf-8'
    if not response.status_code / 100 == 2:
      return self.post

    doc = BeautifulSoup.BeautifulSoup(response.text)
    board_view = doc.find('div', attrs={'class': 'board_view'})
    title = board_view.find('div', attrs={'class': 'post_tit scalable'}).text.strip()
    title = str(unicode(title))
    post = board_view.find('div', attrs={'class': 'post_ct scalable'})

    # links
    links = []
    for a in post.findAll('a'):
      href = a.get('href')
      if not href:
        continue
      if not 'http' in href:
        continue
      lower = href.lower()
      if '.jpg' in lower or '.png' in lower or '.gif' in lower:
        continue
      if href.count('.') == 0:
        continue
      links.append(href)

    # images
    images = []
    for img in post.findAll('img'):
      src = img.get('src')
      if not src:
        continue
      if not 'http' in src:
        continue
      response = self._get_response(src)
      if not response.status_code / 100 == 2:
        continue
      if not response.content:
        continue
      image = attachment.Image(response.content)
      images.append(image)

    self.post.title = title
    self.post.links = links
    self.post.images = images
    return self.post
Esempio n. 2
0
  def parse(self):
    response = self._get_response(self.post.url)
    response.encoding = 'euc-kr'

    doc = BeautifulSoup.BeautifulSoup(response.text)
    title = doc.find('td', attrs={'class': 'mw_basic_view_subject'}).find('h1').text.strip()
    title = str(unicode(title))
    elements = doc.findAll('td', attrs={'class': 'mw_basic_view_file'})
    post = doc.find('div', attrs={'id': 'view_content'})

    magnets = []
    files = []
    images = []

    for e in elements:
      a = e.find('a')
      text = a.text
      onclick = a.get('onclick', None)
      if not onclick:
        continue
      href = onclick.split("file_download('.")[-1].split("',")[0]
      url = 'http://' + self.hostname + '/bbs' + href
      if not url:
        continue

      if '.torrent' in text or '.smi' in text:
        response = self._get_response(url)
        if not response.status_code / 100 == 2 or not response.content:
          continue
        f = attachment.File(name=text, content=response.content)
        files.append(f)

    for e in post.findAll('img'):
      src = e.get('src')
      if not src:
        continue
      if not 'http://' in src and not 'https://' in src:
        src = src.replace('../..', '').replace('..', '')
        src = 'http://' + self.hostname + src
      response = self._get_response(src)
      if not response.status_code / 100 == 2 or not response.content:
        continue
      i = attachment.Image(content=response.content)
      images.append(i)

    self.post.title = title
    self.post.magnets = magnets
    self.post.files = files
    self.post.images = images
    return self.post