def parse(self): response = self._get_response(self.post.url) response.encoding = 'utf-8' if not response.status_code / 100 == 2: return self.post doc = BeautifulSoup.BeautifulSoup(response.text) board_view = doc.find('div', attrs={'class': 'board_view'}) title = board_view.find('div', attrs={'class': 'post_tit scalable'}).text.strip() title = str(unicode(title)) post = board_view.find('div', attrs={'class': 'post_ct scalable'}) # links links = [] for a in post.findAll('a'): href = a.get('href') if not href: continue if not 'http' in href: continue lower = href.lower() if '.jpg' in lower or '.png' in lower or '.gif' in lower: continue if href.count('.') == 0: continue links.append(href) # images images = [] for img in post.findAll('img'): src = img.get('src') if not src: continue if not 'http' in src: continue response = self._get_response(src) if not response.status_code / 100 == 2: continue if not response.content: continue image = attachment.Image(response.content) images.append(image) self.post.title = title self.post.links = links self.post.images = images return self.post
def parse(self): response = self._get_response(self.post.url) response.encoding = 'euc-kr' doc = BeautifulSoup.BeautifulSoup(response.text) title = doc.find('td', attrs={'class': 'mw_basic_view_subject'}).find('h1').text.strip() title = str(unicode(title)) elements = doc.findAll('td', attrs={'class': 'mw_basic_view_file'}) post = doc.find('div', attrs={'id': 'view_content'}) magnets = [] files = [] images = [] for e in elements: a = e.find('a') text = a.text onclick = a.get('onclick', None) if not onclick: continue href = onclick.split("file_download('.")[-1].split("',")[0] url = 'http://' + self.hostname + '/bbs' + href if not url: continue if '.torrent' in text or '.smi' in text: response = self._get_response(url) if not response.status_code / 100 == 2 or not response.content: continue f = attachment.File(name=text, content=response.content) files.append(f) for e in post.findAll('img'): src = e.get('src') if not src: continue if not 'http://' in src and not 'https://' in src: src = src.replace('../..', '').replace('..', '') src = 'http://' + self.hostname + src response = self._get_response(src) if not response.status_code / 100 == 2 or not response.content: continue i = attachment.Image(content=response.content) images.append(i) self.post.title = title self.post.magnets = magnets self.post.files = files self.post.images = images return self.post