def find_imgs(self, uri): url = HOST + uri soup = BeautifulSoup(self.get(url)) img_list = [] for input in soup.find_all('input', type="image"): img = input['src'] content = self.get(img) filename = sha1(content) + img[img.rfind('.'):] save(content, filename) img_list.append({ 'url': img, 'hash': filename, }) return img_list
def parse_imgs(self): answer_soup = self.soup.find('div', class_='zm-editable-content clearfix') if not answer_soup: if u'回答建议修改:涉及淫秽色情低俗信息' in unicode(self.soup): return None imgs = list(set(PATTERN_IMG.findall(unicode(answer_soup)))) if not imgs: return None answer = { 'url': ZHIHU_URL + self.soup.find('div', class_='zm-item-rich-text js-collapse-body')['data-entry-url'], 'agree_cnt': 0, 'a_link': '', 'a_name': u'匿名用户', 'r_time': '', 'e_time': '', 'comment_cnt': '', 'imgs': [], '_id': '', } with trytry(): count = self.soup.find('span', class_='count').getText().strip().lower() if 'k' in count: count = count[:-1] + '000' answer['agree_cnt'] = int(count) for img in imgs: content = self.get(img, timeout=120) filename = sha1(content) + img[img.rfind('.'):] save(content, filename) # answer['imgs'].append({'local': filename, 'raw': img}) answer['imgs'].append(filename) author = self.soup.find('div', class_='zm-item-answer-author-info') author_link = author.find('a', class_='author-link') if author_link: answer['a_link'] = ZHIHU_URL + author_link['href'] answer['a_name'] = author_link.getText().strip() with trytry(): answer['r_time'], answer['e_time'] = self.parse_edit_time(self.soup.find('a', class_='answer-date-link')) with trytry(): comment = self.soup.find('a', class_=' meta-item toggle-comment').getText().strip() if comment != u'添加评论': answer['comment_cnt'] = comment[:-3].strip() answer['_id'] = answer['url'].replace('https://www.zhihu.com/question/', '').replace('/answer/', '-') return answer
def parse_cat_tr(self, tr): tds = tr.find_all('td') if len(tds) != 5: return None title = tds[1].getText().strip().replace('\n', '').replace('\t', '') title_sp = CATEGORY_PATTERN.findall(title) if not title_sp: return None data = { '_id': sha1(title), 'category': self.mapping_category(title_sp[0][0]), 'title': title_sp[0][1], 'img_count': int(title_sp[0][2]), 'raw_path': tds[1].find('a')['href'], 'pub_date': tds[2].find('div', class_='f10').getText() } if self.check_exists(data['_id']): return None imgs = self.find_imgs(data['raw_path']) if not imgs: return None data['images'] = imgs return data