Beispiel #1
0
 def find_imgs(self, uri):
     url = HOST + uri
     soup = BeautifulSoup(self.get(url))
     img_list = []
     for input in soup.find_all('input', type="image"):
         img = input['src']
         content = self.get(img)
         filename = sha1(content) + img[img.rfind('.'):]
         save(content, filename)
         img_list.append({
             'url': img,
             'hash': filename,
         })
     return img_list
Beispiel #2
0
 def find_imgs(self, uri):
     url = HOST + uri
     soup = BeautifulSoup(self.get(url))
     img_list = []
     for input in soup.find_all('input', type="image"):
         img = input['src']
         content = self.get(img)
         filename = sha1(content) + img[img.rfind('.'):]
         save(content, filename)
         img_list.append({
             'url': img,
             'hash': filename,
         })
     return img_list
Beispiel #3
0
    def parse_imgs(self):
        answer_soup = self.soup.find('div', class_='zm-editable-content clearfix')
        if not answer_soup:
            if u'回答建议修改:涉及淫秽色情低俗信息' in unicode(self.soup):
                return None
        imgs = list(set(PATTERN_IMG.findall(unicode(answer_soup))))
        if not imgs:
            return None
        answer = {
            'url': ZHIHU_URL + self.soup.find('div', class_='zm-item-rich-text js-collapse-body')['data-entry-url'],
            'agree_cnt': 0, 'a_link': '', 'a_name': u'匿名用户',
            'r_time': '', 'e_time': '', 'comment_cnt': '', 'imgs': [], '_id': '',
        }
        with trytry():
            count = self.soup.find('span', class_='count').getText().strip().lower()
            if 'k' in count:
                count = count[:-1] + '000'
            answer['agree_cnt'] = int(count)

        for img in imgs:
            content = self.get(img, timeout=120)
            filename = sha1(content) + img[img.rfind('.'):]
            save(content, filename)
            # answer['imgs'].append({'local': filename, 'raw': img})
            answer['imgs'].append(filename)

        author = self.soup.find('div', class_='zm-item-answer-author-info')
        author_link = author.find('a', class_='author-link')
        if author_link:
            answer['a_link'] = ZHIHU_URL + author_link['href']
            answer['a_name'] = author_link.getText().strip()

        with trytry():
            answer['r_time'], answer['e_time'] = self.parse_edit_time(self.soup.find('a', class_='answer-date-link'))

        with trytry():
            comment = self.soup.find('a', class_=' meta-item toggle-comment').getText().strip()
            if comment != u'添加评论':
                answer['comment_cnt'] = comment[:-3].strip()
        answer['_id'] = answer['url'].replace('https://www.zhihu.com/question/', '').replace('/answer/', '-')
        return answer
Beispiel #4
0
 def parse_cat_tr(self, tr):
     tds = tr.find_all('td')
     if len(tds) != 5:
         return None
     title = tds[1].getText().strip().replace('\n', '').replace('\t', '')
     title_sp = CATEGORY_PATTERN.findall(title)
     if not title_sp:
         return None
     data = {
         '_id': sha1(title),
         'category': self.mapping_category(title_sp[0][0]),
         'title': title_sp[0][1],
         'img_count': int(title_sp[0][2]),
         'raw_path': tds[1].find('a')['href'],
         'pub_date': tds[2].find('div', class_='f10').getText()
     }
     if self.check_exists(data['_id']):
         return None
     imgs = self.find_imgs(data['raw_path'])
     if not imgs:
         return None
     data['images'] = imgs
     return data
Beispiel #5
0
 def parse_cat_tr(self, tr):
     tds = tr.find_all('td')
     if len(tds) != 5:
         return None
     title = tds[1].getText().strip().replace('\n', '').replace('\t', '')
     title_sp = CATEGORY_PATTERN.findall(title)
     if not title_sp:
         return None
     data = {
         '_id': sha1(title),
         'category': self.mapping_category(title_sp[0][0]),
         'title': title_sp[0][1],
         'img_count': int(title_sp[0][2]),
         'raw_path': tds[1].find('a')['href'],
         'pub_date': tds[2].find('div', class_='f10').getText()
     }
     if self.check_exists(data['_id']):
         return None
     imgs = self.find_imgs(data['raw_path'])
     if not imgs:
         return None
     data['images'] = imgs
     return data