def get_content(self): logger.info("get_content url %s" % self.url) r = requests.get(self.url, headers=self.headers) logger.info("get_content res %s " % r.text) res_json = r.json() content = res_json.get('content', '') soup = BeautifulSoup(content, "html5lib") soup.find_all(self.remove_attrs) soup.html.unwrap() soup.head.unwrap() soup.body.unwrap() title = res_json.get('title', '') # question_id = res_json.get('question', {}).get('id', '') id = res_json.get('id', '') note_url = 'https://zhuanlan.zhihu.com/p/%s' % (id) res = self.change_img(soup) title_list = title.split('\n') title = '' for t in title_list: title += t logger.info("note_url %s" % note_url) logger.info("title %s" % title) html_content = str(soup) res = EvernoteMethod.makeNote(self.noteStore, title.encode('utf8'), html_content, note_url, res, self.parent_note)
def change_img(self, soup): img_tags = soup.find_all("img") img_arr = [img.attrs['src'] for img in img_tags if img['src']] resources = EvernoteMethod.getRemoteRes(img_arr) index = 0 for img in img_tags: if img['src']: hexhash = binascii.hexlify(resources[index].data.bodyHash) new_tag = soup.new_tag('en-media') new_tag['type'] = resources[index].mime new_tag['hash'] = hexhash img.replace_with(new_tag) index += 1 return resources
def get_content(self): r = requests.get(self.url, headers=self.headers) print(self.url) print(r.text) res_json = r.json() content = res_json.get('content', '') soup = BeautifulSoup(content, "html5lib") soup.find_all(self.remove_attrs) soup.html.unwrap() soup.head.unwrap() soup.body.unwrap() title = res_json.get('question', {}).get('title', '') question_id = res_json.get('question', {}).get('id', '') id = res_json.get('id', '') note_url = 'http://www.zhihu.com/question/%s/answer/%s' % (question_id, id) res = self.change_img(soup) title_list = title.split('\n') title = '' for t in title_list: title += t print("note_url %s" % note_url) print("title %s" % title) html_content = str(soup) res = EvernoteMethod.makeNote(self.noteStore, title.encode('utf8'), html_content, note_url, res, self.parent_note) session = create_session() find_queue = session.query(CollectionQueue).filter(CollectionQueue.api_url == self.url).first() if find_queue: find_queue.is_collected = 1 find_queue.collected_time = int(time.time()) find_queue.note_guid = res.guid session.commit() sqs_conn.delete_message_from_handle(zhihufav_sqs, self.receipt_handle) session.close()