Exemple #1
0
    def get_content(self):
        logger.info("get_content url %s" % self.url)
        r = requests.get(self.url, headers=self.headers)
        logger.info("get_content res %s " % r.text)
        res_json = r.json()
        content = res_json.get('content', '')
        soup = BeautifulSoup(content, "html5lib")
        soup.find_all(self.remove_attrs)
        soup.html.unwrap()
        soup.head.unwrap()
        soup.body.unwrap()

        title = res_json.get('title', '')
        # question_id = res_json.get('question', {}).get('id', '')
        id    = res_json.get('id', '')
        note_url = 'https://zhuanlan.zhihu.com/p/%s' % (id)
        res = self.change_img(soup)
        title_list = title.split('\n')
        title = ''
        for t in title_list:
            title += t
        logger.info("note_url %s" % note_url)
        logger.info("title %s" % title)
        html_content = str(soup)
        res = EvernoteMethod.makeNote(self.noteStore, title.encode('utf8'), html_content, note_url, res, self.parent_note)
Exemple #2
0
 def change_img(self, soup):
     img_tags = soup.find_all("img")
     img_arr = [img.attrs['src'] for img in img_tags if img['src']]
     resources = EvernoteMethod.getRemoteRes(img_arr)
     index = 0
     for img in img_tags:
         if img['src']:
             hexhash = binascii.hexlify(resources[index].data.bodyHash)
             new_tag = soup.new_tag('en-media')
             new_tag['type'] = resources[index].mime
             new_tag['hash'] = hexhash
             img.replace_with(new_tag)
             index += 1
     return resources
Exemple #3
0
    def get_content(self):
        r = requests.get(self.url, headers=self.headers)
        print(self.url)
        print(r.text)
        res_json = r.json()
        content = res_json.get('content', '')
        soup = BeautifulSoup(content, "html5lib")
        soup.find_all(self.remove_attrs)
        soup.html.unwrap()
        soup.head.unwrap()
        soup.body.unwrap()

        title = res_json.get('question', {}).get('title', '')
        question_id = res_json.get('question', {}).get('id', '')
        id    = res_json.get('id', '')
        note_url = 'http://www.zhihu.com/question/%s/answer/%s' % (question_id, id)
        res = self.change_img(soup)
        title_list = title.split('\n')
        title = ''
        for t in title_list:
            title += t
        print("note_url %s" % note_url)
        print("title %s" % title)
        html_content = str(soup)
        res = EvernoteMethod.makeNote(self.noteStore, title.encode('utf8'), html_content, note_url, res, self.parent_note)
        session = create_session()
        find_queue = session.query(CollectionQueue).filter(CollectionQueue.api_url == self.url).first()
        if find_queue:
            find_queue.is_collected = 1
            find_queue.collected_time = int(time.time())
            find_queue.note_guid = res.guid
            session.commit()

            sqs_conn.delete_message_from_handle(zhihufav_sqs, self.receipt_handle)

        session.close()