Esempio n. 1
0
    def run(self):
        with self.threadingSum:
            logging.debug("%s start" % self.url)
            dbHandler = DbHandler()
            if not dbHandler.hasQuestion(self.url):
                # 插入新的问题
                question = Question(self.url)
                title = question.get_title()
                detail = question.get_detail()
                answerNum = question.get_answer_num()
                followersNum = question.get_followers_num()
                tags = ""
                for tag in question.get_tags():
                    tags += tag + ";"
                tags = tags[0: len(tags) - 1]
                questionDict = {"url": self.url, "title": title, 
                                "detail": detail, "followers": followersNum, 
                                "answerNum": answerNum, "tags": tags}
                dbHandler.insertNewQuestion(questionDict)

                zh_qid = dbHandler.getQueIdByUrl(self.url)
                # 插入新的答案
                for answer_link in question.get_all_answer_link():

                    answer = Answer(answer_link)
                    author = answer.get_author()
                    votes = answer.get_votes()
                    answerDict = {"url": answer_link, "author": author, "zh_qid": zh_qid, 
                              "votes": votes}
                    dbHandler.insertNewAnswer(answerDict)

                    # 插入图片地址
                    zh_aid = dbHandler.getAnsIdByUrl(answer_link)

                    for imgUrl in answer.get_all_pics():
                        dbHandler.insertNewImgUrl(zh_aid, imgUrl)

                    contents = answer.get_answer_content()
                    self.storeTheAnswer(zh_aid, contents)

                dbHandler.close()

            logging.debug("%s done" % self.url)
Esempio n. 2
0
        with self.threadingSum:
            logging.debug("%s start!" % self.url)
            pic_name = self.url.split("/")[-1]
            request = requests.get(self.url, stream=True, timeout=10)
            with open(os.getcwd() + "/tmp/html/images/" + pic_name,
                      'wb') as fd:
                for chunk in request.iter_content():
                    fd.write(chunk)
            logging.debug("%s done!" % self.url)


if __name__ == '__main__':
    #设置线程数
    threadingSum = threading.Semaphore(20)

    dbHandler = DbHandler()

    question = Question("http://www.zhihu.com/question/26702926")

    answer = Answer("http://www.zhihu.com/question/26702926/answer/33843851")
    img_urls = answer.get_all_pics()

    for url in img_urls:
        t = DownloadImg(threadingSum, url)
        t.start()

    for t in threading.enumerate():
        if t is threading.currentThread():
            continue
        t.join()