Esempio n. 1
0
def domain():
    weibospider = Weibo_Spider()
    ID_urls = weibospider.ID_urls
    for i in range(len(ID_urls)):
        for j in range(len(ID_urls[i])):
            logger.info('正在爬取第' + str(i) + "个账号 第" + str(j + 1) + "条网页")
            weibospider.get_content(text=getData(ID_urls[i][j]))
Esempio n. 2
0
 def process_item(self, item):
     """ 判断item的类型,并作相应的处理,再入数据库 """
     if isinstance(item, dict):
         if self.data.find_one({
                 "nickname": item["nickname"],
                 "Post": item["Post"],
                 "Pubtime": item["Pubtime"]
         }):
             return "null"
         else:
             self.data.insert(item)
             logger.info("insert data into database...")
             return "ok"
Esempio n. 3
0
 def get_urls(self):
     logger.info('生成所有爬取网页的URLs...')
     ID_urls = {}
     for id in range(len(self.ID)):
         urls = []
         for i in range(1, self.ID_page_num[id] + 1):
             urls.append(self.host + "page=" + str(i) + "&pagebar=0&id=" +
                         str(self.ID[id]))
             for j in range(0, 2):
                 urls.append(self.host + "page=" + str(i) + "&pagebar=" +
                             str(j) + "&id=" + str(self.ID[id]) +
                             "&pre_page=" + str(i))
         ID_urls[id] = urls
     return ID_urls
Esempio n. 4
0
 def get_page(self):
     logger.info('获取所有页码...')
     ID_page_num = {}
     for id in range(len(self.ID)):
         text = getData(url=self.host + "page=1&pagebar=1&id=" +
                        str(self.ID[id]) + "&pre_page=1")
         content = json.loads(text.decode("ascii"))['data']
         # -*- 查询总页数 -*-
         reg = 'countPage=(\d+)"'
         try:
             page_num = int(re.findall(reg, content, re.S)[0])
         except:
             page_num = 0
         ID_page_num[id] = page_num
     return ID_page_num
Esempio n. 5
0
    def get_content(self, text):
        mongo = MongoDB()
        reg = '<em>(\d+)</em>'
        logger.info('解析获取网页数据...')
        content = json.loads(text.decode("ascii"))['data']
        soup = BeautifulSoup(
            "<html><head></head><body>" + content + "</body></html>", "lxml")
        tmp = soup.find_all("div", attrs={"class": "WB_detail"})
        tmp2 = soup.find_all("div", attrs={"class": "WB_handle"})
        if len(tmp) > 0:
            for i in range(len(tmp)):
                item = {}
                item["nickname"] = tmp[i].find("div",
                                               attrs={
                                                   "class": "WB_info"
                                               }).find("a").get_text()
                item["Post"] = tmp[i].find(
                    "div", attrs={
                        "class": "WB_text W_f14"
                    }).get_text().replace("\n", "").replace(" ", "").replace(
                        "\u200b", "")

                # -*- 爬取发布时间 -*-
                item["Pubtime"] = tmp[i].find("a", attrs={
                    "class": "S_txt2"
                }).get("title")

                # -*- 爬取转发数 -*-
                if re.findall(
                        reg,
                        str(tmp2[i].find("span",
                                         attrs={
                                             "class": "line S_line1",
                                             "node-type": "forward_btn_text"
                                         })), re.S):
                    item["Transfer_num"] = int(
                        re.findall(
                            reg,
                            str(tmp2[i].find("span",
                                             attrs={
                                                 "class": "line S_line1",
                                                 "node-type":
                                                 "forward_btn_text"
                                             })), re.S)[0])
                else:
                    item["Transfer_num"] = 0

                # -*- 爬取评论数 -*-
                if re.findall(
                        reg,
                        str(tmp2[i].find("span",
                                         attrs={
                                             "class": "line S_line1",
                                             "node-type": "comment_btn_text"
                                         })), re.S):
                    item["Comment_num"] = int(
                        re.findall(
                            reg,
                            str(tmp2[i].find("span",
                                             attrs={
                                                 "class": "line S_line1",
                                                 "node-type":
                                                 "comment_btn_text"
                                             })), re.S)[0])
                else:
                    item["Comment_num"] = 0

                # -*- 爬取点赞数 -*-
                if re.findall(
                        reg,
                        str(tmp2[i].find("span",
                                         attrs={"node-type": "like_status"})),
                        re.S):
                    item["Like_num"] = int(
                        re.findall(
                            reg,
                            str(tmp2[i].find(
                                "span", attrs={"node-type": "like_status"})),
                            re.S)[0])
                else:
                    item["Like_num"] = 0
                item["Scraltime"] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                  time.localtime())

                if mongo.process_item(item) == "null":
                    break
                else:
                    continue