Beispiel #1
0
    def start_requests(self):
        yield Request(url="https://weibo.com/47452014", )
        while self.scrawl_ID.__len__():
            ID = self.scrawl_ID.pop()
            self.finish_ID.add(ID)  # 加入已爬队列
            ID = str(ID)
            follows = []
            followsItems = FollowsItem()
            followsItems["_id"] = ID
            followsItems["follows"] = follows
            fans = []
            fansItems = FansItem()
            fansItems["_id"] = ID
            fansItems["fans"] = fans

            url_follows = "https://weibo.cn/2210643391/follow"  # url_follows = "http://weibo.cn
            like_url = self.like_pattern % (ID, 1)
            url_fans = "http://weibo.cn/%s/fans" % ID
            # url_tweets = "http://weibo.cn/%s/profile?page=1" % ID
            url_information0 = "http://weibo.cn/attgroup/opening?uid=%s" % ID

            meta_data = {"id": ID, "current_page": 1}
            # yield Request(url=url_follows, meta={"item": followsItems, "result": follows},
            #             callback=self.parse3)  # 去爬关注人
            # yield Request(url=url_fans, meta={"item": fansItems, "result": fans}, callback=self.parse3)  # 去爬粉丝
            # yield Request(url=url_information0, meta={"ID": ID}, callback=self.parse0)  # 去爬个人信息
            yield Request(url=self.tweets_pattern % (ID, 1), meta={"ID": ID, "current_page": 1},
                          dont_filter=True,
                          callback=self.parse_weibo,
                          headers=self.default_headers)  # 去爬微博
            # yield Request(url=like_url, meta=meta_data, callback=self.parse_weibo2)
            # yield Request(url=comment_url, meta={"weiboId": weibo_id}, callback=self.parse_comment)
        for weibo in self.weibos:
            yield Request(url=self.comment_pattern % (weibo, 1), meta={"weiboId": weibo, "current_page": 1},
                          dont_filter=True, callback=self.parse_comment, headers=self.default_headers)
Beispiel #2
0
    def start_requests(self):
        while True:
            if len(self.scrawl_ID) > 0:
                ID = self.scrawl_ID.pop()
            else:
                break
            self.finish_ID.add(ID)  # 加入已爬队列
            ID = str(ID)
            follows = []
            followsItems = FollowsItem()
            followsItems["_id"] = ID
            followsItems["follows"] = follows
            fans = []
            fansItems = FansItem()
            fansItems["_id"] = ID
            fansItems["fans"] = fans
            informationItems = InformationItem()
            informationItems["_id"] = ID

            url_follows = "http://weibo.cn/%s/follow" % ID
            url_fans = "http://weibo.cn/%s/fans" % ID
            url_tweets = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
            url_information0 = "http://weibo.cn/attgroup/opening?uid=%s" % ID
            url_information1 = "http://weibo.cn/%s/info" % ID

            #yield Request(url=url_follows, meta={"item": followsItems, "result": follows}, callback=self.parse3)  # 去爬关注人
            #yield Request(url=url_information1, meta={"item": informationItems}, callback=self.parse1)
            #yield Request(url=url_information0, meta={"ID": ID}, callback=self.parse0)  # 去爬个人信息
            #yield Request(url=url_fans, meta={"item": fansItems, "result": fans}, callback=self.parse3)  # 去爬粉丝
            yield Request(url=url_tweets,
                          meta={"ID": ID},
                          callback=self.parse2)  # 去爬微博
    def request_fans_follows(self, id):
        """ 请求关注人与粉丝信息 """
        follows = []
        followsItems = FollowsItem()
        followsItems["wb_usr_id"] = id
        followsItems["follows"] = follows
        fans = []
        fansItems = FansItem()
        fansItems["wb_usr_id"] = id
        fansItems["fans"] = fans

        url_follows = "http://weibo.cn/%s/follow" % id
        url_fans = "http://weibo.cn/%s/fans" % id
        url_tweets = "http://weibo.cn/%s/profile?filter=1&page=1" % id
        yield Request(url=url_follows,
                      meta={
                          "item": followsItems,
                          "result": follows
                      },
                      callback=self.parse3)  # 去爬关注人
        yield Request(url=url_fans,
                      meta={
                          "item": fansItems,
                          "result": fans
                      },
                      callback=self.parse3)  # 去爬粉丝
Beispiel #4
0
    def start_requests(self):
        while self.scrawl_ID.__len__():
            ID = self.scrawl_ID.pop()
            self.finish_ID.add(ID)  # 加入已爬队列
            ID = str(ID)
            follows = []
            followsItems = FollowsItem()
            followsItems["_id"] = ID
            followsItems["follows"] = follows
            fans = []
            fansItems = FansItem()
            fansItems["_id"] = ID
            fansItems["fans"] = fans

            url_follows = "https://weibo.cn/2210643391/follow"  # url_follows = "http://weibo.cn
            like_url = self.like_pattern % (ID, 1)
            url_fans = "http://weibo.cn/%s/fans" % ID
            url_tweets = "http://weibo.cn/%s?page=1" % ID
            url_information0 = "http://weibo.cn/attgroup/opening?uid=%s" % ID
            weibo_id = 'FniYTADUE'
            meta_data = {"id": ID, "current_page": 1}
            # yield Request(url=url_follows, meta={"item": followsItems, "result": follows},
            #             callback=self.parse3)  # 去爬关注人
            # yield Request(url=url_fans, meta={"item": fansItems, "result": fans}, callback=self.parse3)  # 去爬粉丝
            # yield Request(url=url_information0, meta={"ID": ID}, callback=self.parse0)  # 去爬个人信息
            # yield Request(url=url_tweets, meta={"ID": ID, "current_page": 1}, callback=self.parse_weibo)  # 去爬微博
            yield Request(url=like_url,
                          meta=meta_data,
                          callback=self.parse_weibo2)
Beispiel #5
0
    def start_requests(self):
        while self.scrawl_ID.__len__():
            ID = self.scrawl_ID.pop()
            # print "ID====: " + ID
            self.finish_ID.add(ID)  # 加入已爬队列
            ID = str(ID)
            follows = []
            followsItems = FollowsItem()
            followsItems["_id"] = ID
            followsItems["follows"] = follows
            fans = []
            fansItems = FansItem()
            fansItems["_id"] = ID
            fansItems["fans"] = fans

            url_follows = "https://weibo.cn/%s/follow" % ID
            url_fans = "https://weibo.cn/%s/fans" % ID
            url_tweets = "https://weibo.cn/%s/profile?filter=1&page=1" % ID
            url_information0 = "https://weibo.cn/attgroup/opening?uid=%s" % ID
            yield Request(url=url_follows,
                          meta={
                              "item": followsItems,
                              "result": follows
                          },
                          callback=self.parse3)  # 去爬关注人
            yield Request(url=url_fans,
                          meta={
                              "item": fansItems,
                              "result": fans
                          },
                          callback=self.parse3)  # 去爬粉丝
            yield Request(url=url_information0,
                          meta={"ID": ID},
                          callback=self.parse0)  # 去爬个人信息
            yield Request(url=url_tweets,
                          meta={"ID": ID},
                          callback=self.parse2)  # 去爬微博
Beispiel #6
0
    def parse1(self, response):
        """ 抓取个人信息2 """
        informationItems = response.meta["item"]
        selector = Selector(response)
        text1 = ";".join(
            selector.xpath(
                'body/div[@class="c"]/text()').extract())  # 获取标签里的所有text()

        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # 昵称
        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # 性别
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);',
                           text1)  # 地区(包括省份和城市)
        signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # 个性签名
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # 生日
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);',
                                    text1)  # 性取向
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);',
                              text1)  # 婚姻状况
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # 首页链接
        Experience = re.findall(u'[·](.*?);', text1)  # 首页链接

        if nickname:
            informationItems["NickName"] = nickname[0]
        if gender:
            informationItems["Gender"] = gender[0]
        if place:
            place = place[0].split(" ")
            informationItems["Province"] = place[0]
            if len(place) > 1:
                informationItems["City"] = place[1]
        if signature:
            informationItems["Signature"] = signature[0]
        if birthday:
            try:
                birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
                informationItems["Birthday"] = birthday - datetime.timedelta(
                    hours=8)
            except Exception:
                pass
        if sexorientation:
            if sexorientation[0] == gender[0]:
                informationItems["Sex_Orientation"] = "gay"
            else:
                informationItems["Sex_Orientation"] = "Heterosexual"
        if marriage:
            informationItems["Marriage"] = marriage[0]
        if url:
            informationItems["URL"] = url[0]
        # 添加工作学习
        if Experience:
            informationItems["Experience"] = Experience
        yield informationItems

        ok = False
        for ex in Experience:
            if ex.find(u"\u6e56\u5357\u5de5\u4e1a\u5927\u5b66") != -1:
                ok = True
                break
        if ok == False:
            return

        ID = response.meta["ID"]
        follows = []
        followsItems = FollowsItem()
        followsItems["_id"] = ID
        followsItems["follows"] = follows
        fans = []
        fansItems = FansItem()
        fansItems["_id"] = ID
        fansItems["fans"] = fans

        url_follows = "http://weibo.cn/%s/follow" % ID
        url_fans = "http://weibo.cn/%s/fans" % ID

        yield Request(url=url_follows,
                      meta={
                          "item": followsItems,
                          "result": follows
                      },
                      callback=self.parse3)  # 去爬关注人
        yield Request(url=url_fans,
                      meta={
                          "item": fansItems,
                          "result": fans
                      },
                      callback=self.parse3)  # 去爬粉丝