Esempio n. 1
0
                print "Network Exception!!! ", e
            finally:
                datas = getMatchList(
                    html,
                    '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
                # print len(datas)

                for data in datas:
                    photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                    uid = getMatch(data, 'usercard="id=(*)"')
                    nickname = getMatch(data, 'usercard=.*?alt="(*)"')
                    mid = getMatch(data, '&cid=(*)&')
                    timestamp = getMatch(
                        data, '<div class="WB_from S_txt2">(*)  来自')
                    if timestamp:
                        timestamp = long(getTimeStamp(timestamp))
                    else:
                        timestamp = 0

                    if timestamp <= self.lasttime:
                        tags = True
                        break

                    text = getMatch(data,
                                    '<div class="WB_text S_txt2">(*)</div>')
                    if text:
                        text = extractForHTML(text)
                    else:
                        text = ''
                    r_mid = getMatch(data, 'mid=(*)&')
                    r_uid = self.uid
Esempio n. 2
0
    def commentInbox(self):
        cr_url = 'http://weibo.com/comment/inbox?&page=1&pids=Pl_Content_Commentlist'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    print html
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    time.sleep(5)
                    continue
            #finally:
            datas = getMatchList(
                html, '<div class="WB_feed_detail clearfix">(*)<!--/主评论-->')
            # print len(datas)

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, 'page_frame" title="(*)"')
                mid = getMatch(data, '&cid=(*)&')
                timestamp = getMatch(data,
                                     '<div class="WB_from S_txt2">(*)  来自')
                if timestamp:
                    timestamp = long(getTimeStamp(timestamp))
                else:
                    timestamp = 0

                if timestamp <= self.mlasttime:
                    tags = True
                    break

                text = getMatch(data, '<div class="WB_text">(*)</div>')
                if text:
                    text = extractForHTML(text)
                else:
                    text = ''
                r_mid = getMatch(data, 'mid=(*)&')
                r_uid = self.uid
                #commet_type = 'make'
                commet_type = 'receive'

                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'

                wb_item = {
                    'photo_url': photo_url,
                    'uid': uid,
                    'nick_name': nickname,
                    'mid': mid,
                    'timestamp': timestamp,
                    'text': text,
                    'root_mid': r_mid,
                    'root_uid': r_uid,
                    'weibo_type': _type,
                    'comment_type': commet_type,
                    'update_time': self.update_time
                }

                wb_json = json.dumps(wb_item)
                json_list.append(wb_json)

            # 分页
            next_pageUrl = getUrlToPattern(html,
                                           comment_url,
                                           pattern='page',
                                           text_pattern='下一页')
            # print next_pageUrl
            if next_pageUrl:
                comment_url = next_pageUrl[0]
            elif not next_pageUrl or tags:
                break
Esempio n. 3
0
    def messages(self):
        cr_url = 'http://weibo.com/messages?pids=Pl_Content_MessageList&page=1'
        de_url = 'http://weibo.com/aj/message/getbyid?ajwvr=6&count=50&uid=%s&_t=0&__rnd=%d'
        json_list = []
        tags = False

        comment_url = cr_url
        while True:
            print comment_url
            while True:
                try:
                    request = urllib2.Request(comment_url,
                                              headers=self._headers)
                    response = urllib2.urlopen(request, timeout=60)
                    html = response.read().decode('string_escape').replace(
                        '\\/', '/')
                    break
                except Exception, e:
                    print "Network Exception!!! ", e
                    continue
                #print 'html:', html

            #finally:
            datas = getMatchList(
                html,
                '<div class="private_list SW_fun_bg S_line2 clearfix".*?<!-- 下拉列表 -->'
            )

            for data in datas:
                photo_url = "http:" + getMatch(data, '<img.*?src="(*)"')
                uid = getMatch(data, 'usercard="id=(*)"')
                nickname = getMatch(data, '<img.*?alt="(*)"')
                r_uid = self.uid

                counts = getMatch(
                    data, '<em class="W_new_count S_spetxt_bg">(*)</em>')
                if counts and counts.isdigit():
                    counts = long(counts)
                else:
                    counts = 0
                _type = 'stranger'
                type1 = ''
                type2 = ''
                for fljson in self.follow:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type1 = 'follow'
                        break
                for fljson in self.fans:
                    fjson = json.loads(fljson)
                    if fjson['uid'] == uid:
                        type2 = 'followed'
                        break
                if type1 and type2:
                    _type = 'friend'
                elif type1:
                    _type = type1
                elif type2:
                    _type = type2
                if uid == r_uid:
                    _type = 'self'
                    while True:
                        try:
                            detailUrl = de_url % (uid, int(time.time() * 1000))
                            #print 'detail_url:', detailUrl
                            request = urllib2.Request(detailUrl)
                            response = urllib2.urlopen(request, timeout=60)
                            ms_content = json.loads(response.read())
                            break
                        except Exception, e:
                            print "Network Exception!!! ", e
                            continue
                #else:
                    html = ms_content["data"]["html"]
                    ms_datas = getMatchList(
                        html,
                        u'(<!-- 单行文字-->|<div class="space">).*?<!--/附件信息-->')
                    # print datas[0]
                    last_time = 0
                    for ms_data in ms_datas:
                        mid_uid = getMatch(ms_data, 'usercard="id=(*)"')
                        mid = getMatch(ms_data, 'mid="(*)"')
                        timestamp = getMatch(
                            ms_data, 'prompt_font S_txt2 S_bg1">(*)</legend>')
                        #soup = BeautifulSoup(ms_data)
                        #timestamp_bs4 = soup.find_all('legend', class_=["prompt_font", "S_txt2", "S_bg1"])
                        if timestamp:
                            timestamp = long(getTimeStamp(timestamp))
                            last_time = timestamp
                        else:
                            timestamp = last_time

                        if timestamp < self.lasttime:
                            tags = True
                            print 'timestamp<lasttime, timestamp, lasttime:', timestamp, self.lasttime
                            #break
                            next

                        text = getMatch(ms_data,
                                        u'<div class="cont">.*?<!--/附件信息-->')
                        if text:
                            text = extractForHTML(text)
                            text = commentExtract(text)

                        if mid_uid == uid:
                            private_type = 'receive'
                        elif mid_uid == r_uid:
                            private_type = 'make'
                        else:
                            private_type = ''

                        wb_item = {
                            'photo_url': photo_url,
                            'uid': uid,
                            'nick_name': nickname,
                            'mid': mid,
                            'timestamp': timestamp,
                            'text': text,
                            'root_uid': r_uid,
                            'weibo_type': _type,
                            'private_type': private_type,
                            'w_new_count': counts,
                            'update_time': self.update_time
                        }

                        wb_json = json.dumps(wb_item)
                        #print 'wb_json:::',wb_json
                        json_list.append(wb_json)