Python open_url Beispiele, Utility.open_url Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def get_followees(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:
                                                                           6] + '&wvr=6&mod=headfollow#place'

        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fee_page_num = self.get_followee_page_num(html)
            if fee_page_num is not None:
                break
            else:
                log.warning('Cannot get followee page total number - pid:%s' %
                            (pid, ))
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))

        if fee_page_num == 0:
            print 'He/She does not follow any one.'
            return
        else:
            print 'Getting followee page 1 of %d...' % (fee_page_num, )
            followees = self.parser.parse_followees(html, pid, datetime.now())
            self.followee_list.extend(
                followees
            )  # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html)
            if fee_page_num == 1:
                return
            for i in xrange(2, fee_page_num + 1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (
                        pid, pid[:6], i)
                    print 'Getting followee page %d of %d...' % (i,
                                                                 fee_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(
                        random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                       2 * Config.SLEEP_BETWEEN_2FPAGES))
                    followees = self.parser.parse_followees(
                        html, pid, datetime.now())
                    if followees is None:  # dirty html
                        log.warning(
                            'Cannot parse followee page correctly - pid:%s' %
                            (pid, ))
                        time.sleep(
                            random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                           2 * Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.followee_list.extend(followees)
                    break

Beispiel #2

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def get_pid(self, uid):
        """

        :param uid:
        :return: corresponding pid
        """
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/%s' % (uid,)
        while True:
            html = open_url(fetcher, url)

            parsed_uid = self.parser.parse_uid(html)
            if parsed_uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue # make sure that the html is correct.

            is_enterprise = self.parser.parse_is_enterprise(html)
            if is_enterprise is True:
                return -1 # -1 denotes this user is an enterprise

            pid = self.parser.parse_pid(html)
            if pid is not None:
                return pid
            else:
                log.error('Cannot get pid for uid:%s' % (uid,))
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))

Beispiel #3

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def get_pid(self, uid):
        """

        :param uid:
        :return: corresponding pid
        """
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/%s' % (uid, )
        while True:
            html = open_url(fetcher, url)

            parsed_uid = self.parser.parse_uid(html)
            if parsed_uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue  # make sure that the html is correct.

            is_enterprise = self.parser.parse_is_enterprise(html)
            if is_enterprise is True:
                return -1  # -1 denotes this user is an enterprise

            pid = self.parser.parse_pid(html)
            if pid is not None:
                return pid
            else:
                log.error('Cannot get pid for uid:%s' % (uid, ))
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))

Beispiel #4

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def get_followees(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:6] + '&wvr=6&mod=headfollow#place'

        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fee_page_num = self.get_followee_page_num(html)
            if fee_page_num is not None:
                break
            else:
                log.warning('Cannot get followee page total number - pid:%s' % (pid,))
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))

        if fee_page_num == 0:
            print 'He/She does not follow any one.'
            return
        else:
            print 'Getting followee page 1 of %d...' % (fee_page_num,)
            followees = self.parser.parse_followees(html, pid, datetime.now())
            self.followee_list.extend(followees) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html)
            if fee_page_num == 1:
                return
            for i in xrange(2, fee_page_num+1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (pid, pid[:6], i)
                    print 'Getting followee page %d of %d...' % (i, fee_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                    followees = self.parser.parse_followees(html, pid, datetime.now())
                    if followees is None: # dirty html
                        log.warning('Cannot parse followee page correctly - pid:%s' % (pid,))
                        time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.followee_list.extend(followees)
                    break

Beispiel #5

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def get_timeline_page_num(self, uid):
        """

        :param uid:
        :return: page number and one or two pages, which will decrease accesses to Sina server
        """
        htmls = []  # keep the pages to decrease accesses to Sina
        while True:
            first_page_head = self.fetch_timelines_by_page_bar(uid, 1, 0)
            if first_page_head is None:  # no any posts
                return 0, htmls
            else:
                htmls.append(first_page_head)

            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

            first_page_body = self.fetch_timelines_by_page_bar(uid, 1, 1)
            if first_page_body is None:
                return 1, htmls
            else:
                htmls.append(first_page_body)

            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

            first_page_tail = self.fetch_timelines_by_page_bar(uid, 1, 2)
            if first_page_tail is None:  # just one page of timelines
                return 1, htmls
            else:
                htmls.append(first_page_tail)

            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

            pnum = self.parser.parse_timeline_page_num(
                first_page_tail
            )  # this page number is not accurate, so we will recount it in the next step
            if pnum is None or pnum == 1:
                return 1, htmls

            while True:
                url = 'http://www.weibo.com/%s?page=%d&pids=Pl_Content_HomeFeed' % (
                    uid, pnum)
                test_html = open_url(self.fetchers[self.main_fetcher], url)
                time.sleep(
                    random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                   2 * Config.SLEEP_BETWEEN_2FPAGES))
                no_post = 'W_icon icon_warnB'
                if no_post in test_html:
                    pnum -= 1  # fixing page number
                else:
                    break
            return pnum, htmls

Beispiel #6

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def get_profile(self, pid):
        '''
        get profile information for User marked with pid
        :param pid: page id
        :return:
        '''

        url = 'http://www.weibo.com/p/%s/info?mod=pedit_more' % (pid, )

        uid = pid[6:]
        is_taobao = None
        while is_taobao is None:
            try:

                is_taobao = self.is_taobao(
                    uid)  # get taobao information in advance
                if is_taobao == -1:
                    self.ban_account()
                    is_taobao = None
                elif is_taobao == -2:
                    self.reset_account()
                    is_taobao = None

            except Exception as e:
                print e.message
            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

        profile = None
        print 'Getting profile page...'
        while profile is None:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            if self.parser.parse_uid(
                    html
            ) == -1:  # if -1 that means this working account maybe be banned
                self.ban_account()
                continue
            elif self.parser.is_visitor(
                    html
            ) is True:  # judge whether working account falls into visitor status
                self.reset_account()
                continue

            profile = self.parser.parse_profile(html, pid, is_taobao,
                                                datetime.now())
            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

        self.profile_list.append(profile)

Beispiel #7

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def get_timeline_page_num(self, uid):
        """

        :param uid:
        :return: page number and one or two pages, which will decrease accesses to Sina server
        """
        htmls = [] # keep the pages to decrease accesses to Sina
        while True:
            first_page_head = self.fetch_timelines_by_page_bar(uid, 1, 0)
            if first_page_head is None: # no any posts
                return 0, htmls
            else:
                htmls.append(first_page_head)

            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

            first_page_body = self.fetch_timelines_by_page_bar(uid, 1, 1)
            if first_page_body is None:
                return 1, htmls
            else:
                htmls.append(first_page_body)

            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

            first_page_tail = self.fetch_timelines_by_page_bar(uid, 1, 2)
            if first_page_tail is None: # just one page of timelines
                return 1, htmls
            else:
                htmls.append(first_page_tail)

            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

            pnum = self.parser.parse_timeline_page_num(first_page_tail) # this page number is not accurate, so we will recount it in the next step
            if pnum is None or pnum == 1:
                return 1, htmls

            while True:
                url = 'http://www.weibo.com/%s?page=%d&pids=Pl_Content_HomeFeed' % (uid, pnum)
                test_html = open_url(self.fetchers[self.main_fetcher], url)
                time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                no_post = 'W_icon icon_warnB'
                if no_post in test_html:
                    pnum -= 1 # fixing page number
                else:
                    break
            return pnum, htmls

Beispiel #8

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def is_taobao(self, uid):
        '''

        :param uid: user ID
        :return: a boolean value ('1' or '0') indicating whether user is a taobao shopkeeper
        '''
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/' + uid
        html = open_url(fetcher, url)

        with open('debug_taobao.txt', 'w') as writer:
            writer.write(html)

        if self.parser.parse_uid(html) == -1:
            return -1 # account is banned
        elif self.parser.is_visitor(html) is True:
            return -2 # account is in visitor status

        return self.parser.parse_is_taobao(html)

Beispiel #9

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def get_profile(self, pid):
        '''
        get profile information for User marked with pid
        :param pid: page id
        :return:
        '''

        url = 'http://www.weibo.com/p/%s/info?mod=pedit_more' % (pid,)

        uid = pid[6:]
        is_taobao = None
        while is_taobao is None:
            try:

                is_taobao = self.is_taobao(uid) # get taobao information in advance
                if is_taobao == -1:
                    self.ban_account()
                    is_taobao = None
                elif is_taobao == -2:
                    self.reset_account()
                    is_taobao = None

            except Exception as e:
                print e.message
            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

        profile = None
        print 'Getting profile page...'
        while profile is None:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            if self.parser.parse_uid(html) == -1: # if -1 that means this working account maybe be banned
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True: # judge whether working account falls into visitor status
                self.reset_account()
                continue

            profile = self.parser.parse_profile(html, pid, is_taobao, datetime.now())
            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

        self.profile_list.append(profile)

Beispiel #10

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def is_taobao(self, uid):
        '''

        :param uid: user ID
        :return: a boolean value ('1' or '0') indicating whether user is a taobao shopkeeper
        '''
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/' + uid
        html = open_url(fetcher, url)

        with open('debug_taobao.txt', 'w') as writer:
            writer.write(html)

        if self.parser.parse_uid(html) == -1:
            return -1  # account is banned
        elif self.parser.is_visitor(html) is True:
            return -2  # account is in visitor status

        return self.parser.parse_is_taobao(html)

Beispiel #11

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def ban_account(self):

        url = 'http://sass.weibo.com/unfreeze'
        html = open_url(self.fetchers[self.main_fetcher], url)
        is_exceptional = self.parser.is_exceptional(html)
        is_frozen = self.parser.is_frozen(html)
        if is_exceptional is False and is_frozen is False:
            return

        account = self.users[self.main_fetcher].acct
        Dao.Account.ban(account)
        emphasis_print('One account has been banned!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')

Beispiel #12

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def ban_account(self):

        url = 'http://sass.weibo.com/unfreeze'
        html = open_url(self.fetchers[self.main_fetcher], url)
        is_exceptional = self.parser.is_exceptional(html)
        is_frozen = self.parser.is_frozen(html)
        if is_exceptional is False and is_frozen is False:
            return

        account = self.users[self.main_fetcher].acct
        Dao.Account.ban(account)
        emphasis_print('One account has been banned!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')

Beispiel #13

0

Datei anzeigen

Datei: Weibo.py Projekt: ChenghaoZHU/SinaSpider

    def get_server_data(self, opener):
        print "Getting server time and nonce..."

        server_data = open_url(opener, self.server_url) # get sever_data
        return self.parse_server_data(server_data)

Beispiel #14

0

Datei anzeigen

    def get_server_data(self, opener):
        print "Getting server time and nonce..."

        server_data = open_url(opener, self.server_url)  # get sever_data
        return self.parse_server_data(server_data)

Beispiel #15

0

Datei anzeigen

Datei: Spider.py Projekt: ChenghaoZHU/SinaSpider

    def fetch_timelines_by_page_bar(self, uid, pnum, bnum):
        """
        fetch timelines by specifying page number and bar number
        :param uid:
        :param pnum: page number
        :param bnum: bar number
        :return: html containing timelines or None if there are no timelines
        """
        body = { # 这个是有抓包得出的，因为新浪微博用了瀑布流动态加载，所以不能一次性得到一页中所有信息
            '__rnd':1343647638078,
            '_k':1343647471134109,
            '_t':0,
            'count':15,
            'end_id':3473519214542343,
            'max_id':3473279479126179,
            'page':1,
            'pagebar':1,
            'pre_page':1,
            'uid':uid
       }

        body['page'] = pnum

        if bnum == 0:
            body['count'] = '50'
            body['pagebar'] = ''
            body['pre_page'] = pnum-1
        elif bnum == 1:
            body['count'] = '15'
            body['pagebar'] = '0'
            body['pre_page'] = pnum
        elif bnum == 2:
            body['count'] = '15'
            body['pagebar'] = '1'
            body['pre_page'] = pnum

        url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body)
        while True:
            try:
                print 'Getting timeline page %d part %d...' % (pnum, bnum+1) # bnum starts with zero up to two
                jsn_data = open_url(self.fetchers[self.main_fetcher], url)
                if self.parser.is_frozen(jsn_data):
                    self.ban_account()
                    continue

                data = json.loads(jsn_data)
                html = data['data']
                if u'WB_feed_type SW_fun S_line2' in html:
                    return html
                else:
                    return None
            except Exception as e:
                if 'No valid account!' in e.message:
                    raise e
                if 'No JSON object could be decoded' in e.message:
                    if self.parser.is_visitor(jsn_data) is True:
                        self.reset_account()
                    else:
                        self.ban_account()
                log.warning(e.message)
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                continue

Beispiel #16

0

Datei anzeigen

Datei: Spider.py Projekt: Catfishly/SinaSpider

    def fetch_timelines_by_page_bar(self, uid, pnum, bnum):
        """
        fetch timelines by specifying page number and bar number
        :param uid:
        :param pnum: page number
        :param bnum: bar number
        :return: html containing timelines or None if there are no timelines
        """
        body = {  # 这个是有抓包得出的，因为新浪微博用了瀑布流动态加载，所以不能一次性得到一页中所有信息
            '__rnd': 1343647638078,
            '_k': 1343647471134109,
            '_t': 0,
            'count': 15,
            'end_id': 3473519214542343,
            'max_id': 3473279479126179,
            'page': 1,
            'pagebar': 1,
            'pre_page': 1,
            'uid': uid
        }

        body['page'] = pnum

        if bnum == 0:
            body['count'] = '50'
            body['pagebar'] = ''
            body['pre_page'] = pnum - 1
        elif bnum == 1:
            body['count'] = '15'
            body['pagebar'] = '0'
            body['pre_page'] = pnum
        elif bnum == 2:
            body['count'] = '15'
            body['pagebar'] = '1'
            body['pre_page'] = pnum

        url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body)
        while True:
            try:
                print 'Getting timeline page %d part %d...' % (
                    pnum, bnum + 1)  # bnum starts with zero up to two
                jsn_data = open_url(self.fetchers[self.main_fetcher], url)
                if self.parser.is_frozen(jsn_data):
                    self.ban_account()
                    continue

                data = json.loads(jsn_data)
                html = data['data']
                if u'WB_feed_type SW_fun S_line2' in html:
                    return html
                else:
                    return None
            except Exception as e:
                if 'No valid account!' in e.message:
                    raise e
                if 'No JSON object could be decoded' in e.message:
                    if self.parser.is_visitor(jsn_data) is True:
                        self.reset_account()
                    else:
                        self.ban_account()
                log.warning(e.message)
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))
                continue

Beispiel #17

0

Datei anzeigen

#-*- coding: UTF-8 -*-
__author__ = 'chzhu'

import Spider
from Spider import Spider as SP
from Dao import Task, Account

if __name__ == '__main__':

    # user_1 = Spider.User('*****@*****.**', 'a123456')
    # user_2 = Spider.User('*****@*****.**', 'w1127401044')
    # user_list = [user_1, user_2]
    #
    # sp = SP(user_list)
    #
    # uid_list = Task.get_all()
    #
    # for uid in uid_list:
    #     sp.collect_user_profiles_only(uid)
    #     sp.save_only_profile()

    user = Spider.User('*****@*****.**', 'pp9999')
    user_list = [user]
    sp = SP(user_list)

    from Utility import open_url

    uid = '2971804112'
    url = 'http://www.weibo.com/aj/v6/user/newcard?ajwvr=6&id=%s&type=0&refer_flag=0000011002_&callback=STK_145465715259821'
    r = open_url(sp.fetchers[sp.main_fetcher], url%uid)
    print 'dd'