Exemple #1
0
class UsersCrawler:
    url_format = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%s&page=%d'

    querystring = {"version": "v4"}

    payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
    headers = {
        'content-type':
        "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
        'host': "m.weibo.cn",
        'connection': "keep-alive",
        'cache-control': "no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, sdch, br",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
        'cookie':
        "SCF=AlTf48qNezF12LbNvCHGGee_Nymdun-Sp9kGATl9gjhJAPPkj2QBT2-Y2MECfIjqy1QjvcBbdVr9HWi6hgbgnTQ.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhEEfKT6-E_qQ8I2HTu2.Vu5JpX5o2p5NHD95Qp1hq4She41K-pWs4DqcjGC2Hkg.y8Kntt; SUB=_2A250CvjnDeRhGeBP7FoW9SvEwjiIHXVX9JivrDV6PUJbkdANLUvGkW1966OJJxi88Ah66us23Spcr23Dpw..; SUHB=0cSXjt5Dqq_ieZ; _T_WM=e0f6480701da87741a5b948440f9d665; SSOLoginState=1495508844; ALF=1498100844; H5_INDEX=0_all; H5_INDEX_TITLE=%E4%BD%A0%E5%B7%B2%E7%BB%8F%E8%A2%AB%E7%A7%BB%E9%99%A4%E7%BE%A4%E8%81%8A; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D4110491498745329%26luicode%3D10000011%26lfid%3D231051_-_followers_-_5979396421",
        'postman-token': "0b85ea3b-073b-a799-4593-61095e4ed01a"
    }

    # response = requests.request("GET", url, data=payload, headers=headers, params=querystring)

    db_manager = None

    threads = []

    run = False

    def __init__(self):
        self.db_manager = CrawlDatabaseManager(10)

    def get_users(self, uid, page):
        url = (self.url_format) % (uid, page)
        response = requests.request("GET",
                                    url,
                                    data=self.payload,
                                    headers=self.headers,
                                    params=self.querystring)
        return response.text

    def get_uid(self):
        return self.db_manager.dequeue_user()

    def start(self):
        self.run = True
        t = threading.Thread(target=self.crawl_feeds, name=None)
        self.threads.append(t)
        # set daemon so main thread can exit when receives ctrl-c
        t.setDaemon(True)
        t.start()

    def crawl_users(self):
        kickstart = True
        self.run = True

        while self.run:
            if kickstart:
                kickstart = False
                uid = start_uid
            else:
                uid = self.get_uid()
            user_str = self.get_users(uid, 1)
            users = json.loads(user_str)
            for user in users['cards'][1]['card_group']:
                name = user['user']['screen_name']
                user_id = user['user']['id']
                followers_count = user['user']['followers_count']
                follow_count = user['user']['follow_count']
                description = user['user']['description']
                self.db_manager.enqueue_user(user_id,
                                             name=name,
                                             follow_count=follow_count,
                                             followers_count=followers_count,
                                             description=description)
            time.sleep(CRAWL_DELAY)
            print users
            break
Exemple #2
0
 def __init__(self):
     self.db_manager = CrawlDatabaseManager(10)
Exemple #3
0
 def __init__(self):
     self.db_manager = CrawlDatabaseManager()
     self.crawl_feeds = FeedsCrawler()
Exemple #4
0
class UsersCrawler:
    url_format = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_%s&page=%d'

    #    querystring = {"version":"v4"}

    #    payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
    headers = {
        'host':
        "m.weibo.cn",
        'connection':
        "keep-alive",
        'cache-control':
        "no-cache",
        'upgrade-insecure-requests':
        "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-encoding':
        "gzip, deflate, sdch, br",
        'accept-language':
        "zh-CN,en-US;q=0.8,en;q=0.6",
        'cookie':
        "_T_WM=0c8015962736d2199e208ca8adb7d301; SCF=AgS4nly86VG3e5zq2Usv2iN_bQIUj-rGsjZvQJLMLobUOXiNKaU1wEUxFjfWYNVchEv1_4Mfh08RgGthig-4Qcg.; SUB=_2A253p1ZcDeRhGeNG71YS9S7Jwj-IHXVVaHoUrDV6PUJbkdANLXbgkW1NS0f7LHe4BMxiV6LodCnt1vITwWSST2DI; SUHB=0xnCvYrYJqVQyb; SSOLoginState=1520641555; M_WEIBOCN_PARAMS=uicode%3D20000174"
    }

    # response = requests.request("GET", url, data=payload, headers=headers, params=querystring)

    db_manager = None

    threads = []

    run = False

    kickstart = True

    user_count = 0

    max_threads = 10

    def __init__(self):
        self.db_manager = CrawlDatabaseManager()
        self.crawl_feeds = FeedsCrawler()

    def get_users(self, uid, page):
        url = (self.url_format) % (uid, page)
        response = requests.request("GET", url, headers=self.headers)
        response.encoding = response.apparent_encoding
        return response.text

    def get_uid(self):
        return self.db_manager.dequeue_user()

    def start(self):
        self.run = True
        t = threading.Thread(target=self.crawl_feeds.crawl_feeds(), name=None)
        self.threads.append(t)
        # set daemon so main thread can exit when receives ctrl-c
        t.setDaemon(True)
        t.start()

    def crawl_users(self):
        self.outrange = False
        self.run = True
        self.page = 1
        if self.kickstart:
            self.kickstart = False
            uid = start_uid
        else:
            uid = self.get_uid()[1]
        while self.run:
            print(self.page)
            count = 0
            user_str = self.get_users(uid, self.page)
            users = json.loads(user_str)
            index = 1 if self.page == 1 else 0
            try:
                page_count = len(users['data']['cards'][index]['card_group'])
            except:
                print("page %d has none follow information,break!" % self.page)
                break
            for i in range(page_count):
                user = users['data']['cards'][index]['card_group'][i]
                try:
                    name = user['user']['screen_name']
                    if name.startswith("#"):
                        continue
                    user_id = user['user']['id']
                    followers_count = user['user']['followers_count']
                    follow_count = user['user']['follow_count']
                    description = user['user']['description']
                    print(name)
                except:
                    continue
                if followers_count > 15 * follow_count:
                    self.db_manager.enqueue_user(
                        user_id=user_id,
                        name=name,
                        follow_count=follow_count,
                        followers_count=followers_count,
                        description=description)
                    self.user_count += 1
                    count += 1
                else:
                    continue
            self.page += 1
            time.sleep(CRAWL_DELAY)
        print(self.page)
        print(
            "%d users were qnququed this time ,%d users have been enququed from the start"
            % (count, self.user_count))
Exemple #5
0
class FeedsCrawler():

    url_format = "https://m.weibo.cn/api/container/getIndex?uid=%s&type=uid&value=%s&containerid=107603%s&page=%d"

    # querystring = {"version":"v4"}

    # payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
    headers = {
        'content-type':
        "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
        'host': "m.weibo.cn",
        'connection': "keep-alive",
        'cache-control': "no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, sdch, br",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
        'cookie':
        "_T_WM=0c8015962736d2199e208ca8adb7d301; SCF=AgS4nly86VG3e5zq2Usv2iN_bQIUj-rGsjZvQJLMLobUOXiNKaU1wEUxFjfWYNVchEv1_4Mfh08RgGthig-4Qcg.; SUB=_2A253p1ZcDeRhGeNG71YS9S7Jwj-IHXVVaHoUrDV6PUJbkdANLXbgkW1NS0f7LHe4BMxiV6LodCnt1vITwWSST2DI; SUHB=0xnCvYrYJqVQyb; SSOLoginState=1520641555; M_WEIBOCN_PARAMS=uicode%3D20000174",
        'postman-token': "0b85ea3b-073b-a799-4593-61095e4ed01a"
    }

    db_manager = CrawlDatabaseManager()

    feeds_db_manager = FeedsMongoManager()

    threads = []

    run = False

    def __init__(self):
        pass

    # 5s前 5分钟前 5小时前 05:05 05-05 2017-05-05
    def get_time(self, created_time):
        print(created_time)
        if u's' in created_time:
            time_sec = re.findall(u'(\d*)s', created_time)[0]
            time = mktime(
                strptime(
                    strftime("%Y-%m-%d-%H-%M-") + time_sec,
                    "%Y-%m-%d-%H-%M-%S"))
            return time
        elif u'分钟' in created_time:
            time_min = str(60 - int(re.findall(u'(\d*)分', created_time)[0]))
            time = mktime(
                strptime(
                    strftime("%Y-%m-%d-%H-%S-") + time_min,
                    "%Y-%m-%d-%H-%S-%M"))
            return time
        elif u'小时' in created_time:
            time_hour = str(24 - int(re.findall(u'(\d*)小', created_time)[0]))
            time = mktime(
                strptime(
                    strftime("%Y-%m-%d-%M-%S-") + time_hour,
                    "%Y-%m-%d-%M-%S-%H"))
            return time
        elif u':' in created_time:
            time_hour_min = created_time
            time = mktime(
                strptime(
                    strftime("%Y-%m-%d-%S-") + time_hour_min,
                    "%Y-%m-%d-%S-%H:%M"))
            return time
        elif len(created_time) == 5:
            time_mon_day = created_time
            time = mktime(
                strptime(
                    strftime("%Y-%H-%M-%S-") + time_mon_day,
                    "%Y-%H-%M-%S-%m-%d"))
            return time
        elif len(created_time) == 10:
            time_year_mon_day = created_time
            time = mktime(
                strptime(
                    strftime("%H-%M-%S-") + time_year_mon_day,
                    "%H-%M-%S-%Y-%m-%d"))
            return time
        else:
            return None

    def get_feeds(self, uid, page):
        url = (self.url_format) % (uid, uid, uid, page)
        req = requests.request("GET", url, headers=self.headers)
        req.encoding = req.apparent_encoding
        return req.text

    def get_uid(self):
        uid = self.db_manager.dequeue_user()
        if uid is None:
            return None
        return uid[1]


# '''     def start(self):
#         self.run = True
#         t = threading.Thread(target=self.crawl_feeds, name=None)
#         self.threads.append(t)
#         # set daemon so main thread can exit when receives ctrl-c
#         t.setDaemon(True)
#         t.start() '''

    def crawl_feeds(self):
        self.run = True

        while self.run:
            uid = self.get_uid()
            print("start new user crawling")
            if uid is None:
                self.run = False
                break
            for page in range(1, MAX_PAGE):
                feeds_str = self.get_feeds(uid, page)
                feeds = json.loads(feeds_str)
                for feed in feeds['data']['cards']:
                    if feed['card_type'] != 9:
                        continue
                    try:
                        if 'mblog' in feed.keys():
                            self.feeds_db_manager.insert_feed(
                                feed,
                                self.get_time(feed['mblog']['created_at']))
                            print('--------\n' +
                                  feed['mblog']['user']['screen_name'] +
                                  '\n--------\n' +
                                  re.sub(r'<.*?>', '', feed['mblog']['text']))
                        else:
                            continue
                    except Exception as err:
                        print(err)
                        continue
                    # item_id = feed['itemid']
                    # scheme = feed['scheme']
                    # uid = feed['mblog']['user']['id']
                    # name = feed['mblog']['user']['screen_name']1
                    # profile_image_url = feed['mblog']['user']['profile_image_url']
                    # created_at = feed['mblog']['created_at']
                    # text = feed['mblog']['text']
                    # feed_id = feed['mblog']['id']
                    # reposts_count = feed['mblog']['reposts_count']
                    # comments_count = feed['mblog']['comments_count']
                    # attitudes_count = feed['mblog']['attitudes_count']
                    # page_info = feed['mblog']['page_info']
                    # pics = feed['mblog']['pics']

            time.sleep(CRAWL_DELAY)
Exemple #6
0
class FeedsCrawler:

    url_format = "https://m.weibo.cn/api/container/getIndex?uid=%s&type=uid&value=%s&containerid=107603%s&page=%d"

    querystring = {"version": "v4"}

    payload = "------WebKitFormBoundary7MA4YWxkTrZu0gW\r\nContent-Disposition: form-data; name=\"version\"\r\n\r\nv4\r\n------WebKitFormBoundary7MA4YWxkTrZu0gW--"
    headers = {
        'content-type':
        "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW",
        'host': "m.weibo.cn",
        'connection': "keep-alive",
        'cache-control': "no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        'accept':
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        'accept-encoding': "gzip, deflate, sdch, br",
        'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
        'cookie':
        "SCF=AlTf48qNezF12LbNvCHGGee_Nymdun-Sp9kGATl9gjhJAPPkj2QBT2-Y2MECfIjqy1QjvcBbdVr9HWi6hgbgnTQ.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhEEfKT6-E_qQ8I2HTu2.Vu5JpX5o2p5NHD95Qp1hq4She41K-pWs4DqcjGC2Hkg.y8Kntt; SUB=_2A250CvjnDeRhGeBP7FoW9SvEwjiIHXVX9JivrDV6PUJbkdANLUvGkW1966OJJxi88Ah66us23Spcr23Dpw..; SUHB=0cSXjt5Dqq_ieZ; _T_WM=e0f6480701da87741a5b948440f9d665; SSOLoginState=1495508844; ALF=1498100844; H5_INDEX=0_all; H5_INDEX_TITLE=%E4%BD%A0%E5%B7%B2%E7%BB%8F%E8%A2%AB%E7%A7%BB%E9%99%A4%E7%BE%A4%E8%81%8A; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D4110491498745329%26luicode%3D10000011%26lfid%3D231051_-_followers_-_5979396421",
        'postman-token': "0b85ea3b-073b-a799-4593-61095e4ed01a"
    }

    db_manager = CrawlDatabaseManager(10)

    feeds_db_manager = FeedsMongoManager()

    threads = []

    run = False

    def __init__(self):
        pass

    def get_time(self, created_time):
        # created_time = created_time.replace(' ', '')
        if u'秒前' in created_time:
            return time.time()
        mins = re.findall(u'(.*)分钟', created_time)
        if len(mins) > 0:
            return time.time() - int(mins[0]) * 60

        hours = re.findall(u'(.*)小时', created_time)
        if len(hours) > 0:
            return time.time() - int(hours[0]) * 3600

        today_time = re.findall(u'今天.?(\d\d:\d\d)', created_time)
        if len(today_time) > 0:
            ct = time.strftime(u'%Y/%m/%d ') + today_time[0]
            return time.mktime(time.strptime(ct, u'%Y/%m/%d %H:%M'))

        md = re.findall(u'\d{2}\-\d{2}', created_time)

        if len(md) > 0:
            return time.mktime(time.strptime(md[0], u'%m-%d'))

        yesterday = re.findall(u'昨天.*(\d{2}:\d{2})', created_time)
        if len(yesterday) > 0:
            return time.time() - 86400 + int(yesterday[0][0:2]) * 3600 + int(
                yesterday[0][3:]) * 60

        str_time = re.findall(u'.*\d\d:\d\d', created_time)

        if len(str_time) > 0:
            if u'月' in str_time:
                return time.mktime(
                    time.strptime(
                        time.strftime(u'%Y-') + str_time, u'%Y-%m月%d日 %H:%M'))
            else:
                try:
                    return time.mktime(
                        time.strptime(str_time, u'%Y-%m-%d %H:%M'))
                except Exception:
                    return time.mktime(
                        time.strptime(
                            time.strftime(u'%Y-') + str_time,
                            u'%Y-%m-%d %H:%M'))

    def fetch_feeds(self, uid, page):
        url = (self.url_format) % (uid, uid, uid, page)
        return requests.request("GET",
                                url,
                                data=self.payload,
                                headers=self.headers,
                                params=self.querystring)

    def next_uid(self):
        uid = self.db_manager.dequeue_user()
        if uid is None:
            return None
        return uid['user_id']

    def start(self):
        self.run = True
        t = threading.Thread(target=self.crawl_feeds, name=None)
        self.threads.append(t)
        # set daemon so main thread can exit when receives ctrl-c
        t.setDaemon(True)
        t.start()

    def crawl_feeds(self):
        self.run = True

        while self.run:

            uid = self.next_uid()
            if uid is None:
                print("No more user available")
                break

            print('uid ', uid)
            if uid is None:
                self.run = False
                break
            page = 1
            while page < MAX_PAGE:
                feeds_str = self.fetch_feeds(uid, page)
                page += 1
                feeds = json.loads(feeds_str.text)
                for feed in feeds['data']['cards']:
                    print('running')
                    if feed['card_type'] != 9:
                        continue
                    if 'mblog' in feed:
                        self.feeds_db_manager.insert_feed(
                            feed, self.get_time(feed['mblog']['created_at']))
                        print('--------\n' +
                              feed['mblog']['user']['screen_name'] +
                              '\n--------\n' + feed['mblog']['text'])

                    # item_id = feed['itemid']
                    # scheme = feed['scheme']
                    # uid = feed['mblog']['user']['id']
                    # name = feed['mblog']['user']['screen_name']
                    # profile_image_url = feed['mblog']['user']['profile_image_url']
                    # created_at = feed['mblog']['created_at']
                    # text = feed['mblog']['text']
                    # feed_id = feed['mblog']['id']
                    # reposts_count = feed['mblog']['reposts_count']
                    # comments_count = feed['mblog']['comments_count']
                    # attitudes_count = feed['mblog']['attitudes_count']
                    # page_info = feed['mblog']['page_info']
                    # pics = feed['mblog']['pics']

            time.sleep(CRAWL_DELAY)
            print(feeds)
 def __init__(self, uid):
     self.db_manager = CrawlDatabaseManager(10)
     self.root_uid = uid