Ejemplo n.º 1
0
    def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD):
        # login to sinaweibo
        self.driver = webdriver.PhantomJS()
        self.wl = WeiboLogin(user_name, passwd,
                             self.driver)  # the interface for authorization

        if self.wl.login():
            logging.info('login successfully')
        else:
            logging.info('login faied')
            sys.exit(1)
        self.sk = search_key.strip()
        return
Ejemplo n.º 2
0
    def __init__(self, name="*****@*****.**", password="******", uid="1709818975", *args, **kwargs):
        super(UserInfoCrawl, self).__init__(*args, **kwargs)
        self.uid = uid
        self.start_urls = ["http://weibo.com"]
        self.allowed_domains = ["weibo.com", "weibo.cn"]
        self.url_base = "http://weibo.cn"
        self.first_flag_info = True  # 不爬取自己的微博
        self.first_flag_home = True  # 处理自己资料的时候和其他账户有所不一

        if os.path.exists("weibocookie.json"):
            with open("weibocookie.json", "r") as f:
                self.cookie = json.load(f)
        else:
            self.weibo = WeiboLogin()
            self.session = self.weibo.login(name, password)
            cookiejar = requests.utils.dict_from_cookiejar(self.session.cookies)

            # Set sina weibo cookie
            self.cookie = {'ALF': cookiejar['ALF'],
                           'sso_info': cookiejar['sso_info'],
                           'SUB': cookiejar['SUB'],
                           'SUBP': cookiejar['SUBP'],
                           'SUE': cookiejar['SUE'],
                           'SUHB': cookiejar['SUHB'],
                           'SUP': cookiejar['SUP'],
                           'SUS': cookiejar['SUS']}
            with open("weibocookie.json", "w") as f:
                json.dump(self.cookie, f)
Ejemplo n.º 3
0
 def get_weibo_token(self, appkey, appsecret, url, username, password):
     logging.info("preparing weibo OAuth2:")
     logging.info("appkey: %s username: %s" % (appkey, username))
     self.weibo_client = APIClient(app_key=appkey,
                                   app_secret=appsecret,
                                   redirect_uri=url)
     code = WeiboLogin(username, password, appkey, url).get_code()
     logging.info("code: %s" % code)
     r = self.weibo_client.request_access_token(code)
     self.weibo_client.set_access_token(r.access_token, r.expires_in)
     logging.info("token: %s" % r.access_token)
    def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD):
        # login to sinaweibo
        self.driver = webdriver.PhantomJS()
        self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization

        if self.wl.login():
            logging.info('login successfully')
        else:
            logging.info('login faied')
            sys.exit(1)
        self.sk = search_key.strip()
        return
Ejemplo n.º 5
0
    def __init__(self,
                 name="*****@*****.**",
                 password="******",
                 uid="09424248189",
                 *args,
                 **kwargs):
        super(UserInfoCrawl, self).__init__(*args, **kwargs)
        self.uid = uid
        self.start_urls = ["http://weibo.com"]
        self.allowed_domains = ["weibo.com", "weibo.cn"]
        self.url_base = "http://weibo.cn"
        self.first_flag_info = True  # 不爬取自己的微博
        self.first_flag_home = True  # 处理自己资料的时候和其他账户有所不一

        if os.path.exists("weibocookie.json"):
            with open("weibocookie.json", "r") as f:
                self.cookie = json.load(f)
        else:
            self.weibo = WeiboLogin()
            self.session = self.weibo.login(name, password)
            cookiejar = requests.utils.dict_from_cookiejar(
                self.session.cookies)

            # Set sina weibo cookie
            self.cookie = {
                'ALF': cookiejar['ALF'],
                'sso_info': cookiejar['sso_info'],
                'SUB': cookiejar['SUB'],
                'SUBP': cookiejar['SUBP'],
                'SUE': cookiejar['SUE'],
                'SUHB': cookiejar['SUHB'],
                'SUP': cookiejar['SUP'],
                'SUS': cookiejar['SUS']
            }
            with open("weibocookie.json", "w") as f:
                json.dump(self.cookie, f)
Ejemplo n.º 6
0
class UserInfoCrawl(Spider):
    name = "weibo_user_info"

    # allowed_domains = ["weibo.cn"]

    def __init__(self,
                 name="*****@*****.**",
                 password="******",
                 uid="09424248189",
                 *args,
                 **kwargs):
        super(UserInfoCrawl, self).__init__(*args, **kwargs)
        self.uid = uid
        self.start_urls = ["http://weibo.com"]
        self.allowed_domains = ["weibo.com", "weibo.cn"]
        self.url_base = "http://weibo.cn"
        self.first_flag_info = True  # 不爬取自己的微博
        self.first_flag_home = True  # 处理自己资料的时候和其他账户有所不一

        if os.path.exists("weibocookie.json"):
            with open("weibocookie.json", "r") as f:
                self.cookie = json.load(f)
        else:
            self.weibo = WeiboLogin()
            self.session = self.weibo.login(name, password)
            cookiejar = requests.utils.dict_from_cookiejar(
                self.session.cookies)

            # Set sina weibo cookie
            self.cookie = {
                'ALF': cookiejar['ALF'],
                'sso_info': cookiejar['sso_info'],
                'SUB': cookiejar['SUB'],
                'SUBP': cookiejar['SUBP'],
                'SUE': cookiejar['SUE'],
                'SUHB': cookiejar['SUHB'],
                'SUP': cookiejar['SUP'],
                'SUS': cookiejar['SUS']
            }
            with open("weibocookie.json", "w") as f:
                json.dump(self.cookie, f)

    def start_requests(self):
        # Parse weibo homepage
        home_url = "http://weibo.cn/u/%s" % self.uid
        yield Request(url=home_url,
                      cookies=self.cookie,
                      callback=self._parse_homepage,
                      errback=self.parse_error)

    def _parse_homepage(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        # 粉丝数
        fans_count, uid = self.get_fans_count(soup)

        # 微博数量
        weibo_count = self.get_weibo_count(soup)

        # 关注
        follow_count, follow_url = self.get_follows(soup)

        # 微博,只爬第一条
        weibo_item = self.parse_weibo_context(soup, uid)
        if weibo_item is not None:
            yield weibo_item

        weibo_social = WeiboSocialConnection()
        weibo_social["user_id"] = uid
        weibo_social["weibo"] = weibo_count
        weibo_social["fans"] = fans_count
        weibo_social["follow"] = follow_count
        if weibo_count > 10:
            yield weibo_social

        # 个人资料
        detail_url_ele = soup.find("a", text=u"资料")
        if detail_url_ele:
            detail_url = self.url_base + detail_url_ele["href"]
            yield Request(url=detail_url,
                          cookies=self.cookie,
                          callback=self.parse_info,
                          errback=self.parse_error,
                          priority=1)

        if follow_url:
            yield Request(url=follow_url,
                          cookies=self.cookie,
                          callback=self.parse_follow,
                          errback=self.parse_error)

    def parse_error(self, response):
        logger.error("post:%s" % response.url)

    def parse_info(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        info_tip_ele = soup.find("div", text=u"基本信息")
        uid = self.get_uid_from_response(response)
        info = {}
        if info_tip_ele:
            info_ele = info_tip_ele.next_sibling
            if self.first_flag_info:
                self.first_flag_info = False
                # info_eles = info_ele.find_all("a")
                # for ele in info_eles:
                #     if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]:
                #         info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8")
                #         print ele.text, ele.next_sibling
            else:
                info_eles = info_ele.strings
                user_info = WeiboUserInfoItem()
                user_info["user_id"] = uid
                for ele in info_eles:
                    el = ele.split(":")
                    if len(el) == 2 and el[0] in [
                            u"昵称", u"性别", u"地区", u"生日", u"简介"
                    ]:
                        info[el[0]] = el[1]
                        info_item = el[1].encode("utf-8")
                        if el[0] == u"昵称":
                            user_info["user_name"] = info_item
                        elif el[0] == u"性别":
                            user_info["sex"] = info_item
                        elif el[0] == u"地区":
                            region = info_item.split(" ")
                            if len(region) == 1:
                                user_info["province"] = ""
                                user_info["city"] = region[0]
                            else:
                                user_info["province"] = region[0]
                                user_info["city"] = region[1]
                        elif el[0] == u"生日":
                            if len(info_item.split("-")) < 3:
                                user_info["birthday"] = "2050-" + info_item
                            else:
                                user_info["birthday"] = info_item
                            p = re.compile(r"^\d{4}-\d{2}-\d{2}$")
                            if not p.findall(user_info["birthday"]):
                                user_info["birthday"] = None
                        elif el[0] == u"简介":
                            user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                                replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                yield user_info

    def parse_follow(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        table_eles = soup.find_all("table")
        for ele in table_eles:
            follower_url = ele.find("a")["href"]
            yield Request(url=follower_url,
                          cookies=self.cookie,
                          callback=self._parse_homepage,
                          errback=self.parse_error)

    def get_uid_from_response(self, response):
        if isinstance(response, str):
            url = response
        else:
            url = response.url
        pattern = re.compile(r'/(\d+)/?')
        res = re.findall(pattern, url)
        id = 0
        if res:
            id = int(res[0])
            # print "id:", id
        return id

    def parse_weibo_context(self, soup, uid):
        weibo_info = WeiboItem()
        if self.first_flag_home:
            self.first_flag_home = False
            return None
        else:
            contexts = soup.find_all("div", class_="c")
            for item in contexts:
                try:
                    context = item.find("span", class_="ctt")
                    if not context:
                        continue
                    weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                        replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                    parent_ele = context.parent.parent
                    like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$"))
                    relay_ele = parent_ele.find(
                        text=re.compile(u"^转发\[\d*\]$"))
                    comment_ele = parent_ele.find(
                        text=re.compile(u"^评论\[\d*\]$"))
                    issue_time_ele = parent_ele.find("span", class_="ct")
                    issue_time = issue_time_ele.text
                    issue_time = issue_time.encode("utf-8")

                    issue = issue_time.split("来自")
                    issue_datetime = ""
                    if len(issue) > 0:
                        if "分钟" in issue[0]:
                            min = filter(str.isdigit, issue[0])
                            t = datetime.datetime.now() - datetime.timedelta(
                                minutes=int(min))
                            issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S")
                        elif "今天" in issue[0]:
                            time = issue[0].replace("今天 ", "").replace(
                                "\xc2\xa0", "")
                            issue_datetime = datetime.datetime.now().strftime(
                                "%Y-%m-%d ") + time
                        else:
                            issue_datetime = issue[0].replace(
                                "月", "-").replace("日",
                                                  "").replace("\xc2\xa0", "")
                            if issue[0].count("-") < 2:
                                issue_datetime = datetime.datetime.now(
                                ).strftime("%Y-") + issue_datetime
                    issue_device = issue[1] if len(issue) > 1 else None

                    weibo_info["context"] = weibo_text
                    weibo_info["user_id"] = uid
                    weibo_info["issue_time"] = issue_datetime.strip()
                    weibo_info["get_time"] = datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S")
                    weibo_info["like_count"] = filter(str.isdigit,
                                                      like_ele.encode("utf-8"))
                    weibo_info["relay_count"] = filter(
                        str.isdigit, relay_ele.encode("utf-8"))
                    weibo_info["comment_count"] = filter(
                        str.isdigit, comment_ele.encode("utf-8"))
                    weibo_info["device"] = issue_device

                    # print issue_datetime, issue_device, weibo_text
                    # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8")
                    return weibo_info
                    # 只爬去第一条微博
                except Exception, e:
                    logger.error(e)
Ejemplo n.º 7
0
#coding:utf-8
import urllib2
import post_encode
from weibo_login import WeiboLogin
import get_weibo
if __name__ == '__main__':
	Login = WeiboLogin('17089368196', 'tttt5555')
	if Login.login() == True:
		print "登录成功"
	#可以根据page来循环以便达到爬取多页的目的
	html = urllib2.urlopen("http://s.weibo.com/weibo/%25E5%2591%25A8%25E6%2589%25AC%25E9%259D%2592&page=3").read()
	#调用解析html内容的函数	
	get_weibo.write_all_info(html)
Ejemplo n.º 8
0
import requests
import post_encode
from weibo_login import WeiboLogin
import get_weibo
import re
from lxml import html
from lxml import etree
import string
import random
import time
import os
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#if __name__ == '__main__':
Login = WeiboLogin('*****@*****.**', 'dan5493')
if Login.login() == True:
    print "登录成功"
urls_1, names_1 = get_weibo.get_url()
length = len(urls_1)
for i in range(length):
    urls = []
    names_2 = re.sub('\\\\n', '', names_1[i])
    names_3 = re.sub('   ', '', names_2)
    new_path = get_weibo.path(names_3.decode('unicode_escape'))
    url_1 = re.sub('\\\\', '', urls_1[i])
    url_2 = 'http://s.weibo.com' + url_1
    url_3 = re.sub('Refer=top', 'page=1.html', url_2)  #每一话题的第一页
    print url_3
    sleeptime_rand = random.randint(3, 10)
    time.sleep(sleeptime_rand)
Ejemplo n.º 9
0
class UserInfoCrawl(Spider):
    name = "weibo_user_info"
    # allowed_domains = ["weibo.cn"]

    def __init__(self, name="*****@*****.**", password="******", uid="1709818975", *args, **kwargs):
        super(UserInfoCrawl, self).__init__(*args, **kwargs)
        self.uid = uid
        self.start_urls = ["http://weibo.com"]
        self.allowed_domains = ["weibo.com", "weibo.cn"]
        self.url_base = "http://weibo.cn"
        self.first_flag_info = True  # 不爬取自己的微博
        self.first_flag_home = True  # 处理自己资料的时候和其他账户有所不一

        if os.path.exists("weibocookie.json"):
            with open("weibocookie.json", "r") as f:
                self.cookie = json.load(f)
        else:
            self.weibo = WeiboLogin()
            self.session = self.weibo.login(name, password)
            cookiejar = requests.utils.dict_from_cookiejar(self.session.cookies)

            # Set sina weibo cookie
            self.cookie = {'ALF': cookiejar['ALF'],
                           'sso_info': cookiejar['sso_info'],
                           'SUB': cookiejar['SUB'],
                           'SUBP': cookiejar['SUBP'],
                           'SUE': cookiejar['SUE'],
                           'SUHB': cookiejar['SUHB'],
                           'SUP': cookiejar['SUP'],
                           'SUS': cookiejar['SUS']}
            with open("weibocookie.json", "w") as f:
                json.dump(self.cookie, f)

    def start_requests(self):
        # Parse weibo homepage
        home_url = "http://weibo.cn/u/%s" % self.uid
        yield Request(url=home_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error)

    def _parse_homepage(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        # 粉丝数
        fans_count, uid = self.get_fans_count(soup)

        # 微博数量
        weibo_count = self.get_weibo_count(soup)

        # 关注
        follow_count, follow_url = self.get_follows(soup)

        # 微博,只爬第一条
        weibo_item = self.parse_weibo_context(soup, uid)
        if weibo_item is not None:
            yield weibo_item

        weibo_social = WeiboSocialConnection()
        weibo_social["user_id"] = uid
        weibo_social["weibo"] = weibo_count
        weibo_social["fans"] = fans_count
        weibo_social["follow"] = follow_count
        if weibo_count > 10:
            yield weibo_social

        # 个人资料
        detail_url_ele = soup.find("a", text=u"资料")
        if detail_url_ele:
            detail_url = self.url_base + detail_url_ele["href"]
            yield Request(url=detail_url, cookies=self.cookie,
                          callback=self.parse_info, errback=self.parse_error,
                          priority=1)

        if follow_url:
            yield Request(url=follow_url, cookies=self.cookie, callback=self.parse_follow, errback=self.parse_error)

    def parse_error(self, response):
        logger.error("post:%s" % response.url)

    def parse_info(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        info_tip_ele = soup.find("div", text=u"基本信息")
        uid = self.get_uid_from_response(response)
        info = {}
        if info_tip_ele:
            info_ele = info_tip_ele.next_sibling
            if self.first_flag_info:
                self.first_flag_info = False
                # info_eles = info_ele.find_all("a")
                # for ele in info_eles:
                #     if ele.text in [u"昵称", u"性别", u"地区", u"生日", u"简介"]:
                #         info[ele.text.encode("utf-8")] = ele.next_sibling.encode("utf-8")
                #         print ele.text, ele.next_sibling
            else:
                info_eles = info_ele.strings
                user_info = WeiboUserInfoItem()
                user_info["user_id"] = uid
                for ele in info_eles:
                    el = ele.split(":")
                    if len(el) == 2 and el[0] in [u"昵称", u"性别", u"地区", u"生日", u"简介"]:
                        info[el[0]] = el[1]
                        info_item = el[1].encode("utf-8")
                        if el[0] == u"昵称":
                            user_info["user_name"] = info_item
                        elif el[0] == u"性别":
                            user_info["sex"] = info_item
                        elif el[0] == u"地区":
                            region = info_item.split(" ")
                            if len(region) == 1:
                                user_info["province"] = ""
                                user_info["city"] = region[0]
                            else:
                                user_info["province"] = region[0]
                                user_info["city"] = region[1]
                        elif el[0] == u"生日":
                            if len(info_item.split("-")) < 3:
                                user_info["birthday"] = "2050-" + info_item
                            else:
                                user_info["birthday"] = info_item
                            p = re.compile(r"^\d{4}-\d{2}-\d{2}$")
                            if not p.findall(user_info["birthday"]):
                                user_info["birthday"] = None
                        elif el[0] == u"简介":
                            user_info["abstract"] = info_item.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                                replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                yield user_info

    def parse_follow(self, response):
        html = response.body
        soup = BeautifulSoup(html, "lxml")
        table_eles = soup.find_all("table")
        for ele in table_eles:
            follower_url = ele.find("a")["href"]
            yield Request(url=follower_url, cookies=self.cookie, callback=self._parse_homepage, errback=self.parse_error)

    def get_uid_from_response(self, response):
        if isinstance(response, str):
            url = response
        else:
            url = response.url
        pattern = re.compile(r'/(\d+)/?')
        res = re.findall(pattern, url)
        id = 0
        if res:
            id = int(res[0])
            # print "id:", id
        return id

    def parse_weibo_context(self, soup, uid):
        weibo_info = WeiboItem()
        if self.first_flag_home:
            self.first_flag_home = False
            return None
        else:
            contexts = soup.find_all("div", class_="c")
            for item in contexts:
                try:
                    context = item.find("span", class_="ctt")
                    if not context:
                        continue
                    weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                        replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                    parent_ele = context.parent.parent
                    like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$"))
                    relay_ele = parent_ele.find(text=re.compile(u"^转发\[\d*\]$"))
                    comment_ele = parent_ele.find(text=re.compile(u"^评论\[\d*\]$"))
                    issue_time_ele = parent_ele.find("span", class_="ct")
                    issue_time = issue_time_ele.text
                    issue_time = issue_time.encode("utf-8")

                    issue = issue_time.split("来自")
                    issue_datetime = ""
                    if len(issue) > 0:
                        if "分钟" in issue[0]:
                            min = filter(str.isdigit, issue[0])
                            t = datetime.datetime.now() - datetime.timedelta(minutes=int(min))
                            issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S")
                        elif "今天" in issue[0]:
                            time = issue[0].replace("今天 ", "").replace("\xc2\xa0", "")
                            issue_datetime = datetime.datetime.now().strftime("%Y-%m-%d ") + time
                        else:
                            issue_datetime = issue[0].replace("月", "-").replace("日", "").replace("\xc2\xa0", "")
                            if issue[0].count("-") < 2:
                                issue_datetime =datetime.datetime.now().strftime("%Y-") + issue_datetime
                    issue_device = issue[1] if len(issue) > 1 else None

                    weibo_info["context"] = weibo_text
                    weibo_info["user_id"] = uid
                    weibo_info["issue_time"] = issue_datetime.strip()
                    weibo_info["get_time"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8"))
                    weibo_info["relay_count"] = filter(str.isdigit, relay_ele.encode("utf-8"))
                    weibo_info["comment_count"] = filter(str.isdigit, comment_ele.encode("utf-8"))
                    weibo_info["device"] = issue_device


                    # print issue_datetime, issue_device, weibo_text
                    # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8")
                    return weibo_info
                    # 只爬去第一条微博
                except Exception, e:
                    logger.error(e)
Ejemplo n.º 10
0
from weibo import APIClient, APIError
from weibo_login import WeiboLogin, WeiboLoginError

APP_KEY = "3226611318"
APP_SECRET = "4f94b19d1d30c6bce2505e69d22cd62e"
CALLBACK_URL = "https://api.weibo.com/oauth2/default.html"

print("start login...")

client = APIClient(app_key=APP_KEY,
                   app_secret=APP_SECRET,
                   redirect_uri=CALLBACK_URL)

code = ''
try:
    code = WeiboLogin("*****@*****.**", "s2013h1cfr", APP_KEY,
                      CALLBACK_URL).get_code()
except WeiboLoginError as e:
    print("Login Fail [%s]: %s" % (e.error_code, e.error))
    exit(1)

print("code: %s" % code)

r = client.request_access_token(code)

access_token = r.access_token
expires_in = r.expires_in

print("token: %s" % access_token)
print("expires in %s" % expires_in)

client.set_access_token(access_token, expires_in)
class WeiboCrawler():
    '''
    crawl weibo using keywords
    '''
    def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD):
        # login to sinaweibo
        self.driver = webdriver.PhantomJS()
        self.wl = WeiboLogin(user_name, passwd, self.driver) # the interface for authorization

        if self.wl.login():
            logging.info('login successfully')
        else:
            logging.info('login faied')
            sys.exit(1)
        self.sk = search_key.strip()
        return

    def __del__(self):
        self.driver.quit()
        return

    def crawl(self, page_count=1, comments=False):
        '''
        crawl the weibo using the keywords

        page_count: how many pages would be crawled
        '''
        self.results = []
        # get the mids from each result page
        pages = list(range(1, page_count+1))
        random.shuffle(pages)

        for t in ('hot', 'time'):
            for i in pages:
                url_to_crawl = self.get_search_url(i)
                logging.info('crawling page {}:{}'.format(i, url_to_crawl))
                self.driver.get(url_to_crawl)
                # wait the page loading the content
                try:
                    element = WebDriverWait(self.driver, 5).until(
                            lambda x: x.find_elements_by_class_name('feed_list')
                            )
                except TimeoutException:
                    logging.info('there is no weibo content in {}'.format(url_to_crawl))
                    logging.info('you are considered as a robot')
                    logging.info(self.driver.current_url)
                    self.driver.get_screenshot_as_file('./screenshot/error.png')

                    # let user input the verification code
                    verify_user(self.driver, 'search')
                    # break


                weibo_list = self.get_weibo_list(self.driver.page_source) # mid is used to crawl the original weibo content, using batch mode
                self.results.extend(weibo_list)

                # sleep some time to prevent hitting too much
                # time.sleep(1)
            else: continue
            break

        # for r in results:
        #     logging.info_dict(r)
        logging.info('total result {}'.format(len(self.results)))


        if comments:
            logging.info('crawling the comments')
            self.crawl_comments()
        return

    def get_search_url(self, page=1, w_type='hot'):
        '''
        compose a search url based on page_num and weibo type
        '''
        # logging.info('generating the url')
        url=''
        url += 'http://'
        url += search_domain
        url += '/wb'
        url += urllib.parse.quote('/'+self.sk)
        url += '&'
        url += urllib.parse.urlencode([
            ('page', page),
            ('xsort', w_type)
            ])

        return url


    def get_weibo_list(self, content):
        '''
        parse the weibo content in the current result page
        content: the source page of the keywords result

        return: a list of weibo object
        '''
        weibo_list = []
        soup = BeautifulSoup(content, 'html5lib')
        for t in soup.find_all('dl', class_='feed_list'):
            if t.has_attr('mid'):
                weibo = self.parse_weibo(t)
                if weibo:
                    weibo_list.append(weibo)
        logging.info('There are {} weibo on this page'.format(len(weibo_list)))
        return weibo_list

    def parse_weibo(self, t):
        '''
        parse weibo object from html
        t: the tag object that has weibo content

        Return weibo object
        '''
        weibo = {}

        try:
            weibo['keywords'] = self.sk.split(' ') #keywords is a list of words
            weibo['mid'] = t['mid']

            # the user name
            weibo['screen_name'] = t.find(name='dt', class_='face').find('a').get('title')
            weibo['user_profile'] = t.find(name='dt', class_='face').find('a').get('href')

            # the content of weibo
            weibo['text'] = t.find(name='dd', class_='content').find('em').get_text().strip()
            # the source url of the weibo
            weibo['source_url'] = t.find(name='a', class_='date').get('href').strip()
            logging.info(weibo['source_url'])

            # logging.info(weibo['text'])

            # meta data
            epoch_length = len(str(int(time.time())))
            time_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find(name='a', class_='date').get('date')[0:epoch_length]
            time_now = time.localtime(int(time_str))
            weibo['created_at'] = datetime.datetime(*time_now[0:6])
            weibo['source'] = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('a', rel='nofollow').string.strip()

            pop_str = t.find('dd', class_='content').find('p', class_='info W_linkb W_textb').find('span').get_text().strip().replace('\n', '')

            pop_type = {
                    # key: source representation, value: attr
                    '赞': 'like_count',
                    '转发': 'repost_count',
                    '评论': 'comment_count'
                    }
            for key in list(pop_type.keys()):
                pattern = re.compile(r'.*(%s\((\d+)\)).*' % key)
                match = pattern.match(pop_str)
                if match:
                    # logging.info match.group(1)
                    # logging.info match.group(2)
                    weibo[pop_type[key]] = int(match.group(2))
                else:
                    # logging.info key, 'not found.'
                    weibo[pop_type[key]] = 0

        except Exception as e:
            logging.info(e)
            return None

        # logging.info_dict(weibo)
        return weibo

    def save(self, dist_dir='result'):
        '''
        save the search results to file
        '''
        if dist_dir not in os.listdir(os.curdir):
            os.mkdir(dist_dir)
        for w in self.results:
            file_name = ''.join([
                    '_'.join([k for k in w['keywords']]),
                    w['mid']
                    ])
            file_name += '.txt'
            f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8')
            json.dump(w, f, ensure_ascii = False, default=json_util.default, indent = 2)
            # logging.info(w['text'])
            logging.info('writed to file {}'.format(file_name))
        return

    def crawl_comments(self):
        '''
        crawl the comments after getting all the results and update the results list --> self
        '''
        client = self.wl.authorize_app()
        if client:
            for w in self.results:
                # logging.info(w['mid'])
                w['comments'] = []
                crawler = WeiboCommentsCrawler(client, weibo_mid = w['mid'])
                r = crawler.crawl()

                # filter out the unrelated fields
                for c in r:
                    c.pop('status')
                w['comments'].extend(r)
        else:
            logging.error('认证失败,不能获取评论列表')
        return
Ejemplo n.º 12
0
# -*- coding:utf-8 -*-
import sys
from weibo_login import WeiboLogin
from spider.SearchSpider import SearchSpider
from spider.RCSpider import RCSpider
import requests
import time
import csv
from logconfig import LogConfig
logger = LogConfig.get_logger()
reload(sys)
sys.setdefaultencoding("utf-8")

# 登录,保存Session
s_login = requests.session()
w = WeiboLogin()
s_login = w.login_un()
logger.info("login has finished")
time.sleep(1.5)

Search_urls = [
    # 搜索关键词语:杨洋 傅园慧 可以运行
    "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&type=all&queryVal=%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&luicode=20000174&title=%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&v_p=11&ext=&fid=100103type%3D1%26q%3D%E6%9D%A8%E6%B4%8B+%E5%82%85%E5%9B%AD%E6%85%A7&uicode=10000011&next_cursor=&page=",
    # 搜索关键词语:傅园慧 可以运行
    "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7&type=all&queryVal=%E5%82%85%E5%9B%AD%E6%85%A7&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7&title=%E5%82%85%E5%9B%AD%E6%85%A7&v_p=11&ext=&fid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7&uicode=10000011&next_cursor=&page=",
    # 搜索关键词语:洪荒少女 可以运行
    "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&type=all&queryVal=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&title=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&v_p=11&ext=&fid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&uicode=10000011&next_cursor=&page=",
    # 搜索关键词语:洪荒少女傅园慧 可以运行
    "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&type=all&queryVal=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3&title=%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&v_p=11&ext=&fid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&uicode=10000011&next_cursor=&page=",
    # 搜索关键词语:傅园慧表情包 可以运行
    "http://m.weibo.cn/page/pageJson?containerid=&containerid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&type=all&queryVal=%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%B4%AA%E8%8D%92%E5%B0%91%E5%A5%B3%E5%82%85%E5%9B%AD%E6%85%A7&title=%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&v_p=11&ext=&fid=100103type%3D1%26q%3D%E5%82%85%E5%9B%AD%E6%85%A7%E8%A1%A8%E6%83%85%E5%8C%85&uicode=10000011&next_cursor=&page=",
#coding:utf-8
import urllib2
import post_encode
import time
import os
from weibo_login import WeiboLogin
import get_weibo
if __name__ == '__main__':
    #Login = WeiboLogin('17089368196', 'tttt5555')
    Login = WeiboLogin('用户名', '密码')
    if Login.login() == True:
        print "登录成功"
    rnd = long((time.time()) * 1000)
    #可以根据page来循环以便达到爬取多页的目的
    init_url = "http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id=3917356680052180&max_id=3917762218921722"
    url = init_url + "&page=1&_rnd=" + str(rnd)
    html = urllib2.urlopen(url)
    #调用解析html内容的函数
    forward_html = urllib2.urlopen(
        "http://weibo.com/1336593085/D7hwE0dzC?type=repost#_rnd1449550776472"
    ).read()
    #print forward_html
    origin_uid = get_weibo.get_origin_weibo(forward_html)
    (uid, origin_uid2_no, uid2_no, time_no, time_no2, total_forward,
     total_page, current_page) = get_weibo.get_forward(html, forward_html)
    #循环抓取多页
    print total_page
    uid = []
    origin_uid2 = []
    uid2 = []
    forward_time = []
Ejemplo n.º 14
0
class WeiboCrawler():
    '''
    crawl weibo using keywords
    '''
    def __init__(self, search_key, user_name=USER_NAME, passwd=PASSWD):
        # login to sinaweibo
        self.driver = webdriver.PhantomJS()
        self.wl = WeiboLogin(user_name, passwd,
                             self.driver)  # the interface for authorization

        if self.wl.login():
            logging.info('login successfully')
        else:
            logging.info('login faied')
            sys.exit(1)
        self.sk = search_key.strip()
        return

    def __del__(self):
        self.driver.quit()
        return

    def crawl(self, page_count=1, comments=False):
        '''
        crawl the weibo using the keywords

        page_count: how many pages would be crawled
        '''
        self.results = []
        # get the mids from each result page
        pages = list(range(1, page_count + 1))
        random.shuffle(pages)

        for t in ('hot', 'time'):
            for i in pages:
                url_to_crawl = self.get_search_url(i)
                logging.info('crawling page {}:{}'.format(i, url_to_crawl))
                self.driver.get(url_to_crawl)
                # wait the page loading the content
                try:
                    element = WebDriverWait(self.driver, 5).until(
                        lambda x: x.find_elements_by_class_name('feed_list'))
                except TimeoutException:
                    logging.info(
                        'there is no weibo content in {}'.format(url_to_crawl))
                    logging.info('you are considered as a robot')
                    logging.info(self.driver.current_url)
                    self.driver.get_screenshot_as_file(
                        './screenshot/error.png')

                    # let user input the verification code
                    verify_user(self.driver, 'search')
                    # break

                weibo_list = self.get_weibo_list(
                    self.driver.page_source
                )  # mid is used to crawl the original weibo content, using batch mode
                self.results.extend(weibo_list)

                # sleep some time to prevent hitting too much
                # time.sleep(1)
            else:
                continue
            break

        # for r in results:
        #     logging.info_dict(r)
        logging.info('total result {}'.format(len(self.results)))

        if comments:
            logging.info('crawling the comments')
            self.crawl_comments()
        return

    def get_search_url(self, page=1, w_type='hot'):
        '''
        compose a search url based on page_num and weibo type
        '''
        # logging.info('generating the url')
        url = ''
        url += 'http://'
        url += search_domain
        url += '/wb'
        url += urllib.parse.quote('/' + self.sk)
        url += '&'
        url += urllib.parse.urlencode([('page', page), ('xsort', w_type)])

        return url

    def get_weibo_list(self, content):
        '''
        parse the weibo content in the current result page
        content: the source page of the keywords result

        return: a list of weibo object
        '''
        weibo_list = []
        soup = BeautifulSoup(content, 'html5lib')
        for t in soup.find_all('dl', class_='feed_list'):
            if t.has_attr('mid'):
                weibo = self.parse_weibo(t)
                if weibo:
                    weibo_list.append(weibo)
        logging.info('There are {} weibo on this page'.format(len(weibo_list)))
        return weibo_list

    def parse_weibo(self, t):
        '''
        parse weibo object from html
        t: the tag object that has weibo content

        Return weibo object
        '''
        weibo = {}

        try:
            weibo['keywords'] = self.sk.split(
                ' ')  #keywords is a list of words
            weibo['mid'] = t['mid']

            # the user name
            weibo['screen_name'] = t.find(name='dt',
                                          class_='face').find('a').get('title')
            weibo['user_profile'] = t.find(name='dt',
                                           class_='face').find('a').get('href')

            # the content of weibo
            weibo['text'] = t.find(
                name='dd', class_='content').find('em').get_text().strip()
            # the source url of the weibo
            weibo['source_url'] = t.find(name='a',
                                         class_='date').get('href').strip()
            logging.info(weibo['source_url'])

            # logging.info(weibo['text'])

            # meta data
            epoch_length = len(str(int(time.time())))
            time_str = t.find('dd', class_='content').find(
                'p', class_='info W_linkb W_textb').find(
                    name='a', class_='date').get('date')[0:epoch_length]
            time_now = time.localtime(int(time_str))
            weibo['created_at'] = datetime.datetime(*time_now[0:6])
            weibo['source'] = t.find('dd', class_='content').find(
                'p', class_='info W_linkb W_textb').find(
                    'a', rel='nofollow').string.strip()

            pop_str = t.find('dd', class_='content').find(
                'p', class_='info W_linkb W_textb').find(
                    'span').get_text().strip().replace('\n', '')

            pop_type = {
                # key: source representation, value: attr
                '赞': 'like_count',
                '转发': 'repost_count',
                '评论': 'comment_count'
            }
            for key in list(pop_type.keys()):
                pattern = re.compile(r'.*(%s\((\d+)\)).*' % key)
                match = pattern.match(pop_str)
                if match:
                    # logging.info match.group(1)
                    # logging.info match.group(2)
                    weibo[pop_type[key]] = int(match.group(2))
                else:
                    # logging.info key, 'not found.'
                    weibo[pop_type[key]] = 0

        except Exception as e:
            logging.info(e)
            return None

        # logging.info_dict(weibo)
        return weibo

    def save(self, dist_dir='result'):
        '''
        save the search results to file
        '''
        if dist_dir not in os.listdir(os.curdir):
            os.mkdir(dist_dir)
        for w in self.results:
            file_name = ''.join(
                ['_'.join([k for k in w['keywords']]), w['mid']])
            file_name += '.txt'
            f = codecs.open(os.path.join(dist_dir, file_name), 'w', 'utf-8')
            json.dump(w,
                      f,
                      ensure_ascii=False,
                      default=json_util.default,
                      indent=2)
            # logging.info(w['text'])
            logging.info('writed to file {}'.format(file_name))
        return

    def crawl_comments(self):
        '''
        crawl the comments after getting all the results and update the results list --> self
        '''
        client = self.wl.authorize_app()
        if client:
            for w in self.results:
                # logging.info(w['mid'])
                w['comments'] = []
                crawler = WeiboCommentsCrawler(client, weibo_mid=w['mid'])
                r = crawler.crawl()

                # filter out the unrelated fields
                for c in r:
                    c.pop('status')
                w['comments'].extend(r)
        else:
            logging.error('认证失败,不能获取评论列表')
        return
Ejemplo n.º 15
0
def check_uid(uid):
    info = wb.get("http://weibo.com/{}".format(uid)).content
    if "page_error" not in info:
        matched = user_pat.search(info.decode("utf-8"))
        if matched:
            return matched.group(1)
        else:
            return True


app = Flask(__name__)
app.config.from_pyfile("config.py")


wb = WeiboLogin(app.config["USERNAME"],
                app.config["PASSWORD"],
                app.config["COOKIE_FILE"])
wb.load_cookies()


@app.route("/weibo", methods=["POST", "GET"])
def weibo():
    wb.test_log_status()
    args = request.form or request.args
    user = args.get("user", "")
    uid = args.get("uid", "")
    action = args.get("action", "start")
    door = args.get("door", "")
    if request.method == "GET":
        base64_image = ""
        if not wb.logged: