def __init__(self, sid): self._sid = sid self._login = WeiboLogin(settings.ACCOUNT, settings.PASSWORD) # 将抓取的内容保存在 以博主ID号为文件夹 的目录下 self._dir = 'resource/' + sid self._parser = WeiboParser(self._dir)
def login_hook(opener, **kw): username = str(kw['username']) passwd = str(kw['password']) loginer = AccountLogin(opener,username,passwd) ret = loginer.login() return ret
def main(): # RedisCookies.clean() weiboLogin = WeiboLogin() success = [] failed = [] for account in ACCOUNTS: try: LOGGER.info('get cookies for %s' % str(account)) cookies = weiboLogin.login_by_selenium(account['user'], account['password']) if cookies is not None and 'SSOLoginState' in cookies and 'SUBP' in cookies and 'SUHB' in cookies: success.append(account) RedisCookies.save_cookies(account['user'], cookies) else: failed.append(account) except Exception: LOGGER.error("get cookies failed") traceback.print_exc() failed.append(account) LOGGER.info("%d accounts login success" % len(success)) LOGGER.info("%d accounts login failed" % len(failed))
def refresh_cookies(self, ck_dir): """ refresh cookie db """ idx = 0 # del all cookies if os.path.isdir(cookie_dir): shutil.rmtree(cookie_dir) os.mkdir(cookie_dir) # add cookie from folder accounts = [] for root, dirs, files in os.walk(ck_dir): for filespath in files: full_name = os.path.join(root, filespath) with open(full_name) as f: for line in f.readlines(): if line: u, p = line.split('\t') if u and p: accounts.append((u.strip(), p.strip())) # save cookie for u, p in accounts: opener = MechanizeOpener( user_agent= 'Baiduspider+(+http://www.baidu.com/search/spider.htm)', timeout=10) opener.browser.set_proxies({'http': get_ip_proxy(size=10)}) lm = WeiboLogin(opener, u, p) try: status = lm.login() except Exception as ex: self.logger.warn("login error:%s" % u) self.logger.error(ex) continue if status: idx += 1 opener.cj.save(os.path.join(cookie_dir, '%d.txt' % idx), ignore_discard=True, ignore_expires=True) self.validated.append("%s\t%s\r\n" % (u, p)) opener.close()
def crawl_author(): mongo_url = "localhost" mongo_db = "weibo" mongo_coll = "author" account = input("enter your weibo account:\n") pwd = input("enter your passwords:\n") login = WeiboLogin(account, pwd) sp = AuthorSpider(login) with AuthorPipeline(mongo_url, mongo_db, mongo_coll) as pipe: for item in sp.crawl(): pipe.save(item)
def crawl_weibo(start=0, width=300): with open("weibo_ids.csv") as f: ids = f.readlines() ids = [s.strip().strip('"') for s in ids[1:]] account = input("enter your weibo account:\n") pwd = input("enter your passwords:\n") lg = WeiboLogin(account, pwd) sp = WeiboSpider2(lg) mongo_url = "localhost" mongo_db = "weibo" mongo_coll = "posts_newfields" id_ = "1006062557129567" with WeiboPipeline(mongo_url, mongo_db, mongo_coll) as pipe: for i, id_ in enumerate(ids[:start+width]): print(i) if i < start: continue for item in sp.crawl(id_, "2015-01-01 00:00"): pipe.update(item)
# -*- coding: utf-8 -*- from crawler import WeiboCrawler from login import WeiboLogin if __name__ == '__main__': username = '******' pwd = 'Cc19900201' keywords = ['#中国人寿#', '#中国人寿保险#', '#中国人寿保险公司#'] WeiboLogin(username, pwd).login() WeiboCrawler( isConnectMySQL=True, htmlOutputDir='/Users/cchen224/Downloads/China Life').search(keywords) print 'Finally!!'
def login_hook(opener, **kw): username = kw['username'] passwd = kw['password'] loginer = WeiboLogin(opener, username, passwd) return loginer.login()
if __name__ == '__main__': base_url = "https://www.weibo.com/u/%s?is_all=1" wb_url = "https://www.weibo.com/p/%s/home?pids=Pl_Official_MyProfileFeed__22&is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=%s#feedtop" ajax_url = "https://www.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=%s&is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page=%s&pagebar=%s&pl_name=Pl_Official_MyProfileFeed__22&id=%s&pre_page=%s" # url = ajax_url % (domain_id, page, page_bar, page_id) sleep_time = 0.1 page_limit = 50 max_pages = 100 # S1: Get cookie print('Get logined cookie...') username = '******' password = '******' dl = WeiboLogin(username=username, password=password) with open('login.cookie', 'rb') as f: dl.session.cookies.update(pickle.load(f)) # S2: Create spider print('Initialized the Spider...') spider = WeiboSpider(dl) log = spider.IOHandle['log'] # S3: Treat uids for uid in spider.uids: print("start to treat uid:", uid) log.write("start to treat uid: %s\n" % uid) spider.uid = uid # 3.0 spider.make_output_handles()
class Spider: def __init__(self, sid): self._sid = sid self._login = WeiboLogin(settings.ACCOUNT, settings.PASSWORD) # 将抓取的内容保存在 以博主ID号为文件夹 的目录下 self._dir = 'resource/' + sid self._parser = WeiboParser(self._dir) def __spider(self, url): cookies = self._login.cookies() # 获取登录cookies if not cookies: if not self._login.login(): print("==> login failed and exist!") exit(-1) headers = {'User-Agent': USER_AGENT} # 请求URL,获取页面信息,其中包含文章内容 response = requests.get(url, cookies=cookies, headers=headers) if response.status_code != 200: print('network error') return # re.S 匹配包括换行在内的所有字符,这样就不会因为换行导致的匹配不到内容 match = re.search('Sina Visitor System', response.text, re.S) if not match: # 登录成功,开始解析文章内容 content = BeautifulSoup(response.text, 'lxml', exclude_encodings='gbk') self._parser.parse(content) print("match") else: print('<Sina Visitor System> error') def running(self, start_page, end_page): print("*" * 50) print("*" + " Start crawling sina") print("*" + " Account: %s" % settings.ACCOUNT) print("*" + " Sid: %s" % self._sid) print("*" + " page: %d - %d" % (start_page, end_page)) print("*" * 50) # 检测保存目录是否存在,不存在则创建 if not os.path.exists(self._dir): os.makedirs(self._dir) for i in range(start_page, end_page + 1): _pager = WeiboPage(self._sid, i) print("===> START Page <%d --- 1>" % i) self.__spider(_pager.first()) print("===> END Page <%d --- 1>" % i) print("===> START Page <%d --- 2>" % i) self.__spider(_pager.second()) print("===> END Page <%d --- 2>" % i) print("===> START Page <%d --- 3>" % i) self.__spider(_pager.third()) print("===> END Page <%d --- 3>" % i) if i != end_page: time_random = random.randint(10, 20) print("==> Sleep %d second" % (time_random + 10)) time.sleep(time_random + 10)
#coding=utf-8 from login import WeiboLogin import collectData if __name__ == '__main__': uid = '18640376585' psw = '89364013' WeiboLogin(uid, psw) collectData.main()
def login_hook(opener, **kw): username = str(kw["username"]) passwd = str(kw["password"]) loginer = WeiboLogin(opener, username, passwd) return loginer.login()
""" #新的短信验证方式,不支持多线程同时登陆,会造成发生短信颜值码频率太高的错误,所以先使用单线程的方式实现 uname = str(user_config['job']['login'][0]['username']) passwd = str(user_config['job']['login'][0]['password']) user_agent = """ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36""" cookie = """SINAGLOBAL=122.234.236.211_1449673121.694525; Apache=122.234.236.211_1449673121.694527; SUB=_2AkMhNM6Vf8NhqwJRmPoUxW_naItzygjEiebDAH_sJxJjHlEO7FBtRgGyzabhoI02ECY9_U0P29jX; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWu0sYw.Q-ey7_1U9OXyjuM; ULOGIN_IMG=gz-fcc5afecc7602110ffffd666df9d024f0051""" cookies_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), cookies_filename) opener = MechanizeOpener(user_agent=user_agent, cookie_filename=cookies_filename) #opener.browser.addheaders = [('User-agent',user_agent),('Connection','keep-alive'),('Cookie',cookie)] opener.browser.addheaders = [('User-agent',user_agent),('Connection','keep-alive')] loginer = WeiboLogin(opener,uname,passwd) #TODO:尝试直接使用上次的cookie,不重新登录... #is_need_login = False is_need_login = True if not is_need_login or loginer.login() == True: msg = """已经成功登录微博,请继续使用opener对象访问微博的其他页面,比如:\n response = opener.open('URL地址','要提交的数据,请先用urllib.urlencode进行编码') """ try: from IPython import embed embed(banner2 = msg) except ImportError: import code code.interact(msg, local=globals()) else: print '登录失败,每天发送验证短信的次数只有4-5次'