def __init__(self): logging.getLogger("wechatsogou").setLevel(logging.WARNING) logging.getLogger("peewee").setLevel(logging.WARNING) logging.getLogger("requests").setLevel(logging.WARNING) self.logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) self.get_bad_proxies() self.WxTable = WechatInfo() self.get_conn() self.create_target() self.get_saved_data() self.proxies_list = NewGenerationProxy({ 'anony': 'L4', 'post': 'false', 'speed': 3000 }) proxyLine = self.proxies_list.getProxy() self.wx_api = wechatsogou.WechatSogouAPI(timeout=8, proxies={ 'http': proxyLine, 'https': proxyLine }) SpiderConfig = Config.SpiderConfig self.headers = SpiderConfig.headers.json() self.weChat_table = WechatInfo() self.proxies_table = UnableProxies() self.crawled_table = CrawledData() self.rk = RClient('ghost2017b', 'Ghost2017b', '107539', 'a8bd936aa1574ddb96d14564c1a0d022')
def register(phone): coll_insert = connect_mongodb('192.168.0.235', 27017, 'fgg', 'user_info') s = requests.session() rc = RClient('ruokuaimiyao', 'goojia123456', '95632', '6b8205cf61944329a5841c30e5ed0d5d') proxies = {'http': 'http://192.168.0.93:4234/'} c = int(time.time() * 1000) s.get('http://www.fungugu.com/ShenQingShiYong/fillInformation', proxies=proxies) jrbqiantai = s.cookies.get_dict()['jrbqiantai'] cookie = 'Hm_lpvt_203904e114edfe3e6ab6bc0bc04207cd' + str( c) + ';Hm_lvt_203904e114edfe3e6ab6bc0bc04207cd' + str( c) + ';jrbqiantai=' + jrbqiantai headers = { 'Cookie': cookie, 'Referer': 'http://www.fungugu.com/ShenQingShiYong/fillInformation', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36' } print(headers) while True: url = 'http://www.fungugu.com/yzmControlller/refreshPicCode?id=pwdCode&' + str( c) response = s.get(url=url, proxies=proxies) code = rc.rk_create(response.content, 3040)['Result'] print('code:', code) params = { 'code': code, 'type': 'pwdCode', } code_url = 'http://www.fungugu.com/yzmControlller/verificationPicCode' result = s.post(url=code_url, params=params, proxies=proxies) print(result.content) if 'true' in result.text: break params = {'phone': phone, 'picCode': code} # 发送手机号短信 send_url = 'http://www.fungugu.com/yzmControlller/sendCode' result = s.post(url=send_url, params=params, proxies=proxies) print(result.content.decode()) print(phone) # phone_verification = get_msg(token, phone) phone_verification = input('验证码 :') params = {'securityCode': phone_verification, 'phone': phone} phone_url = 'http://www.fungugu.com/yzmControlller/verificationSMSCode' result = s.post(url=phone_url, params=params, proxies=proxies) print('aaa:', result.content) user_name = yonghuming() params = { 'keyCode': '', 'keHuShouJi': phone, 'yongHuMing': user_name, 'keHuMiMa': '4ac9fa21a775e4239e4c72317cdca870', 'quanXianChengShi': '上海', 'jiGouMingCheng': user_name, } register_url = 'http://www.fungugu.com/ShenQingShiYong/completionUserInfo' result = s.post(url=register_url, params=params, proxies=proxies) print(result.text) if 'true' in result.text: data = { 'user_name': user_name, } coll_insert.insert_one(data) print('插入成功') else: print('错误')
# -*- coding: utf-8 -* import requests import re import time import getpass from rk import RClient ''' 定义全局cookies,所有网络请求返回的cookies都放在这里 ''' all_cookies = {} #获取若快图像识别客户端 rc = RClient('ljc1998', 'ljc19980217.', '117226', 'abf23a6f920644d9b8db7908b773f16a') ''' 定义需要用到的URL ''' url = { 'yiban_login': '******', # get,登录页面 'yiban_do_login': '******', # post,请求登录 'yiban_index': 'https://www.yiban.cn/', # get,易班首页 'yiban_app_base': 'https://q.yiban.cn/app/index/appid/', # get,易班app页面,后面需加应用id 'yiban_signup_get': 'https://q.yiban.cn/signup/getSignupAjax/', # post,查询讲座状态 'yiban_signup_insert': 'https://q.yiban.cn/signup/insertBoxAjax/', # post,抢讲座入口 'yiban_captcha_get': 'https://www.yiban.cn/captcha/index/' #get,获取登录的验证码 } ''' 工具函数,用于将网络请求返回的cookies与全局cookies合并,返回全局cookies
class Crawl(object): def __init__(self): logging.getLogger("wechatsogou").setLevel(logging.WARNING) logging.getLogger("peewee").setLevel(logging.WARNING) logging.getLogger("requests").setLevel(logging.WARNING) self.logger = logging.getLogger() handler = logging.StreamHandler() formatter = logging.Formatter( '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) self.get_bad_proxies() self.WxTable = WechatInfo() self.get_conn() self.create_target() self.get_saved_data() self.proxies_list = NewGenerationProxy({ 'anony': 'L4', 'post': 'false', 'speed': 3000 }) proxyLine = self.proxies_list.getProxy() self.wx_api = wechatsogou.WechatSogouAPI(timeout=8, proxies={ 'http': proxyLine, 'https': proxyLine }) SpiderConfig = Config.SpiderConfig self.headers = SpiderConfig.headers.json() self.weChat_table = WechatInfo() self.proxies_table = UnableProxies() self.crawled_table = CrawledData() self.rk = RClient('ghost2017b', 'Ghost2017b', '107539', 'a8bd936aa1574ddb96d14564c1a0d022') def get_conn(self): conn = pymysql.connect(host=DBinfo.MYSQL_HOST, port=DBinfo.MYSQL_PORT, user=DBinfo.MYSQL_USER, passwd=str(DBinfo.MYSQL_PASSWD), db=DBinfo.MYSQL_DANAME, charset='utf8') return conn def get_saved_data(self): conn = self.get_conn() cursor = conn.cursor() sql = """select main_url from crawleddata""" cursor.execute(sql) self.saved_data_list = [x for x, in cursor.fetchall()] def get_bad_proxies(self): conn = self.get_conn() cursor = conn.cursor() sql = """select object from unableproxies """ cursor.execute(sql) self.bad_proxies_list = [x for x, in cursor.fetchall()] def create_target(self): self.target_list = [] conn = self.get_conn() cursor = conn.cursor() sql = """select jobTypeId , jobTypeName from data_job_type where level = '3'""" cursor.execute(sql) data = cursor.fetchall() for num, item in enumerate(data): jobTypeId = item[0] jobTypeName = item[1] target = { 'jobTypeId': jobTypeId, 'jobTypeName': jobTypeName, 'items': [] } self.target_list.append(target) cursor.close() # def get_proxy(self, scene='default'): # while True: # # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ20185305368PAYAqN&returnType=1&count=1' # # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ2018634506zLQTDj&returnType=2&count=1' # # url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=' # url = 'http://dev.kdlapi.com/api/getproxy/?orderid=953059683695998&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=2&method=1&an_an=1&an_ha=1&sep=1' # html = requests.get(url).text # time.sleep(random.randint(5, 10)) # if re.findall('普通订单5秒钟内只允许提取1次', html): # self.logger.debug('need to get proxy ' + scene + ' again!') # continue # # try: # proxyLine = html # proxies = { # 'https': proxyLine # } # except: # pprint(html) # continue # # if str(proxies) in self.bad_proxies_list: # self.logger.debug('ip is bad') # continue # else: # self.logger.debug(u'Update %s proxy to %s' % (scene, proxyLine)) # return proxies # proxyLine proxies ''' for run.py ''' def get_target_list(self, target): page = 10 keyword = target['jobTypeName'] target['items'] = [] target['main_url'] = [] for i in range(page): url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % ( i + 1, parse.quote(str(keyword))) if url in self.saved_data_list: self.logger.debug('the url had been crawled %s/%s' % (keyword, i + 1)) continue while True: def identify_image_callback(image): result = self.rk.rk_create(image, 3060) if 'Result' in result: self.logger.debug(u'Captcha: %s, ID: %s' % (result['Result'], result['Id'])) return result['Result'] self.logger.debug(result) return '' try: target_little_list = self.wx_api.search_article( target['jobTypeName'], page=i + 1, identify_image_callback=identify_image_callback, ) target['main_url'].append(url) self.logger.debug( u'Name: %s, Page: %s, Count: %s' % (target['jobTypeName'], i, len(target_little_list))) break except Exception as e: self.logger.debug(traceback.format_exc()) self.logger.debug(u'Name: %s, Page: %s, Error: %s' % (target['jobTypeName'], i, e.__repr__())) proxyLine = self.proxies_list.getProxy() self.logger.debug(u'Update proxy to %s' % (proxyLine)) self.wx_api.requests_kwargs['proxies'] = { "https": proxyLine, } if target_little_list.__len__() == 0: break target['items'].append(target_little_list) return target def get_target_list_v2(self, target): page = 10 keyword = target['jobTypeName'] target['items'] = [] target['main_url'] = [] first_data = 'init_proxy' for i in range(page): while True: url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % ( i + 1, parse.quote(keyword)) self.headers['Referer'] = url self.headers['Cookie'] = 'SUV="";SNUID="";' if url not in self.saved_data_list: try: self.logger.debug("queue get before size:%s" % q_proxies.qsize()) proxies = q_proxies.get() self.logger.debug("queue get after size :%s" % q_proxies.qsize()) resp = requests.get( url, proxies=proxies, headers=self.headers, timeout=8, ) if resp.ok: if u'antispider' in resp.url: # TODO:记录一下被识别为爬虫的代理IP到数据库 # self.proxies_table.create( object=proxies['http'], ) self.logger.debug( u'Name: %s, Page: %s, DetachAntiSpider: %s' % (keyword, str(i + 1), proxies['http'])) first_data = 'detach_spider' continue else: time.sleep(random.randint(2, 5)) target_little_list = WechatSogouStructuring.get_article_by_search( resp.text) if target_little_list.__len__() == 0: break # break for the page doesnt have data target['items'].append(target_little_list) target['main_url'].append(url) self.logger.debug( 'get item %s page %d total %d ' % (keyword, i + 1, target_little_list.__len__())) break # break for success else: self.logger.debug( u'Name: %s, Page: %s, HttpError: %s' % (keyword, str(i + 1), str(resp.status_code))) first_data = 'http_error' except Exception as e: self.logger.debug(u'Name: %s, Page: %s, Error: %s' % (keyword, i + 1, type(e))) first_data = 'catch_exception' else: self.logger.debug('the url had been crawled') break # break for exist self.logger.debug(u'Name: %s, Total: %s' % (keyword, len(target['items']))) return target def get_data(self, target): try: for num, topItem in enumerate(target['items']): url = target['main_url'][num] self.crawled_table.create( main_url=url, target=target['jobTypeName'], ) for item in topItem: detail_url = item['article']['url'] html = requests.get(url=detail_url, headers=self.headers).text content = re.findall( '<div class="rich_media_content " lang=="en" id="js_content">([\s\S]*)</div>', html) biz = re.findall('var biz = ""\|\|"(.*?)"', html) is_delete = re.findall('该内容已被发布者删除', html) if is_delete.__len__() == 0: wechatId = re.findall( '"profile_meta_value">(.*?)</span>', html) if wechatId.__len__() == 0: wechatId = '' elif wechatId[0] == '': wechatId = '' else: wechatId = wechatId[0] if biz.__len__() == 0: biz = '' else: biz = biz[0] obj_wechat = self.weChat_table.create( url=detail_url, jobTypeId=target['jobTypeId'], title=item['article']['title'], content=''.join(content), desc=item['article']['abstract'], createAt=time.strftime( '%Y-%m-%dT%H:%M:%S', time.localtime(item['article']['time'])), wechatId=wechatId, wechatName=item['gzh']['wechat_name'], biz=biz, ) self.logger.debug( 'item: %s save id: %s' % (target['jobTypeName'], str(obj_wechat.id))) else: self.logger.debug('pass for item is deleted by author') except Exception as e: self.logger.warning('something wrong : ' + e.__repr__())
def insert_mysql(self, result): self.cursor.execute( """insert into {table} (username, password, mobile, api, cookie, id) value (%s, %s, %s, %s, %s, %s)""".format(table=TABLE), (result['username'], str(result['password']), str( result['mobile']), result['api'], result['cookie'], result['id'])) self.connect.commit() if __name__ == '__main__': while True: try: suma = SuMa() rc = RClient('tosshl1985', 'sa123456', '106174', '4a5d4d1e0d334a5b8f59e1ad94ada2cc') zhima = ZhiMa() mobile = suma.mobile session = zhima.getPic() vcode = rc.run() code = zhima.getVocde(mobile, vcode, session) if code == '1': print('ok') phone_code = suma.getVcodeAndHoldMobilenum() if phone_code: zhima.login(mobile, vcode, phone_code, session) else: suma.addIgnoreList(mobile) pass except Exception as e: print(e)