class BaseSpider(Spider): name = 'basespider' def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.sql = SqlHelper() self.dir_log = 'log/proxy/%s' % self.name self.is_record_web_page = False def init(self): self.meta = { 'download_timeout': self.timeout, } utils.make_dir(self.dir_log) command = utils.get_create_table_command(config.free_ipproxy_table) self.sql.execute(command) def start_requests(self): for i, url in enumerate(self.urls): yield Request( url=url, headers=self.headers, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) def parse_page(self, response): self.write(response.body) pass def error_parse(self, failure): request = failure.request pass def add_proxy(self, proxy): utils.sql_insert_proxy(self.sql, config.free_ipproxy_table, proxy) def write(self, data): if self.is_record_web_page: with open( '%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') ip = inputs.get('ip') command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip) sql.execute(command) command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip) res = sql.query_one(command) return res is None except: pass return False
class SendSms(object): def __init__(self): self.sql = SqlHelper() self.weather_table_name = config.weather_table self.user_table_name = config.user_table def send_sms(self): command = ("SELECT * FROM {};".format(self.user_table_name)) self.sql.execute(command) users = self.sql.cursor.fetchall() if users != None: for user in users: utils.log('send_sms get user info user:%s' % str(user)) # 判断用户定义的时间,只有满足用户定义时间才发送短信 user_time = user[5] time_info = user_time.split(':') u_hour = time_info[0] u_minute = time_info[1] # 获取系统时间 s_hour = datetime.datetime.now().hour s_minute = datetime.datetime.now().minute if int(u_hour) == s_hour and int(u_minute) == s_minute: utils.log('send sms to user:%s' % str(user)) command = ( "select * from {0} where city_name='{1}' order by id desc limit 1;" .format(self.weather_table_name, user[3])) self.sql.execute(command) weather = self.sql.cursor.fetchone() if weather != None: temp_code = 'SMS_41855112' phone = user[2] info = { 'name': user[1], 'city': user[3], 'weather': weather[15], 'temp': '%s ~ %s' % (weather[9], weather[8]), 'aqilevel': utils.get_aqi_level_info(weather[12]), } sms = AliyunSms() sms.send_sms(temp_code, info, phone)
class Validator(Spider): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = '' self.is_record_web_page = False def init(self): utils.make_dir(self.dir_log) command = utils.get_create_table_command(self.name) self.sql.create_table(command) @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or { 'CONCURRENT_REQUESTS': cls.concurrent_requests, 'RETRY_ENABLED': cls.retry_enabled, }, priority='spider') def start_requests(self): count = utils.get_table_length(self.sql, self.name) count_free = utils.get_table_length(self.sql, config.httpbin_table) ids = utils.get_table_ids(self.sql, self.name) ids_free = utils.get_table_ids(self.sql, config.httpbin_table) for i in range(0, count + count_free): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_free[i - len(ids)] proxy = utils.get_proxy_info(self.sql, table, id) if proxy == None: continue for url in self.urls: cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.get('id'), 'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')), 'vali_count': proxy.get('vali_count', 0) }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): utils.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy = response.meta.get('proxy_info') table = response.meta.get('table') id = response.meta.get('id') ip = proxy.get('ip') self.save_page(ip, response.body) if self.success_mark in response.body or self.success_mark is '': speed = time.time() - response.meta.get('cur_time') if table == self.name: if speed > self.timeout: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: vali_count = response.meta.get('vali_count', 0) + 1 command = utils.get_update_data_command( table, id, speed, vali_count) self.sql.execute(command) else: if speed < self.timeout: command = utils.get_insert_data_command(self.name) msg = (None, proxy.get('ip'), proxy.get('port'), proxy.get('country'), proxy.get('anonymity'), proxy.get('https'), speed, proxy.get('source'), None, 1) self.sql.insert_data(command, msg, commit=True) else: # 如果没有找到成功标示,说明这里返回信息有误,需要删除当前库的 ip if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) def error_parse(self, failure): request = failure.request utils.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') id = failure.request.meta.get('id') if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) utils.log('filename:%s' % filename) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('\d+', re.S) product_id = re.search(pattern, url).group() sql = SqlHelper() utils.log('product_id:%s' % product_id) if 'item.jd.com' in url and product_id != None: data['status'] = 'success' data['guid'] = str(uuid.uuid4()) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.jd_item_table, product_id = product_id) result = sql.query_one(command) if result == None: name = 'jd' cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, product_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, product_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'product_id={product_id};'. \ format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
sql = SqlHelper() spiders = [ XiCiDaiLiSpider, SixSixIpSpider, IpOneEightOneSpider, KuaiDaiLiSpider, # 在访问前加了一个 js ,反爬 GatherproxySpider, HidemySpider, ProxylistplusSpider, FreeProxyListsSpider, # PeulandSpider, # 目标站点失效 UsProxySpider, ProxyDBSpider, ProxyRoxSpider, ] while True: utils.log('*******************run spider start...*******************') command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL 0.2 DAY)".format( table = config.free_ipproxy_table) sql.execute(command) for spider in spiders: scrapydo.run_spider(spider) utils.log('*******************run spider waiting...*******************') time.sleep(1200)
class Validator(Spider): name = 'base' def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = '' def init(self): utils.make_dir(self.dir_log) command = utils.get_create_table_command(self.name) self.sql.create_table(command) def start_requests(self): count = utils.get_table_length(self.sql, self.name) count_free = utils.get_table_length(self.sql, free_ipproxy_table) for i in range(0, count + count_free): table = self.name if (i < count) else free_ipproxy_table proxy = utils.get_proxy_info(self.sql, table, i) if proxy == None: continue for url in self.urls: cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.get('id'), 'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')), }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): utils.log('name:%s success_parse proxy:%s meta:%s' % (self.name, str( response.meta.get('proxy_info')), str(response.meta))) filename = datetime.datetime.now().strftime('%Y-%m-%d %H_%M_%S_%f') self.save_page(filename, response.body) if response.body.find(self.success_mark) or self.success_mark is '': proxy = response.meta.get('proxy_info') speed = time.time() - response.meta.get('cur_time') table = response.meta.get('table') id = response.meta.get('id') utils.log('speed:%s table:%s id:%s' % (speed, table, id)) if table == self.name: if speed > self.timeout: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: command = utils.get_update_data_command(table, id, speed) self.sql.execute(command) else: if speed < self.timeout: command = utils.get_insert_data_command(self.name) msg = (None, proxy.get('ip'), proxy.get('port'), proxy.get('country'), proxy.get('anonymity'), proxy.get('https'), speed, proxy.get('source'), None) self.sql.insert_data(command, msg) def error_parse(self, failure): utils.log('error_parse value:%s' % failure.value) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') id = failure.request.meta.get('id') if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, filename, data): if get_project_settings().get('IS_RECODE_HTML', False): with open('%s/%s.html' % (self.dir_log, filename), 'w') as f: f.write(data) f.close()
def main(n): _tc = TimeCnt() _tc.cnt_time() #model_labels_2 = model_labels_2 #model_labels = model_labels #model_columns_base = model_columns_base #print("> this is sql test") #shandong_ = SqlHelper(Config_map_shandong) #guangxi_ = SqlHelper(Config_guangxi) #storm_shandong_= SqlHelper(Configstormshandong) #storm_110_ = SqlHelper(Configooo) _tc.cnt_time() today = todayStr() #from load_mysqL_from_localcpk import load_mongodb_conn #bond_risk_ = load_mysqL_from_localcpk.load_mongodb_conn() mysql_bond_risk_ = SqlHelper(Config_bond_risk) #cursor = bond_risk_.find() #for i in cursor: #print(i) #pdb.set_trace() # timeFormat(i['date_input']) label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date)>=%d);"% n) compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date)>=%d);"% n) #label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n) #_tc.cnt_time() #compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n) #label_120_, compname_120_ = get_label_time_window(bond_risk_, n, n-120) set_ = set() [set_.add(i) for i in label_120_] label_lst_ = list(set_) _tc.cnt_time() set_.clear() [set_.add(i) for i in compname_120_] compname_lst_ = list(set_) _dic = get_label_120(mysql_bond_risk_, compname_lst_, label_lst_, n) _tc.cnt_time() _panel = pd.Panel(_dic) _panel = _panel.fillna(0.0) #filter_data_by_time(bond_risk_, today) _tc.cnt_time() df_4_model = pd.DataFrame(index=compname_lst_, columns = model_labels + model_labels_2) df_4_model = df_4_model.fillna(0.0) df_4_model = df_4_model.astype(np.float64) _index = list(df_4_model.index) _columns = model_labels _columns_2 = model_labels_2 #print("> ready to get data") _cnt = 0 for i in _index: _cnt+=1 if _cnt % 100 ==1: pass #print(">>>> !!! handle the,", i, _cnt) #if _cnt > 300: #break #print("> handle the,", i, _cnt) for c in _columns: df_4_model.loc[i,c] = cell_fill(_panel, i,c) df_4_model.loc[i, "企业名称"] = i df_4_model.loc[i, "发布日期"] = datetime.datetime.now() df_4_model.loc[i, "credit_recent"] = 0 df_4_model.loc[i, "credit_ago"] = 0 df_4_model.loc[i, "credit_trend"] = 0 df_4_model.loc[i, "60"] = _panel[i].loc[60,:].sum() df_4_model.loc[i, "120"] = _panel[i].loc[120,:].sum() df_4_model.loc[i, "180"] = _panel[i].loc[180,:].sum() df_4_model.loc[i, "债券风险60"] = group_cnt_key_word("债券风险60",i,_panel) df_4_model.loc[i, "债券风险120"] = group_cnt_key_word("债券风险120",i,_panel) df_4_model.loc[i, "债券风险180"] = group_cnt_key_word("债券风险180",i,_panel) df_4_model.loc[i, "个人风险60"] = group_cnt_key_word("个人风险60",i,_panel) df_4_model.loc[i, "个人风险120"] = group_cnt_key_word("个人风险120",i,_panel) df_4_model.loc[i, "个人风险180"] = group_cnt_key_word("个人风险180",i,_panel) df_4_model.loc[i, "财务风险60"] = group_cnt_key_word("财务风险60",i,_panel) df_4_model.loc[i, "财务风险120"] = group_cnt_key_word("财务风险120",i,_panel) df_4_model.loc[i, "财务风险180"] = group_cnt_key_word("财务风险180",i,_panel) df_4_model.loc[i, "经营风险60"] = group_cnt_key_word("经营风险60",i,_panel) df_4_model.loc[i, "经营风险120"] = group_cnt_key_word("经营风险120",i,_panel) df_4_model.loc[i, "经营风险180"] = group_cnt_key_word("经营风险180",i,_panel) df_4_model.loc[i, "行业风险60"] = group_cnt_key_word("行业风险60",i,_panel) df_4_model.loc[i, "行业风险120"] = group_cnt_key_word("行业风险120",i,_panel) df_4_model.loc[i, "行业风险180"] = group_cnt_key_word("行业风险180",i,_panel) df_4_model.loc[i, "企业风险60"] = group_cnt_key_word("企业风险60",i,_panel) df_4_model.loc[i, "企业风险120"] = group_cnt_key_word("企业风险120",i,_panel) df_4_model.loc[i, "企业风险180"] = group_cnt_key_word("企业风险180",i,_panel) #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x) df_4_model.loc[i, "sub120_60"] = df_4_model.loc[i, "120"] - df_4_model.loc[i, "60"] df_4_model.loc[i, "sub180_120"] = df_4_model.loc[i, "180"] - df_4_model.loc[i, "120"] #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==-1 else x) #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x) _x = df_4_model.drop(["企业名称","发布日期","Label"],1) _z = pd.read_csv("/home/siyuan/bond_risk/_z.csv").drop(["Unnamed: 0","发布日期","Label"],1) #_z.index = _z["企业名称"] _z = _z.drop("企业名称", axis=1) _x.columns = list(_z.columns) # !! filter #_x = _x[(_x["sub120_60"]>0) & (_x["60"]>0)] #_x = _x[(_x["60"]>0)] _x = _x[(_x["120"]>0)] train_separator = len(_x.index) #print(train_separator) _pred_data = pd.concat([_x, _z], axis=0) _pred_data = set_dummy(_pred_data, False) # output predict label bst = xgb.Booster() bst.load_model("/home/siyuan/data/xgb.model") #pdb.set_trace() #_lz = pd.read_csv("/home/siyuan/bond_risk/_z.csv")["Label"] result_ = predict(bst, _pred_data, _pred_data.iloc[1]) dict_ = dict(zip(list(_pred_data.index), result_)) dict_res = dict(zip(list(_pred_data.index)[:train_separator], result_[:train_separator])) #dict_res = dict(zip(list(_pred_data.index), result_)) #print(collections.Counter(list(result_))) #print(collections.Counter(list(result_)[:train_separator])) cnt = 0 #pdb.set_trace() #print(dict_res) for i in dict_res.keys(): #sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e'))) sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e'))) #print(sql_) sql_res_ = mysql_bond_risk_.execute(sql_) #print(sql_res_) cnt+=1 pdb.set_trace() mysql_bond_risk_.connect.commit()
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('user-rate-') urls = re.split(pattern, url) user_id = urls[1] pattern = re.compile('\w+', re.S) user_id = re.search(pattern, user_id).group() sql = SqlHelper() utils.log('user_id:%s' % user_id) if 'rate.taobao.com' in url and user_id != None: data['status'] = 'success' data['guid'] = str(random.randint(1000000000000, 9999999999999)) + '_' + str( random.randint(100, 999)) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={user_id}". \ format(table = config.tb_item_table, user_id = user_id) result = sql.query_one(command) if result == None: name = 'tb_comment' cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a user_id={user_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) logging.warn(cmd) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \ format(config.analysis_item_table, user_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, user_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'user_id={user_id};'. \ format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
class GameInfo(CrawlSpider): name = 'game_info' def __init__(self, *a, **kw): super(GameInfo, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) self.error_count = 0 def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` TEXT NOT NULL," "`price` INT(5) NOT NULL," "`metacritic_score` FLOAT DEFAULT NULL," "`user_reviews_count` INT(6) NOT NULL," "`positive_user_reviews_count` INT(6) NOT NULL," "`positive_percent` FLOAT NOT NULL ," "`negative_user_reviews_count` INT(6) NOT NULL," '`steam_user_reviews_count` INT(6) NOT NULL,' '`non_steam_user_reviews_count` INT(6) NOT NULL,' '`english_user_reviews_count` INT(6) NOT NULL,' '`non_english_user_reviews_count` INT(6) NOT NULL,' "`tag_list` TEXT DEFAULT NULL," "`achievements_count` INT(4) DEFAULT NULL," "`category` TEXT NOT NULL," "`genre` TEXT NOT NULL," "`developer` TEXT NOT NULL," "`publisher` TEXT NOT NULL," "`release_date` TEXT NOT NULL," "`url` TEXT NOT NULL," "`language_number` INT(3) DEFAULT NULL," "`description` TEXT DEFAULT NULL," "`save_time` TIMESTAMP NOT NULL," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.steam_game_info_table)) self.sql.create_table(command) def start_requests(self): command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format( config.steam_game_urls_table) data = self.sql.query(command) for i, item in enumerate(data): yield Request( url=item[3], dont_filter=True, method='GET', headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'store.steampowered.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 ' 'Firefox/51.0', }, meta={ 'item': item, 'id': item[0], }, cookies={ 'mature_content': '1', }, callback=self.parse_game, errback=self.error_parse, ) def parse_game(self, response): self.log('parse_game url:%s' % response.url) id = response.meta.get('id') # file_name = '%s/%s.html' % (self.dir_game, id) # self.save_page(file_name, response.body) if u'Please enter your birth date to continue' in response.body: self.log('Please enter your birth date to continue meta:%s' % response.meta) url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id) return FormRequest(url=url, dont_filter=True, method='POST', formdata={ 'ageDay': str(range(1, 25)), 'ageMonth': 'January', 'ageYear': str(range(1980, 1995)), 'snr': '1_agecheck_agecheck__age-gate', }, callback=self.parse_game) soup = BeautifulSoup(response.body, 'lxml') sel = Selector(text=response.body) name = sel.xpath( '//div[@class="apphub_AppName"]/text()').extract_first() if name == '' or name == None: self.log('no get data meta:%s' % response.meta) return price = sel.xpath('//div[@class="game_purchase_price price"]/text()' ).extract_first() try: p = price.split('¥') price = int(p[1]) except: price = -1 # 该游戏在 metacritic 上的评分 metacritic_score = sel.xpath( '//div[@class="score high"]/text()').extract_first() try: metacritic_score = int(metacritic_score) except: metacritic_score = -1 # 所有用户回复数量 user_reviews_count = sel.xpath( '//label[@for="review_type_all"]/span/text()').extract_first() user_reviews_count = self.count_to_int(user_reviews_count) # 好评的用户数量 positive_user_reviews_count = sel.xpath( '//label[@for="review_type_positive"]/span/text()').extract_first( ) positive_user_reviews_count = self.count_to_int( positive_user_reviews_count) # 好评的百分比 if user_reviews_count != -1 and positive_user_reviews_count != -1: positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100 else: positive_percent = 0 # 差评的用户数量 negative_user_reviews_count = sel.xpath( '//label[@for="review_type_negative"]/span/text()').extract_first( ) negative_user_reviews_count = self.count_to_int( negative_user_reviews_count) # 在 steam 购买的用户的评论数 steam_user_reviews_count = sel.xpath( '//label[@for="purchase_type_steam"]/span/text()').extract_first() steam_user_reviews_count = self.count_to_int(steam_user_reviews_count) # 在其他平台购买的用户的评论数 non_steam_user_reviews_count = sel.xpath( '//label[@for="purchase_type_non_steam"]/span/text()' ).extract_first() non_steam_user_reviews_count = self.count_to_int( non_steam_user_reviews_count) # 英语评论的数量 english_user_reviews_count = sel.xpath( '//label[@for="review_language_mine"]/span/text()').extract_first( ) english_user_reviews_count = self.count_to_int( english_user_reviews_count) # 非英语的评论数量 non_english_user_reviews_count = user_reviews_count - english_user_reviews_count # 该游戏的标签列表 try: tags = soup.find(attrs={'class': 'glance_tags popular_tags'}) tag_list = tags.text.replace('\t', '') tag_list = tag_list.replace('\n', ',') except: tag_list = '' # 该游戏的成就数量 achievements = sel.xpath( '//div[@id="achievement_block"]/div/text()').extract_first() try: achievements_count = re.search('\d+', achievements, re.S).group(0) achievements_count = int(achievements_count) except: achievements_count = 0 # 该游戏的分类 All Games > Action Games > Counter-Strike try: category = soup.find(name='div', attrs={ 'class': 'breadcrumbs' }).text category = category.replace('\t', '') category = category.replace('\n', '') except: category = '' # 游戏类型 genre = sel.xpath( '//div[@class="block_content"]/div/div/a/text()').extract_first() # 游戏开发商 developer = sel.xpath( '//div[@class="block_content"]/div/div/a[2]/text()').extract_first( ) # 游戏发行商 publisher = sel.xpath( '//div[@class="block_content"]/div/div/a[3]/text()').extract_first( ) # 游戏发行日期 release_date = sel.xpath( '//div[@class="release_date"]/span/text()').extract_first() # 游戏支持的语言 language_number = len( sel.xpath( '//table[@class="game_language_options"]/tr').extract()) - 1 # 游戏描述 description = sel.xpath( '//div[@class="game_description_snippet"]/text()').extract_first() # 抓取该游戏时间 save_time = None msg = (id, name, price, response.url, metacritic_score, user_reviews_count, positive_user_reviews_count, positive_percent, negative_user_reviews_count, steam_user_reviews_count, non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, tag_list, achievements_count, category, genre, developer, publisher, release_date, language_number, description, save_time) command = ( "INSERT IGNORE INTO {} " "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, " "positive_percent, negative_user_reviews_count, steam_user_reviews_count, " "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, " "tag_list, achievements_count, category, genre, developer, publisher, release_date, " "language_number, description, save_time)" "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " "%s)".format(config.steam_game_info_table)) self.sql.insert_data(command, msg) command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format( config.steam_game_urls_table, id) self.sql.execute(command) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def get_id(self, url): type = '' if '/sub/' in url: pattern = re.compile('/sub/(\d+)/') type = 'sub' elif '/app/' in url: pattern = re.compile('/app/(\d+)/', re.S) type = 'app' elif '/bundle/' in url: pattern = re.compile('/bundle/(\d+)/', re.S) type = 'bundle' else: pattern = re.compile('/(\d+)/', re.S) type = 'other' utils.log('get_id other url:%s' % url) id = re.search(pattern, url) if id: id = id.group(1) return id self.error_count = self.error_count + 1 utils.log('get_id error url:%s' % url) return -self.error_count def count_to_int(self, data): try: ret = data ret = ret.replace('(', '') ret = ret.replace(')', '') ret = ret.replace(',', '') return int(ret) except: return -1 def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()