def crawl(): log.info("Force crawling? {}".format(config.FORCE_CRAWL)) csgo_items = [] raw_categories = csgo_all_categories() categories = final_categories(raw_categories) # crawl by categories and price section if len(raw_categories) != len(categories): total_category = len(categories) for index, category in enumerate(categories, start=1): t = asyncio.run(crawl_goods_by_price_section(category)) if t is None: break else: csgo_items.extend(t) log.info('GET category {}/{} for ({}).'.format( index, total_category, category)) else: # crawl by price section without category csgo_items.extend( asyncio.run(crawl_goods_by_price_section(None)) or []) return csgo_items
def crawl_item_history_price(index, item, total_price_number): history_prices = [] steam_price_url = steam_price_history_url(item) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True) # key existence check if (steam_history_prices is not None) and ('prices' in steam_history_prices): raw_price_history = steam_history_prices['prices'] if len(raw_price_history) > 0: days = min((datetime.today().date() - datetime.strptime( raw_price_history[0][0], '%b %d %Y %H: +0').date()).days, 7) else: days = 0 for pair in reversed(raw_price_history): if len(pair) == 3: for i in range(0, int(pair[2])): history_prices.append(float(pair[1])) if (datetime.today().date() - datetime.strptime( pair[0], '%b %d %Y %H: +0').date()).days > days: break # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
async def asyncfetch(url): urlid = url_id(url) log.info('Successful attempt to fetch from {}'.format(urlid)) async with aiofiles.open(os.path.join(cache_root, urlid), "r", encoding='utf-8') as f: return await f.read()
def crawl_item_history_price(index, item, total_price_number): history_prices = [] item_id = item.id steam_price_url = steam_price_history_url(item) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True) if steam_history_prices is not None: raw_price_history = steam_history_prices['prices'] days = min(len(raw_price_history), 7) for pair in reversed(raw_price_history): if len(pair) == 3: history_prices.append(float(pair[1])) if len(history_prices) == days: break # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
def get_json_dict(url, proxy=None, times=1): if times > RETRY_TIMES: log.error( 'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format( url, RETRY_TIMES)) return None timer.sleep_awhile() #随机睡眠1~2秒 try: if proxy is not None: log.info("使用代理{}".format(proxy)) return requests.get(url, headers=headers, cookies=cookies, timeout=5, proxies={ 'http': proxy }).json() else: log.info("无代理".format(proxy)) return requests.get(url, headers=headers, cookies=cookies, timeout=5).json() #获取JSON文件 except Timeout: #获取json时出现超时问题 log.warn("timeout for {}. Try again.".format(url)) return get_json_dict(url, times + 1)
def csgo_all_categories(): """ 通过对html文件的解析获取buff下所有大类的名称 """ prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">' suffix = '</ul> </div> </div> <div class="criteria">' # to match all csgo skin categories category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL) #用于获取category下名称信息的正则表达式 # entry page root_url = goods_root_url() #获取buff主页根url log.info("GET 主页在此!: " + root_url) root_html = http_util.open_url(root_url) #定义在src/util中,简单来说就是一个urllib的url爬取,具体参数自己看 remove_prefix = root_html.split(prefix, 1)[1] #利用spilt函数获取prefix对应字符串后的内容 core_html = remove_prefix.split(suffix, 1)[0] #再获取suffix对应字符串前的内容 #这种利用split实现切割搜索实在是巧妙 # all categories categories = category_regex.findall(core_html) #获取category的所有名称 log.info("所有buff大类({}): {}".format(len(categories), categories)) return categories
def write_arb_tab(): curr_arb_tab = arbitrage_data() curr_arb_tab = curr_arb_tab.reset_index() print(curr_arb_tab) mydb = open_db() myDBcursor = mydb.cursor() table = 'ArbTable' myDBcursor.execute("DROP TABLE IF EXISTS " + table) sql_create_tab = "CREATE TABLE " + table + " \ (ItemName VARCHAR(50), SteamPrice DECIMAL(10,2), SteamDayVolume INTEGER, \ SteamWeekVolume INTEGER, BitskinsPrice DECIMAL(10,2), BitskinsNum INTEGER, \ UpdateTime INTEGER, WaxpeerPrice DECIMAL(10,2), WaxpeerNum INTEGER, \ AbsoluteProfit DECIMAL(10,5), PercentProfit DECIMAL(10,5) )" myDBcursor.execute(sql_create_tab) cols = "ItemName, SteamPrice, SteamDayVolume, SteamWeekVolume, BitskinsPrice, BitskinsNum, \ UpdateTime, WaxpeerPrice, WaxpeerNum, AbsoluteProfit, PercentProfit" for i, row in curr_arb_tab.iterrows(): log.info('Writing Arbitrage Table into SQL ' + str(tuple(row))) sql = "INSERT INTO " + table + " (" + cols + ") VALUES (" + "%s," * ( len(row) - 1) + "%s)" myDBcursor.execute(sql, tuple(row)) mydb.commit() myDBcursor.close() mydb.close() return
def sleep_awhile(): if proxies[0] is not None: interval = random.uniform(0.1, 0.3) else: interval = random.randint(1, 2) log.info("sleep {}s at {}".format(interval, datetime.datetime.now())) #随机睡眠1~2秒 time.sleep(interval)
def get_raw_data(): url = "http://api.steamanalyst.com/csgo/XHoXEDUxbtdHXFZlc" resp = requests.get(url) log.info("Fetching General Steam Community market data") if resp.ok == False: log.warning("Can't fetch Steam Community Market data ") return toReturn = resp.json() return toReturn['results']
def get_market_data(): url = 'https://bitskins.com/api/v1/get_price_data_for_items_on_sale/?api_key=' + API_KEY + '&code=' + CODE + '&app_id=' + APP_ID resp = requests.get(url) log.info("Fetching Bitskins market data") if resp.ok == False: log.warning("Can't fetch Bitskins data ") return price_list = resp.json() return price_list['data']['items']
def get_market_data(): url = 'https://api.waxpeer.com/v1/prices?game=csgo&min_price=0&max_price=100000000' resp = requests.get(url) log.info("Fetching Waxpeer market data") if resp.ok == False: log.warning("Can't fetch Waxpeer data ") return price_list = resp.json() return price_list['items']
def sleep_awhile(is_steam_request=0): low = max(FREQUENCY_INTERVAL_LOW, 10) high = max(10, FREQUENCY_INTERVAL_HIGH) if is_steam_request == 1: interval = 1 / (random.randint(5, 10)) else: interval = random.randint(low, high) log.info("sleep {}s at {}".format(interval, datetime.datetime.now())) time.sleep(interval)
def crawl_history_price(csgo_items): total_price_number = len(csgo_items) log.info('Total {} items to get history price.'.format(total_price_number)) for index, item in enumerate(csgo_items, start=1): try: crawl_item_history_price(index, item, total_price_number) except Exception as e: log.error(traceback.format_exc())
def crawl_history_price(csgo_items): total_price_number = len(csgo_items) log.info('从buff爬取共 {} 物品item满足爬取条件.'.format(total_price_number)) proxies = proxyGet(30) for index, item in enumerate(csgo_items, start=1): #枚举类型,从1开始,index记录的就是序号 #针对csgo_items中的所有物品进行爬取,并赋予序号 proxy = next(cycle(proxies)) crawl_item_history_price(index, item, total_price_number,proxy)
def collect_all_categories(categories): csgo_items = [] # for category in [categories.pop()]: for category in categories: csgo_items.extend(collect_single_category(category)) log.info( "Finish parsing All csgo items. Total effective items: {}\n".format( len(csgo_items))) return csgo_items
def mannul_delete_item(itemToDelete): mydb = open_db() myDBcursor = mydb.cursor() table = 'ArbTable' sql_delete = " DELETE FROM " + table + " WHERE " + "ItemName = '" + itemToDelete + "'" log.info(sql_delete) myDBcursor.execute(sql_delete) mydb.commit() myDBcursor.close() mydb.close() return
def run(self): start = datetime.datetime.now() log.info("Start Time: {}".format(start)) table = item_crawler.crawl() if (table is not None) and len(table) > 0: suggestion.suggest(table) else: log.error( 'No correct csgo items remain. Please check if conditions are to strict.' ) end = datetime.datetime.now() log.info("END: {}. TIME USED: {}.".format(end, end - start)) self._signal.emit()
def crawl(): log.info("Force crawling? {}".format(FORCE_CRAWL)) if (not FORCE_CRAWL) and os.path.exists(DATABASE_FILE): log.info('{} exists, load data from local!'.format(DATABASE_FILE)) table = load_local() else: log.info('Crawl data from website!') if CRAWL_MIN_PRICE_ITEM == BUFF_GOODS_LIMITED_MIN_PRICE and CRAWL_MAX_PRICE_ITEM == BUFF_GOODS_LIMITED_MAX_PRICE: log.info('Price section unspecified, crawl all items!') table = crawl_the_whole_website() else: log.info('Price section specified, crawl price between {} and {}'. format(CRAWL_MIN_PRICE_ITEM, CRAWL_MAX_PRICE_ITEM)) table = crawl_only_price_section() return table
def crawl_website(): csgo_items = [] raw_categories = csgo_all_categories() categories = final_categories(raw_categories) # crawl by categories and price section if len(raw_categories) != len(categories): total_category = len(categories) for index, category in enumerate(categories, start=1): csgo_items.extend(crawl_goods_by_price_section(category)) log.info('GET category {}/{} for ({}).'.format(index, total_category, category)) else: # crawl by price section without category csgo_items.extend(crawl_goods_by_price_section(None)) enrich_item_with_price_history(csgo_items, CRAWL_STEAM_ASYNC) return persist_util.tabulate(csgo_items)
def csgo_all_categories(): prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">' suffix = '</ul> </div> </div> <div class="criteria">' # to match all csgo skin categories category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL) # entry page root_url = goods_root_url() log.info("GET: " + root_url) root_html = http_util.open_url(root_url) remove_prefix = root_html.split(prefix, 1)[1] core_html = remove_prefix.split(suffix, 1)[0] # all categories categories = category_regex.findall(core_html) log.info("All categories({}): {}".format(len(categories), categories)) return categories
def crawl_item_history_price(index, item, total_price_number,proxy): history_prices = [] item_id = item.id steam_price_url = steam_price_history_url(item_id) #从item.id获取对应的steam价格接口api log.info('GET {} 的steam价格信息 处理序列 第{}个/共{}个 : steam对应价格api接口 {}'.format(item.name, index, total_price_number, steam_price_url)) steam_history_prices = requester.get_json_dict(steam_price_url,proxy) """ json格式如下: { "code": "OK", "data": { "currency": "\u4eba\u6c11\u5e01", "currency_symbol": "\u00a5", "days": 7, "price_history": [ [ 1587834000000, 180.94 ], ], "price_type": "Steam\u4ef7\u683c", "steam_price_currency": "\u5143" }, "msg": null } """ if steam_history_prices is not None: days = steam_history_prices['data']['days'] raw_price_history = steam_history_prices['data']['price_history'] for pair in raw_price_history: if len(pair) == 2: history_prices.append(float(pair[1]) * DOLLAR_TO_CNY) #获取历史记录列表 # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) #为item设置历史价格,在其item类定义中,还会计算其他如 平均价格等参数 log.info('{} 在最近 {} 天里有共 {} 件交易记录 \n'.format(item.name,days,len(history_prices)))
def crawl_item_history_price(index, item, total_price_number): history_prices = [] steam_price_url = steam_price_history_url(item) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) # (同步爬取下引入is_steam_request降低了steam market的爬取间隔) steam_history_prices = get_json_dict(steam_price_url, steam_cookies, is_steam_request=1) # key existence check if (steam_history_prices is not None) and ('prices' in steam_history_prices): days = key_existence_check(item, history_prices, steam_history_prices) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
async def async_crawl_item_history_price(index, item, session): history_prices = [] steam_price_url = steam_price_history_url(item) log.info('prepare to GET steam history price {} for ({}): {}'.format( index, item.name, steam_price_url)) steam_history_prices = await async_get_json_dict(steam_price_url, config.STEAM_COOKIE, session, proxy=True) # key existence check if (steam_history_prices is not None) and ('prices' in steam_history_prices): days = key_existence_check(item, history_prices, steam_history_prices) log.info( 'got steam history price {} for {}({} pieces of price history): {}' .format(index, item.name, len(history_prices), steam_price_url))
def collect_item(item): """ 将爬取到的数据转化为Item类的一个对象 """ buff_id = item['id'] name = item['name'] min_price = item['sell_min_price'] sell_num = item['sell_num'] steam_url = item['steam_market_url'] steam_predict_price = item['goods_info']['steam_price_cny'] buy_max_price = item['buy_max_price'] # restrict price of a item if float(min_price) < CRAWL_MIN_PRICE_ITEM: #理论上,这种情况不会发生 #因为获取item物品时,就已经经过了最高最低价的分界 #仅在由于市场瞬间变化导致价格突然跨越界限时可能发生 log.info("{} price is lower than {}. Drop it!".format(name, CRAWL_MIN_PRICE_ITEM)) return None elif float(min_price) > CRAWL_MAX_PRICE_ITEM: log.info("{} price is higher than {}. Drop it!".format(name, CRAWL_MAX_PRICE_ITEM)) return None else: log.info("GET ITEM {} , 已经解析到Item类对象.".format(name)) return Item(buff_id, name, min_price, sell_num, steam_url, steam_predict_price, buy_max_price)
def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = requester.get_json_dict(root_url) category_items = [] if root_json is not None: total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(category, page_num) page_json = requester.get_json_dict(page_url) if page_json is not None: # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) return category_items
def crawl_only_price_section(): root_url = goods_section_root_url() log.info('GET: {}'.format(root_url)) root_json = requester.get_json_dict(root_url) csgo_items = [] if root_json is not None: total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(page_num) page_json = requester.get_json_dict(page_url) if page_json is not None: # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: csgo_items.append(csgo_item) enrich_item_with_price_history(csgo_items) return persist_util.tabulate(csgo_items)
def write_pricesnapshot_comb(input_df): input_df = input_df.reset_index() item_list = input_df.values.tolist() mydb = open_db() myDBcursor = mydb.cursor() sql = "USE CSGO" myDBcursor.execute(sql) table = "PriceSnapshot" for row in item_list: log.info("Combined data writing to SQL: " + str(row[0])) ItemName = row[0] SteamPrice = float(row[1]) SteamDayVolume = int(row[2]) SteamWeekVolume = int(row[3]) SteamUpdateTime = int(datetime.datetime.now().timestamp()) BitskinsPrice = float(row[4]) BitskinsNum = int(row[5]) BitSkinsUpdateTime = int(row[6]) WaxpeerPrice = float(row[7]) WaxpeerNum = int(row[8]) WaxpeerUpdateTime = int(datetime.datetime.now().timestamp()) sql = f"""INSERT INTO {table} (ItemName, SteamPrice, SteamDayVolume, SteamWeekVolume, SteamUpdateTime, BitskinsPrice, BitskinsNum, BitSkinsUpdateTime, \ WaxpeerPrice, WaxpeerNum, WaxpeerUpdateTime ) \ VALUES ("{ItemName}",{SteamPrice},{SteamDayVolume},{SteamWeekVolume},{SteamUpdateTime},{BitskinsPrice},{BitskinsNum},{BitSkinsUpdateTime},\ {WaxpeerPrice},{WaxpeerNum},{WaxpeerUpdateTime}) \ ON DUPLICATE KEY UPDATE \ SteamPrice={SteamPrice}, SteamDayVolume={SteamDayVolume}, SteamWeekVolume={SteamWeekVolume}, SteamUpdateTime={SteamUpdateTime}, BitskinsPrice={BitskinsPrice}, BitskinsNum={BitskinsNum}\ , BitSkinsUpdateTime={BitSkinsUpdateTime}, WaxpeerPrice={WaxpeerPrice}, WaxpeerNum={WaxpeerNum}, WaxpeerUpdateTime={WaxpeerUpdateTime} """ myDBcursor.execute(sql) mydb.commit() myDBcursor.close() mydb.close() return
def write_buff_data(item_dic): log.info("Writing to SQL: " + str(item_dic)) ItemName = item_dic['ItemName'] BuffPrice = item_dic['BuffPrice'] BuffNum = item_dic['BuffNum'] BuffUpdateTime = item_dic['BuffUpdateTime'] mydb = open_db() myDBcursor = mydb.cursor() table = "PriceSnapshot" sql = "USE CSGO" myDBcursor.execute(sql) sql = f"""INSERT INTO {table} (ItemName, BuffPrice, BuffNum, BuffUpdateTime) VALUES ("{ItemName}",{BuffPrice},{BuffNum},{BuffUpdateTime}) ON DUPLICATE KEY UPDATE \ BuffPrice={BuffPrice}, BuffNum={BuffNum}, BuffUpdateTime={BuffUpdateTime}""" myDBcursor.execute(sql) mydb.commit() myDBcursor.close() mydb.close() return
def crawl_item_history_price(index, item, total_price_number): history_prices = [] item_id = item.id steam_price_url = steam_price_history_url(item_id) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) steam_history_prices = requester.get_json_dict(steam_price_url) if steam_history_prices is not None: days = steam_history_prices['data']['days'] raw_price_history = steam_history_prices['data']['price_history'] for pair in raw_price_history: if len(pair) == 2: history_prices.append(float(pair[1]) * DOLLAR_TO_CNY) # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
async def async_crawl_history_price(csgo_items): total_price_number = len(csgo_items) log.info('Total {} items to get history price.'.format(total_price_number)) tasks = [] # 30min timeout = aiohttp.ClientTimeout(total=30 * 60) if PROXY: # use socks connector = ProxyConnector.from_url(PROXY, limit=5) else: connector = aiohttp.TCPConnector(limit=5) async with aiohttp.ClientSession(cookies=steam_cookies, headers=get_headers(), connector=connector, timeout=timeout) as session: for index, item in enumerate(csgo_items, start=1): try: tasks.append( async_crawl_item_history_price(index, item, total_price_number, session)) except Exception as e: log.error(traceback.format_exc()) # 每次执行100个任务: if len(tasks) > 100: try: await asyncio.gather(*tasks) except Exception as e: log.error(traceback.format_exc()) tasks = [] try: await asyncio.gather(*tasks) except Exception as e: log.error(traceback.format_exc())