コード例 #1
0
def crawl():
    log.info("Force crawling? {}".format(config.FORCE_CRAWL))
    csgo_items = []

    raw_categories = csgo_all_categories()

    categories = final_categories(raw_categories)

    # crawl by categories and price section
    if len(raw_categories) != len(categories):
        total_category = len(categories)
        for index, category in enumerate(categories, start=1):
            t = asyncio.run(crawl_goods_by_price_section(category))
            if t is None:
                break
            else:
                csgo_items.extend(t)
            log.info('GET category {}/{} for ({}).'.format(
                index, total_category, category))
    else:
        # crawl by price section without category
        csgo_items.extend(
            asyncio.run(crawl_goods_by_price_section(None)) or [])

    return csgo_items
コード例 #2
0
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    steam_price_url = steam_price_history_url(item)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))
    steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True)

    # key existence check
    if (steam_history_prices is not None) and ('prices'
                                               in steam_history_prices):
        raw_price_history = steam_history_prices['prices']
        if len(raw_price_history) > 0:
            days = min((datetime.today().date() - datetime.strptime(
                raw_price_history[0][0], '%b %d %Y %H: +0').date()).days, 7)
        else:
            days = 0
        for pair in reversed(raw_price_history):
            if len(pair) == 3:
                for i in range(0, int(pair[2])):
                    history_prices.append(float(pair[1]))
            if (datetime.today().date() - datetime.strptime(
                    pair[0], '%b %d %Y %H: +0').date()).days > days:
                break

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
コード例 #3
0
ファイル: cache.py プロジェクト: zcs19960929/oddish
async def asyncfetch(url):
    urlid = url_id(url)
    log.info('Successful attempt to fetch from {}'.format(urlid))
    async with aiofiles.open(os.path.join(cache_root, urlid),
                             "r",
                             encoding='utf-8') as f:
        return await f.read()
コード例 #4
0
ファイル: history_price_crawler.py プロジェクト: xh119/oddish
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    item_id = item.id
    steam_price_url = steam_price_history_url(item)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))
    steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True)

    if steam_history_prices is not None:
        raw_price_history = steam_history_prices['prices']
        days = min(len(raw_price_history), 7)
        for pair in reversed(raw_price_history):
            if len(pair) == 3:
                history_prices.append(float(pair[1]))
            if len(history_prices) == days:
                break

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
コード例 #5
0
def get_json_dict(url, proxy=None, times=1):
    if times > RETRY_TIMES:
        log.error(
            'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(
                url, RETRY_TIMES))
        return None

    timer.sleep_awhile()
    #随机睡眠1~2秒
    try:
        if proxy is not None:
            log.info("使用代理{}".format(proxy))
            return requests.get(url,
                                headers=headers,
                                cookies=cookies,
                                timeout=5,
                                proxies={
                                    'http': proxy
                                }).json()
        else:
            log.info("无代理".format(proxy))
            return requests.get(url,
                                headers=headers,
                                cookies=cookies,
                                timeout=5).json()
        #获取JSON文件
    except Timeout:
        #获取json时出现超时问题
        log.warn("timeout for {}. Try again.".format(url))
        return get_json_dict(url, times + 1)
コード例 #6
0
ファイル: item_crawler.py プロジェクト: Nortonary/BUFFCrawler
def csgo_all_categories():
    """
    通过对html文件的解析获取buff下所有大类的名称
    """
    prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">'
    suffix = '</ul> </div> </div> <div class="criteria">'
    # to match all csgo skin categories
    category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL)
    #用于获取category下名称信息的正则表达式
    # entry page
    root_url = goods_root_url()
    #获取buff主页根url

    log.info("GET 主页在此!: " + root_url)
    root_html = http_util.open_url(root_url)
    #定义在src/util中,简单来说就是一个urllib的url爬取,具体参数自己看

    remove_prefix = root_html.split(prefix, 1)[1]
    #利用spilt函数获取prefix对应字符串后的内容
    core_html = remove_prefix.split(suffix, 1)[0]
    #再获取suffix对应字符串前的内容
    #这种利用split实现切割搜索实在是巧妙

    # all categories
    categories = category_regex.findall(core_html)
    #获取category的所有名称
    log.info("所有buff大类({}): {}".format(len(categories), categories))
    return categories
コード例 #7
0
def write_arb_tab():
    curr_arb_tab = arbitrage_data()
    curr_arb_tab = curr_arb_tab.reset_index()
    print(curr_arb_tab)

    mydb = open_db()
    myDBcursor = mydb.cursor()
    table = 'ArbTable'

    myDBcursor.execute("DROP TABLE IF EXISTS " + table)

    sql_create_tab = "CREATE TABLE " + table + " \
         (ItemName VARCHAR(50), SteamPrice DECIMAL(10,2), SteamDayVolume INTEGER, \
             SteamWeekVolume INTEGER, BitskinsPrice DECIMAL(10,2), BitskinsNum INTEGER, \
                 UpdateTime INTEGER, WaxpeerPrice DECIMAL(10,2), WaxpeerNum INTEGER, \
                     AbsoluteProfit DECIMAL(10,5), PercentProfit DECIMAL(10,5) )"

    myDBcursor.execute(sql_create_tab)

    cols = "ItemName, SteamPrice, SteamDayVolume, SteamWeekVolume, BitskinsPrice, BitskinsNum, \
        UpdateTime, WaxpeerPrice, WaxpeerNum, AbsoluteProfit, PercentProfit"

    for i, row in curr_arb_tab.iterrows():
        log.info('Writing Arbitrage Table into SQL ' + str(tuple(row)))
        sql = "INSERT INTO " + table + " (" + cols + ") VALUES (" + "%s," * (
            len(row) - 1) + "%s)"
        myDBcursor.execute(sql, tuple(row))

    mydb.commit()
    myDBcursor.close()
    mydb.close()
    return
コード例 #8
0
ファイル: timer.py プロジェクト: Nortonary/BUFFCrawler
def sleep_awhile():
    if proxies[0] is not None:
        interval = random.uniform(0.1, 0.3)
    else:
        interval = random.randint(1, 2)
    log.info("sleep {}s at {}".format(interval, datetime.datetime.now()))
    #随机睡眠1~2秒
    time.sleep(interval)
コード例 #9
0
def get_raw_data():
    url = "http://api.steamanalyst.com/csgo/XHoXEDUxbtdHXFZlc"
    resp = requests.get(url)
    log.info("Fetching General Steam Community market data")
    if resp.ok == False:
        log.warning("Can't fetch Steam Community Market data ")
        return
    toReturn = resp.json()
    return toReturn['results']
コード例 #10
0
def get_market_data():
    url = 'https://bitskins.com/api/v1/get_price_data_for_items_on_sale/?api_key=' + API_KEY + '&code=' + CODE + '&app_id=' + APP_ID
    resp = requests.get(url)
    log.info("Fetching Bitskins market data")
    if resp.ok == False:
        log.warning("Can't fetch Bitskins data ")
        return
    price_list = resp.json()
    return price_list['data']['items']
コード例 #11
0
def get_market_data():
    url = 'https://api.waxpeer.com/v1/prices?game=csgo&min_price=0&max_price=100000000'
    resp = requests.get(url)
    log.info("Fetching Waxpeer market data")
    if resp.ok == False:
        log.warning("Can't fetch Waxpeer data ")
        return
    price_list = resp.json()
    return price_list['items']
コード例 #12
0
def sleep_awhile(is_steam_request=0):
    low = max(FREQUENCY_INTERVAL_LOW, 10)
    high = max(10, FREQUENCY_INTERVAL_HIGH)
    if is_steam_request == 1:
        interval = 1 / (random.randint(5, 10))
    else:
        interval = random.randint(low, high)
    log.info("sleep {}s at {}".format(interval, datetime.datetime.now()))
    time.sleep(interval)
コード例 #13
0
def crawl_history_price(csgo_items):
    total_price_number = len(csgo_items)
    log.info('Total {} items to get history price.'.format(total_price_number))

    for index, item in enumerate(csgo_items, start=1):
        try:
            crawl_item_history_price(index, item, total_price_number)
        except Exception as e:
            log.error(traceback.format_exc())
コード例 #14
0
def crawl_history_price(csgo_items):
    total_price_number = len(csgo_items)
    log.info('从buff爬取共 {} 物品item满足爬取条件.'.format(total_price_number))

    proxies = proxyGet(30)

    for index, item in enumerate(csgo_items, start=1):
        #枚举类型,从1开始,index记录的就是序号
        #针对csgo_items中的所有物品进行爬取,并赋予序号
        proxy = next(cycle(proxies))
        crawl_item_history_price(index, item, total_price_number,proxy)
コード例 #15
0
def collect_all_categories(categories):
    csgo_items = []

    # for category in [categories.pop()]:
    for category in categories:
        csgo_items.extend(collect_single_category(category))

    log.info(
        "Finish parsing All csgo items. Total effective items: {}\n".format(
            len(csgo_items)))
    return csgo_items
コード例 #16
0
def mannul_delete_item(itemToDelete):
    mydb = open_db()
    myDBcursor = mydb.cursor()
    table = 'ArbTable'

    sql_delete = " DELETE FROM " + table + " WHERE " + "ItemName = '" + itemToDelete + "'"
    log.info(sql_delete)
    myDBcursor.execute(sql_delete)
    mydb.commit()
    myDBcursor.close()
    mydb.close()
    return
コード例 #17
0
ファイル: oddish.py プロジェクト: puppylpg/oddish
    def run(self):
        start = datetime.datetime.now()
        log.info("Start Time: {}".format(start))

        table = item_crawler.crawl()

        if (table is not None) and len(table) > 0:
            suggestion.suggest(table)
        else:
            log.error(
                'No correct csgo items remain. Please check if conditions are to strict.'
            )

        end = datetime.datetime.now()
        log.info("END: {}. TIME USED: {}.".format(end, end - start))
        self._signal.emit()
コード例 #18
0
def crawl():
    log.info("Force crawling? {}".format(FORCE_CRAWL))
    if (not FORCE_CRAWL) and os.path.exists(DATABASE_FILE):
        log.info('{} exists, load data from local!'.format(DATABASE_FILE))
        table = load_local()
    else:
        log.info('Crawl data from website!')
        if CRAWL_MIN_PRICE_ITEM == BUFF_GOODS_LIMITED_MIN_PRICE and CRAWL_MAX_PRICE_ITEM == BUFF_GOODS_LIMITED_MAX_PRICE:
            log.info('Price section unspecified, crawl all items!')
            table = crawl_the_whole_website()
        else:
            log.info('Price section specified, crawl price between {} and {}'.
                     format(CRAWL_MIN_PRICE_ITEM, CRAWL_MAX_PRICE_ITEM))
            table = crawl_only_price_section()

    return table
コード例 #19
0
ファイル: item_crawler.py プロジェクト: chaffurry/oddish
def crawl_website():
    csgo_items = []

    raw_categories = csgo_all_categories()

    categories = final_categories(raw_categories)

    # crawl by categories and price section
    if len(raw_categories) != len(categories):
        total_category = len(categories)
        for index, category in enumerate(categories, start=1):
            csgo_items.extend(crawl_goods_by_price_section(category))
            log.info('GET category {}/{} for ({}).'.format(index, total_category, category))
    else:
        # crawl by price section without category
        csgo_items.extend(crawl_goods_by_price_section(None))

    enrich_item_with_price_history(csgo_items, CRAWL_STEAM_ASYNC)
    return persist_util.tabulate(csgo_items)
コード例 #20
0
ファイル: item_crawler.py プロジェクト: asdlei99/oddish
def csgo_all_categories():
    prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">'
    suffix = '</ul> </div> </div> <div class="criteria">'
    # to match all csgo skin categories
    category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL)

    # entry page
    root_url = goods_root_url()

    log.info("GET: " + root_url)
    root_html = http_util.open_url(root_url)

    remove_prefix = root_html.split(prefix, 1)[1]
    core_html = remove_prefix.split(suffix, 1)[0]

    # all categories
    categories = category_regex.findall(core_html)
    log.info("All categories({}): {}".format(len(categories), categories))
    return categories
コード例 #21
0
def crawl_item_history_price(index, item, total_price_number,proxy):
    history_prices = []

    item_id = item.id
    steam_price_url = steam_price_history_url(item_id)
    #从item.id获取对应的steam价格接口api
    log.info('GET {} 的steam价格信息 处理序列 第{}个/共{}个 : steam对应价格api接口 {}'.format(item.name, index, total_price_number,  steam_price_url))
    steam_history_prices = requester.get_json_dict(steam_price_url,proxy)
    """
    json格式如下:
    {
      "code": "OK", 
      "data": {
        "currency": "\u4eba\u6c11\u5e01", 
        "currency_symbol": "\u00a5", 
        "days": 7, 
        "price_history": [
                            [
                                1587834000000, 
                                180.94
                            ], 
                         ], 
        "price_type": "Steam\u4ef7\u683c", 
        "steam_price_currency": "\u5143"
      }, 
      "msg": null
    }
    """
    if steam_history_prices is not None:
        days = steam_history_prices['data']['days']
        raw_price_history = steam_history_prices['data']['price_history']
        for pair in raw_price_history:
            if len(pair) == 2:
                history_prices.append(float(pair[1]) * DOLLAR_TO_CNY)
                #获取历史记录列表

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)
            #为item设置历史价格,在其item类定义中,还会计算其他如 平均价格等参数

        log.info('{} 在最近 {} 天里有共 {} 件交易记录 \n'.format(item.name,days,len(history_prices)))
コード例 #22
0
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    steam_price_url = steam_price_history_url(item)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))

    # (同步爬取下引入is_steam_request降低了steam market的爬取间隔)
    steam_history_prices = get_json_dict(steam_price_url,
                                         steam_cookies,
                                         is_steam_request=1)

    # key existence check
    if (steam_history_prices is not None) and ('prices'
                                               in steam_history_prices):
        days = key_existence_check(item, history_prices, steam_history_prices)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
コード例 #23
0
async def async_crawl_item_history_price(index, item, session):
    history_prices = []

    steam_price_url = steam_price_history_url(item)
    log.info('prepare to GET steam history price {} for ({}): {}'.format(
        index, item.name, steam_price_url))

    steam_history_prices = await async_get_json_dict(steam_price_url,
                                                     config.STEAM_COOKIE,
                                                     session,
                                                     proxy=True)

    # key existence check
    if (steam_history_prices is not None) and ('prices'
                                               in steam_history_prices):
        days = key_existence_check(item, history_prices, steam_history_prices)

        log.info(
            'got steam history price {} for {}({} pieces of price history): {}'
            .format(index, item.name, len(history_prices), steam_price_url))
コード例 #24
0
ファイル: item_crawler.py プロジェクト: Nortonary/BUFFCrawler
def collect_item(item):
    """
    将爬取到的数据转化为Item类的一个对象
    """
    buff_id = item['id']
    name = item['name']
    min_price = item['sell_min_price']
    sell_num = item['sell_num']
    steam_url = item['steam_market_url']
    steam_predict_price = item['goods_info']['steam_price_cny']
    buy_max_price = item['buy_max_price']

    # restrict price of a item
    if float(min_price) < CRAWL_MIN_PRICE_ITEM:
        #理论上,这种情况不会发生
        #因为获取item物品时,就已经经过了最高最低价的分界
        #仅在由于市场瞬间变化导致价格突然跨越界限时可能发生
        log.info("{} price is lower than {}. Drop it!".format(name, CRAWL_MIN_PRICE_ITEM))
        return None
    elif float(min_price) > CRAWL_MAX_PRICE_ITEM:
        log.info("{} price is higher than {}. Drop it!".format(name, CRAWL_MAX_PRICE_ITEM))
        return None
    else:
        log.info("GET ITEM {} , 已经解析到Item类对象.".format(name))
        return Item(buff_id, name, min_price, sell_num, steam_url, steam_predict_price, buy_max_price)
コード例 #25
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = requester.get_json_dict(root_url)

    category_items = []

    if root_json is not None:
        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(category, page_num)
            page_json = requester.get_json_dict(page_url)
            if page_json is not None:
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)

    return category_items
コード例 #26
0
def crawl_only_price_section():
    root_url = goods_section_root_url()
    log.info('GET: {}'.format(root_url))

    root_json = requester.get_json_dict(root_url)

    csgo_items = []

    if root_json is not None:
        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(page_num)
            page_json = requester.get_json_dict(page_url)
            if page_json is not None:
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        csgo_items.append(csgo_item)

    enrich_item_with_price_history(csgo_items)
    return persist_util.tabulate(csgo_items)
コード例 #27
0
def write_pricesnapshot_comb(input_df):
    input_df = input_df.reset_index()
    item_list = input_df.values.tolist()

    mydb = open_db()
    myDBcursor = mydb.cursor()
    sql = "USE CSGO"
    myDBcursor.execute(sql)
    table = "PriceSnapshot"

    for row in item_list:
        log.info("Combined data writing to SQL: " + str(row[0]))
        ItemName = row[0]
        SteamPrice = float(row[1])
        SteamDayVolume = int(row[2])
        SteamWeekVolume = int(row[3])
        SteamUpdateTime = int(datetime.datetime.now().timestamp())
        BitskinsPrice = float(row[4])
        BitskinsNum = int(row[5])
        BitSkinsUpdateTime = int(row[6])
        WaxpeerPrice = float(row[7])
        WaxpeerNum = int(row[8])
        WaxpeerUpdateTime = int(datetime.datetime.now().timestamp())

        sql = f"""INSERT INTO {table} (ItemName, SteamPrice, SteamDayVolume, SteamWeekVolume, SteamUpdateTime, BitskinsPrice, BitskinsNum, BitSkinsUpdateTime, \
            WaxpeerPrice, WaxpeerNum, WaxpeerUpdateTime ) \
                VALUES ("{ItemName}",{SteamPrice},{SteamDayVolume},{SteamWeekVolume},{SteamUpdateTime},{BitskinsPrice},{BitskinsNum},{BitSkinsUpdateTime},\
                {WaxpeerPrice},{WaxpeerNum},{WaxpeerUpdateTime}) \
                    ON DUPLICATE KEY UPDATE \
                        SteamPrice={SteamPrice}, SteamDayVolume={SteamDayVolume}, SteamWeekVolume={SteamWeekVolume}, SteamUpdateTime={SteamUpdateTime}, BitskinsPrice={BitskinsPrice}, BitskinsNum={BitskinsNum}\
                            , BitSkinsUpdateTime={BitSkinsUpdateTime}, WaxpeerPrice={WaxpeerPrice}, WaxpeerNum={WaxpeerNum}, WaxpeerUpdateTime={WaxpeerUpdateTime}
        """
        myDBcursor.execute(sql)
    mydb.commit()
    myDBcursor.close()
    mydb.close()

    return
コード例 #28
0
def write_buff_data(item_dic):
    log.info("Writing to SQL: " + str(item_dic))
    ItemName = item_dic['ItemName']
    BuffPrice = item_dic['BuffPrice']
    BuffNum = item_dic['BuffNum']
    BuffUpdateTime = item_dic['BuffUpdateTime']

    mydb = open_db()
    myDBcursor = mydb.cursor()

    table = "PriceSnapshot"

    sql = "USE CSGO"
    myDBcursor.execute(sql)
    sql = f"""INSERT INTO {table} (ItemName, BuffPrice, BuffNum, BuffUpdateTime) VALUES ("{ItemName}",{BuffPrice},{BuffNum},{BuffUpdateTime}) ON DUPLICATE KEY UPDATE \
        BuffPrice={BuffPrice}, BuffNum={BuffNum}, BuffUpdateTime={BuffUpdateTime}"""
    myDBcursor.execute(sql)

    mydb.commit()
    myDBcursor.close()
    mydb.close()

    return
コード例 #29
0
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    item_id = item.id
    steam_price_url = steam_price_history_url(item_id)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))
    steam_history_prices = requester.get_json_dict(steam_price_url)

    if steam_history_prices is not None:
        days = steam_history_prices['data']['days']
        raw_price_history = steam_history_prices['data']['price_history']
        for pair in raw_price_history:
            if len(pair) == 2:
                history_prices.append(float(pair[1]) * DOLLAR_TO_CNY)

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
コード例 #30
0
async def async_crawl_history_price(csgo_items):
    total_price_number = len(csgo_items)
    log.info('Total {} items to get history price.'.format(total_price_number))

    tasks = []

    # 30min
    timeout = aiohttp.ClientTimeout(total=30 * 60)
    if PROXY:
        # use socks
        connector = ProxyConnector.from_url(PROXY, limit=5)
    else:
        connector = aiohttp.TCPConnector(limit=5)
    async with aiohttp.ClientSession(cookies=steam_cookies,
                                     headers=get_headers(),
                                     connector=connector,
                                     timeout=timeout) as session:
        for index, item in enumerate(csgo_items, start=1):
            try:
                tasks.append(
                    async_crawl_item_history_price(index, item,
                                                   total_price_number,
                                                   session))
            except Exception as e:
                log.error(traceback.format_exc())
            # 每次执行100个任务:
            if len(tasks) > 100:
                try:
                    await asyncio.gather(*tasks)
                except Exception as e:
                    log.error(traceback.format_exc())
                tasks = []
        try:
            await asyncio.gather(*tasks)
        except Exception as e:
            log.error(traceback.format_exc())