Ejemplo n.º 1
0
def csgo_all_categories():
    """
    通过对html文件的解析获取buff下所有大类的名称
    """
    prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">'
    suffix = '</ul> </div> </div> <div class="criteria">'
    # to match all csgo skin categories
    category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL)
    #用于获取category下名称信息的正则表达式
    # entry page
    root_url = goods_root_url()
    #获取buff主页根url

    log.info("GET 主页在此!: " + root_url)
    root_html = http_util.open_url(root_url)
    #定义在src/util中,简单来说就是一个urllib的url爬取,具体参数自己看

    remove_prefix = root_html.split(prefix, 1)[1]
    #利用spilt函数获取prefix对应字符串后的内容
    core_html = remove_prefix.split(suffix, 1)[0]
    #再获取suffix对应字符串前的内容
    #这种利用split实现切割搜索实在是巧妙

    # all categories
    categories = category_regex.findall(core_html)
    #获取category的所有名称
    log.info("所有buff大类({}): {}".format(len(categories), categories))
    return categories
Ejemplo n.º 2
0
def csgo_all_categories():
    prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">'
    suffix = '</ul> </div> </div> <div class="criteria">'
    # to match all csgo skin categories
    category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL)

    # entry page
    root_url = goods_root_url()

    log.info("GET: " + root_url)
    root_html = http_util.open_url(root_url)

    remove_prefix = root_html.split(prefix, 1)[1]
    core_html = remove_prefix.split(suffix, 1)[0]

    # all categories
    categories = category_regex.findall(core_html)
    log.info("All categories({}): {}".format(len(categories), categories))
    return categories
Ejemplo n.º 3
0
def crawl_the_whole_website():
    prefix = '<div class="h1z1-selType type_csgo" id="j_h1z1-selType">'
    suffix = '</ul> </div> </div> <div class="criteria">'
    # to match all csgo skin categories
    category_regex = re.compile(r'<li value="(.+?)"', re.DOTALL)

    # entry page
    root_url = goods_root_url()

    log.info("GET: " + root_url)
    root_html = http_util.open_url(root_url)

    remove_prefix = root_html.split(prefix, 1)[1]
    core_html = remove_prefix.split(suffix, 1)[0]

    # all categories
    categories = category_regex.findall(core_html)
    log.info("All categories: ")
    # All categories:
    # weapon_knife_survival_bowie, weapon_knife_butterfly, weapon_knife_falchion, weapon_knife_flip, weapon_knife_gut,
    # weapon_knife_tactical, weapon_knife_m9_bayonet, weapon_bayonet, weapon_knife_karambit, weapon_knife_push,
    # weapon_knife_stiletto, weapon_knife_ursus, weapon_knife_gypsy_jackknife, weapon_knife_widowmaker,
    # weapon_knife_css, weapon_knife_cord, weapon_knife_canis, weapon_knife_outdoor, weapon_knife_skeleton,
    # weapon_hkp2000, weapon_usp_silencer, weapon_glock, weapon_p250, weapon_fiveseven, weapon_cz75a, weapon_tec9,
    # weapon_revolver, weapon_deagle, weapon_elite, weapon_galilar, weapon_scar20, weapon_awp, weapon_ak47,
    # weapon_famas, weapon_m4a1, weapon_m4a1_silencer, weapon_sg556, weapon_ssg08, weapon_aug, weapon_g3sg1,
    # weapon_p90, weapon_mac10, weapon_ump45, weapon_mp7, weapon_bizon, weapon_mp9, weapon_mp5sd, weapon_sawedoff,
    # weapon_xm1014, weapon_nova, weapon_mag7, weapon_m249, weapon_negev,
    # weapon_bloodhound_gloves, weapon_driver_gloves, weapon_hand_wraps, weapon_moto_gloves, weapon_specialist_gloves,
    # weapon_sport_gloves, weapon_hydra_gloves,
    # csgo_type_tool, csgo_type_spray, csgo_type_collectible, csgo_type_ticket, csgo_tool_gifttag, csgo_type_musickit,
    # csgo_type_weaponcase, csgo_tool_weaponcase_keytag, type_customplayer
    log.info(", ".join(categories))

    csgo_items = collect_all_categories(categories)
    enrich_item_with_price_history(csgo_items)
    table = persist_util.tabulate(csgo_items)
    return table