Python Crawler Examples, crawl.crawlLib.Crawler Python Examples

Example #1

0

Show file

def get_geo_from_address(address):
    address = address.split(' ')[0]
    url = "http://api.map.baidu.com/place/v2/suggestion?region=上海市&city_limit=true&query={0}&ak={1}&output=json".format(
        address, BAIDU_APP_KEY)
    results = crawlLib.Crawler(url).to_json()['result']
    if len(results) == 0 or 'location' not in results[0]:
        url = "http://api.map.baidu.com/geocoder/v2/?city=上海市&address={0}&ak={1}&output=json".format(
            address, BAIDU_APP_KEY)
        obj = crawlLib.Crawler(url).to_json()
        if obj['status'] == 0:
            loc = obj['result']['location']
            return entity.Location(loc['lng'], loc['lat'])
        else:
            return None
    loc = results[0]['location']
    return entity.Location(loc['lng'], loc['lat'])

Example #2

0

Show file

File: shop.py Project: millerbinbin/dp

def get_shop_review_star_num(shop_id):
    data = crawlLib.Crawler(
        REVIEW_URL.format(shop_id)).parse_content(mode="complex")
    try:
        comment_num = data.find("div",
                                class_="comment-star").find_all("dd")[0].find(
                                    "em", class_="col-exp").text[1:-1]
        star_5_num = data.find("div",
                               class_="comment-star").find_all("dd")[1].find(
                                   "em", class_="col-exp").text[1:-1]
        star_4_num = data.find("div",
                               class_="comment-star").find_all("dd")[2].find(
                                   "em", class_="col-exp").text[1:-1]
        star_3_num = data.find("div",
                               class_="comment-star").find_all("dd")[3].find(
                                   "em", class_="col-exp").text[1:-1]
        star_2_num = data.find("div",
                               class_="comment-star").find_all("dd")[4].find(
                                   "em", class_="col-exp").text[1:-1]
        star_1_num = data.find("div",
                               class_="comment-star").find_all("dd")[5].find(
                                   "em", class_="col-exp").text[1:-1]
    except Exception:
        comment_num, star_5_num, star_4_num, star_3_num, star_2_num, star_1_num = None, None, None, None, None, None
    return comment_num, star_5_num, star_4_num, star_3_num, star_2_num, star_1_num

Example #3

0

Show file

def get_all_available_routes(origin, destination):
    x = "{0},{1}".format(origin.lat, origin.lng)
    y = "{0},{1}".format(destination.lat, destination.lng)
    url = "http://api.map.baidu.com/direction/v2/transit?tactics_incity=4&origin={0}&destination={1}&ak={2}".format(
        x, y, BAIDU_APP_KEY)
    result = crawlLib.Crawler(url).to_json()['result']
    return result['routes'], result['taxi']

Example #4

0

Show file

def get_all_regions_by_district(district):
    district_name, district_code, district_id = district
    region_url = SH_URL + "/" + district_code
    data = crawlLib.Crawler(region_url).parse_content()
    return [(item.text, item['href'][item['href'].rfind("/") + 1:],
             item['href'][item['href'].rfind("/") + 2:], district_id) for item
            in data.find("div", id="region-nav-sub",
                         class_="nc-items nc-sub").find_all(tag_filter)]

Example #5

0

Show file

File: shop.py Project: millerbinbin/dp

def get_shop_details(shop_id):
    details = crawlLib.Crawler(
        SHOP_DETAILS_URL.format(shop_id)).to_json()['msg']['shopInfo']
    today_hits = details['todayHits']
    monthly_hits = details['monthlyHits']
    weekly_hits = details['weeklyHits']
    lat = details['glat']
    lng = details['glng']
    total_hits = details['hits']
    phone_no = details['phoneNo']
    last_week_hits = details['prevWeeklyHits']
    return phone_no, total_hits, today_hits, monthly_hits, weekly_hits, last_week_hits, lat, lng

Example #6

0

Show file

def get_all_subcategory_by_category(category):
    category_name, category_code, category_id = category
    category_url = SH_URL + "/" + category_code
    print category_url
    data = crawlLib.Crawler(category_url).parse_content()
    try:
        return [(item.text, item['href'][item['href'].rfind("/") + 1:],
                 item['href'][item['href'].rfind("/") + 2:], category_id)
                for item in
                data.find("div", id="classfy-sub",
                          class_="nc-items nc-sub").find_all(tag_filter)]
    except Exception:
        return []

Example #7

0

Show file

def crawl_all_base_info():
    print "开始抓取基础数据..."
    content = crawlLib.Crawler(SH_URL).parse_content()
    districts, category = get_all_base_info(content)
    regions = get_all_regions(districts)
    print "基础数据抓取完成!"
    # subcategory = get_all_subcategory(category)
    # csvLib.write_records_to_csv(CBD_CSV, cbd, FIELD_DELIMITER)
    # csvLib.write_records_to_csv(METRO_CSV, metros, FIELD_DELIMITER)
    csvLib.write_records_to_csv(DISTRICT_CSV, districts, FIELD_DELIMITER)
    csvLib.write_records_to_csv(CATEGORY_CSV, category, FIELD_DELIMITER)
    csvLib.write_records_to_csv(REGION_CSV, regions, FIELD_DELIMITER)
    # csvLib.write_records_to_csv(SUBCATEGORY_CSV, subcategory, FIELD_DELIMITER)
    print "数据写入完成！"

Example #8

0

Show file

File: test_crawler.py Project: millerbinbin/dp

def test_ljs():
    link = "https://list.lu.com"
    url = "https://list.lu.com/list/r030?minMoney=50000&maxMoney=55000&minDays=&maxDays=&minRate=&maxRate=&mode=&subType=&instId=&haitongGrade=&fundGroupId=&searchWord=&trade=&isCx=&currentPage=1&orderType=R030_INVEST_RATE&orderAsc=false&notHasBuyFeeRate=&rootProductCategoryEnum="
    content = crawlLib.Crawler(url).parse_content()
    prod_list = content.find("ul", class_="main-list").find_all(
        "li", class_="product-list clearfix has-bottom is-2col ")
    for prod in prod_list:
        prod_link = link + prod.find("dt", class_="product-name").a['href']
        prod_rate = prod.find("li", class_="interest-rate").find(
            "p", class_="num-style").text.replace("%", "")
        prod_amount = prod.find("div", class_="product-amount").find(
            "em", class_="num-style").text.replace(",", "")
        prod_rate = float(prod_rate)
        prod_amount = float(prod_amount)
        if prod_amount / prod_rate <= 10500:
            print prod_link, prod_rate, prod_amount
            webbrowser.open_new(prod_link)
            sys.exit(1)

Example #9

0

Show file

File: shop.py Project: millerbinbin/dp

def crawl_all_shops_by_category(category,
                                order_type,
                                score_threshold,
                                limit_num=1000):
    category_id = category.category_id
    category_code = category.category_code
    category_name = category.category_name
    url = SH_URL + "/" + category_code + get_seq_suffix_from_type(order_type)
    stop_flag = 0
    total_num = 0
    info_list = []
    heat_list = []
    score_list = []
    cmt_list = []
    for page in range(1, 51):
        if stop_flag == 1:
            print "【{0}】爬取完成，总数据条数：{1}".format(category_name, len(info_list))
            break
        page_num = "p{0}".format(page)
        p_url = url + page_num
        print p_url
        content = crawlLib.Crawler(p_url).parse_content()
        for c in content.find("div", id="shop-all-list").find_all("li",
                                                                  class_=""):
            result = get_shop_result(c, category_id)
            if result is None:
                continue
            score = result.get_score()
            heat = result.get_heat()
            info = result.get_info()
            comment = result.get_comment()
            total_num += 1
            if float(score.taste_score
                     ) < score_threshold or total_num > limit_num:
                stop_flag = 1
                break
            info_list.append(info.to_tuple())
            heat_list.append(heat.to_tuple())
            score_list.append(score.to_tuple())
            cmt_list.append(comment.to_tuple())
    return info_list, heat_list, score_list, cmt_list

Example #10

0

Show file

File: test_crawler.py Project: millerbinbin/dp

def test_xy():
    csrf_token = "85316bf555379961d6c0752652bc30eb"
    url = "https://www.yingzt.com/invest/apiList?app_ver=2&loanGroup=101&period=ALL&interest=ALL&repay=ALL&order=DESC&orderBy=available&p1=1&_fromAjax_=1&_csrfToken_={0}&_=1504344376474".format(
        csrf_token)
    content = crawlLib.Crawler(url).crawl()
    content = json.loads(content)['data']['html']
    from bs4 import BeautifulSoup
    html = BeautifulSoup(content, "lxml")
    for proj in html.find_all("li", class_="clearfix"):
        p = proj.find("div", class_="info-top")
        proj_name = p.text.strip()
        proj_link = p.a['href']
        str = ""
        for item in proj.find("ul", class_="info-detail").find_all("li"):
            str += "\t" + item.find("p").text
        months, amount, link = filter_months(proj_name + str + '\t' +
                                             proj_link)
        if (months > 3 and months <= 6 and amount > 5000):
            print link, months, amount
            webbrowser.open_new(link)
            sys.exit(1)