def get_geo_from_address(address): address = address.split(' ')[0] url = "http://api.map.baidu.com/place/v2/suggestion?region=上海市&city_limit=true&query={0}&ak={1}&output=json".format( address, BAIDU_APP_KEY) results = crawlLib.Crawler(url).to_json()['result'] if len(results) == 0 or 'location' not in results[0]: url = "http://api.map.baidu.com/geocoder/v2/?city=上海市&address={0}&ak={1}&output=json".format( address, BAIDU_APP_KEY) obj = crawlLib.Crawler(url).to_json() if obj['status'] == 0: loc = obj['result']['location'] return entity.Location(loc['lng'], loc['lat']) else: return None loc = results[0]['location'] return entity.Location(loc['lng'], loc['lat'])
def get_shop_review_star_num(shop_id): data = crawlLib.Crawler( REVIEW_URL.format(shop_id)).parse_content(mode="complex") try: comment_num = data.find("div", class_="comment-star").find_all("dd")[0].find( "em", class_="col-exp").text[1:-1] star_5_num = data.find("div", class_="comment-star").find_all("dd")[1].find( "em", class_="col-exp").text[1:-1] star_4_num = data.find("div", class_="comment-star").find_all("dd")[2].find( "em", class_="col-exp").text[1:-1] star_3_num = data.find("div", class_="comment-star").find_all("dd")[3].find( "em", class_="col-exp").text[1:-1] star_2_num = data.find("div", class_="comment-star").find_all("dd")[4].find( "em", class_="col-exp").text[1:-1] star_1_num = data.find("div", class_="comment-star").find_all("dd")[5].find( "em", class_="col-exp").text[1:-1] except Exception: comment_num, star_5_num, star_4_num, star_3_num, star_2_num, star_1_num = None, None, None, None, None, None return comment_num, star_5_num, star_4_num, star_3_num, star_2_num, star_1_num
def get_all_available_routes(origin, destination): x = "{0},{1}".format(origin.lat, origin.lng) y = "{0},{1}".format(destination.lat, destination.lng) url = "http://api.map.baidu.com/direction/v2/transit?tactics_incity=4&origin={0}&destination={1}&ak={2}".format( x, y, BAIDU_APP_KEY) result = crawlLib.Crawler(url).to_json()['result'] return result['routes'], result['taxi']
def get_all_regions_by_district(district): district_name, district_code, district_id = district region_url = SH_URL + "/" + district_code data = crawlLib.Crawler(region_url).parse_content() return [(item.text, item['href'][item['href'].rfind("/") + 1:], item['href'][item['href'].rfind("/") + 2:], district_id) for item in data.find("div", id="region-nav-sub", class_="nc-items nc-sub").find_all(tag_filter)]
def get_shop_details(shop_id): details = crawlLib.Crawler( SHOP_DETAILS_URL.format(shop_id)).to_json()['msg']['shopInfo'] today_hits = details['todayHits'] monthly_hits = details['monthlyHits'] weekly_hits = details['weeklyHits'] lat = details['glat'] lng = details['glng'] total_hits = details['hits'] phone_no = details['phoneNo'] last_week_hits = details['prevWeeklyHits'] return phone_no, total_hits, today_hits, monthly_hits, weekly_hits, last_week_hits, lat, lng
def get_all_subcategory_by_category(category): category_name, category_code, category_id = category category_url = SH_URL + "/" + category_code print category_url data = crawlLib.Crawler(category_url).parse_content() try: return [(item.text, item['href'][item['href'].rfind("/") + 1:], item['href'][item['href'].rfind("/") + 2:], category_id) for item in data.find("div", id="classfy-sub", class_="nc-items nc-sub").find_all(tag_filter)] except Exception: return []
def crawl_all_base_info(): print "开始抓取基础数据..." content = crawlLib.Crawler(SH_URL).parse_content() districts, category = get_all_base_info(content) regions = get_all_regions(districts) print "基础数据抓取完成!" # subcategory = get_all_subcategory(category) # csvLib.write_records_to_csv(CBD_CSV, cbd, FIELD_DELIMITER) # csvLib.write_records_to_csv(METRO_CSV, metros, FIELD_DELIMITER) csvLib.write_records_to_csv(DISTRICT_CSV, districts, FIELD_DELIMITER) csvLib.write_records_to_csv(CATEGORY_CSV, category, FIELD_DELIMITER) csvLib.write_records_to_csv(REGION_CSV, regions, FIELD_DELIMITER) # csvLib.write_records_to_csv(SUBCATEGORY_CSV, subcategory, FIELD_DELIMITER) print "数据写入完成!"
def test_ljs(): link = "https://list.lu.com" url = "https://list.lu.com/list/r030?minMoney=50000&maxMoney=55000&minDays=&maxDays=&minRate=&maxRate=&mode=&subType=&instId=&haitongGrade=&fundGroupId=&searchWord=&trade=&isCx=¤tPage=1&orderType=R030_INVEST_RATE&orderAsc=false¬HasBuyFeeRate=&rootProductCategoryEnum=" content = crawlLib.Crawler(url).parse_content() prod_list = content.find("ul", class_="main-list").find_all( "li", class_="product-list clearfix has-bottom is-2col ") for prod in prod_list: prod_link = link + prod.find("dt", class_="product-name").a['href'] prod_rate = prod.find("li", class_="interest-rate").find( "p", class_="num-style").text.replace("%", "") prod_amount = prod.find("div", class_="product-amount").find( "em", class_="num-style").text.replace(",", "") prod_rate = float(prod_rate) prod_amount = float(prod_amount) if prod_amount / prod_rate <= 10500: print prod_link, prod_rate, prod_amount webbrowser.open_new(prod_link) sys.exit(1)
def crawl_all_shops_by_category(category, order_type, score_threshold, limit_num=1000): category_id = category.category_id category_code = category.category_code category_name = category.category_name url = SH_URL + "/" + category_code + get_seq_suffix_from_type(order_type) stop_flag = 0 total_num = 0 info_list = [] heat_list = [] score_list = [] cmt_list = [] for page in range(1, 51): if stop_flag == 1: print "【{0}】爬取完成,总数据条数:{1}".format(category_name, len(info_list)) break page_num = "p{0}".format(page) p_url = url + page_num print p_url content = crawlLib.Crawler(p_url).parse_content() for c in content.find("div", id="shop-all-list").find_all("li", class_=""): result = get_shop_result(c, category_id) if result is None: continue score = result.get_score() heat = result.get_heat() info = result.get_info() comment = result.get_comment() total_num += 1 if float(score.taste_score ) < score_threshold or total_num > limit_num: stop_flag = 1 break info_list.append(info.to_tuple()) heat_list.append(heat.to_tuple()) score_list.append(score.to_tuple()) cmt_list.append(comment.to_tuple()) return info_list, heat_list, score_list, cmt_list
def test_xy(): csrf_token = "85316bf555379961d6c0752652bc30eb" url = "https://www.yingzt.com/invest/apiList?app_ver=2&loanGroup=101&period=ALL&interest=ALL&repay=ALL&order=DESC&orderBy=available&p1=1&_fromAjax_=1&_csrfToken_={0}&_=1504344376474".format( csrf_token) content = crawlLib.Crawler(url).crawl() content = json.loads(content)['data']['html'] from bs4 import BeautifulSoup html = BeautifulSoup(content, "lxml") for proj in html.find_all("li", class_="clearfix"): p = proj.find("div", class_="info-top") proj_name = p.text.strip() proj_link = p.a['href'] str = "" for item in proj.find("ul", class_="info-detail").find_all("li"): str += "\t" + item.find("p").text months, amount, link = filter_months(proj_name + str + '\t' + proj_link) if (months > 3 and months <= 6 and amount > 5000): print link, months, amount webbrowser.open_new(link) sys.exit(1)