def zxcs_download(ori_url): content = tools.get_html_content(ori_url, "body .wrap #pleft .pagefujian .down_2 a") for item in content.items(): url = item.attr.href text = item.text() print "zxcs_download: ", url, text zxcs_rar(url)
def get_mnewest(ori_param_json): print ori_param_json["url"] + "============================" mnewest = tools.get_html_content(ori_param_json["url"], "div", "class", "mnewest") list_ul = mnewest[0].find_all("a") for item in list_ul: if len(item.contents) == 2 and item.img: # print item # print "------------------------------------------------------------" # print item.get("href") # print item.img.get("src") # item.img.get("src") AttributeError: 'NoneType' object has no attribute 'get' # print item.img.get("title") param_json = {} param_json["url"] = "%s%s" % (PV190_ROOTURL, item.get("href")) img = item.img.get("src") param_json["src"] = "%s%s" % (PV190_ROOTURL, img) if ( img.startswith("/")) else img param_json["dir"] = ori_param_json["dir"] param_json["file"] = item.img.get("title") get_pagecon(param_json) list_ul = mnewest[0].div.find_all("a") for item in list_ul: if item.get_text().find("下一页") > 0: nextpage_url = item.get("href") if len(nextpage_url) > 5: param_json = {} param_json["url"] = "%s%s" % (PV190_ROOTURL, nextpage_url) if ( nextpage_url.startswith("/")) else nextpage_url param_json["dir"] = ori_param_json["dir"] get_mnewest(param_json)
def zxcs_rar(ori_url): content = tools.get_html_content(ori_url, "body .wrap .content .downfile a") for item in content.items(): url = item.attr.href text = item.text() print "zxcs_rar: ", url, text tools.write_file(url, "%s%s%s" % (OUTFILE, text, ".txt"))
def zadzs_download(ori_url, sort): content = tools.get_html_content(ori_url, "body .wrap .content") book_name = content.find("h2").text() for item in content.items("div[class='panel-body'] a:contains('TXT格式下载')"): url = "%s%s" % (ROOT_URL, item.attr.href) print "zadzs_download: ", url, item.text() zadzs_book_info(url, sort, book_name)
def get_fangxie_list(ori_url): house_list = [] print "get_fangxie_list: %s" % ori_url content = tools.get_html_content(ori_url, "div", "class", "right_cont") if content: #获取房产信息列表 ul_list = content[0].ul.contents for item in ul_list: if item.span: url_json = {} url_json["ori_url"] = "%s%s" % (CDFANGXIE, item.span.a.get("href")) zone = item.span.a.get("title").split("|") url_json["zone"] = zone[0] url_json["name"] = zone[1] house = get_fangxie_info(url_json) if house and len(house) > 0: house_list.append(house) #获取下一页链接 page_list = content[0].div.b if page_list: for item in page_list: if type(item) == type(content[0]) and item.text == u"下一页": pass next_house_list = get_fangxie_list( "%s%s" % (CDFANGXIE, item["href"])) if next_house_list and len(next_house_list) > 0: house_list.extend(next_house_list) break return house_list
def get_fangxie_info(url_json): print url_json content = tools.get_html_content(url_json["ori_url"], "div", "class", "infor") if content and len(content) > 0: p_list = content[0].find_all("p") if p_list and len(p_list) > 0: for item in p_list: span = item.span if not span.text: continue data = str(span.text) print data if data.startswith(u"项目咨询电话:"): url_json["phone"] = data.split(u":")[1] elif data.startswith(u"预/现售证号:"): url_json["sale"] = data.split(":")[1] elif data.startswith(u"房屋用途:"): url_json["usage"] = data.split(u":")[1] elif data.startswith(u"预售面积(平方米):"): url_json["area"] = data.split(u":")[1] elif data.startswith(u"上市时间:"): url_json["date"] = data.split(u":")[1] elif data.startswith(u"购房登记规则点击下载"): try: url_json["url_reg"] = span.span.a["href"] except Exception, err: print err elif data.startswith(u"成品住房装修方案价格表点击下载"): try: url_json["url_house"] = span.span.a["href"] except Exception, err: print err
def get_zxcs_latest(ori_url): """ 下载最新书籍 :param ori_url: :return: """ content = tools.get_html_content(ori_url, ".wrap #content a") for item in content.items(): url = item.attr.href zxcs_download(url)
def get_wp_n_m(ori_url): wp_n_m = tools.get_html_content(ori_url, "span", "class", "wp_n_m_r_t") # list_ul = wp_n_m[0].ul.contents for item in wp_n_m: sub_url = item.a.get("href") if sub_url.startswith("/"): print(item.a.get_text()) param_json = {} param_json["url"] = "%s%s" % (PV190_ROOTURL, sub_url) param_json["dir"] = item.a.get_text() get_mnewest(param_json)
def get_zadzs_all(ori_url): content = tools.get_html_content(ori_url, "body .g-bdw div[id=J_MM][class=m-menu] .sub") for item in content.items(): title = item.find("h3").text() list_tag = item.find("a") for sub_item in list_tag.items(): url = "%s%s" % (ROOT_URL, sub_item.attr.href) # print title, sub_item.text(), url zadzs_sort(url, url, title) break break
def zxcs_sort(ori_url): content = tools.get_html_content(ori_url, "body .wrap #pleft") # 下载本页书籍 book_sort = content.find("dl[id=plist] dt a") for item in book_sort.items(): url = item.attr.href zxcs_download(url) #查找下一页链接 flag = content.find("div[id=pagenavi] span").text() nexit_page = content.find("div[id=pagenavi] a:contains('" + str(int(flag) + 1) + "')") for item in nexit_page.items(): url = item.attr.href zxcs_sort(url)
def zadzs_sort(ori_url, head_url, sort): print "zadzs_sort: ", ori_url content = tools.get_html_content(ori_url, "body .g-mnc") book_list = content.find("tbody tr div[class=book-name] a") for item in book_list.items(): url = "%s%s" % (ROOT_URL, item.attr.href) # print "zadzs_sort: ", sort, item.text(), url zadzs_bookview(url, sort) break nexit_page = content.find("span[class=nums] a:contains('下一页')") for item in nexit_page.items(): url = "%s%s" % (head_url, item.attr.href) # print url, item.text() zadzs_sort(url, head_url, sort)
def get_pagecon(ori_param_json): print ori_param_json["url"] + "============================" flag = ori_param_json["file"] torrent_data = "" img_data = "\n" + ori_param_json["src"] flag_data = "\n%s%s%s" % (flag, FLAG, ori_param_json["src"]) data_filepath = "%s%s%s" % (OUT_DIR, ori_param_json["dir"].decode( 'utf8').encode('gb2312'), PV190_FILE) torrent_filepath = "%s%s%s" % (OUT_DIR, ori_param_json["dir"].decode( 'utf8').encode('gb2312'), PV190_TORRENT) flag_filepath = "%s%s%s" % (OUT_DIR, ori_param_json["dir"].decode( 'utf8').encode('gb2312'), FLAG_FILE) pagecon = tools.get_html_content(ori_param_json["url"], "div", "class", "pagecon") if len(pagecon) <= 0: return list_ul = pagecon[0].find_all("img") for item in list_ul: img = item.get("src") img_url = "%s%s" % (PV190_ROOTURL, img) if ( img.startswith("/")) else img print img_url img_data = "%s\n%s" % (img_data, img_url) flag_data = "%s\n%s%s%s" % (flag_data, flag, FLAG, img_url) torrent = pagecon[0].get_text() torrent_url_list = torrent.split(HTTP_SPLIT) if len(torrent_url_list) <= 1: return for item in torrent_url_list: if item.find(".torrent") >= 0: torrent_url = HTTP_SPLIT + item[:(item.find(".torrent") + 8)] print torrent_url torrent_data = "%s\n%s" % (torrent_data, torrent_url) flag_data = "%s\n%s%s%s" % (flag_data, flag, FLAG, torrent_url) tools.write_file(torrent_data, torrent_filepath) tools.write_file(img_data, data_filepath) tools.write_file(flag_data, flag_filepath)
def get_zxcs_all(ori_url): content = tools.get_html_content(ori_url, "body .wrap #sort ul li a:contains('(')") for item in content.items(): url = item.attr.href zxcs_sort(url)
def zadzs_bookview(ori_url, sort): content = tools.get_html_content(ori_url, "body div[class=ops] a") for item in content.items(): url = "%s%s" % (ROOT_URL, item.attr.href) print "zadzs_rar: ", url, item.text() zadzs_download(url, sort)