def appCommunityHandlerNoStyle(community_id): #time.sleep(0.01) cid = str(community_id) print '3->>>>>hand community:' + cid + ' start=====' app_url = lianjia_app_host + cid print app_url global proxies all_p = len(proxies) - 1 for i in range(0, all_p): p = proxies[i] try: strategy_res = requests.get(app_url, proxies=p, timeout=2) except Exception, e: print '3->>>>>hand community get error:' + cid + ' end=====' + str( p) proxies = our_proxy.getListProxies() i = 0 continue strategy_http_code = strategy_res.status_code if (strategy_http_code != 200): print '3->>>>>hand community http code error:' + cid + ' end=====' + str( p) proxies = our_proxy.getListProxies() i = 0 continue else: break
def httpGet(url): global proxies #time.sleep(0.01) a = str(random.randint(1, 10)) aa = str(random.randint(1, 10)) b = str(random.randint(1, 11)) c = str(random.randint(1, 12)) d = str(random.randint(1, 8)) user_agent = 'Mozilla/' + a + '.' + aa + ' (Macintosh; Intel Mac OS X ' + b + '_' + c + '_' + d + ')' headers = {'User-Agent': user_agent} session = requests.session() #page = session.get(url, headers=headers) length = len(proxies) - 1 for i in range(0, length): p = proxies[i] try: page = session.get(url, proxies=p, headers=headers, timeout=2) if (type(page) != 'NoneType'): print 'http get ok,proxy:' + str(p) break else: continue except Exception, e: proxies = our_proxy.getListProxies() i = 0 print 'http get failed,proxy:' + str(p) continue
def getPageCommunityCount(url): global proxies try: page_soup = httpGet(url) page_community_count_div = page_soup.find( 'div', attrs={'class': 'list-head clear'}) p_community_count = page_community_count_div.find('span').string except Exception, e: print 'get community count failed,try again:' + url proxies = our_proxy.getListProxies() return getPageCommunityCount(url)
def listHandler(url): proxies = our_proxy.getListProxies() print '1->>>>>hand list:' + url + ' start=====' p_community_count = getPageCommunityCount(url) cc = int(math.ceil(int(p_community_count) / float(20))) print 'community total:' + str(p_community_count) + ';page total:' + str( cc) for i in range(1, cc + 1): page_url = url + 'd' + str(i) + '/' pageHandler(page_url) print '1->>>>>hand list:' + url + ' end====='
def setDistrictName(district_url): global proxies try: page = httpGet(district_url) l_txt = page.find('div', attrs={'class': 'fl l-txt'}) l_txt_a = l_txt.find_all('a') district_name = l_txt_a[2].text mkdirPcDoc(district_name) mkdirAppDoc(district_name) return district_name except Exception, e: print 'get community count failed,try again:' + district_url proxies = our_proxy.getListProxies() return setDistrictName(district_url)
# -*- coding: utf-8 -*- import shield_proxy, our_proxy, comm_mapping import requests from bs4 import BeautifulSoup import re import random import os.path, sys import math import time import json reload(sys) sys.setdefaultencoding('utf8') lianjia_host = "http://sh.lianjia.com" lianjia_app_host = "http://m.sh.lianjia.com/api/v1/m/strategy/contents/" proxies = our_proxy.getListProxies() dirname, filename = os.path.split(os.path.abspath(sys.argv[0])) print "running from", dirname print "file is", filename home = dirname + '/链家' pc = home + '/pc' app = home + '/app' district_name = '' if (not os.path.exists(home)): os.makedirs(home) if (not os.path.exists(pc)): os.makedirs(pc) if (not os.path.exists(app)):