def start_requests(self): Mysql().truncate_table('t_web_lj_district') id_url = Mysql().query_by_sql(''' select id,url from t_web_lj_city where url is not null ''') for iu in id_url: yield Request(iu['url'] + 'ershoufang/', meta={'id': iu['id']}, callback=self.get_district, dont_filter=True)
def start_requests(self): Mysql().truncate_table('t_web_lj_community') id_and_route = Mysql().query_by_sql(''' select di.id,di.route,ci.url from t_web_lj_city ci,t_web_lj_district di where di.city_id=ci.id ''') for one in id_and_route: yield Request(one['url'] + 'ershoufang/' + one['route'] + '/', meta={'id': one['id']}, callback=self.get_community, dont_filter=True)
def get_residence_url(self, response): li = Selector(response).xpath( '/html/body/div[4]/div[1]/ul/li').extract() for l in li: st = Selector(text=l) url = st.xpath('//*[@class="img"]/@href').extract_first() is_exist = Mysql().query_by_sql( "select * from t_web_lj_residence where url='%s'" % url) if isinstance(is_exist, list): continue district = st.xpath( '//*[@class="district"]/@href').extract_first().split('/')[-2] community = st.xpath( '//*[@class="bizcircle"]/@href').extract_first().split('/')[-2] yield Request(url, meta={'d_c': district + '_' + community}, callback=self.get_residence_info, dont_filter=True) page_box = Selector(response).xpath( '//*[@class="page-box house-lst-page-box"]').extract_first() if page_box is not None: totalPage = eval( Selector(text=page_box).xpath( '//@page-data').extract_first())['totalPage'] curPage = eval( Selector(text=page_box).xpath( '//@page-data').extract_first())['curPage'] if totalPage > curPage: yield Request(response.url[0:response.url.find('/', 30) + 1] + 'pg' + str(curPage + 1) + '/', callback=self.get_residence_url, dont_filter=True)
def start_requests(self): id_esf_url = Mysql().query_by_sql(''' select co.route,c.url from t_web_lj_community co,t_web_lj_district d,t_web_lj_city c where d.id=co.district_id and d.city_id=c.id ''') for route_url in id_esf_url: yield Request( route_url['url'] + 'ershoufang/' + route_url['route'] + '/co32/', callback=self.get_esf_url, dont_filter=True )
def start_requests(self): q_result = Mysql().query_by_sql(''' select co.id,di.route d_r,co.route c_r,ci.url from t_web_lj_community co,t_web_lj_district di,t_web_lj_city ci where co.district_id=di.id and di.city_id=ci.id; ''') for one_r in q_result: self.d_c[one_r['d_r'] + '_' + one_r['c_r']] = one_r['id'] for one_d in q_result: yield Request(one_d['url'] + 'xiaoqu/' + one_d['c_r'] + '/', callback=self.get_residence_url, dont_filter=True)
def start_requests(self): q_result = Mysql().query_by_sql(''' select ci.cn_name,co.route c_r,ci.url from t_web_lj_community co,t_web_lj_district di,t_web_lj_city ci where co.district_id=di.id and di.city_id=ci.id; ''') for one_d in q_result: yield Request( one_d['url'] + 'xiaoqu/' + one_d['c_r'] + '/', meta={'rsd_ci': one_d['cn_name']}, callback=self.get_residence_url )
dct['ring_num'] = data['ring_num'] dct['lj_num'] = data['lj_num'] dct['house_age'] = data['house_age'] dct['property_type'] = data['property_type'] dct['house_type'] = data['house_type'] dct['house_owner'] = data['house_owner'] dct['listing_date'] = data['listing_date'] dct['total_price'] = data['total_price'] dct['unit_price'] = data['unit_price'] dct['last_deal'] = data['last_deal'] dct['mortgage'] = data['mortgage'] dct['house_backup'] = data['house_backup'] dct['bsn_dt'] = None dct['tms'] = time.strftime("%Y-%m-%d %X", time.localtime()) dct['url'] = data['url'] dct['webbst_nm'] = u'链家' dct['crawl_time'] = data['crawl_time'] dct['residence_url'] = None dct['residence_id'] = data['residence_id'] # line = json.dumps(OrderedDict(dct), ensure_ascii=False, sort_keys=False) + '\n' # f.write(line) # break Mysql().insert_by_dict('t_web_lj_esf', dct) end_time = datetime.now() print 'end_time:', end_time print 'seconds:', (end_time - start_time).seconds
def process_item(self, item, spider): Mysql().insert_by_item(item) return item