def start(self): city = get_city() self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME), city, self.date_string) # collect_area_zufang('sh', 'beicai') # For debugging, keep it here t1 = time.time() # 开始计时 # 获得城市有多少区列表, district: 区县 # 只抓取指定区 if len(sys.argv) == 3: districts = get_districts(city, sys.argv[2]) else: districts = get_districts(city) print('City: {0}'.format(city)) print('Districts: {0}'.format(districts)) # 按城市统一写入到一个文件中 district_file = self.today_path + "/city_{0}.csv".format(city) df = codecs.open(district_file, "w", 'utf_8_sig') ZuFangBaseSpider.districtCsv = csv.writer(df) ZuFangBaseSpider.districtCsv.writerow( ['区域', '板块', '小区', '户型', '面积', '租金', '元/平']) # 获得每个区的板块, area: 板块 areas = list() for district in districts: areas_of_district = get_areas(city, district) print('{0}: Area list: {1}'.format(district, areas_of_district)) # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部 areas.extend(areas_of_district) # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', } for area in areas_of_district: area_dict[area] = district #area_dict['huangxinggongyuan'] = district #areas = ['huangxinggongyuan'] print("Area:", areas) print("District and areas:", area_dict) # 准备线程池用到的参数 nones = [None for i in range(len(areas))] city_list = [city for i in range(len(areas))] args = zip(zip(city_list, areas), nones) # areas = areas[0: 1] # 针对每个板块写一个文件,启动一个线程来操作 pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_requests = threadpool.makeRequests(self.collect_area_zufang_data, args) [pool.putRequest(req) for req in my_requests] pool.wait() pool.dismissWorkers(pool_size, do_join=True) # 完成后退出 # 计时结束,统计结果 t2 = time.time() print("Total crawl {0} areas.".format(len(areas))) print("Total cost {0} second to crawl {1} data items.".format( t2 - t1, self.total_num))
def start(self): city = get_city() self.today_path = create_date_path("{0}/ershou".format(SPIDER_NAME), city, self.date_string) # 获得城市有多少区列表, district: 区县 districts = get_districts(city) print('City: {0}'.format(city)) print('Districts: {0}'.format(districts)) # 获得每个区的板块, area: 板块 areas = list() for district in districts: areas_of_district = get_areas(city, district) print('{0}: Area list: {1}'.format(district, areas_of_district)) # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部 areas.extend(areas_of_district) # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', } for area in areas_of_district: area_dict[area] = district print("Area:", areas) print("District and areas:", area_dict) # 准备线程池用到的参数 nones = [None for i in range(len(areas))] city_list = [city for i in range(len(areas))] args = zip(zip(city_list, areas), nones) # areas = areas[0: 1] # For debugging # 针对每个板块写一个文件,启动一个线程来操作 pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_requests = threadpool.makeRequests(self.collect_area_ershou_data, args) [pool.putRequest(req) for req in my_requests] pool.wait() pool.dismissWorkers(pool_size, do_join=True) # 完成后退出
def start(self): city = get_city() print('Today date is: %s' % self.date_string) self.today_path = create_date_path("{0}/loupan".format(SPIDER_NAME), city, self.date_string) t1 = time.time() # 开始计时 self.collect_city_loupan_data(city) t2 = time.time() # 计时结束,统计结果 print("Total crawl {0} loupan.".format(self.total_num)) print("Total cost {0} second ".format(t2 - t1))
def start(self): city = get_city() self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME), city, self.date_string) # collect_area_zufang('sh', 'beicai') # For debugging, keep it here t1 = time.time() # 开始计时 # 获得城市有多少区列表, district: 区县 # districts = get_districts(city) #districts = ['nanshanqu', 'futianqu', 'baoanqu'] districts = get_metro_lines(city) print('City: {0}\n'.format(city)) # print('Districts: {0}'.format(districts)) # 获得每个区的板块, area: 板块 areas = list() for district in districts: # 区域 # areas_of_district = get_areas(city, district) # 地铁 areas_of_district = get_metro_stations(city, district) print('{0}号线: Area list:'.format(district['name'])) # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部 areas.extend(areas_of_district) # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', } for area in areas_of_district: area_dict[area['name']] = district['name'] print(area['name']) print('\n') # print("Area:", areas) # print("District and areas:", area_dict) # 准备线程池用到的参数 nones = [None for i in range(len(areas))] city_list = [city for i in range(len(areas))] args = zip(zip(city_list, areas), nones) # areas = areas[0: 1] # 针对每个板块写一个文件,启动一个线程来操作 pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_requests = threadpool.makeRequests(self.collect_area_zufang_data, args) [pool.putRequest(req) for req in my_requests] pool.wait() pool.dismissWorkers(pool_size, do_join=True) # 完成后退出 # 计时结束,统计结果 t2 = time.time() print("Total crawl {0} areas.".format(len(areas))) print("Total cost {0} second to crawl {1} data items.".format( t2 - t1, self.total_num)) print(district)
def start(self): city = get_city() self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME), city, self.date_string) t1 = time.time() # 开始计时 districts, result = get_districts(city) print('City: {0}'.format(city)) print('Districts: {0}'.format(districts)) # 获得每个区的板块, area: 板块 areas = list() paragram = list() for res in result: areas_of_district = res["area"] District = res["districts"] self.account += 1 for area in areas_of_district: paragram.append(([city, District, area], None)) # # paragram.append(([city,District,area],None)) # self.collect_area_zufang_data( city, District,area) # self.get_area_zufang_info(city,District,area) # print(paragram) # print('{0}: Area list: {1}'.format(areas_of_district)) # # areas.extend(areas_of_district) # # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', } # for area in areas_of_district: # area_dict[area] = district # print("Area:", areas) # print("District and areas:", area_dict) # 准备线程池用到的参数 # nones = [None for i in range(len(areas))] # city_list = [city for i in range(len(areas))] # args = zip(zip(city_list, areas), nones) # areas = areas[0: 1] # 针对每个板块写一个文件,启动一个线程来操作 # pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_requests = threadpool.makeRequests(self.collect_area_zufang_data, paragram) [pool.putRequest(req) for req in my_requests] # helper.commit() pool.wait() pool.dismissWorkers(pool_size, do_join=True) # 完成后退出 # 计时结束,统计结果 t2 = time.time() print("Total crawl {0} areas.".format(len(areas))) print("Total cost {0} second to crawl data items.".format(t2 - t1))
def start(self): # 开始计时 t1 = time.time() city = get_city() path_list = list() path_dict = dict() # print(city) #获取每个城市的区列表 districts = get_districts(city) # print('City: {0}'.format(city)) # print('Districts: {0}'.format(districts)) #获取每个区的板块 areas = list() for district in districts: area_of_district = get_areas(city, district) # print('{0} : Area_list : {1}'.format(district,area_of_district)) areas.extend(area_of_district) #使用字典存储板块和区之间的映射关系 for area in area_of_district: area_dict[area] = district # print('Area :',areas) # print('district and area :',area_dict) # 设置文件保存的路径,字典形式{'tianhe':'/data/zufang/gz/20190527/tianhe/',...} path_list = create_zufang_path(city, districts, self.date_string) self.path_dict = dict(zip(districts, path_list)) print(path_dict) # self.collect_area_zufang_info('gz','shiqiao1') # 准备线程池用到的参数 nones = [None for i in range(len(areas))] city_list = [city for i in range(len(areas))] args = zip(zip(city_list, areas), nones) # 针对每个板块写一个文件,启动一个线程来操作 # 如果传入的是个元组数据,那么他会把元组数据分开,做另外处理,元组中第一个元素为请求值 \ # ,即给请求函数调用的值,第二个元素是结果值,就是请求函数执行后的输出值 pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_request = threadpool.makeRequests(self.collect_area_zufang_info, args) [pool.putRequest(req) for req in my_request] pool.wait() # 结束计时 t2 = time.time() print("Total crawl {0} areas".format(len(areas))) print("Total cost {0} second to crawl {1} data items".format( t2 - t1, self.total_num))
def start(self): city = get_city() self.today_path = create_date_path("{0}/xiaoqu".format(SPIDER_NAME), city, self.date_string) t1 = time.time() # 开始计时 # # for test # area_dict['gumei'] = 'minhang' # areas = list(['gumei']) # 获得城市有多少区列表, district: 区县 districts = get_districts(city) print('City: {0}'.format(city)) print('Districts: {0}'.format(districts)) # 获得每个区的板块, area: 板块 areas = list() for district in districts: areas_of_district = get_areas(city, district) print('{0}: Area list: {1}'.format(district, areas_of_district)) # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部 areas.extend(areas_of_district) # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', } for area in areas_of_district: area_dict[area] = district print("Area:", areas) print("District and areas:", area_dict) # 准备线程池用到的参数 nones = [None for i in range(len(areas))] city_list = [city for i in range(len(areas))] args = zip(zip(city_list, areas), nones) # areas = areas[0: 1] # 针对每个板块写一个文件,启动一个线程来操作 pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_requests = threadpool.makeRequests(self.collect_area_xiaoqu_data, args) [pool.putRequest(req) for req in my_requests] pool.wait() pool.dismissWorkers(pool_size, do_join=True) # 完成后退出 # 计时结束,统计结果 t2 = time.time() print("Total crawl {0} areas.".format(len(areas))) print("Total cost {0} second to crawl {1} data items.".format( t2 - t1, self.total_num))
def hebin(self): city = get_city() self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME), city, self.date_string) Folder_Path = os.path.abspath(self.today_path) #要拼接的文件夹及其完整路径,注意不要包含中文 SaveFile_Path = os.path.abspath(self.today_path) #拼接后要保存的文件路径 SaveFile_Name = r'all.csv' #合并后要保存的文件名 #修改当前工作目录 os.chdir(Folder_Path) #将该文件夹下的所有文件名存入一个列表 file_list = os.listdir() #读取第一个CSV文件并包含表头 df = pd.read_csv(os.path.join(SaveFile_Path, file_list[0]), encoding="utf_8_sig") #编码默认UTF-8,若乱码自行更改 #将读取的第一个CSV文件写入合并后的文件保存 df.to_csv(os.path.join(SaveFile_Path, SaveFile_Name), encoding="utf_8_sig", index=False, header=False) #循环遍历列表中各个CSV文件名,并追加到合并后的文件 for i in range(1, len(file_list)): try: df = pd.read_csv(os.path.join(SaveFile_Path, file_list[i]), engine='python', encoding="utf_8_sig") df.to_csv(os.path.join(SaveFile_Path, SaveFile_Name), encoding="utf_8_sig", index=False, header=False, mode='a+') except: continue
def start(self): path_dict = dict() nodata = list() # 开始计时 t1 = time.time() city = get_city() # 通过终端获取输入的城市 districts = get_districts(city) # 获取每个城市的区列表 # 获取每个区的板块 areas = list() for district in districts: area_of_district = get_areas(city, district) # 使用字典存储板块和区之间的映射关系 if area_of_district != None: areas.extend(area_of_district) for area in area_of_district: if area != None: area_dict[area] = district else: print("area:{}".format(area)) else: print("area_of_district:{}".format(area_of_district)) # print('Area :',areas) # print('district and area :',area_dict) # 设置文件保存的路径,字典形式{'tianhe':'/data/zufang/gz/20190527/tianhe/',...} path_list = create_ershou_path(city, districts, self.date_string) self.path_dict = dict(zip(districts, path_list)) # print(path_dict) # 准备线程用到的参数,构造((city_name,area_name),result)这样的元组列表 result = (None for i in range(len(areas))) city_list = (city for i in range(len(areas))) args = zip(zip(city_list, areas), result) # 针对每个板块写一个文件,启动一个线程来操作 # 如果传入的是个元组数据,那么他会把元组数据分开,做另外处理,元组中第一个元素为请求值 \ # ,即给请求函数调用的值,第二个元素是结果值,就是请求函数执行后的输出值 # print("延迟5秒...") # time.sleep(5) pool_size = thread_pool_size pool = threadpool.ThreadPool(pool_size) my_request = threadpool.makeRequests(self.collect_area_ershou_info, args) [pool.putRequest(req) for req in my_request] pool.wait() # 结束计时 t2 = time.time() print("总共爬取 {0} 个areas".format(len(areas))) print("总共耗时 {0} 秒爬取 {1} 条数据".format(t2 - t1, self.total_num)) for i in self.nodata_list: self.chinese_nodata_list.append( chinese_area_dict[self.nodata_list[i]]) if self.nodata_list != None: print("以下板块没有获取到数据:... \n {0}".format(self.chinese_nodata_list)) self.recollect_area_ershou_info(city, self.nodata_list) else: print("所有数据已爬取完成,数据完整..") return