def get_ershou_img_urls(city): urls = list() date = get_date_string() # 获得 csv 文件路径 # date = "20180331" # 指定采集数据的日期 # city = "sh" # 指定采集数据的城市 csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, base_spider.SPIDER_NAME, city, date) files = list() if not os.path.exists(csv_dir): print("{0} does not exist.".format(csv_dir)) print("Please run 'python ershou.py' firstly.") print("Bye.") exit(0) else: print('OK, start to process ' + get_chinese_city(city)) for csv in os.listdir(csv_dir): if csv[-3:] != "csv": continue data_csv = csv_dir + "/" + csv # print(data_csv) files.append(data_csv) # 清理数据 count = 0 for csv in files: with open(csv, 'r') as f: for line in f: count += 1 text = line.strip() try: results = text.split("https://") except Exception as e: print(text) print(e) continue # 确保之前的步骤采集到了图片的url if len(results) > 1: url = results[-1] urls.append("https://" + url) print("https://" + url) print(len(urls)) return urls
async with session.get(image_url, headers=create_headers()) as req: image = await req.read() fp = await aiofiles.open(save_path, 'wb') await fp.write(image) if __name__ == '__main__': # urls = ["https://img.ljcdn.com/370600-inspection/test-9925c97c-fc99-4d1a-97fa-2fd6d3209027.png!m_fill,w_280,h_210,f_jpg?from=ke.com", # "https://img.ljcdn.com/370600-inspection/df98f65c-427e-4d7d-91a7-425a5d682af5.jpg!m_fill,w_280,h_210,f_jpg?from=ke.com", # "https://img.ljcdn.com/370600-inspection/test-9925c97c-fc99-4d1a-97fa-2fd6d3209027.png!m_fill,w_280,h_210,f_jpg?from=ke.com", # "https://img.ljcdn.com/370600-inspection/df98f65c-427e-4d7d-91a7-425a5d682af5.jpg!m_fill,w_280,h_210,f_jpg?from=ke.com"] # 指定城市 start = time.time() city = "yt" urls = get_ershou_img_urls(city) loop = asyncio.get_event_loop() date = get_date_string() csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date) to_do = [ download_images("{0}/{1}.jpg".format(csv_dir, i), urls[i]) for i in range(len(urls)) ] print("Start to download, please wait.") wait_future = asyncio.wait(to_do) resp = loop.run_until_complete(wait_future) loop.close() print("Download {0} images, cost {1} seconds.".format( len(urls), time.time() - start))
data = pd.DataFrame() for file_name in total_files: if district in file_name: df_data = read_data(file_name) df_data.columns = columns data = data.append(df_data, ignore_index=True) return data if __name__ == '__main__': merge_type = r'ershou' # r'ershou' merge_district = r'nanshanqu' # r'nanshanqu' today = get_date_string() input_dir = r'/Users/a123/PycharmProjects/lianjia-beike-spider/data/ke/' + merge_type + r'/sz/' + today print('input_dir:', input_dir) out_file = os.path.join( '/Users/a123/PycharmProjects/lianjia-beike-spider/data/ke/sz', merge_district + r'_' + merge_type + '.csv') print('out_file:', out_file) xiaoqu_columns = [ r'日期', r'区', r'片区', r'小区', r'参考均价', r'在售套数', r'房屋年代', r'90天成交', r'在租房源', r'户型数', r'建筑类型', r'物业费用', r'物业公司', r'开发商', r'楼栋总数', r'房屋总数' ] ershou_columns = [r'日期', r'区', r'片区', r'小区', r'总价', r'关注人数', r'发布时间'] + HOUSE_DETAIL_INFO + \ ['建筑年代']