def start(self):
        city = get_city()
        self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME),
                                           city, self.date_string)
        # collect_area_zufang('sh', 'beicai')  # For debugging, keep it here
        t1 = time.time()  # 开始计时

        # 获得城市有多少区列表, district: 区县
        # 只抓取指定区
        if len(sys.argv) == 3:
            districts = get_districts(city, sys.argv[2])
        else:
            districts = get_districts(city)

        print('City: {0}'.format(city))
        print('Districts: {0}'.format(districts))

        # 按城市统一写入到一个文件中
        district_file = self.today_path + "/city_{0}.csv".format(city)
        df = codecs.open(district_file, "w", 'utf_8_sig')
        ZuFangBaseSpider.districtCsv = csv.writer(df)
        ZuFangBaseSpider.districtCsv.writerow(
            ['区域', '板块', '小区', '户型', '面积', '租金', '元/平'])

        # 获得每个区的板块, area: 板块
        areas = list()
        for district in districts:
            areas_of_district = get_areas(city, district)
            print('{0}: Area list:  {1}'.format(district, areas_of_district))
            # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部
            areas.extend(areas_of_district)
            # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
            for area in areas_of_district:
                area_dict[area] = district

        #area_dict['huangxinggongyuan'] = district
        #areas = ['huangxinggongyuan']
        print("Area:", areas)
        print("District and areas:", area_dict)

        # 准备线程池用到的参数
        nones = [None for i in range(len(areas))]
        city_list = [city for i in range(len(areas))]
        args = zip(zip(city_list, areas), nones)
        # areas = areas[0: 1]

        # 针对每个板块写一个文件,启动一个线程来操作
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_requests = threadpool.makeRequests(self.collect_area_zufang_data,
                                              args)
        [pool.putRequest(req) for req in my_requests]
        pool.wait()
        pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出

        # 计时结束,统计结果
        t2 = time.time()
        print("Total crawl {0} areas.".format(len(areas)))
        print("Total cost {0} second to crawl {1} data items.".format(
            t2 - t1, self.total_num))
Exemple #2
0
    def start(self):
        city = get_city()
        self.today_path = create_date_path("{0}/ershou".format(SPIDER_NAME), city, self.date_string)

        # 获得城市有多少区列表, district: 区县
        districts = get_districts(city)
        print('City: {0}'.format(city))
        print('Districts: {0}'.format(districts))

        # 获得每个区的板块, area: 板块
        areas = list()
        for district in districts:
            areas_of_district = get_areas(city, district)
            print('{0}: Area list:  {1}'.format(district, areas_of_district))
            # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部
            areas.extend(areas_of_district)
            # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
            for area in areas_of_district:
                area_dict[area] = district
        print("Area:", areas)
        print("District and areas:", area_dict)

        # 准备线程池用到的参数
        nones = [None for i in range(len(areas))]
        city_list = [city for i in range(len(areas))]
        args = zip(zip(city_list, areas), nones)
        # areas = areas[0: 1]   # For debugging

        # 针对每个板块写一个文件,启动一个线程来操作
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_requests = threadpool.makeRequests(self.collect_area_ershou_data, args)
        [pool.putRequest(req) for req in my_requests]
        pool.wait()
        pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出
Exemple #3
0
    def start(self):
        city = get_city()
        print('Today date is: %s' % self.date_string)
        self.today_path = create_date_path("{0}/loupan".format(SPIDER_NAME), city, self.date_string)

        t1 = time.time()  # 开始计时
        self.collect_city_loupan_data(city)
        t2 = time.time()  # 计时结束,统计结果

        print("Total crawl {0} loupan.".format(self.total_num))
        print("Total cost {0} second ".format(t2 - t1))
Exemple #4
0
    def start(self):
        city = get_city()
        self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME),
                                           city, self.date_string)
        # collect_area_zufang('sh', 'beicai')  # For debugging, keep it here
        t1 = time.time()  # 开始计时

        # 获得城市有多少区列表, district: 区县
        # districts = get_districts(city)
        #districts = ['nanshanqu', 'futianqu', 'baoanqu']
        districts = get_metro_lines(city)

        print('City: {0}\n'.format(city))
        # print('Districts: {0}'.format(districts))

        # 获得每个区的板块, area: 板块
        areas = list()
        for district in districts:
            # 区域
            # areas_of_district = get_areas(city, district)
            # 地铁
            areas_of_district = get_metro_stations(city, district)
            print('{0}号线: Area list:'.format(district['name']))
            # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部
            areas.extend(areas_of_district)
            # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
            for area in areas_of_district:
                area_dict[area['name']] = district['name']
                print(area['name'])
            print('\n')
        # print("Area:", areas)
        # print("District and areas:", area_dict)

        # 准备线程池用到的参数
        nones = [None for i in range(len(areas))]
        city_list = [city for i in range(len(areas))]
        args = zip(zip(city_list, areas), nones)
        # areas = areas[0: 1]

        # 针对每个板块写一个文件,启动一个线程来操作
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_requests = threadpool.makeRequests(self.collect_area_zufang_data,
                                              args)
        [pool.putRequest(req) for req in my_requests]
        pool.wait()
        pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出

        # 计时结束,统计结果
        t2 = time.time()
        print("Total crawl {0} areas.".format(len(areas)))
        print("Total cost {0} second to crawl {1} data items.".format(
            t2 - t1, self.total_num))
        print(district)
Exemple #5
0
    def start(self):
        city = get_city()
        self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME),
                                           city, self.date_string)
        t1 = time.time()  # 开始计时
        districts, result = get_districts(city)
        print('City: {0}'.format(city))
        print('Districts: {0}'.format(districts))
        # 获得每个区的板块, area: 板块
        areas = list()
        paragram = list()
        for res in result:
            areas_of_district = res["area"]
            District = res["districts"]
            self.account += 1
            for area in areas_of_district:
                paragram.append(([city, District, area], None))
            #     # paragram.append(([city,District,area],None))
            #     self.collect_area_zufang_data( city, District,area)
            # self.get_area_zufang_info(city,District,area)
        # print(paragram)
        #     print('{0}: Area list:  {1}'.format(areas_of_district))
        #
        #     areas.extend(areas_of_district)
        #     # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
        #     for area in areas_of_district:
        #         area_dict[area] = district
        # print("Area:", areas)
        # print("District and areas:", area_dict)

        # 准备线程池用到的参数
        # nones = [None for i in range(len(areas))]
        # city_list = [city for i in range(len(areas))]
        # args = zip(zip(city_list, areas), nones)
        # areas = areas[0: 1]

        # 针对每个板块写一个文件,启动一个线程来操作
        #
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_requests = threadpool.makeRequests(self.collect_area_zufang_data,
                                              paragram)
        [pool.putRequest(req) for req in my_requests]
        # helper.commit()
        pool.wait()
        pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出

        # 计时结束,统计结果
        t2 = time.time()
        print("Total crawl {0} areas.".format(len(areas)))
        print("Total cost {0} second to crawl data items.".format(t2 - t1))
Exemple #6
0
    def start(self):
        # 开始计时
        t1 = time.time()
        city = get_city()
        path_list = list()
        path_dict = dict()
        # print(city)
        #获取每个城市的区列表
        districts = get_districts(city)
        # print('City: {0}'.format(city))
        # print('Districts: {0}'.format(districts))

        #获取每个区的板块
        areas = list()
        for district in districts:
            area_of_district = get_areas(city, district)
            # print('{0} : Area_list : {1}'.format(district,area_of_district))
            areas.extend(area_of_district)
            #使用字典存储板块和区之间的映射关系
            for area in area_of_district:
                area_dict[area] = district
        # print('Area :',areas)
        # print('district and area :',area_dict)

        # 设置文件保存的路径,字典形式{'tianhe':'/data/zufang/gz/20190527/tianhe/',...}
        path_list = create_zufang_path(city, districts, self.date_string)
        self.path_dict = dict(zip(districts, path_list))
        print(path_dict)

        # self.collect_area_zufang_info('gz','shiqiao1')
        # 准备线程池用到的参数
        nones = [None for i in range(len(areas))]
        city_list = [city for i in range(len(areas))]
        args = zip(zip(city_list, areas), nones)

        # 针对每个板块写一个文件,启动一个线程来操作
        # 如果传入的是个元组数据,那么他会把元组数据分开,做另外处理,元组中第一个元素为请求值 \
        # ,即给请求函数调用的值,第二个元素是结果值,就是请求函数执行后的输出值
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_request = threadpool.makeRequests(self.collect_area_zufang_info,
                                             args)
        [pool.putRequest(req) for req in my_request]
        pool.wait()

        # 结束计时
        t2 = time.time()
        print("Total crawl {0} areas".format(len(areas)))
        print("Total cost {0} second to crawl {1} data items".format(
            t2 - t1, self.total_num))
Exemple #7
0
    def start(self):
        city = get_city()
        self.today_path = create_date_path("{0}/xiaoqu".format(SPIDER_NAME),
                                           city, self.date_string)
        t1 = time.time()  # 开始计时

        # # for test
        # area_dict['gumei'] = 'minhang'
        # areas = list(['gumei'])

        # 获得城市有多少区列表, district: 区县
        districts = get_districts(city)
        print('City: {0}'.format(city))
        print('Districts: {0}'.format(districts))

        # 获得每个区的板块, area: 板块
        areas = list()
        for district in districts:
            areas_of_district = get_areas(city, district)
            print('{0}: Area list:  {1}'.format(district, areas_of_district))
            # 用list的extend方法,L1.extend(L2),该方法将参数L2的全部元素添加到L1的尾部
            areas.extend(areas_of_district)
            # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
            for area in areas_of_district:
                area_dict[area] = district
        print("Area:", areas)
        print("District and areas:", area_dict)

        # 准备线程池用到的参数
        nones = [None for i in range(len(areas))]
        city_list = [city for i in range(len(areas))]
        args = zip(zip(city_list, areas), nones)
        # areas = areas[0: 1]

        # 针对每个板块写一个文件,启动一个线程来操作
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_requests = threadpool.makeRequests(self.collect_area_xiaoqu_data,
                                              args)
        [pool.putRequest(req) for req in my_requests]
        pool.wait()
        pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出

        # 计时结束,统计结果
        t2 = time.time()
        print("Total crawl {0} areas.".format(len(areas)))
        print("Total cost {0} second to crawl {1} data items.".format(
            t2 - t1, self.total_num))
    def hebin(self):
        city = get_city()
        self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME),
                                           city, self.date_string)
        Folder_Path = os.path.abspath(self.today_path)  #要拼接的文件夹及其完整路径,注意不要包含中文
        SaveFile_Path = os.path.abspath(self.today_path)  #拼接后要保存的文件路径
        SaveFile_Name = r'all.csv'  #合并后要保存的文件名

        #修改当前工作目录
        os.chdir(Folder_Path)
        #将该文件夹下的所有文件名存入一个列表
        file_list = os.listdir()

        #读取第一个CSV文件并包含表头
        df = pd.read_csv(os.path.join(SaveFile_Path, file_list[0]),
                         encoding="utf_8_sig")  #编码默认UTF-8,若乱码自行更改

        #将读取的第一个CSV文件写入合并后的文件保存
        df.to_csv(os.path.join(SaveFile_Path, SaveFile_Name),
                  encoding="utf_8_sig",
                  index=False,
                  header=False)

        #循环遍历列表中各个CSV文件名,并追加到合并后的文件
        for i in range(1, len(file_list)):
            try:
                df = pd.read_csv(os.path.join(SaveFile_Path, file_list[i]),
                                 engine='python',
                                 encoding="utf_8_sig")
                df.to_csv(os.path.join(SaveFile_Path, SaveFile_Name),
                          encoding="utf_8_sig",
                          index=False,
                          header=False,
                          mode='a+')
            except:
                continue
Exemple #9
0
    def start(self):
        path_dict = dict()
        nodata = list()
        # 开始计时
        t1 = time.time()
        city = get_city()  # 通过终端获取输入的城市
        districts = get_districts(city)  # 获取每个城市的区列表
        # 获取每个区的板块
        areas = list()
        for district in districts:
            area_of_district = get_areas(city, district)
            # 使用字典存储板块和区之间的映射关系
            if area_of_district != None:
                areas.extend(area_of_district)
                for area in area_of_district:
                    if area != None:
                        area_dict[area] = district
                    else:
                        print("area:{}".format(area))
            else:
                print("area_of_district:{}".format(area_of_district))

        # print('Area :',areas)
        # print('district and area :',area_dict)

        # 设置文件保存的路径,字典形式{'tianhe':'/data/zufang/gz/20190527/tianhe/',...}
        path_list = create_ershou_path(city, districts, self.date_string)
        self.path_dict = dict(zip(districts, path_list))
        # print(path_dict)

        # 准备线程用到的参数,构造((city_name,area_name),result)这样的元组列表
        result = (None for i in range(len(areas)))
        city_list = (city for i in range(len(areas)))
        args = zip(zip(city_list, areas), result)

        # 针对每个板块写一个文件,启动一个线程来操作
        # 如果传入的是个元组数据,那么他会把元组数据分开,做另外处理,元组中第一个元素为请求值 \
        # ,即给请求函数调用的值,第二个元素是结果值,就是请求函数执行后的输出值
        # print("延迟5秒...")
        # time.sleep(5)
        pool_size = thread_pool_size
        pool = threadpool.ThreadPool(pool_size)
        my_request = threadpool.makeRequests(self.collect_area_ershou_info,
                                             args)
        [pool.putRequest(req) for req in my_request]
        pool.wait()

        # 结束计时
        t2 = time.time()
        print("总共爬取 {0} 个areas".format(len(areas)))
        print("总共耗时 {0} 秒爬取 {1} 条数据".format(t2 - t1, self.total_num))
        for i in self.nodata_list:
            self.chinese_nodata_list.append(
                chinese_area_dict[self.nodata_list[i]])
        if self.nodata_list != None:
            print("以下板块没有获取到数据:... \n {0}".format(self.chinese_nodata_list))
            self.recollect_area_ershou_info(city, self.nodata_list)
        else:
            print("所有数据已爬取完成,数据完整..")

        return