def process_fangtianxia_parcel_raw_data(city_name): read_file_path = get_raw_data_file_path( city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.FANGTIANXIA.value, CrawlerDataLabel.PARCEL.value) save_file_path = get_raw_data_file_path( city_name, CrawlerDataType.READY_DATA.value, CrawlerSourceName.FANGTIANXIA.value, CrawlerDataLabel.PARCEL.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_raw_data_to_ready(raw_data) ready_data.to_csv(path_or_buf=save_file_path, sep='\t', encoding='utf-8')
def process_lianjia_new_community_raw_data(city_name): read_file_path = get_raw_data_file_path( city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.NEW_COMMUNITY.value) save_file_path = get_raw_data_file_path( city_name, CrawlerDataType.READY_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.NEW_COMMUNITY.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_new_community_raw_data_to_ready(raw_data) ready_data.to_csv(path_or_buf=save_file_path, sep='\t', encoding='utf-8')
def process_baidu_poi_raw_data(city_name): read_file_path = get_raw_data_file_path(city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.BAIDU.value, CrawlerDataLabel.BAIDU_POI.value) save_file_path = get_raw_data_file_path(city_name, CrawlerDataType.READY_DATA.value, CrawlerSourceName.BAIDU.value, CrawlerDataLabel.BAIDU_POI.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_raw_data_to_ready(raw_data) ready_data.to_csv(path_or_buf=save_file_path, sep='\t', encoding='utf-8')
def process_lianjia_second_hand_community_raw_data(city_name): read_file_path = get_raw_data_file_path(city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) save_file_path = get_raw_data_file_path(city_name, CrawlerDataType.READY_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_second_hand_community_raw_data_to_ready(raw_data) ready_data.to_csv(path_or_buf=save_file_path, sep='\t', encoding='utf-8')
def process_fangtianxia_parcel_raw_data(city_name): read_file_path = get_raw_data_file_path(city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.FANGTIANXIA.value, CrawlerDataLabel.PARCEL.value) save_file_path = get_raw_data_file_path(city_name, CrawlerDataType.READY_DATA.value, CrawlerSourceName.FANGTIANXIA.value, CrawlerDataLabel.PARCEL.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_raw_data_to_ready(raw_data) ready_data.to_csv(path_or_buf=save_file_path, sep='\t', encoding='utf-8')
def process_anjuke_new_community_raw_data(city_name): read_file_path = get_raw_data_file_path( city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.ANJUKE.value, CrawlerDataLabel.NEW_COMMUNITY.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_new_community_raw_data_to_ready(raw_data) return ready_data
def write_second_community_raw_data_in_rect_to_file(self, raw_data_list): write_file_path = get_raw_data_file_path( self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.ANJUKE.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) self.write_to_file(ANJUKE_SECOND_COMMUNITY_RAW_DATA_HEADER_LIST, write_file_path, raw_data_list)
def write_second_community_raw_data_in_rect_to_file(self, raw_data_list): write_file_path = get_raw_data_file_path(self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) header_list = LIANJIA_SPECIFIC_SECOND_COMMUNITY_READY_DATA_HEADER_LIST if self.city_name in LIANJIA_SPECIFIC_SECOND_HAND_COMMUNITY_CITY_NAME_LIST else LIANJIA_SECOND_COMMUNITY_RAW_DATA_HEADER_LIST self.write_to_file(header_list, write_file_path, raw_data_list)
def crawl_lianjia_new_community_raw_data(self): data_dict_list_for_new_community = [] city_url = self.get_city_url_for_lianjia() if self.city_name in LIANJIA_SPECIFIC_NEW_COMMUNITY_CITY_NAME_LIST: community_data = self.get_specific_lianjia_new_community_data_with_url( city_url) community_data_list = community_data self.new_community_data_num = len(community_data_list) for community in community_data_list: data_dict_list_for_new_community.append(community) else: community_data = self.get_lianjia_new_community_data_with_url( city_url) community_data_list = community_data.values() for community_list in community_data_list: self.new_community_data_num += len(community_list) for community in community_list: data_dict_list_for_new_community.append(community) file_path = get_raw_data_file_path( self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.NEW_COMMUNITY.value) save_raw_data_in_tsv_file(file_path, data_dict_list_for_new_community) self.logger.info( 'city : {} ][gross lianjia new community data : {}'.format( self.city_name, self.new_community_data_num))
def write_second_community_raw_data_in_rect_to_file(self, raw_data_list): write_file_path = get_raw_data_file_path( self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) header_list = LIANJIA_SPECIFIC_SECOND_COMMUNITY_READY_DATA_HEADER_LIST if self.city_name in LIANJIA_SPECIFIC_SECOND_HAND_COMMUNITY_CITY_NAME_LIST else LIANJIA_SECOND_COMMUNITY_RAW_DATA_HEADER_LIST self.write_to_file(header_list, write_file_path, raw_data_list)
def format_anjuke_second_hand_community_raw_data(city_name): read_file_path = get_raw_data_file_path(city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.ANJUKE.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) formed_raw_data = format_second_community_raw_data(raw_data) return formed_raw_data
def process_anjuke_new_community_raw_data(city_name): read_file_path = get_raw_data_file_path(city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.ANJUKE.value, CrawlerDataLabel.NEW_COMMUNITY.value) raw_data = pd.read_table(read_file_path, error_bad_lines=False) ready_data = process_new_community_raw_data_to_ready(raw_data) return ready_data
def crawl_lianjia_new_community_raw_data(self): data_dict_list_for_new_community = [] city_url = self.get_city_url_for_lianjia() if self.city_name in LIANJIA_SPECIFIC_NEW_COMMUNITY_CITY_NAME_LIST: community_data = self.get_specific_lianjia_new_community_data_with_url(city_url) community_data_list = community_data self.new_community_data_num = len(community_data_list) for community in community_data_list: data_dict_list_for_new_community.append(community) else: community_data = self.get_lianjia_new_community_data_with_url(city_url) community_data_list = community_data.values() for community_list in community_data_list: self.new_community_data_num += len(community_list) for community in community_list: data_dict_list_for_new_community.append(community) file_path = get_raw_data_file_path(self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.LIANJIA.value, CrawlerDataLabel.NEW_COMMUNITY.value) save_raw_data_in_tsv_file(file_path, data_dict_list_for_new_community) self.logger.info('city : {} ][gross lianjia new community data : {}'.format(self.city_name, self.new_community_data_num))
def write_parcel_raw_data_in_rect_to_file(self, raw_data_list): write_file_path = get_raw_data_file_path(self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.FANGTIANXIA.value, CrawlerDataLabel.PARCEL.value) self.write_to_file(FANGTIANXIA_PARCEL_RAW_DATA_HEADER_LIST, write_file_path, raw_data_list)
def write_poi_raw_data_in_rect_to_file(self, raw_data_list): write_file_path = get_raw_data_file_path(self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.BAIDU.value, CrawlerDataLabel.BAIDU_POI.value) self.write_to_file(BAIDU_POI_RAW_DATA_HEADER_LIST, write_file_path, raw_data_list)
def write_second_community_raw_data_in_rect_to_file(self, raw_data_list): write_file_path = get_raw_data_file_path(self.city_name, CrawlerDataType.RAW_DATA.value, CrawlerSourceName.ANJUKE.value, CrawlerDataLabel.SECOND_HAND_COMMUNITY.value) self.write_to_file(ANJUKE_SECOND_COMMUNITY_RAW_DATA_HEADER_LIST, write_file_path, raw_data_list)