def get_finish_active_file( in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/', out_file='finish_active_user_list.txt'): file_name_list = usefulAPI.get_dir_files(in_file_fold, False) out_line_con_list = [] for file_name in file_name_list: out_line_con_list.append(file_name.split('.')[0]) open(out_file, 'w+').write('\n'.join(out_line_con_list))
def get_finish_hotel_id_file(out_file='finish_hotel_id.txt'): in_file_fold = '../Data/HtmlData/TripAdvisorHotelReviewPage/' hotel_id_list = usefulAPI.get_dir_files(in_file_fold, False) finish_hotel_id_dict = {} for hotel_id in hotel_id_list: file_list = usefulAPI.get_dir_files(in_file_fold + hotel_id + '/', True) if len(file_list) < 10: for file in file_list: print file os.remove(file) os.rmdir(in_file_fold + hotel_id + '/') else: finish_hotel_id_dict[hotel_id] = 1 out_line_con_list = [] for key, value in finish_hotel_id_dict.items(): out_line_con_list.append(key) open(out_file, 'w+').write('\n'.join(out_line_con_list))
def get_all_user_info(in_user_file_fold): uid_dict = {} file_name_list = usefulAPI.get_dir_files(in_user_file_fold, True) user_info_dict_list = [] for file_name in file_name_list: temp_user_info_dict_list = usefulDataStruct.load_user_info_dat_txt( user_info_file=file_name) for user_info_dict in temp_user_info_dict_list: if uid_dict.has_key(user_info_dict['uid']) == False: user_info_dict_list.append(user_info_dict) uid_dict[user_info_dict['uid']] = 0 return user_info_dict_list
def load_all_user_info_dict_list( in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/', out_file='../Data/TxtData/TripAdvisorActiveUserPageInfoJson/active_user_more_info.json' ): file_name_list = usefulAPI.get_dir_files(in_file_fold, True) all_user_info_dict_list = [] for file_name in file_name_list: user_info_dict_list = usefulDataStruct.load_user_info_dict_json( file_name) all_user_info_dict_list = all_user_info_dict_list + user_info_dict_list print len(all_user_info_dict_list) usefulDataStruct.print_out_user_info_dat_to_json(all_user_info_dict_list, out_file) return all_user_info_dict_list
def change_dat( in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfo/', out_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/'): file_name_list = usefulAPI.get_dir_files(in_file_fold, True) for file_name in file_name_list: user_info_dict_list = usefulDataStruct.load_user_info_dat_txt( user_info_file=file_name, dict_str= 'user_name\tuid\tsrc\thotel_id\tall_review_number\thotel_review_number\tmore_info_page\tuser_info_str\tfull_profile_page\tall_review_dis' ) for user_info_dict in user_info_dict_list: out_file = out_file_fold + user_info_dict['uid'] + '.json' user_info_dict_list = [] user_info_dict_list.append(user_info_dict) usefulDataStruct.print_out_user_info_dat_to_json( user_info_dict_list=user_info_dict_list, out_json_file=out_file)
def get_prepare_info(in_file_fold, all_user_info_dict_file): user_home_file_list = [] active_user_info_list = [] user_info_dict_list = usefulDataStruct.load_user_info_dat_txt( all_user_info_dict_file) user_id_to_user_info_dict = {} for user_info_dict in user_info_dict_list: user_id_to_user_info_dict[user_info_dict['uid']] = user_info_dict uid_files = usefulAPI.get_dir_files(in_file_fold, False) for uid_file in uid_files: uid = uid_file.replace('.html', '') #if user_id_to_user_info_dict.has_key(uid) == False: # print uid #else: user_home_file_list.append(in_file_fold + uid_file) active_user_info_list.append(user_id_to_user_info_dict[uid]) return user_home_file_list, active_user_info_list
def ana_all_hotel_pages( all_dat_filefold='../Data/HtmlData/TripAdvisorHotelInCitesHomePage/', out_json_file='../Data/TxtData/TripAdvisorHotelInCitesHomePage/hotel_homepage_info.json' ): page_file_list = usefulAPI.get_dir_files(all_dat_filefold, False) home_info_list = [] for i in range(0, len(page_file_list)): city_name = '' page_name_list = page_file_list[i].split('_') for page_name in page_name_list: if page_name == 'Hotels': break city_name = city_name + page_name if i % 50 == 0: print 'pageNum: ' + str(i) + ' / ' + str(len(page_file_list)) print 'page_file: ' + page_file_list[i] print 'city_name: ' + city_name ana_city_hotel_homepage(all_dat_filefold + page_file_list[i], city_name, home_info_list) print 'length of all home info list = ' + str(len(home_info_list)) check_repeat(home_info_list) usefulAPI.print_out_dat_json(home_info_list, out_json_file)
def filter_demo_users( in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/', out_file='a.json'): all_user_dict_list = [] filter_user_dict_list = [] file_list = usefulAPI.get_dir_files(in_file_fold, True) for file in file_list: user_info_dict = json.loads(open(file, 'r').read()) all_user_dict_list.append(user_info_dict) print len(all_user_dict_list) for user_dict_info in all_user_dict_list: if user_dict_info['user_info_str'] == 'NULL': continue user_info_str_list = user_dict_info['user_info_str'].split('-_-_-_-_-') if len(user_info_str_list) == 2: user_info_str_list[1] = user_info_str_list[1].replace( 'From', 'from').replace('Man', 'man').replace('Woman', 'woman') if user_info_str_list[1].split('from')[0] == '': continue #print user_info_str_list[1].split('from')[0] filter_user_dict_list.append(user_dict_info) print len(filter_user_dict_list) usefulDataStruct.print_out_user_info_dat_to_json(filter_user_dict_list, out_file)
def get_hotel_review_files( in_file_fold='../Data/HtmlData/TripAdvisorHotelReviewPage/hotel_206921/' ): return usefulAPI.get_dir_files(in_file_fold, True)
#coding=utf-8 from UsefulLibs import usefulAPI import shutil if __name__ == '__main__': dst_file_fold = '../Data/HtmlData/TripAdvisorHotelInCitesHomePage/' src_file_fold_list = usefulAPI.get_dir_files(dst_file_fold, True) print src_file_fold_list for src_file_fold in src_file_fold_list: src_file_fold = src_file_fold + '/' file_name_list = usefulAPI.get_dir_files(src_file_fold, True) for file_name in file_name_list: shutil.move(file_name, dst_file_fold)
def get_hotel_review_file_list(self): file_fold = self.hotel_reviw_file_fold + self.hotel_id + '/' self.hotel_review_file_list = usefulAPI.get_dir_files( file_fold, is_contain_dir=True)