Exemple #1
0
def get_finish_active_file(
        in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/',
        out_file='finish_active_user_list.txt'):
    file_name_list = usefulAPI.get_dir_files(in_file_fold, False)
    out_line_con_list = []
    for file_name in file_name_list:
        out_line_con_list.append(file_name.split('.')[0])
    open(out_file, 'w+').write('\n'.join(out_line_con_list))
def get_finish_hotel_id_file(out_file='finish_hotel_id.txt'):
    in_file_fold = '../Data/HtmlData/TripAdvisorHotelReviewPage/'
    hotel_id_list = usefulAPI.get_dir_files(in_file_fold, False)
    finish_hotel_id_dict = {}
    for hotel_id in hotel_id_list:
        file_list = usefulAPI.get_dir_files(in_file_fold + hotel_id + '/',
                                            True)
        if len(file_list) < 10:
            for file in file_list:
                print file
                os.remove(file)
            os.rmdir(in_file_fold + hotel_id + '/')
        else:
            finish_hotel_id_dict[hotel_id] = 1
    out_line_con_list = []
    for key, value in finish_hotel_id_dict.items():
        out_line_con_list.append(key)
    open(out_file, 'w+').write('\n'.join(out_line_con_list))
Exemple #3
0
def get_all_user_info(in_user_file_fold):
    uid_dict = {}
    file_name_list = usefulAPI.get_dir_files(in_user_file_fold, True)
    user_info_dict_list = []
    for file_name in file_name_list:
        temp_user_info_dict_list = usefulDataStruct.load_user_info_dat_txt(
            user_info_file=file_name)
        for user_info_dict in temp_user_info_dict_list:
            if uid_dict.has_key(user_info_dict['uid']) == False:
                user_info_dict_list.append(user_info_dict)
            uid_dict[user_info_dict['uid']] = 0
    return user_info_dict_list
def load_all_user_info_dict_list(
    in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/',
    out_file='../Data/TxtData/TripAdvisorActiveUserPageInfoJson/active_user_more_info.json'
):
    file_name_list = usefulAPI.get_dir_files(in_file_fold, True)
    all_user_info_dict_list = []
    for file_name in file_name_list:
        user_info_dict_list = usefulDataStruct.load_user_info_dict_json(
            file_name)
        all_user_info_dict_list = all_user_info_dict_list + user_info_dict_list
    print len(all_user_info_dict_list)
    usefulDataStruct.print_out_user_info_dat_to_json(all_user_info_dict_list,
                                                     out_file)
    return all_user_info_dict_list
def change_dat(
        in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfo/',
        out_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/'):
    file_name_list = usefulAPI.get_dir_files(in_file_fold, True)
    for file_name in file_name_list:
        user_info_dict_list = usefulDataStruct.load_user_info_dat_txt(
            user_info_file=file_name,
            dict_str=
            'user_name\tuid\tsrc\thotel_id\tall_review_number\thotel_review_number\tmore_info_page\tuser_info_str\tfull_profile_page\tall_review_dis'
        )
        for user_info_dict in user_info_dict_list:
            out_file = out_file_fold + user_info_dict['uid'] + '.json'
            user_info_dict_list = []
            user_info_dict_list.append(user_info_dict)
            usefulDataStruct.print_out_user_info_dat_to_json(
                user_info_dict_list=user_info_dict_list,
                out_json_file=out_file)
def get_prepare_info(in_file_fold, all_user_info_dict_file):
    user_home_file_list = []
    active_user_info_list = []
    user_info_dict_list = usefulDataStruct.load_user_info_dat_txt(
        all_user_info_dict_file)
    user_id_to_user_info_dict = {}
    for user_info_dict in user_info_dict_list:
        user_id_to_user_info_dict[user_info_dict['uid']] = user_info_dict

    uid_files = usefulAPI.get_dir_files(in_file_fold, False)
    for uid_file in uid_files:
        uid = uid_file.replace('.html', '')
        #if user_id_to_user_info_dict.has_key(uid) == False:
        #    print uid
        #else:
        user_home_file_list.append(in_file_fold + uid_file)
        active_user_info_list.append(user_id_to_user_info_dict[uid])
    return user_home_file_list, active_user_info_list
def ana_all_hotel_pages(
    all_dat_filefold='../Data/HtmlData/TripAdvisorHotelInCitesHomePage/',
    out_json_file='../Data/TxtData/TripAdvisorHotelInCitesHomePage/hotel_homepage_info.json'
):
    page_file_list = usefulAPI.get_dir_files(all_dat_filefold, False)
    home_info_list = []
    for i in range(0, len(page_file_list)):
        city_name = ''
        page_name_list = page_file_list[i].split('_')
        for page_name in page_name_list:
            if page_name == 'Hotels': break
            city_name = city_name + page_name
        if i % 50 == 0:
            print 'pageNum: ' + str(i) + ' / ' + str(len(page_file_list))
            print 'page_file: ' + page_file_list[i]
            print 'city_name: ' + city_name
        ana_city_hotel_homepage(all_dat_filefold + page_file_list[i],
                                city_name, home_info_list)
    print 'length of all home info list = ' + str(len(home_info_list))
    check_repeat(home_info_list)
    usefulAPI.print_out_dat_json(home_info_list, out_json_file)
Exemple #8
0
def filter_demo_users(
        in_file_fold='../Data/TxtData/TripAdvisorActiveUserPageInfoNew/',
        out_file='a.json'):
    all_user_dict_list = []
    filter_user_dict_list = []
    file_list = usefulAPI.get_dir_files(in_file_fold, True)
    for file in file_list:
        user_info_dict = json.loads(open(file, 'r').read())
        all_user_dict_list.append(user_info_dict)
    print len(all_user_dict_list)
    for user_dict_info in all_user_dict_list:
        if user_dict_info['user_info_str'] == 'NULL': continue
        user_info_str_list = user_dict_info['user_info_str'].split('-_-_-_-_-')
        if len(user_info_str_list) == 2:
            user_info_str_list[1] = user_info_str_list[1].replace(
                'From', 'from').replace('Man',
                                        'man').replace('Woman', 'woman')
            if user_info_str_list[1].split('from')[0] == '': continue
            #print user_info_str_list[1].split('from')[0]
            filter_user_dict_list.append(user_dict_info)
    print len(filter_user_dict_list)
    usefulDataStruct.print_out_user_info_dat_to_json(filter_user_dict_list,
                                                     out_file)
def get_hotel_review_files(
        in_file_fold='../Data/HtmlData/TripAdvisorHotelReviewPage/hotel_206921/'
):
    return usefulAPI.get_dir_files(in_file_fold, True)
#coding=utf-8

from UsefulLibs import usefulAPI
import shutil

if __name__ == '__main__':
    dst_file_fold = '../Data/HtmlData/TripAdvisorHotelInCitesHomePage/'
    src_file_fold_list = usefulAPI.get_dir_files(dst_file_fold, True)
    print src_file_fold_list
    for src_file_fold in src_file_fold_list:
        src_file_fold = src_file_fold + '/'
        file_name_list = usefulAPI.get_dir_files(src_file_fold, True)
        for file_name in file_name_list:
            shutil.move(file_name, dst_file_fold)
 def get_hotel_review_file_list(self):
     file_fold = self.hotel_reviw_file_fold + self.hotel_id + '/'
     self.hotel_review_file_list = usefulAPI.get_dir_files(
         file_fold, is_contain_dir=True)