def merge_per_time(dataset): """ merge the dateset which is seprated by the time :dataset: target dataset :return: none """ global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts write_log('Merging per_time Start') time_files = get_files_under_path(per_day_path + '/per_time') list_merged = [] write_log('Merging per_time : Load Start') for time_path in time_files: with open(time_path, 'r') as f_data: list_per_time = json.load(f_data) list_merged += list_per_time list_per_time = None write_log('Merging per_time : Load End') write_log('Merging per_time : Sort Start') # (timestamp, user_id, url) list_merged = list(filter(lambda x: x[2] in valid_urls, list_merged)) list_merged.sort(key=lambda x: x[0]) # time interval compression new_timestamp = 1 if dataset == 'glob_': dict_new_ts = {} prev_ts = -1 for ts in [x[0] for x in list_merged]: if prev_ts < 0: dict_new_ts[str(ts)] = new_timestamp prev_ts = ts continue if prev_ts == ts: continue new_timestamp += min(ts - prev_ts, 60 * 60 * 3) dict_new_ts[str(ts)] = new_timestamp prev_ts = ts list_merged = [(dict_new_ts[str(x[0])], x[1], x[2]) for x in list_merged] write_log('Merging per_time : Sort End') with open(out_dir + '/per_time.json', 'w') as f_time: json.dump(list_merged, f_time) list_merged = None write_log('Merging per_time End')
def generate_simple_dataset(): global one_week_path, simple_path for data_path in get_files_under_path(one_week_path): simple_data = '' with open(data_path, 'r') as f_data: for i in range(1000): simple_data += f_data.readline().strip() + '\n' target_path = os.path.join(simple_path, os.path.basename(data_path)) with open(target_path, 'w') as f_simple: f_simple.write(simple_data)
def merge_per_user(dataset): """ merge the dateset which is seprated by the user :dataset: target dataset :return: none """ global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts write_log('Merging per_user Start') user_files = get_files_under_path(per_day_path + '/per_user') dict_merged = {} total_count = len(user_files) count = 0 for user_path in user_files: write_log('Merging per_user : {}/{}'.format(count, total_count)) count += 1 with open(user_path, 'r') as f_data: dict_per_user = json.load(f_data) write_log('Merging per_user Loaded: {}/{}'.format(count, total_count)) for key in dict_per_user.keys(): dict_merged[key] = dict_merged.get(key, []) + dict_per_user[key] write_log('Merging per_user Merged: {}/{}'.format(count, total_count)) dict_per_user = None write_log('Merging per_user : sorting start') for user_id in dict_merged: # (timestamp, url) dict_merged[user_id] = list( filter(lambda x: x[1] in valid_urls, dict_merged[user_id])) # time interval compression if dataset == 'glob_': dict_merged[user_id] = [(dict_new_ts[str(x[0])], x[1]) for x in dict_merged[user_id]] dict_merged[user_id].sort(key=lambda x: x[0]) write_log('Merging per_user : sorting end') write_log('Merging per_user start to writing') with open(out_dir + '/per_user.json', 'w') as f_user: json.dump(dict_merged, f_user) write_log('Merging per_user End') dict_merged = None
def generate_merged_sequences(): """ generate the merges sequences from the seperated inputs :return: merges sequences for all users """ global separated_output_dir_path, merged_sequences, dict_per_user, dict_usr2idx merged_sequences = [] separated_files = get_files_under_path(separated_output_dir_path) for separated_file in separated_files: with open(separated_file, 'r') as f_dict: separated_dict = json.load(f_dict) # separated_dict[user_id] = { # 'start_time': start_time, # 'end_time': end_time, # 'sequence': idx_sequence, # 'time_sequence': time_sequence, # } for user_id, dict_data in separated_dict.items(): seq_len = len(dict_data['sequence']) if seq_len <= 1: continue sequence_entry = (dict_data['start_time'], dict_data['end_time'], dict_usr2idx[user_id], dict_data['sequence'], dict_data['time_sequence']) merged_sequences.append(sequence_entry) # st = 0 # st_step = max(1, int((seq_len - 20) / 5) + 1) # while (st == 0) or (st + 20 <= seq_len): # cur_seq = dict_data['sequence'][st:st+20] # cur_t_seq = dict_data['time_sequence'][st:st+20] # # sequence_entry = (cur_t_seq[0], cur_t_seq[-1], dict_usr2idx[user_id], # cur_seq, cur_t_seq) # # merged_sequences.append(sequence_entry) # # st += st_step merged_sequences.sort(key=lambda x: x[0])
def main(): """ main function """ global data_mode, out_dir, data_path, dict_url2id options, args = parser.parse_args() if (options.mode == None) or (options.output == None) or (options.dataset == None): return data_mode = options.mode out_dir = options.output dataset = options.dataset if dataset not in ['adressa', 'glob']: print('Wrong dataset name : {}'.format(dataset)) return if dataset == 'adressa': data_path = 'data/' + data_mode worker_fn = raw_to_per_day elif dataset == 'glob': data_path = 'data/glob' if data_mode == 'simple': data_path += '/simple' else: data_path += '/clicks' worker_fn = raw_to_per_day_glob os.system('mkdir -p {}'.format(out_dir + '/per_user')) os.system('mkdir -p {}'.format(out_dir + '/per_time')) works = get_files_under_path(data_path) dict_url2id = {} with ThreadPool(8) as pool: pool.map(worker_fn, works) with open(out_dir + '/url2id.json', 'w') as f_dict: json.dump(dict_url2id, f_dict)
def generate_merged_sequences(): global separated_output_dir_path, merged_sequences merged_sequences = [] separated_files = get_files_under_path(separated_output_dir_path) for separated_file in separated_files: with open(separated_file, 'r') as f_dict: separated_dict = json.load(f_dict) # separated_dict[user_id] = { # 'start_time': start_time, # 'end_time': end_time, # 'sequence': idx_sequence, # } for user_id, dict_data in separated_dict.items(): sequence_entry = (dict_data['start_time'], dict_data['end_time'], dict_data['sequence']) merged_sequences.append(sequence_entry) merged_sequences.sort(key=lambda x: x[0])
def generate_rnn_input(seperated_input_path=None, output_path=None): """ generate an RNN input of each task :seperated_input_path: path of the input directory storing RNN input seperated by the user :output_path: path of output to save RNN input :return: none """ global dict_url_idx, dict_per_time if (seperated_input_path == None) or (output_path == None): return merged_sequences = [] write_log('Merging seperated infos ...') for seperated_path in get_files_under_path(seperated_input_path): with open(seperated_path, 'r') as f_dict: seperated_dict = json.load(f_dict) # seperated_dict[user_id] = { # 'start_time': start_time, # 'end_time': end_time, # 'sequence': idx_sequence, # } # dict_url_idx for user_id, dict_data in seperated_dict.items(): sequence_entry = (dict_data['start_time'], dict_data['end_time'], dict_data['sequence']) merged_sequences.append(sequence_entry) write_log('Merging seperated infos ... Done !') write_log('Sort by time : start') merged_sequences.sort(key=lambda x:x[0]) write_log('Sort by time : end') timestamp_tuple = list(map(lambda x:tuple((x[0], x[1])), merged_sequences)) seq_len = list(map(lambda x:len(x[2]), merged_sequences)) sequence = list(map(lambda x:x[2], merged_sequences)) write_log('Generate idx2url : start') merged_sequences = None dict_idx2url = {idx:word for word, idx in dict_url_idx.items()} write_log('Generate idx2url : end') write_log('Generate candidate data structure : start') dict_time_idx = {} prev_timestamp = None for (timestamp, user_id, url) in dict_per_time: if prev_timestamp != timestamp: if prev_timestamp != None: dict_time_idx[prev_timestamp]['next_time'] = timestamp dict_time_idx[timestamp] = { 'prev_time': prev_timestamp, 'next_time': None, 'indices': {}, } idx_of_url = dict_url_idx[url] dict_time_idx[timestamp]['indices'][idx_of_url] = dict_time_idx[timestamp]['indices'].get(idx_of_url, 0) + 1 prev_timestamp = timestamp write_log('Generate candidate data structure : end') write_log('Save rnn_inputs : start') dict_rnn_input = { 'timestamp': timestamp_tuple, 'seq_len': seq_len, 'sequence': sequence, 'idx2url': dict_idx2url, 'time_idx': dict_time_idx, } with open(output_path, 'w') as f_input: json.dump(dict_rnn_input, f_input) write_log('Save rnn_inputs : end')