Ejemplo n.º 1
0
def merge_per_time(dataset):
    """
        merge the dateset which is seprated by the time
        :dataset: target dataset
        :return: none
    """
    global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts
    write_log('Merging per_time Start')

    time_files = get_files_under_path(per_day_path + '/per_time')

    list_merged = []

    write_log('Merging per_time : Load Start')
    for time_path in time_files:
        with open(time_path, 'r') as f_data:
            list_per_time = json.load(f_data)

        list_merged += list_per_time
        list_per_time = None
    write_log('Merging per_time : Load End')

    write_log('Merging per_time : Sort Start')
    # (timestamp, user_id, url)
    list_merged = list(filter(lambda x: x[2] in valid_urls, list_merged))
    list_merged.sort(key=lambda x: x[0])

    # time interval compression
    new_timestamp = 1
    if dataset == 'glob_':
        dict_new_ts = {}
        prev_ts = -1
        for ts in [x[0] for x in list_merged]:
            if prev_ts < 0:
                dict_new_ts[str(ts)] = new_timestamp
                prev_ts = ts
                continue

            if prev_ts == ts:
                continue

            new_timestamp += min(ts - prev_ts, 60 * 60 * 3)
            dict_new_ts[str(ts)] = new_timestamp

            prev_ts = ts
        list_merged = [(dict_new_ts[str(x[0])], x[1], x[2])
                       for x in list_merged]

    write_log('Merging per_time : Sort End')

    with open(out_dir + '/per_time.json', 'w') as f_time:
        json.dump(list_merged, f_time)

    list_merged = None

    write_log('Merging per_time End')
def generate_simple_dataset():
    global one_week_path, simple_path

    for data_path in get_files_under_path(one_week_path):
        simple_data = ''
        with open(data_path, 'r') as f_data:
            for i in range(1000):
                simple_data += f_data.readline().strip() + '\n'

        target_path = os.path.join(simple_path, os.path.basename(data_path))
        with open(target_path, 'w') as f_simple:
            f_simple.write(simple_data)
Ejemplo n.º 3
0
def merge_per_user(dataset):
    """
        merge the dateset which is seprated by the user
        :dataset: target dataset
        :return: none
    """
    global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts

    write_log('Merging per_user Start')
    user_files = get_files_under_path(per_day_path + '/per_user')

    dict_merged = {}

    total_count = len(user_files)
    count = 0
    for user_path in user_files:
        write_log('Merging per_user : {}/{}'.format(count, total_count))
        count += 1
        with open(user_path, 'r') as f_data:
            dict_per_user = json.load(f_data)
        write_log('Merging per_user Loaded: {}/{}'.format(count, total_count))

        for key in dict_per_user.keys():
            dict_merged[key] = dict_merged.get(key, []) + dict_per_user[key]

        write_log('Merging per_user Merged: {}/{}'.format(count, total_count))
        dict_per_user = None

    write_log('Merging per_user : sorting start')
    for user_id in dict_merged:
        # (timestamp, url)
        dict_merged[user_id] = list(
            filter(lambda x: x[1] in valid_urls, dict_merged[user_id]))
        # time interval compression
        if dataset == 'glob_':
            dict_merged[user_id] = [(dict_new_ts[str(x[0])], x[1])
                                    for x in dict_merged[user_id]]
        dict_merged[user_id].sort(key=lambda x: x[0])
    write_log('Merging per_user : sorting end')

    write_log('Merging per_user start to writing')
    with open(out_dir + '/per_user.json', 'w') as f_user:
        json.dump(dict_merged, f_user)
    write_log('Merging per_user End')

    dict_merged = None
Ejemplo n.º 4
0
def generate_merged_sequences():
    """
        generate the merges sequences from the seperated inputs
        :return: merges sequences for all users
    """
    global separated_output_dir_path, merged_sequences, dict_per_user, dict_usr2idx

    merged_sequences = []
    separated_files = get_files_under_path(separated_output_dir_path)

    for separated_file in separated_files:
        with open(separated_file, 'r') as f_dict:
            separated_dict = json.load(f_dict)

#        separated_dict[user_id] = {
#            'start_time': start_time,
#            'end_time': end_time,
#            'sequence': idx_sequence,
#            'time_sequence': time_sequence,
#        }

        for user_id, dict_data in separated_dict.items():
            seq_len = len(dict_data['sequence'])
            if seq_len <= 1:
                continue

            sequence_entry = (dict_data['start_time'], dict_data['end_time'],
                              dict_usr2idx[user_id], dict_data['sequence'],
                              dict_data['time_sequence'])
            merged_sequences.append(sequence_entry)
#            st = 0
#            st_step = max(1, int((seq_len - 20) / 5) + 1)
#            while (st == 0) or (st + 20 <= seq_len):
#                cur_seq = dict_data['sequence'][st:st+20]
#                cur_t_seq = dict_data['time_sequence'][st:st+20]
#
#                sequence_entry = (cur_t_seq[0], cur_t_seq[-1], dict_usr2idx[user_id],
#                    cur_seq, cur_t_seq)
#
#                merged_sequences.append(sequence_entry)
#
#                st += st_step

    merged_sequences.sort(key=lambda x: x[0])
Ejemplo n.º 5
0
def main():
    """
        main function
    """
    global data_mode, out_dir, data_path, dict_url2id
    options, args = parser.parse_args()

    if (options.mode == None) or (options.output == None) or (options.dataset
                                                              == None):
        return

    data_mode = options.mode
    out_dir = options.output
    dataset = options.dataset

    if dataset not in ['adressa', 'glob']:
        print('Wrong dataset name : {}'.format(dataset))
        return

    if dataset == 'adressa':
        data_path = 'data/' + data_mode
        worker_fn = raw_to_per_day
    elif dataset == 'glob':
        data_path = 'data/glob'
        if data_mode == 'simple':
            data_path += '/simple'
        else:
            data_path += '/clicks'
        worker_fn = raw_to_per_day_glob

    os.system('mkdir -p {}'.format(out_dir + '/per_user'))
    os.system('mkdir -p {}'.format(out_dir + '/per_time'))

    works = get_files_under_path(data_path)

    dict_url2id = {}
    with ThreadPool(8) as pool:
        pool.map(worker_fn, works)

    with open(out_dir + '/url2id.json', 'w') as f_dict:
        json.dump(dict_url2id, f_dict)
Ejemplo n.º 6
0
def generate_merged_sequences():
    global separated_output_dir_path, merged_sequences

    merged_sequences = []
    separated_files = get_files_under_path(separated_output_dir_path)

    for separated_file in separated_files:
        with open(separated_file, 'r') as f_dict:
            separated_dict = json.load(f_dict)


#		separated_dict[user_id] = {
#			'start_time': start_time,
#			'end_time': end_time,
#			'sequence': idx_sequence,
#		}

        for user_id, dict_data in separated_dict.items():
            sequence_entry = (dict_data['start_time'], dict_data['end_time'],
                              dict_data['sequence'])
            merged_sequences.append(sequence_entry)

    merged_sequences.sort(key=lambda x: x[0])
Ejemplo n.º 7
0
def generate_rnn_input(seperated_input_path=None, output_path=None):
    """
    generate an RNN input of each task
    :seperated_input_path: path of the input directory storing RNN input seperated by the user
    :output_path: path of output to save RNN input
    :return: none
    """
	global dict_url_idx, dict_per_time

	if (seperated_input_path == None) or (output_path == None):
		return

	merged_sequences = []

	write_log('Merging seperated infos ...')
	for seperated_path in get_files_under_path(seperated_input_path):
		with open(seperated_path, 'r') as f_dict:
			seperated_dict = json.load(f_dict)

#		seperated_dict[user_id] = {
#			'start_time': start_time,
#			'end_time': end_time,
#			'sequence': idx_sequence,
#		}

		# dict_url_idx
		for user_id, dict_data in seperated_dict.items():
			sequence_entry = (dict_data['start_time'], dict_data['end_time'],
					dict_data['sequence'])
			merged_sequences.append(sequence_entry)

	write_log('Merging seperated infos ...  Done !')
	write_log('Sort by time : start')
	merged_sequences.sort(key=lambda x:x[0])
	write_log('Sort by time : end')

	timestamp_tuple = list(map(lambda x:tuple((x[0], x[1])), merged_sequences))
	seq_len = list(map(lambda x:len(x[2]), merged_sequences))
	sequence = list(map(lambda x:x[2], merged_sequences))

	write_log('Generate idx2url : start')
	merged_sequences = None
	dict_idx2url = {idx:word for word, idx in dict_url_idx.items()}
	write_log('Generate idx2url : end')

	write_log('Generate candidate data structure : start')
	dict_time_idx = {}

	prev_timestamp = None
	for (timestamp, user_id, url) in dict_per_time:
		if prev_timestamp != timestamp:
			if prev_timestamp != None:
				dict_time_idx[prev_timestamp]['next_time'] = timestamp
			dict_time_idx[timestamp] = {
				'prev_time': prev_timestamp,
				'next_time': None,
				'indices': {},
			}

		idx_of_url = dict_url_idx[url]
		dict_time_idx[timestamp]['indices'][idx_of_url] = dict_time_idx[timestamp]['indices'].get(idx_of_url, 0) + 1

		prev_timestamp = timestamp

	write_log('Generate candidate data structure : end')

	write_log('Save rnn_inputs : start')
	dict_rnn_input = {
		'timestamp': timestamp_tuple,
		'seq_len': seq_len,
		'sequence': sequence,
		'idx2url': dict_idx2url,
		'time_idx': dict_time_idx,
	}

	with open(output_path, 'w') as f_input:
		json.dump(dict_rnn_input, f_input)
	write_log('Save rnn_inputs : end')