def main(): """ main function """ global data_mode, out_dir, per_day_path, w2v_path, valid_urls options, args = parser.parse_args() if (options.mode == None) or (options.output == None) or (options.input == None) or \ (options.w2v_path == None) or (options.dataset == None): return data_mode = options.mode per_day_path = options.input out_dir = options.output w2v_path = options.w2v_path dataset = options.dataset if dataset not in ['adressa', 'glob']: print('Wrong dataset name : {}'.format(dataset)) return os.system('mkdir -p {}'.format(out_dir)) write_log('w2v Load : start') with open(w2v_path, 'r') as f_w2v: dict_w2v = json.load(f_w2v) write_log('w2v Load : end') valid_urls = dict_w2v.keys() dict_w2v = None merge_per_time(dataset) merge_per_user(dataset)
def load(self, epoch=None, model_path=None): if model_path == None or not os.path.exists(model_path): return 0 states = torch.load(model_path) self._model.load_state_dict(states['model']) self._optimizer.load_state_dict(states['optimizer']) write_log('Model loaded!! - {}'.format(model_path)) return states['epoch']
def save(self, epoch=0, model_path=None): if model_path == None: return states = { 'epoch': epoch, 'model': self._model.state_dict(), 'optimizer': self._optimizer.state_dict(), } torch.save(states, model_path) write_log('Model saved! - {}'.format(model_path))
def main(): """ main function """ global contentdata_path, dict_article_info options, args = parser.parse_args() if (options.output == None) or (options.url2id == None) or (options.input == None) \ or (options.dataset == None) or (options.glob_meta == None): return contentdata_path = options.input out_path = options.output url2id_path = options.url2id dataset = options.dataset glob_meta_path = options.glob_meta if dataset not in ['adressa', 'glob']: print('Wrong dataset name : {}'.format(dataset)) return dict_article_info = {} if dataset == 'adressa': with open(url2id_path, 'r') as f_dict: dict_url2id = json.load(f_dict) write_log('Starting threads') with ThreadPool(8) as pool: pool.map(extract_article_info, dict_url2id.items()) write_log('Thread works done') elif dataset == 'glob': with open(glob_meta_path, 'r') as f_meta: lines = f_meta.readlines() dict_header_idx = None for line in lines: line = line.strip() if dict_header_idx == None: dict_header_idx = {} for i, k in enumerate(line.split(',')): dict_header_idx[k] = i continue line_split = line.split(',') url = 'url_{}'.format(line_split[dict_header_idx['article_id']]) category_id = 'cate_{}'.format(line_split[dict_header_idx['category_id']]) dict_article_info[url] = { 'category0': category_id, } write_log('Save to {}'.format(out_path)) with open(out_path, 'w') as f_json: json.dump(dict_article_info, f_json) write_log('Done')
def preprocess_rnn_input(args=(-1, [])): """ trigger the multi-process tasks to generate RNN inputs merging for all users :args: arguments of tasks :return: none """ global dict_per_user, dict_url_idx, seperated_output_path max_seq_len = 20 worker_id, user_ids = args write_log('worker({}) : start'.format(worker_id)) dict_data = {} for user_id in user_ids: # remove duplication sequence = [] # "cx:i68bn3gbf0ql786n:1hyr7mridb1el": [[1483570820, "http://adressa.no/100sport/ballsport/byasen-fiasko-mot-tabelljumboen-228288b.html"]] prev_url = None for seq_entry in dict_per_user[user_id]: timestamp, url = seq_entry if (prev_url == None) or (url != prev_url): prev_url = url sequence.append(seq_entry) seq_len = len(sequence) if seq_len < 2: continue if seq_len > max_seq_len: sequence = sequence[-max_seq_len:] start_time = sequence[0][0] end_time = sequence[-1][0] idx_sequence = list(map(lambda x:dict_url_idx[x[1]], sequence)) dict_data[user_id] = { 'start_time': start_time, 'end_time': end_time, 'sequence': idx_sequence, } with open(seperated_output_path + '/' + str(worker_id) + '_data.json', 'w') as f_out: json.dump(dict_data, f_out) write_log('worker({}) : end'.format(worker_id))
def main(): """ main function """ global dict_per_user, dict_per_time, dict_url_idx, seperated_output_path options, args = parser.parse_args() if (options.data_path == None) or (options.output_file_path == None): return per_time_path = options.data_path + '/per_time.json' per_user_path = options.data_path + '/per_user.json' output_path = options.output_file_path seperated_output_path = output_path + '/seperated' if not os.path.exists(output_path): os.system('mkdir -p ' + output_path) if not os.path.exists(seperated_output_path): os.system('mkdir -p ' + seperated_output_path) write_log('Preprocessing ...') with open(per_user_path, 'r') as f_user: dict_per_user = json.load(f_user) with open(per_time_path, 'r') as f_time: dict_per_time = json.load(f_time) user_ids = list(dict_per_user.keys()) dict_url_idx = generate_unique_url_idxs() write_log('Preprocessing End : total {} user_ids'.format(len(user_ids))) n_div = 100 multi_worker = MultiWorker(worker_count=10) works = list(map(lambda x: (x[0], x[1]), [(i, user_ids[i::n_div]) for i in range(n_div)])) multi_worker.work(works=works, work_function=preprocess_rnn_input) multi_worker = None # genrate_rnn_input generate_rnn_input(seperated_input_path=seperated_output_path, output_path=output_path + '/rnn_input.json')
def raw_to_per_day(raw_path): """ extract user-specifit interaction data for each file in parallel :raw_path: path of data file :return: none """ global out_dir, dict_url2id write_log('Processing : {}'.format(raw_path)) with open(raw_path, 'r') as f_raw: lines = f_raw.readlines() dict_per_user = {} list_per_time = [] total_count = len(lines) count = 0 for line in lines: if count % 10000 == 0: write_log('Processing({}) : {}/{}'.format(raw_path, count, total_count)) count += 1 line = line.strip() line_json = json.loads(line) user_id = line_json.get('userId', None) url = find_best_url(event_dict=line_json) time = line_json.get('time', -1) article_id = line_json.get('id', None) if (user_id == None) or (url == None) or (time < 0) or (article_id == None): continue if dict_per_user.get(user_id, None) == None: dict_per_user[user_id] = [] dict_per_user[user_id].append(tuple((time, url))) list_per_time.append(tuple((time, user_id, url))) dict_url2id[url] = article_id lines = None per_user_path = out_dir + '/per_user/' + os.path.basename(raw_path) per_time_path = out_dir + '/per_time/' + os.path.basename(raw_path) with open(per_user_path, 'w') as f_user: json.dump(dict_per_user, f_user) with open(per_time_path, 'w') as f_time: json.dump(list_per_time, f_time) dict_per_user = None list_per_time = None write_log('Done : {}'.format(raw_path))
def work(self, works, work_function): self._working_sema = Semaphore(1) self._child_count = 0 total_work_count = len(works) cur_work_done = 0 for work in works: while (True): if (self._child_count < self._worker_count): break time.sleep(1) cur_work_done += 1 if ((cur_work_done % 1000) == 0): write_log('working : {}/{}'.format(cur_work_done, total_work_count)) if (self._time_to_die): break def run_on_subproc(work): child_pid = os.fork() # child process if (child_pid == 0): work_function(work) exit(0) os.waitpid(child_pid, 0) self._working_sema.acquire() self._child_count -= 1 self._working_sema.release() self._working_sema.acquire() self._child_count += 1 Thread(target=run_on_subproc, args=(work, )).start() self._working_sema.release() while (self._child_count > 0): time.sleep(1)
def raw_to_per_day_glob(raw_path): global out_dir, dict_url2id write_log('Processing : {}'.format(raw_path)) with open(raw_path, 'r') as f_raw: lines = f_raw.readlines() dict_per_user = {} list_per_time = [] total_count = len(lines) count = 0 dict_header_idx = None for line in lines: if count % 10000 == 0: write_log('Processing({}) : {}/{}'.format(raw_path, count, total_count)) count += 1 line = line.strip() if dict_header_idx == None: dict_header_idx = {} for i, k in enumerate(line.split(',')): dict_header_idx[k] = i continue line_split = line.split(',') user_id = 'uid_{}'.format(line_split[dict_header_idx['user_id']]) time = int(line_split[dict_header_idx['click_timestamp']]) // 1000 url = 'url_{}'.format(line_split[dict_header_idx['click_article_id']]) article_id = 'id_{}'.format(line_split[dict_header_idx['click_article_id']]) if (user_id == None) or (url == None) or (time < 0) or (article_id == None): continue if dict_per_user.get(user_id, None) == None: dict_per_user[user_id] = [] dict_per_user[user_id].append(tuple((time, url))) list_per_time.append(tuple((time, user_id, url))) dict_url2id[url] = article_id lines = None per_user_path = out_dir + '/per_user/' + os.path.splitext(os.path.basename(raw_path))[0] per_time_path = out_dir + '/per_time/' + os.path.splitext(os.path.basename(raw_path))[0] with open(per_user_path, 'w') as f_user: json.dump(dict_per_user, f_user) with open(per_time_path, 'w') as f_time: json.dump(list_per_time, f_time) dict_per_user = None list_per_time = None write_log('Done : {}'.format(raw_path))
def generate_w2v_map(): """ generate a map for doc2vec training. :return: none """ global article_info_path, output_path, embedding_dimension, model_path write_log('W2V Load article info : Start') with open(article_info_path, 'r') as f_art: article_info = json.load(f_art) write_log('W2V Load article info : End') write_log('W2V Generate labeled_sentences : Start') labeled_sentences = [] for url, dict_info in article_info.items(): sentence_header = dict_info.get('sentence_header', None) sentence_body = dict_info.get('sentence_body', None) if (sentence_header == None) or (sentence_body == None): continue words = [] for sentence in sentence_header + sentence_body: for word in sentence.split(' '): words.append(word) labeled_sentence = gensim.models.doc2vec.LabeledSentence(words=words, tags=[url]) labeled_sentences.append(labeled_sentence) write_log('W2V Generate labeled_sentences : End') w2v_model = gensim.models.Doc2Vec(alpha=.025, min_alpha=.001, min_count=1, vector_size=embedding_dimension, window=10, dm=0, dbow_words=1, workers=16, epochs=10) w2v_model.build_vocab(labeled_sentences) for epoch in range(20): start_time = time.time() write_log('W2V epoch {} : Start'.format(epoch)) random.shuffle(labeled_sentences) w2v_model.train(labeled_sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs) w2v_model.alpha -= 0.001 w2v_model.min_alpha = w2v_model.alpha write_log('W2V epoch {} ends : tooks {}'.format( epoch, time.time() - start_time)) w2v_model.save(model_path) dict_w2v = {} for url in article_info.keys(): dict_w2v[url] = w2v_model[url].tolist() dict_w2v['url_pad'] = [float(0)] * embedding_dimension write_log('W2V json dump : start') with open(output_path, 'w') as out_f: json.dump(dict_w2v, out_f) write_log('W2V json dump : end')
def merge_per_time(dataset): """ merge the dateset which is seprated by the time :dataset: target dataset :return: none """ global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts write_log('Merging per_time Start') time_files = get_files_under_path(per_day_path + '/per_time') list_merged = [] write_log('Merging per_time : Load Start') for time_path in time_files: with open(time_path, 'r') as f_data: list_per_time = json.load(f_data) list_merged += list_per_time list_per_time = None write_log('Merging per_time : Load End') write_log('Merging per_time : Sort Start') # (timestamp, user_id, url) list_merged = list(filter(lambda x: x[2] in valid_urls, list_merged)) list_merged.sort(key=lambda x: x[0]) # time interval compression new_timestamp = 1 if dataset == 'glob_': dict_new_ts = {} prev_ts = -1 for ts in [x[0] for x in list_merged]: if prev_ts < 0: dict_new_ts[str(ts)] = new_timestamp prev_ts = ts continue if prev_ts == ts: continue new_timestamp += min(ts - prev_ts, 60 * 60 * 3) dict_new_ts[str(ts)] = new_timestamp prev_ts = ts list_merged = [(dict_new_ts[str(x[0])], x[1], x[2]) for x in list_merged] write_log('Merging per_time : Sort End') with open(out_dir + '/per_time.json', 'w') as f_time: json.dump(list_merged, f_time) list_merged = None write_log('Merging per_time End')
def merge_per_user(dataset): """ merge the dateset which is seprated by the user :dataset: target dataset :return: none """ global data_mode, out_dir, per_day_path, valid_urls, dict_new_ts write_log('Merging per_user Start') user_files = get_files_under_path(per_day_path + '/per_user') dict_merged = {} total_count = len(user_files) count = 0 for user_path in user_files: write_log('Merging per_user : {}/{}'.format(count, total_count)) count += 1 with open(user_path, 'r') as f_data: dict_per_user = json.load(f_data) write_log('Merging per_user Loaded: {}/{}'.format(count, total_count)) for key in dict_per_user.keys(): dict_merged[key] = dict_merged.get(key, []) + dict_per_user[key] write_log('Merging per_user Merged: {}/{}'.format(count, total_count)) dict_per_user = None write_log('Merging per_user : sorting start') for user_id in dict_merged: # (timestamp, url) dict_merged[user_id] = list( filter(lambda x: x[1] in valid_urls, dict_merged[user_id])) # time interval compression if dataset == 'glob_': dict_merged[user_id] = [(dict_new_ts[str(x[0])], x[1]) for x in dict_merged[user_id]] dict_merged[user_id].sort(key=lambda x: x[0]) write_log('Merging per_user : sorting end') write_log('Merging per_user start to writing') with open(out_dir + '/per_user.json', 'w') as f_user: json.dump(dict_merged, f_user) write_log('Merging per_user End') dict_merged = None
def generate_rnn_input(seperated_input_path=None, output_path=None): """ generate an RNN input of each task :seperated_input_path: path of the input directory storing RNN input seperated by the user :output_path: path of output to save RNN input :return: none """ global dict_url_idx, dict_per_time if (seperated_input_path == None) or (output_path == None): return merged_sequences = [] write_log('Merging seperated infos ...') for seperated_path in get_files_under_path(seperated_input_path): with open(seperated_path, 'r') as f_dict: seperated_dict = json.load(f_dict) # seperated_dict[user_id] = { # 'start_time': start_time, # 'end_time': end_time, # 'sequence': idx_sequence, # } # dict_url_idx for user_id, dict_data in seperated_dict.items(): sequence_entry = (dict_data['start_time'], dict_data['end_time'], dict_data['sequence']) merged_sequences.append(sequence_entry) write_log('Merging seperated infos ... Done !') write_log('Sort by time : start') merged_sequences.sort(key=lambda x:x[0]) write_log('Sort by time : end') timestamp_tuple = list(map(lambda x:tuple((x[0], x[1])), merged_sequences)) seq_len = list(map(lambda x:len(x[2]), merged_sequences)) sequence = list(map(lambda x:x[2], merged_sequences)) write_log('Generate idx2url : start') merged_sequences = None dict_idx2url = {idx:word for word, idx in dict_url_idx.items()} write_log('Generate idx2url : end') write_log('Generate candidate data structure : start') dict_time_idx = {} prev_timestamp = None for (timestamp, user_id, url) in dict_per_time: if prev_timestamp != timestamp: if prev_timestamp != None: dict_time_idx[prev_timestamp]['next_time'] = timestamp dict_time_idx[timestamp] = { 'prev_time': prev_timestamp, 'next_time': None, 'indices': {}, } idx_of_url = dict_url_idx[url] dict_time_idx[timestamp]['indices'][idx_of_url] = dict_time_idx[timestamp]['indices'].get(idx_of_url, 0) + 1 prev_timestamp = timestamp write_log('Generate candidate data structure : end') write_log('Save rnn_inputs : start') dict_rnn_input = { 'timestamp': timestamp_tuple, 'seq_len': seq_len, 'sequence': sequence, 'idx2url': dict_idx2url, 'time_idx': dict_time_idx, } with open(output_path, 'w') as f_input: json.dump(dict_rnn_input, f_input) write_log('Save rnn_inputs : end')
def extract_article_content(content_dir): target_files = [] for file_name in os.listdir(content_dir): file_path = os.path.join(content_dir, file_name) if not os.path.isfile(file_path): continue target_files.append(file_path) output = {} for file_idx, file_path in enumerate(target_files): lines = [] with open(file_path, 'r') as f_con: lines = [line.strip() for line in f_con.readlines() if len(line.strip()) > 0] for line in lines: try: dict_cont = json.loads(line) except: print('Error: {}'.format(line)) continue dict_data = {} for field in dict_cont.get('fields', []): field_name = field.get('field', None) field_value = field.get('value', None) if not field_name or not field_value: continue if field_name not in ['url', 'cannonicalUrl', 'referrerUrl', 'title', 'body', 'category0', 'category1']: continue dict_data[field_name] = field_value # find the best URL best_url = find_best_url(dict_data) if not best_url: continue for key in ['url', 'cannonicalUrl', 'referrerUrl']: dict_data.pop(key, None) # preprocess title & body if ('title' not in dict_data) or ('body' not in dict_data): continue def preprocess_sentence(sentences): new_sentences = [] regex_remove = re.compile('[\'|\"|,|\-|\\.| |\?|«|»|:|!|–|@|\\(|\\)|−]+') for sentence in sentences: sentence = re.sub(regex_remove, ' ', sentence) new_sentences.append(sentence.strip()) return new_sentences dict_data['sentence_header'] = preprocess_sentence([dict_data['title']]) dict_data['sentence_body'] = preprocess_sentence(dict_data['body']) for key in ['title', 'body']: dict_data.pop(key, None) output[best_url] = dict_data write_log('Save to Json : start') with open(out_dir, 'w') as f_json: json.dump(output, f_json) write_log('Save to Json : end')