def raw_to_per_day(raw_path): """ extract user-specifit interaction data for each file in parallel :raw_path: path of data file :return: none """ global out_dir, dict_url2id write_log('Processing : {}'.format(raw_path)) with open(raw_path, 'r') as f_raw: lines = f_raw.readlines() dict_per_user = {} list_per_time = [] total_count = len(lines) count = 0 for line in lines: if count % 10000 == 0: write_log('Processing({}) : {}/{}'.format(raw_path, count, total_count)) count += 1 line = line.strip() line_json = json.loads(line) user_id = line_json.get('userId', None) url = find_best_url(event_dict=line_json) time = line_json.get('time', -1) article_id = line_json.get('id', None) if (user_id == None) or (url == None) or (time < 0) or (article_id == None): continue if dict_per_user.get(user_id, None) == None: dict_per_user[user_id] = [] dict_per_user[user_id].append(tuple((time, url))) list_per_time.append(tuple((time, user_id, url))) dict_url2id[url] = article_id lines = None per_user_path = out_dir + '/per_user/' + os.path.basename(raw_path) per_time_path = out_dir + '/per_time/' + os.path.basename(raw_path) with open(per_user_path, 'w') as f_user: json.dump(dict_per_user, f_user) with open(per_time_path, 'w') as f_time: json.dump(list_per_time, f_time) dict_per_user = None list_per_time = None write_log('Done : {}'.format(raw_path))
def extract_article_content(content_dir): target_files = [] for file_name in os.listdir(content_dir): file_path = os.path.join(content_dir, file_name) if not os.path.isfile(file_path): continue target_files.append(file_path) output = {} for file_idx, file_path in enumerate(target_files): lines = [] with open(file_path, 'r') as f_con: lines = [line.strip() for line in f_con.readlines() if len(line.strip()) > 0] for line in lines: try: dict_cont = json.loads(line) except: print('Error: {}'.format(line)) continue dict_data = {} for field in dict_cont.get('fields', []): field_name = field.get('field', None) field_value = field.get('value', None) if not field_name or not field_value: continue if field_name not in ['url', 'cannonicalUrl', 'referrerUrl', 'title', 'body', 'category0', 'category1']: continue dict_data[field_name] = field_value # find the best URL best_url = find_best_url(dict_data) if not best_url: continue for key in ['url', 'cannonicalUrl', 'referrerUrl']: dict_data.pop(key, None) # preprocess title & body if ('title' not in dict_data) or ('body' not in dict_data): continue def preprocess_sentence(sentences): new_sentences = [] regex_remove = re.compile('[\'|\"|,|\-|\\.| |\?|«|»|:|!|–|@|\\(|\\)|−]+') for sentence in sentences: sentence = re.sub(regex_remove, ' ', sentence) new_sentences.append(sentence.strip()) return new_sentences dict_data['sentence_header'] = preprocess_sentence([dict_data['title']]) dict_data['sentence_body'] = preprocess_sentence(dict_data['body']) for key in ['title', 'body']: dict_data.pop(key, None) output[best_url] = dict_data write_log('Save to Json : start') with open(out_dir, 'w') as f_json: json.dump(output, f_json) write_log('Save to Json : end')