class MyUser: lut = dict() user_pkl_base = './twitter/pkls' user_json_base = './twitter/users' user_label_base = './twitter/labels' user_pkl_files = iu.list_children(user_pkl_base, pattern='.pkl', full_path=True) user_json_files = iu.list_children(user_json_base, pattern='.txt', full_path=True) label_files = iu.list_children(user_label_base, pattern='.txt', full_path=True) def __init__(self, profile, twarr): self.twarr: List[dict] = twarr self.profile: dict = profile self.uid: str = profile[tk.id_str] def sort_twarr_by_time(self): self.twarr = sorted( self.twarr, key=lambda tw: tmu.timestamp_of_created_at(tw[tk.created_at])) def get_created_at_list(self): return [tw[tk.created_at] for tw in self.twarr] def reindex_twarr(self, index_arr: List): assert len(index_arr) == len(self.twarr) assert set(index_arr) == set(range(len(index_arr))) self.twarr = [self.twarr[i] for i in index_arr]
def get_log_path(self): cand_paths = iu.list_children('./', iu.DIR, r'^log\d', full_path=True) if len(cand_paths) == 0: cand_paths = iu.list_children('./logs', iu.DIR, r'^log\d', full_path=True) log_path = iu.choose_from( cand_paths) if self.args.c else iu.most_recent(cand_paths) return log_path
def plot_max_c_probs(): paths = iu.list_children('./', iu.DIR, '^log', True) log_path = (iu.choose_from if False else iu.most_recent)(paths) store_paths = iu.list_children(log_path, iu.DIR, pattern='gid=67', full_path=True) for store_path in store_paths: try: per_store_path(store_path) tf.reset_default_graph() except Exception as e: print(e)
def filter_into_temp(self): from bs4 import BeautifulSoup files = iu.list_children(self.orgn_file, full_path=True) array = list() for fidx, file in enumerate(files): print(fidx) tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser") for article in tree.find_all("reuters"): topics = list(article.topics.children) if not len(topics) == 1: continue topic = str(topics[0].text.encode('ascii', 'ignore')) text = article.find('text') if text is None or text.body is None: continue title = str( text.title.text.encode('utf-8', 'ignore')) if text.title is not None else '' title = ' '.join(pu.tokenize(title, pu.tokenize_pattern)) body = str(text.body.text.encode('utf-8', 'ignore')) body = ' '.join(pu.tokenize(body, pu.tokenize_pattern)) array.append((topic, '{}, {}'.format(title, body))) docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)]) print(len(docarr)) print(Counter([d.topic for d in docarr])) print(len(sorted(set([d.topic for d in docarr])))) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): file_list = iu.list_children(self.orgn_file, full_path=True) twarr_list = [iu.load_array(file) for file in file_list] doclist = list() for topic_id, twarr in enumerate(twarr_list): for tw in twarr: doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', ''))) docarr = du.make_docarr(doclist) du.dump_docarr(self.temp_file, docarr)
def get_log_path(str_list, make_new: bool): if make_new: log_path = './log_{}_{}'.format(tmu.format_date()[2:], '+'.join(str_list)) iu.mkdir(log_path, rm_prev=True) else: log_path = iu.choose_from( iu.list_children('./', iu.DIR, 'log', full_path=True)) print('log path:', log_path) return log_path
def extract_tweets(): in_path = '/home/cdong/works/uclu/data/userClustering_origin/data' out_path_json = '/home/cdong/works/uclu/data/twitter/users/' out_path_pkl = '/home/cdong/works/uclu/data/twitter/pkls/' files = iu.list_children(in_path, ctype=iu.FILE, pattern='^E', full_path=True) print('total files', len(files)) files_parts = au.split_multi_process(files, 20) args_list = [(part, out_path_json, out_path_pkl) for part in files_parts] res_list = mu.multi_process(filter_tw_from_files, args_list)
def rename_ground_truth(): in_path = '/home/cdong/works/uclu/data/userClustering_origin/groud-truth-clusters' out_path = '/home/cdong/works/uclu/data/twitter/labels' files = iu.list_children(in_path, ctype=iu.FILE, full_path=True, pattern=r'^\d') for file in files: fname = iu.get_name(file) fname_new = re.sub(r'\b(\d)\b', '0\\1', fname) s = fname_new.split('-', maxsplit=3) s.insert(0, s.pop(2)) fname_new = '-'.join(s) print(fname, '=>', fname_new, '\n') formated = reformat_ground_truth(file) iu.dump_array(iu.join(out_path, fname_new), formated)
def main(self): log_path = self.get_log_path() print('log path:', log_path) log_files = iu.list_children(log_path, pattern=r'^gid.+\.txt$', full_path=True) best_list = list() for file in log_files: entries = au.name2entries(name=iu.get_name(file), postfix='.txt', exclude=self.exclude) scores = [ iu.loads(l) for l in iu.read_lines(file) if (l.startswith('{') and 'v_NDCG' in l) ] scores_with_test = [s for s in scores if 't_NDCG' in s] if len(scores) == 0 or len(scores_with_test) == 0: print(au.entries2name(entries), 'lacks test info') continue best_scores = scores_with_test[-3:] name2score = pd.DataFrame() for idx, rvs2scores in enumerate(best_scores): rvs2scores.pop('brk_cnt') for title, value in rvs2scores.items(): name2score.loc[idx, title] = value # for rvs, score in rvs2scores.items(): # for name, value in score.items(): # title = '{}_{}'.format(rvs[0], name) name2score = name2score.mean(axis=0).round(4) name2score['ep'] = len(scores) best_list.append((dict(entries), name2score.to_dict())) table = pd.DataFrame() for i, (name2param, name2score) in enumerate(best_list): for k, v in list(name2param.items()) + list(name2score.items()): table.loc[i, k] = v table.fillna('-', inplace=True) temp = 'mmm' pre = 't' table[temp] = table['%s_NDCG' % pre] + table['%s_MAP' % pre] + table['%s_MRR' % pre] table = table.sort_values(by=temp) table.drop([temp, K.lr, K.reg], axis=1, inplace=True) # table = table.query('dpt=="1"') if self.args.s: table.to_csv(iu.join(log_path, 'summary.csv')) # print(table.columns) # print(table) # group_col = [K.dn, K.mix, K.act, K.dpt] for value, df in table.groupby(K.vs): df.pop(K.ep) print(value) print(df) mean = df.groupby(K.dn).mean() print(mean) mean.to_csv('%s.csv' % value) return group_col = [K.dn] grouped = table.groupby(group_col) kv_df_list = list() summ = pd.DataFrame() import numpy as np for idx, (values, table) in enumerate(grouped): # print(list(zip(group_col, values))) kv = dict(zip(group_col, values)) kv['final'] = np.mean(table['v_NDCG'] + table['v_MAP'] + table['v_MRR']) / 3 kv['final'] = kv['final'].round(3) kv_df_list.append([kv, table]) columns = [ '%s_%s' % (a, b) for a in ['v', 't'] for b in ['NDCG', 'MAP', 'MRR'] ] s = table[columns].mean(0) print(dict(s)) # print(s.index) # print(s[s.index]) # print(list(s.data)) # summ.loc[idx, 'data'] = values # summ.loc[idx, columns] = list(s.data) summ.append(dict(s), ignore_index=True) # print(table, '\n') print(summ)
# a = list(_sampler.trick_generate(64, 16)) # exit() # _d = Data20ng() # docarr = _d.load_docarr() # print(_d.name, np.mean([len(d.tokenids[:500]) for d in docarr])) # exit() # summary_datasets() # exit() # to_btm() # exit() from clu.data.remote_transfer import transfer_files, OUT, Nodes _files = iu.list_children('./', pattern='btm', full_path=True) print(_files) input('continue?') transfer_files(_files, _files, OUT, Nodes.alias_gpu) exit() # for _o in object_list: # tf, topics = _o.get_matrix_topics(using='tf') # iu.dump_pickle('{}_tf.pkl'.format(_d.name), [tf, topics]) # tfidf, topics = _o.get_matrix_topics_for_vade() # iu.dump_pickle('{}_tfidf.pkl'.format(_d.name), [tfidf, topics]) # exit() for _d in [Data20ng]: # for _d in object_list: _d = _d() print(_d.name)