def fill_new(self, files): base = '/home/cdong/works/clu/data/corpus' if isinstance(files, list): return [iu.join(base, self.name, f) for f in files] elif isinstance(files, str): return iu.join(base, self.name, files) else: raise ValueError('wtf')
def filter_tw_from_files(files, out_path_json, out_path_pkl): for fidx, file in enumerate(files): if fidx > 0 and fidx % 10 == 0: print(fidx, end=' ', flush=True) profile, twarr = filter_tw_from_file(file) twarr = sorted( twarr, key=lambda tw: tmu.timestamp_of_created_at(tw[tk.created_at])) twarr.insert(0, profile) fname = iu.get_name(file) fjson = fname[fname.rfind('_') + 1:] fpkl = fjson.replace('txt', 'pkl') iu.dump_pickle(iu.join(out_path_pkl, fpkl), twarr) iu.dump_array(iu.join(out_path_json, fjson), twarr)
def per_store_path(store_path): print(store_path) hyper_file = iu.join(store_path, 'hyper') param_file = iu.join(store_path, 'model.ckpt') args = iu.load_json(hyper_file) print('restore args from file', args) model_name = args[vs_] data_name = args[dn_] sampler = Sampler(data_name) w_embed, c_embed = sampler.d_obj.load_word_cluster_embed() eval_batches = sampler.eval_batches print('sample over') model_class = {v.__name__: v for v in [N5]}[model_name] model = model_class(args) model.build_model(w_embed, c_embed) print('model build over') sess = get_session(1, 0.1, allow_growth=True, run_init=True) model.set_session(sess) tf.train.Saver(tf.trainable_variables()).restore(sess, param_file) p_maxes = list() for batch in eval_batches: c_probs = sess.run(model.pc_probs, feed_dict=model.get_fd_by_batch(batch)) p_maxes.extend(np.max(c_probs, axis=1).reshape(-1)) sess.close() print('clusters get over') ax = plt.figure(figsize=(4, 3)).add_subplot(111) ax.tick_params(direction='in', right=True, top=True, labelsize=9) font = {'family': 'sans-serif', 'weight': 'normal', 'size': 12} # x-axis ax.set_xlabel('max probability', font) # y-axis ax.set_ylabel('density', font) # ax.set_yticks(np.arange(0, 1, 0.1)) # ax.set_ylim(0, 1) # ax.legend(name, loc='lower right', fontsize=9, frameon=False, # borderaxespad=0.3, labelspacing=0.3) span = 10000 ax.hist(p_maxes, density=False, bins=np.arange(0, span) / span) show()
def __init__(self, args: dict): self.args = args self.gid = args[C.gid] self.gpu_id = args[C.gi] self.gpu_frac = args[C.gp] self.epoch_num = args[C.ep] self.batch_size = args[C.bs] self.neg_size = args[C.ns_] self.data_name = args[C.dn] self.model_name = args[C.vs] self.w_init = args[C.wini_] self.c_init = args[C.cini_] self.scale = args[C.sc] self.c_num = args[C.cn_] self.log_path = args[C.lg] entries = [(k, v) for k, v in args.items() if v is not None] log_name = au.entries2name(entries, exclude={C.gi, C.gp, C.lg}, postfix='.txt') self.log_file = iu.join(self.log_path, log_name) self.logger = lu.get_logger(self.log_file) # self.is_record = Nodes.is_1702() self.is_record = False if self.is_record: self.writer_path = iu.join(self.log_path, 'gid={}'.format(self.gid)) self.param_file = iu.join(self.writer_path, 'model.ckpt') self.hyper_file = iu.join(self.writer_path, 'hyper') iu.mkdir(self.writer_path) iu.dump_json(self.hyper_file, args) self.history = list() self.writer_step = 0 self.ppp(args)
def rename_ground_truth(): in_path = '/home/cdong/works/uclu/data/userClustering_origin/groud-truth-clusters' out_path = '/home/cdong/works/uclu/data/twitter/labels' files = iu.list_children(in_path, ctype=iu.FILE, full_path=True, pattern=r'^\d') for file in files: fname = iu.get_name(file) fname_new = re.sub(r'\b(\d)\b', '0\\1', fname) s = fname_new.split('-', maxsplit=3) s.insert(0, s.pop(2)) fname_new = '-'.join(s) print(fname, '=>', fname_new, '\n') formated = reformat_ground_truth(file) iu.dump_array(iu.join(out_path, fname_new), formated)
def main(self): log_path = self.get_log_path() print('log path:', log_path) log_files = iu.list_children(log_path, pattern=r'^gid.+\.txt$', full_path=True) best_list = list() for file in log_files: entries = au.name2entries(name=iu.get_name(file), postfix='.txt', exclude=self.exclude) scores = [ iu.loads(l) for l in iu.read_lines(file) if (l.startswith('{') and 'v_NDCG' in l) ] scores_with_test = [s for s in scores if 't_NDCG' in s] if len(scores) == 0 or len(scores_with_test) == 0: print(au.entries2name(entries), 'lacks test info') continue best_scores = scores_with_test[-3:] name2score = pd.DataFrame() for idx, rvs2scores in enumerate(best_scores): rvs2scores.pop('brk_cnt') for title, value in rvs2scores.items(): name2score.loc[idx, title] = value # for rvs, score in rvs2scores.items(): # for name, value in score.items(): # title = '{}_{}'.format(rvs[0], name) name2score = name2score.mean(axis=0).round(4) name2score['ep'] = len(scores) best_list.append((dict(entries), name2score.to_dict())) table = pd.DataFrame() for i, (name2param, name2score) in enumerate(best_list): for k, v in list(name2param.items()) + list(name2score.items()): table.loc[i, k] = v table.fillna('-', inplace=True) temp = 'mmm' pre = 't' table[temp] = table['%s_NDCG' % pre] + table['%s_MAP' % pre] + table['%s_MRR' % pre] table = table.sort_values(by=temp) table.drop([temp, K.lr, K.reg], axis=1, inplace=True) # table = table.query('dpt=="1"') if self.args.s: table.to_csv(iu.join(log_path, 'summary.csv')) # print(table.columns) # print(table) # group_col = [K.dn, K.mix, K.act, K.dpt] for value, df in table.groupby(K.vs): df.pop(K.ep) print(value) print(df) mean = df.groupby(K.dn).mean() print(mean) mean.to_csv('%s.csv' % value) return group_col = [K.dn] grouped = table.groupby(group_col) kv_df_list = list() summ = pd.DataFrame() import numpy as np for idx, (values, table) in enumerate(grouped): # print(list(zip(group_col, values))) kv = dict(zip(group_col, values)) kv['final'] = np.mean(table['v_NDCG'] + table['v_MAP'] + table['v_MRR']) / 3 kv['final'] = kv['final'].round(3) kv_df_list.append([kv, table]) columns = [ '%s_%s' % (a, b) for a in ['v', 't'] for b in ['NDCG', 'MAP', 'MRR'] ] s = table[columns].mean(0) print(dict(s)) # print(s.index) # print(s[s.index]) # print(list(s.data)) # summ.loc[idx, 'data'] = values # summ.loc[idx, columns] = list(s.data) summ.append(dict(s), ignore_index=True) # print(table, '\n') print(summ)
def fill(self, *names): cdong_home = '/home/cdong/works/cqa/data/{}'.format(self.name) return iu.join(cdong_home, *names)
def fill(files): base = '/home/cdong/works/research/input_and_outputs/short_text_corpus' return [iu.join(base, self.name, f) for f in files]