Ejemplo n.º 1
0
 def fill_new(self, files):
     base = '/home/cdong/works/clu/data/corpus'
     if isinstance(files, list):
         return [iu.join(base, self.name, f) for f in files]
     elif isinstance(files, str):
         return iu.join(base, self.name, files)
     else:
         raise ValueError('wtf')
Ejemplo n.º 2
0
def filter_tw_from_files(files, out_path_json, out_path_pkl):
    for fidx, file in enumerate(files):
        if fidx > 0 and fidx % 10 == 0:
            print(fidx, end=' ', flush=True)
        profile, twarr = filter_tw_from_file(file)
        twarr = sorted(
            twarr,
            key=lambda tw: tmu.timestamp_of_created_at(tw[tk.created_at]))
        twarr.insert(0, profile)

        fname = iu.get_name(file)
        fjson = fname[fname.rfind('_') + 1:]
        fpkl = fjson.replace('txt', 'pkl')
        iu.dump_pickle(iu.join(out_path_pkl, fpkl), twarr)
        iu.dump_array(iu.join(out_path_json, fjson), twarr)
Ejemplo n.º 3
0
def per_store_path(store_path):
    print(store_path)
    hyper_file = iu.join(store_path, 'hyper')
    param_file = iu.join(store_path, 'model.ckpt')
    args = iu.load_json(hyper_file)
    print('restore args from file', args)
    model_name = args[vs_]
    data_name = args[dn_]

    sampler = Sampler(data_name)
    w_embed, c_embed = sampler.d_obj.load_word_cluster_embed()
    eval_batches = sampler.eval_batches
    print('sample over')

    model_class = {v.__name__: v for v in [N5]}[model_name]
    model = model_class(args)
    model.build_model(w_embed, c_embed)
    print('model build over')
    sess = get_session(1, 0.1, allow_growth=True, run_init=True)
    model.set_session(sess)
    tf.train.Saver(tf.trainable_variables()).restore(sess, param_file)

    p_maxes = list()
    for batch in eval_batches:
        c_probs = sess.run(model.pc_probs,
                           feed_dict=model.get_fd_by_batch(batch))
        p_maxes.extend(np.max(c_probs, axis=1).reshape(-1))
    sess.close()
    print('clusters get over')

    ax = plt.figure(figsize=(4, 3)).add_subplot(111)
    ax.tick_params(direction='in', right=True, top=True, labelsize=9)
    font = {'family': 'sans-serif', 'weight': 'normal', 'size': 12}
    # x-axis
    ax.set_xlabel('max probability', font)
    # y-axis
    ax.set_ylabel('density', font)
    # ax.set_yticks(np.arange(0, 1, 0.1))
    # ax.set_ylim(0, 1)
    # ax.legend(name, loc='lower right', fontsize=9, frameon=False,
    #           borderaxespad=0.3, labelspacing=0.3)
    span = 10000
    ax.hist(p_maxes, density=False, bins=np.arange(0, span) / span)
    show()
Ejemplo n.º 4
0
    def __init__(self, args: dict):
        self.args = args
        self.gid = args[C.gid]
        self.gpu_id = args[C.gi]
        self.gpu_frac = args[C.gp]
        self.epoch_num = args[C.ep]
        self.batch_size = args[C.bs]
        self.neg_size = args[C.ns_]
        self.data_name = args[C.dn]
        self.model_name = args[C.vs]

        self.w_init = args[C.wini_]
        self.c_init = args[C.cini_]
        self.scale = args[C.sc]
        self.c_num = args[C.cn_]

        self.log_path = args[C.lg]
        entries = [(k, v) for k, v in args.items() if v is not None]
        log_name = au.entries2name(entries,
                                   exclude={C.gi, C.gp, C.lg},
                                   postfix='.txt')
        self.log_file = iu.join(self.log_path, log_name)
        self.logger = lu.get_logger(self.log_file)

        # self.is_record = Nodes.is_1702()
        self.is_record = False
        if self.is_record:
            self.writer_path = iu.join(self.log_path,
                                       'gid={}'.format(self.gid))
            self.param_file = iu.join(self.writer_path, 'model.ckpt')
            self.hyper_file = iu.join(self.writer_path, 'hyper')
            iu.mkdir(self.writer_path)
            iu.dump_json(self.hyper_file, args)

        self.history = list()
        self.writer_step = 0
        self.ppp(args)
Ejemplo n.º 5
0
def rename_ground_truth():
    in_path = '/home/cdong/works/uclu/data/userClustering_origin/groud-truth-clusters'
    out_path = '/home/cdong/works/uclu/data/twitter/labels'
    files = iu.list_children(in_path,
                             ctype=iu.FILE,
                             full_path=True,
                             pattern=r'^\d')
    for file in files:
        fname = iu.get_name(file)
        fname_new = re.sub(r'\b(\d)\b', '0\\1', fname)
        s = fname_new.split('-', maxsplit=3)
        s.insert(0, s.pop(2))
        fname_new = '-'.join(s)
        print(fname, '=>', fname_new, '\n')

        formated = reformat_ground_truth(file)
        iu.dump_array(iu.join(out_path, fname_new), formated)
Ejemplo n.º 6
0
    def main(self):
        log_path = self.get_log_path()
        print('log path:', log_path)
        log_files = iu.list_children(log_path,
                                     pattern=r'^gid.+\.txt$',
                                     full_path=True)
        best_list = list()
        for file in log_files:
            entries = au.name2entries(name=iu.get_name(file),
                                      postfix='.txt',
                                      exclude=self.exclude)
            scores = [
                iu.loads(l) for l in iu.read_lines(file)
                if (l.startswith('{') and 'v_NDCG' in l)
            ]
            scores_with_test = [s for s in scores if 't_NDCG' in s]
            if len(scores) == 0 or len(scores_with_test) == 0:
                print(au.entries2name(entries), 'lacks test info')
                continue
            best_scores = scores_with_test[-3:]
            name2score = pd.DataFrame()
            for idx, rvs2scores in enumerate(best_scores):
                rvs2scores.pop('brk_cnt')
                for title, value in rvs2scores.items():
                    name2score.loc[idx, title] = value
                # for rvs, score in rvs2scores.items():
                #     for name, value in score.items():
                #         title = '{}_{}'.format(rvs[0], name)
            name2score = name2score.mean(axis=0).round(4)
            name2score['ep'] = len(scores)
            best_list.append((dict(entries), name2score.to_dict()))

        table = pd.DataFrame()
        for i, (name2param, name2score) in enumerate(best_list):
            for k, v in list(name2param.items()) + list(name2score.items()):
                table.loc[i, k] = v
        table.fillna('-', inplace=True)
        temp = 'mmm'
        pre = 't'
        table[temp] = table['%s_NDCG' % pre] + table['%s_MAP' %
                                                     pre] + table['%s_MRR' %
                                                                  pre]
        table = table.sort_values(by=temp)
        table.drop([temp, K.lr, K.reg], axis=1, inplace=True)
        # table = table.query('dpt=="1"')
        if self.args.s:
            table.to_csv(iu.join(log_path, 'summary.csv'))

        # print(table.columns)
        # print(table)
        # group_col = [K.dn, K.mix, K.act, K.dpt]

        for value, df in table.groupby(K.vs):
            df.pop(K.ep)
            print(value)
            print(df)
            mean = df.groupby(K.dn).mean()
            print(mean)
            mean.to_csv('%s.csv' % value)
        return

        group_col = [K.dn]
        grouped = table.groupby(group_col)
        kv_df_list = list()
        summ = pd.DataFrame()
        import numpy as np
        for idx, (values, table) in enumerate(grouped):
            # print(list(zip(group_col, values)))
            kv = dict(zip(group_col, values))
            kv['final'] = np.mean(table['v_NDCG'] + table['v_MAP'] +
                                  table['v_MRR']) / 3
            kv['final'] = kv['final'].round(3)
            kv_df_list.append([kv, table])
            columns = [
                '%s_%s' % (a, b) for a in ['v', 't']
                for b in ['NDCG', 'MAP', 'MRR']
            ]
            s = table[columns].mean(0)
            print(dict(s))
            # print(s.index)
            # print(s[s.index])
            # print(list(s.data))
            # summ.loc[idx, 'data'] = values
            # summ.loc[idx, columns] = list(s.data)
            summ.append(dict(s), ignore_index=True)
            # print(table, '\n')
        print(summ)
Ejemplo n.º 7
0
 def fill(self, *names):
     cdong_home = '/home/cdong/works/cqa/data/{}'.format(self.name)
     return iu.join(cdong_home, *names)
Ejemplo n.º 8
0
 def fill(files):
     base = '/home/cdong/works/research/input_and_outputs/short_text_corpus'
     return [iu.join(base, self.name, f) for f in files]