Example #1
0
def get_table_topic(df, lda, common_dict, model_name):
    # get topic vector for table
    kwargs = name2dic(model_name)

    table_seq = []
    for col in df.columns:
        processed_col = train_LDA.process_col(df[col], **kwargs)
        table_seq.extend(processed_col)

    vector = lda[common_dict.doc2bow(table_seq)]
    return [v[1] for v in vector]
Example #2
0
    batch_size = args.batch_size
    corpus_list = args.corpus_list

    MAX_COL_COUNT = args.MAX_COL_COUNT if args.model_type == 'CRF' else None

    seed_list = [1001, 1002, 1003, 1004, 1005]
    ####################
    # Preparations
    ####################
    valid_types = get_valid_types(TYPENAME)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("PyTorch device={}".format(device))

    if topic_name:
        topic_dim = int(name2dic(topic_name)['tn'])
    else:
        topic_dim = None

    if args.topic is not None:
        feature_group_list = args.sherlock_feature_groups + ['topic']
    else:
        feature_group_list = args.sherlock_feature_groups

    # 1. Dataset
    t1 = time()
    print("Creating Dataset object...")
    label_enc = LabelEncoder()
    label_enc.fit(valid_types)

    # load data through table instance
Example #3
0
    def __init__(self,
                 corpus,
                 sherlock_features: List[str] = None,
                 topic_feature: str = None,
                 label_enc: LabelEncoder = None,
                 id_filter: List[str] = None,
                 max_col_count:int = None): # if not None, pad the returning tensors to max_col_count columns. 
        
        self.sherlock_features = sherlock_features # list of sherlock features
        self.topic_feature = topic_feature # name of topic_feature
        self.label_enc = label_enc
        self.max_col_count = max_col_count
        
        self.df_header = load_tmp_df(header_path, tmp_path, '{}_{}_header_valid'.format(corpus,TYPENAME), table=True)

        # filter training/testing sets
        # filtering won't affect the pickled file used or the dictionary loaded
        if id_filter is not None:
            self.df_header = self.df_header.loc[id_filter]

        self.data_dic = {}
        
        start = time.time()

        sherlock_loaded = False

        if len(sherlock_features) > 0:

            for f_g in sherlock_features:

                dic_pkl_file = join(tmp_path, '{}_{}_{}.pkl'.format(corpus, TYPENAME, f_g))
                if os.path.exists(dic_pkl_file):
                    with open(dic_pkl_file, "rb") as fin:
                        self.data_dic[f_g] = pickle.load(fin)
                else:
                    if not sherlock_loaded:
                        self.df_sherlock = load_tmp_df(feature_path, tmp_path, '{}_{}_sherlock_features'.format(corpus, TYPENAME), table=False)
                        sherlock_loaded = True

                    print("Prepare seperate files for feature groups...")
                    feat_dict = self.df_sherlock.groupby('table_id').apply(lambda x: x[feature_group_cols[f_g]].fillna(EMBEDDING_FILL).values.astype('float')).to_dict()
                    for i in feat_dict:
                        feat_dict[i] = torch.FloatTensor(feat_dict[i])
                    self.data_dic[f_g] = feat_dict

                    with open(dic_pkl_file, "wb") as fout:
                        pickle.dump(feat_dict, fout, protocol=2)


        if topic_feature is not None:
            self.topic_no = int(name2dic(self.topic_feature)['tn'])

            dic_pkl_file = join(tmp_path, '{}_{}_{}.pkl'.format(corpus, TYPENAME, topic_feature))
            if os.path.exists(dic_pkl_file):
                with open(dic_pkl_file, "rb") as fin:
                    self.data_dic['topic'] = pickle.load(fin)

            else:

                fill = 1.0/self.topic_no
                pad_vec = lambda x: np.pad(x, (0, self.topic_no - len(x)),
                                    'constant',
                                    constant_values=(0.0, fill))

                self.df_topic = load_tmp_df(feature_path, tmp_path, '{}_{}_topic-{}_features'.format(corpus, TYPENAME, topic_feature), table=True)
                topic_dict = self.df_topic.apply(lambda x: pad_vec(eval(x['table_topic'])), axis=1).to_dict()

                for i in topic_dict:
                    topic_dict[i] = torch.FloatTensor(topic_dict[i])

                self.data_dic['topic'] = topic_dict
                with open(dic_pkl_file, "wb") as fout:
                    pickle.dump(topic_dict, fout, protocol=2)


        
        print("Total data preparation time:", time.time()-start)