def get_table_topic(df, lda, common_dict, model_name): # get topic vector for table kwargs = name2dic(model_name) table_seq = [] for col in df.columns: processed_col = train_LDA.process_col(df[col], **kwargs) table_seq.extend(processed_col) vector = lda[common_dict.doc2bow(table_seq)] return [v[1] for v in vector]
batch_size = args.batch_size corpus_list = args.corpus_list MAX_COL_COUNT = args.MAX_COL_COUNT if args.model_type == 'CRF' else None seed_list = [1001, 1002, 1003, 1004, 1005] #################### # Preparations #################### valid_types = get_valid_types(TYPENAME) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("PyTorch device={}".format(device)) if topic_name: topic_dim = int(name2dic(topic_name)['tn']) else: topic_dim = None if args.topic is not None: feature_group_list = args.sherlock_feature_groups + ['topic'] else: feature_group_list = args.sherlock_feature_groups # 1. Dataset t1 = time() print("Creating Dataset object...") label_enc = LabelEncoder() label_enc.fit(valid_types) # load data through table instance
def __init__(self, corpus, sherlock_features: List[str] = None, topic_feature: str = None, label_enc: LabelEncoder = None, id_filter: List[str] = None, max_col_count:int = None): # if not None, pad the returning tensors to max_col_count columns. self.sherlock_features = sherlock_features # list of sherlock features self.topic_feature = topic_feature # name of topic_feature self.label_enc = label_enc self.max_col_count = max_col_count self.df_header = load_tmp_df(header_path, tmp_path, '{}_{}_header_valid'.format(corpus,TYPENAME), table=True) # filter training/testing sets # filtering won't affect the pickled file used or the dictionary loaded if id_filter is not None: self.df_header = self.df_header.loc[id_filter] self.data_dic = {} start = time.time() sherlock_loaded = False if len(sherlock_features) > 0: for f_g in sherlock_features: dic_pkl_file = join(tmp_path, '{}_{}_{}.pkl'.format(corpus, TYPENAME, f_g)) if os.path.exists(dic_pkl_file): with open(dic_pkl_file, "rb") as fin: self.data_dic[f_g] = pickle.load(fin) else: if not sherlock_loaded: self.df_sherlock = load_tmp_df(feature_path, tmp_path, '{}_{}_sherlock_features'.format(corpus, TYPENAME), table=False) sherlock_loaded = True print("Prepare seperate files for feature groups...") feat_dict = self.df_sherlock.groupby('table_id').apply(lambda x: x[feature_group_cols[f_g]].fillna(EMBEDDING_FILL).values.astype('float')).to_dict() for i in feat_dict: feat_dict[i] = torch.FloatTensor(feat_dict[i]) self.data_dic[f_g] = feat_dict with open(dic_pkl_file, "wb") as fout: pickle.dump(feat_dict, fout, protocol=2) if topic_feature is not None: self.topic_no = int(name2dic(self.topic_feature)['tn']) dic_pkl_file = join(tmp_path, '{}_{}_{}.pkl'.format(corpus, TYPENAME, topic_feature)) if os.path.exists(dic_pkl_file): with open(dic_pkl_file, "rb") as fin: self.data_dic['topic'] = pickle.load(fin) else: fill = 1.0/self.topic_no pad_vec = lambda x: np.pad(x, (0, self.topic_no - len(x)), 'constant', constant_values=(0.0, fill)) self.df_topic = load_tmp_df(feature_path, tmp_path, '{}_{}_topic-{}_features'.format(corpus, TYPENAME, topic_feature), table=True) topic_dict = self.df_topic.apply(lambda x: pad_vec(eval(x['table_topic'])), axis=1).to_dict() for i in topic_dict: topic_dict[i] = torch.FloatTensor(topic_dict[i]) self.data_dic['topic'] = topic_dict with open(dic_pkl_file, "wb") as fout: pickle.dump(topic_dict, fout, protocol=2) print("Total data preparation time:", time.time()-start)