def save_dict(self, path, gz=False): """Save dictionary into a file. Parameters: path (str): Path to the file. gz (bool): Whether to gz or not. """ dic = {'w2i': self.w2i, 'i2w': self.i2w} savepkl(dic, path, gz=gz)
def _save_cache(self, path: Path): r"""Save the dataset to cache in the data directory. :param path: The path to data directory. """ cache_dir = path / '_cached' params_index_path = cache_dir / 'params_index.pkl' params = self._get_params() if not cache_dir.exists(): cache_dir.mkdir() index = 0 params_index = [params] else: params_index = loadpkl(params_index_path) index = next((idx for idx in range(len(params_index)) if not (cache_dir / f'{idx}.pkl').exists()), len(params_index)) params_index[index:(index + 1)] = [params] # replace or append savepkl(params_index, params_index_path) load_path = cache_dir / f'{index}.pkl' savepkl(self.batches, load_path) LOGGER.info(f"Dataset cached to {load_path}, with settings: {params}")
def generate_vocab(X): ''' Generating word distribution dataframe ''' result = flatten_1_deg(flatten_1_deg(flatten_1_deg(X))) query_l = [tokenize_str(i) for i in list(baseline_f['query'].unique())] query_l = flatten_1_deg(query_l) result += query_l # print(result[:10]) count = Counter(result) c = [[i, count[i]] for i in count.keys()] df = pd.DataFrame(c) df.sort_values(by=[1], ascending=False, inplace=True) df.to_csv('./data/word_distr_2D_complete.csv', index=False, columns=None) ''' Getting the vocab from the data ''' vocab = list(set(count.keys())) vocab.insert(0, '<PAD>') vocab.insert(0, '<UNK>') print(f'vocab: {len(vocab)}\n') savepkl(f'./data/vocab_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', vocab)
for i in range(0, table_prep_params['MAX_ROW_LEN'] - rows): table.append( [['<PAD>'] * table_prep_params['LENGTH_PER_CELL']] * table_prep_params['MAX_COL_LEN']) return table def table_words2index(tables): w2i = {w: i for i, w in enumerate(vocab)} for i, t in enumerate(tables): tables[i] = np.vectorize(lambda y: w2i[y])( np.array(t)).tolist() return tables p = Pool(processes=40) X = p.map(pad_table, X) p.close() p.join() X = table_words2index(X) X = np.array(X) print(X.shape) savepkl('./data/xp_2D_10-50_pad.pkl', X) else: device = torch.device( f"cuda:{1}" if torch.cuda.is_available() else 'cpu') dataset = T2VDataset(X, y, vocab, device, config) dataloader = DataLoader(dataset, batch_size=32, shuffle=True) X_, y_ = next(iter(dataloader)) print(X_.shape, y_.shape) print(time.time() - start)
def _extra_init(self, loaded_batches: bool): self.rel_vocab = Vocab.from_dict(self._path / 'rel_names.pkl', mode='i2w') self.vocab: Dict[str, Vocab] = { "word": self.word_vocab, "rel": self.rel_vocab } self.max_unkrel = max( (-rel_typ - 3 for rel_typ in self.rel_vocab.i2w if rel_typ < -3), default=0) if self._use_fasttext: def _alias_path(name): path = Path(self._fasttext_model_path) return path.parent / (path.name + f'.{name}') # gather all entity aliases and compute fastText embeddings alias_dict_path = _alias_path('alias_dict.pkl') if alias_dict_path.exists(): alias_dict: Dict[str, int] = loadpkl(alias_dict_path) loaded = True else: alias_dict = defaultdict(lambda: len(alias_dict)) loaded = False if not loaded_batches: for dataset in self.data.values(): for example in dataset: for idx, rel in enumerate( example.relations): # type: ignore example.relations[ idx] = rel._replace( # type: ignore obj_alias=[ alias_dict[s] for s in rel.obj_alias ]) if not alias_dict_path.exists(): alias_dict = dict(alias_dict) savepkl(alias_dict, alias_dict_path) alias_vectors_path = _alias_path('alias_vectors.pt') if not alias_vectors_path.exists() or not loaded: import fastText ft_model = fastText.load_model(self._fasttext_model_path) alias_vectors = [] alias_list = utils.reverse_map(alias_dict) for alias in utils.progress(alias_list, desc="Building fastText vectors", ascii=True, ncols=80): vectors = [ ft_model.get_word_vector(w) for w in alias.split() ] vectors = np.sum(vectors, axis=0).tolist() alias_vectors.append(vectors) alias_vectors = torch.tensor(alias_vectors) torch.save(alias_vectors, alias_vectors_path) if not loaded_batches and (self._exclude_entity_disamb or self._exclude_alias_disamb): # no need to do this if batches are loaded if self._exclude_entity_disamb: # gather training set stats self.entity_count_per_type = self.gather_entity_stats( self.data['train']) for dataset in self.data.values(): for idx in range(len(dataset)): dataset[idx] = self.remove_ambiguity( dataset[idx], self._exclude_entity_disamb, self._exclude_alias_disamb)
# Testing tables # table-0614-640.json # table-1225-209.json # print_table(X[6705]) if __name__ == "__main__": baseline_f = pd.read_csv('../global_data/features.csv') tables_subset_3k = list(baseline_f['table_id']) tables_subset = list( set(tables_subset_3k + random.sample(all_tables, 20000))) ''' Generating Positive X, y dataset from the tables ''' X_p, y_p = data_prep_pipeline(tables_subset, '+') savepkl(f'./data/xp_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X_p) savepkl(f'./data/yp_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_p) ''' Generating Negative X, y dataset from the tables (1.3 x X_p) ''' size = int(len(X_p) * 1.3) with Pool(40) as p: X_n = [tqdm(p.imap(generate_neg_table, range(size)), total=size)] p.close() p.join() X_n, y_n = data_prep_pipeline(X_n, '-') savepkl(f'./data/xn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X_n) savepkl(f'./data/yn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_n) ''' Generating word distribution dataframe '''
X = p.map(tokenize_table, X) p.close() p.join() # print_table(X[0]) # ''' # Remove totally empty tables, generating vocab and cliiping cells to max_cell_len # ''' # generate_vocab(X) X = remove_empty_tables(X) # X = cell_overflow_cap(X) X = np.array(X) print(X.shape) # print_table(X[0]) return X if __name__ == "__main__": baseline_f = pd.read_csv('../global_data/features.csv') tables_subset_3k = list(baseline_f['table_id']) tables_subset = list( set(tables_subset_3k + random.sample(all_tables, 20000))) savepkl(f'./data/wo_strnum3.0/postive_tables_set.pkl', tables_subset) read_all_tables = [read_table(js)['data'] for js in tables_subset] X = data_prep_pipeline(read_all_tables) savepkl(f'./data/wo_strnum3.0/x_tokenised.pkl', X) # savepkl(f'./data/xp_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X)