Exemple #1
0
 def save_dict(self, path, gz=False):
     """Save dictionary into a file.
     Parameters:
         path (str): Path to the file.
         gz (bool): Whether to gz or not.
     """
     dic = {'w2i': self.w2i, 'i2w': self.i2w}
     savepkl(dic, path, gz=gz)
Exemple #2
0
    def _save_cache(self, path: Path):
        r"""Save the dataset to cache in the data directory.

        :param path: The path to data directory.
        """
        cache_dir = path / '_cached'
        params_index_path = cache_dir / 'params_index.pkl'
        params = self._get_params()
        if not cache_dir.exists():
            cache_dir.mkdir()
            index = 0
            params_index = [params]
        else:
            params_index = loadpkl(params_index_path)
            index = next((idx for idx in range(len(params_index))
                          if not (cache_dir / f'{idx}.pkl').exists()),
                         len(params_index))
            params_index[index:(index + 1)] = [params]  # replace or append
        savepkl(params_index, params_index_path)
        load_path = cache_dir / f'{index}.pkl'
        savepkl(self.batches, load_path)
        LOGGER.info(f"Dataset cached to {load_path}, with settings: {params}")
Exemple #3
0
def generate_vocab(X):
    '''
    Generating word distribution dataframe
    '''
    result = flatten_1_deg(flatten_1_deg(flatten_1_deg(X)))
    query_l = [tokenize_str(i) for i in list(baseline_f['query'].unique())]
    query_l = flatten_1_deg(query_l)
    result += query_l
    # print(result[:10])
    count = Counter(result)
    c = [[i, count[i]] for i in count.keys()]
    df = pd.DataFrame(c)
    df.sort_values(by=[1], ascending=False, inplace=True)
    df.to_csv('./data/word_distr_2D_complete.csv', index=False, columns=None)
    '''
    Getting the vocab from the data
    '''
    vocab = list(set(count.keys()))
    vocab.insert(0, '<PAD>')
    vocab.insert(0, '<UNK>')
    print(f'vocab: {len(vocab)}\n')
    savepkl(f'./data/vocab_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', vocab)
            for i in range(0, table_prep_params['MAX_ROW_LEN'] - rows):
                table.append(
                    [['<PAD>'] * table_prep_params['LENGTH_PER_CELL']] *
                    table_prep_params['MAX_COL_LEN'])
            return table

        def table_words2index(tables):
            w2i = {w: i for i, w in enumerate(vocab)}
            for i, t in enumerate(tables):
                tables[i] = np.vectorize(lambda y: w2i[y])(
                    np.array(t)).tolist()
            return tables

        p = Pool(processes=40)
        X = p.map(pad_table, X)
        p.close()
        p.join()
        X = table_words2index(X)
        X = np.array(X)
        print(X.shape)
        savepkl('./data/xp_2D_10-50_pad.pkl', X)
    else:
        device = torch.device(
            f"cuda:{1}" if torch.cuda.is_available() else 'cpu')
        dataset = T2VDataset(X, y, vocab, device, config)
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
        X_, y_ = next(iter(dataloader))
        print(X_.shape, y_.shape)

    print(time.time() - start)
Exemple #5
0
    def _extra_init(self, loaded_batches: bool):
        self.rel_vocab = Vocab.from_dict(self._path / 'rel_names.pkl',
                                         mode='i2w')
        self.vocab: Dict[str, Vocab] = {
            "word": self.word_vocab,
            "rel": self.rel_vocab
        }

        self.max_unkrel = max(
            (-rel_typ - 3 for rel_typ in self.rel_vocab.i2w if rel_typ < -3),
            default=0)

        if self._use_fasttext:

            def _alias_path(name):
                path = Path(self._fasttext_model_path)
                return path.parent / (path.name + f'.{name}')

            # gather all entity aliases and compute fastText embeddings
            alias_dict_path = _alias_path('alias_dict.pkl')
            if alias_dict_path.exists():
                alias_dict: Dict[str, int] = loadpkl(alias_dict_path)
                loaded = True
            else:
                alias_dict = defaultdict(lambda: len(alias_dict))
                loaded = False
            if not loaded_batches:
                for dataset in self.data.values():
                    for example in dataset:
                        for idx, rel in enumerate(
                                example.relations):  # type: ignore
                            example.relations[
                                idx] = rel._replace(  # type: ignore
                                    obj_alias=[
                                        alias_dict[s] for s in rel.obj_alias
                                    ])
            if not alias_dict_path.exists():
                alias_dict = dict(alias_dict)
                savepkl(alias_dict, alias_dict_path)

            alias_vectors_path = _alias_path('alias_vectors.pt')
            if not alias_vectors_path.exists() or not loaded:
                import fastText
                ft_model = fastText.load_model(self._fasttext_model_path)
                alias_vectors = []
                alias_list = utils.reverse_map(alias_dict)
                for alias in utils.progress(alias_list,
                                            desc="Building fastText vectors",
                                            ascii=True,
                                            ncols=80):
                    vectors = [
                        ft_model.get_word_vector(w) for w in alias.split()
                    ]
                    vectors = np.sum(vectors, axis=0).tolist()
                    alias_vectors.append(vectors)
                alias_vectors = torch.tensor(alias_vectors)
                torch.save(alias_vectors, alias_vectors_path)

        if not loaded_batches and (self._exclude_entity_disamb
                                   or self._exclude_alias_disamb):
            # no need to do this if batches are loaded
            if self._exclude_entity_disamb:
                # gather training set stats
                self.entity_count_per_type = self.gather_entity_stats(
                    self.data['train'])

            for dataset in self.data.values():
                for idx in range(len(dataset)):
                    dataset[idx] = self.remove_ambiguity(
                        dataset[idx], self._exclude_entity_disamb,
                        self._exclude_alias_disamb)
Exemple #6
0
# Testing tables
# table-0614-640.json
# table-1225-209.json
# print_table(X[6705])

if __name__ == "__main__":
    baseline_f = pd.read_csv('../global_data/features.csv')
    tables_subset_3k = list(baseline_f['table_id'])
    tables_subset = list(
        set(tables_subset_3k + random.sample(all_tables, 20000)))
    '''
    Generating Positive X, y dataset from the tables
    '''
    X_p, y_p = data_prep_pipeline(tables_subset, '+')
    savepkl(f'./data/xp_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X_p)
    savepkl(f'./data/yp_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_p)
    '''
    Generating Negative X, y dataset from the tables (1.3 x X_p)
    '''
    size = int(len(X_p) * 1.3)
    with Pool(40) as p:
        X_n = [tqdm(p.imap(generate_neg_table, range(size)), total=size)]
    p.close()
    p.join()
    X_n, y_n = data_prep_pipeline(X_n, '-')
    savepkl(f'./data/xn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X_n)
    savepkl(f'./data/yn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_n)
    '''
    Generating word distribution dataframe
    '''
Exemple #7
0
    X = p.map(tokenize_table, X)
    p.close()
    p.join()
    # print_table(X[0])

    # '''
    # Remove totally empty tables, generating vocab and cliiping cells to max_cell_len
    # '''
    # generate_vocab(X)
    X = remove_empty_tables(X)
    # X = cell_overflow_cap(X)

    X = np.array(X)
    print(X.shape)
    # print_table(X[0])

    return X


if __name__ == "__main__":
    baseline_f = pd.read_csv('../global_data/features.csv')
    tables_subset_3k = list(baseline_f['table_id'])
    tables_subset = list(
        set(tables_subset_3k + random.sample(all_tables, 20000)))

    savepkl(f'./data/wo_strnum3.0/postive_tables_set.pkl', tables_subset)
    read_all_tables = [read_table(js)['data'] for js in tables_subset]
    X = data_prep_pipeline(read_all_tables)
    savepkl(f'./data/wo_strnum3.0/x_tokenised.pkl', X)
    # savepkl(f'./data/xp_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X)