Beispiel #1
0
 def __init__(self,
              corpus_data_0,
              corpus_data_1,
              *,
              params,
              n_samples=10000000):
     self.skip_gram = [
         SkipGram(corpus_data_0.vocab_size + 1, params.emb_dim).to(GPU),
         SkipGram(corpus_data_1.vocab_size + 1, params.emb_dim).to(GPU)
     ]
     self.perm = Permutation(params.emb_dim,
                             params.p_sample_top,
                             n_units=params.p_n_units,
                             batch_norm=params.p_bn).to(GPU)
     self.sampler = [
         WordSampler(corpus_data_0.dic,
                     n_urns=n_samples,
                     alpha=params.p_sample_factor,
                     top=params.p_sample_top),
         WordSampler(corpus_data_1.dic,
                     n_urns=n_samples,
                     alpha=params.p_sample_factor,
                     top=params.p_sample_top)
     ]
     self.p_bs = params.p_bs
     self.p_sample_top = params.p_sample_top
     self.emb_dim = params.emb_dim
     self.vocab_size_0, self.vocab_size_1 = corpus_data_0.vocab_size, corpus_data_1.vocab_size
     self.perm_optimizer, self.perm_scheduler = optimizers.get_sgd_find_lr(
         self.perm.parameters(),
         lr=params.p_lr,
         wd=params.p_wd,
         momentum=params.p_momentum)
     self.entropy_loss = EntropyLoss()
Beispiel #2
0
    def __init__(self, corpus_data_0, corpus_data_1, *, params, n_samples=10000000):
        self.skip_gram = [SkipGram(corpus_data_0.vocab_size + 1, params.emb_dim).to(GPU),
                          SkipGram(corpus_data_1.vocab_size + 1, params.emb_dim).to(GPU)]
        self.discriminator = Discriminator(params.emb_dim, n_layers=params.d_n_layers, n_units=params.d_n_units,
                                           drop_prob=params.d_drop_prob, drop_prob_input=params.d_drop_prob_input,
                                           leaky=params.d_leaky, batch_norm=params.d_bn).to(GPU)
        self.mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False)
        self.mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim)))
        self.mapping = self.mapping.to(GPU)
        self.sg_optimizer, self.sg_scheduler = [], []
        for id in [0, 1]:
            optimizer, scheduler = optimizers.get_sgd_adapt(self.skip_gram[id].parameters(),
                                                            lr=params.sg_lr, mode="max")
            self.sg_optimizer.append(optimizer)
            self.sg_scheduler.append(scheduler)
        self.a_optimizer, self.a_scheduler = [], []
        for id in [0, 1]:
            optimizer, scheduler = optimizers.get_sgd_adapt(
                [{"params": self.skip_gram[id].u.parameters()}, {"params": self.skip_gram[id].v.parameters()}],
                lr=params.a_lr, mode="max")
            self.a_optimizer.append(optimizer)
            self.a_scheduler.append(scheduler)
        if params.d_optimizer == "SGD":
            self.d_optimizer, self.d_scheduler = optimizers.get_sgd_adapt(self.discriminator.parameters(),
                                                                          lr=params.d_lr, mode="max", wd=params.d_wd)

        elif params.d_optimizer == "RMSProp":
            self.d_optimizer, self.d_scheduler = optimizers.get_rmsprop_linear(self.discriminator.parameters(),
                                                                               params.n_steps,
                                                                               lr=params.d_lr, wd=params.d_wd)
        else:
            raise Exception(f"Optimizer {params.d_optimizer} not found.")
        if params.m_optimizer == "SGD":
            self.m_optimizer, self.m_scheduler = optimizers.get_sgd_adapt(self.mapping.parameters(),
                                                                          lr=params.m_lr, mode="max", wd=params.m_wd)
        elif params.m_optimizer == "RMSProp":
            self.m_optimizer, self.m_scheduler = optimizers.get_rmsprop_linear(self.mapping.parameters(),
                                                                               params.n_steps,
                                                                               lr=params.m_lr, wd=params.m_wd)
        else:
            raise Exception(f"Optimizer {params.m_optimizer} not found")
        self.m_beta = params.m_beta
        self.smooth = params.smooth
        self.loss_fn = nn.BCEWithLogitsLoss(reduction="elementwise_mean")
        self.corpus_data_queue = [
            _data_queue(corpus_data_0, n_threads=(params.n_threads + 1) // 2, n_sentences=params.n_sentences,
                        batch_size=params.sg_bs),
            _data_queue(corpus_data_1, n_threads=(params.n_threads + 1) // 2, n_sentences=params.n_sentences,
                        batch_size=params.sg_bs)
        ]
        self.sampler = [
            WordSampler(corpus_data_0.dic, n_urns=n_samples, alpha=params.a_sample_factor, top=params.a_sample_top),
            WordSampler(corpus_data_1.dic, n_urns=n_samples, alpha=params.a_sample_factor, top=params.a_sample_top)]
        self.d_bs = params.d_bs
Beispiel #3
0
 def __init__(self,
              corpus_data_0,
              corpus_data_1,
              *,
              params,
              n_samples=10000000):
     self.skip_gram = [
         SkipGram(corpus_data_0.vocab_size + 1, params.emb_dim).to(GPU),
         SkipGram(corpus_data_1.vocab_size + 1, params.emb_dim).to(GPU)
     ]
     self.perm = Permutation(params.emb_dim,
                             params.p_sample_top,
                             n_units=params.p_n_units,
                             batch_norm=params.p_bn).to(GPU)
     self.sampler = [
         WordSampler(corpus_data_0.dic,
                     n_urns=n_samples,
                     alpha=params.p_sample_factor,
                     top=params.p_sample_top),
         WordSampler(corpus_data_1.dic,
                     n_urns=n_samples,
                     alpha=params.p_sample_factor,
                     top=params.p_sample_top)
     ]
     self.p_bs = params.p_bs
     self.i_bs = params.i_bs
     self.p_sample_top = params.p_sample_top
     self.emb_dim = params.emb_dim
     self.vocab_size_0, self.vocab_size_1 = corpus_data_0.vocab_size, corpus_data_1.vocab_size
     self.perm_optimizer, self.perm_scheduler = optimizers.get_sgd_adapt(
         self.perm.parameters(),
         lr=params.p_lr,
         mode="min",
         wd=params.p_wd,
         momentum=params.p_momentum,
         factor=params.p_lr_factor,
         patience=params.p_lr_patience)
     self.entropy_loss = EntropyLoss()
     self.init_target = None
     self.init_loss_fn = nn.CrossEntropyLoss(reduction="elementwise_mean")
     self.i_sampler = [
         WordSampler(corpus_data_0.dic,
                     n_urns=n_samples,
                     alpha=params.p_sample_factor,
                     top=params.i_n_init),
         WordSampler(corpus_data_1.dic,
                     n_urns=n_samples,
                     alpha=params.p_sample_factor,
                     top=params.i_n_init)
     ]
Beispiel #4
0
    def __init__(self, corpus_path, dic_path, *, max_ws, n_ns, threshold, n_negatives=10000000, shuffle=False):
        self.corpus_path = corpus_path
        meta = torch.load(f"{corpus_path}.meta.pt")
        self.n_docs = meta["n_docs"]
        self.n_tokens = meta["n_tokens"]
        self.dic = torch.load(f"{dic_path}.pt")
        self.vocab_size = len(self.dic)
        self.negative_sampler = WordSampler(self.dic, n_urns=n_negatives, alpha=0.5)
        self.max_ws = max_ws
        self.n_ns = n_ns
        self.shuffle = shuffle
        self.p_discard = get_discard_table(self.dic, self.n_tokens, threshold)

        self.corpus_block = None
        self.corpus_idx = -1
Beispiel #5
0
class CorpusData(Dataset):
    def __init__(self,
                 path_data,
                 path_model,
                 *,
                 max_ws,
                 n_ns,
                 threshold,
                 n_negatives=10000000):
        with open(path_data, mode="r", encoding="utf-8") as f:
            self.n_docs = sum(1 for _ in f)
        self.model = fastText.load_model(path_model)
        self.dic = list(zip(*self.model.get_words(include_freq=True)))
        self.n_tokens = sum(freq for _, freq in self.dic)
        self.file = open(path_data, mode="r", encoding="utf-8")
        self.p_discard = get_discard_table(self.dic, self.n_tokens, threshold)
        self.negative_sampler = WordSampler(self.dic,
                                            n_urns=n_negatives,
                                            alpha=0.5)
        self.max_ws = max_ws
        self.n_ns = n_ns

    def __getitem__(self, index):
        doc = self.file.readline()
        if not doc:
            self.file.seek(0)
            doc = self.file.readline()
        doc = self.model.get_line(doc.strip())[0]
        doc = [self.model.get_word_id(w) for w in doc]
        doc = [
            w for w in doc if w != -1 and np.random.rand() >= self.p_discard[w]
        ]
        c, u_b, v_b = 0, [], []
        for i in range(len(doc)):
            u = doc[i]
            ws = np.random.randint(1, self.max_ws + 1)
            for j in range(-ws, ws + 1):
                if j != 0 and 0 <= i + j < len(doc):
                    v = torch.LongTensor(self.n_ns + 1)
                    v[0] = doc[i + j]
                    for k in range(1, self.n_ns + 1):
                        v[k] = int(self.negative_sampler.sample_neg(doc[i +
                                                                        j]))
                    u_b.append(u)
                    v_b.append(v)
                    c += 1
        u_b = torch.LongTensor(u_b)
        if c > 0:
            v_b = torch.stack(v_b).view(c, self.n_ns + 1)
        else:
            v_b = torch.LongTensor([]).view(c, self.n_ns + 1)
        return u_b, v_b  # u_b: Int[c] ; v_b: Int[c, 6]

    def __len__(self):
        return self.n_docs
Beispiel #6
0
 def __init__(self,
              path_data,
              path_model,
              *,
              max_ws,
              n_ns,
              threshold,
              n_negatives=10000000):
     with open(path_data, mode="r", encoding="utf-8") as f:
         self.n_docs = sum(1 for _ in f)
     self.model = fastText.load_model(path_model)
     self.dic = list(zip(*self.model.get_words(include_freq=True)))
     self.n_tokens = sum(freq for _, freq in self.dic)
     self.file = open(path_data, mode="r", encoding="utf-8")
     self.p_discard = get_discard_table(self.dic, self.n_tokens, threshold)
     self.negative_sampler = WordSampler(self.dic,
                                         n_urns=n_negatives,
                                         alpha=0.5)
     self.max_ws = max_ws
     self.n_ns = n_ns
Beispiel #7
0
class CorpusData(Dataset):
    def __init__(self, corpus_path, dic_path, *, max_ws, n_ns, threshold, n_negatives=10000000, shuffle=False):
        self.corpus_path = corpus_path
        meta = torch.load(f"{corpus_path}.meta.pt")
        self.n_docs = meta["n_docs"]
        self.n_tokens = meta["n_tokens"]
        self.dic = torch.load(f"{dic_path}.pt")
        self.vocab_size = len(self.dic)
        self.negative_sampler = WordSampler(self.dic, n_urns=n_negatives, alpha=0.5)
        self.max_ws = max_ws
        self.n_ns = n_ns
        self.shuffle = shuffle
        self.p_discard = get_discard_table(self.dic, self.n_tokens, threshold)

        self.corpus_block = None
        self.corpus_idx = -1

    def __getitem__(self, index):
        idx = (index // BLOCK_SIZE, index % BLOCK_SIZE)
        if idx[0] != self.corpus_idx:
            self.corpus_idx = idx[0]
            self.corpus_block = torch.load(f"{self.corpus_path}.{self.corpus_idx}.pt")
            gc.collect()
        doc = torch.IntTensor([w for w in self.corpus_block[idx[1]]
                               if w == self.vocab_size or np.random.rand() >= self.p_discard[w]])
        c, pos_u_b, pos_v_b, neg_v_b = 0, [], [], []
        for i in range(doc.shape[0]):
            pos_u = doc[i].item()
            ws = np.random.randint(1, self.max_ws + 1)
            for j in range(-ws, ws + 1):
                if j != 0 and 0 <= i + j < doc.shape[0]:
                    pos_v = doc[i + j].item()
                    neg_v = torch.LongTensor(self.n_ns)
                    for k in range(self.n_ns):
                        neg_v[k] = int(self.negative_sampler.sample_neg(pos_v))
                    pos_u_b.append(pos_u)
                    pos_v_b.append(pos_v)
                    neg_v_b.append(neg_v)
                    c += 1
        pos_u_b = torch.LongTensor(pos_u_b).view(c, 1)
        pos_v_b = torch.LongTensor(pos_v_b).view(c, 1)
        if c > 0:
            neg_v_b = torch.stack(neg_v_b).view(c, self.n_ns)
        else:
            neg_v_b = torch.LongTensor([]).view(c, self.n_ns)
        if self.shuffle:
            perm = torch.randperm(c)
            return pos_u_b[perm], pos_v_b[perm], neg_v_b[perm]
        else:
            return pos_u_b, pos_v_b, neg_v_b

    def __len__(self):
        return self.n_docs
    def __init__(self, params, *, n_samples=10000000):
        self.model = [
            fastText.load_model(
                os.path.join(params.dataDir, params.model_path_0)),
            fastText.load_model(
                os.path.join(params.dataDir, params.model_path_1))
        ]
        self.dic = [
            list(zip(*self.model[id].get_words(include_freq=True)))
            for id in [0, 1]
        ]
        x = [
            np.empty((params.vocab_size, params.emb_dim), dtype=np.float64)
            for _ in [0, 1]
        ]
        for id in [0, 1]:
            for i in range(params.vocab_size):
                x[id][i, :] = self.model[id].get_word_vector(
                    self.dic[id][i][0])
            x[id] = normalize_embeddings_np(x[id], params.normalize_pre)
        u0, s0, _ = scipy.linalg.svd(x[0], full_matrices=False)
        u1, s1, _ = scipy.linalg.svd(x[1], full_matrices=False)
        if params.spectral_align_pre:
            s = (s0 + s1) * 0.5
            x[0] = u0 @ np.diag(s)
            x[1] = u1 @ np.diag(s)
        else:
            x[0] = u0 @ np.diag(s0)
            x[1] = u1 @ np.diag(s1)
        self.embedding = [
            nn.Embedding.from_pretrained(torch.from_numpy(x[id]).to(
                torch.float).to(GPU),
                                         freeze=True,
                                         sparse=True) for id in [0, 1]
        ]
        self.discriminator = Discriminator(
            params.emb_dim,
            n_layers=params.d_n_layers,
            n_units=params.d_n_units,
            drop_prob=params.d_drop_prob,
            drop_prob_input=params.d_drop_prob_input,
            leaky=params.d_leaky,
            batch_norm=params.d_bn).to(GPU)
        self.mapping = Mapping(params.emb_dim).to(GPU)
        if params.d_optimizer == "SGD":
            self.d_optimizer, self.d_scheduler = optimizers.get_sgd_adapt(
                self.discriminator.parameters(),
                lr=params.d_lr,
                mode="max",
                wd=params.d_wd)

        elif params.d_optimizer == "RMSProp":
            self.d_optimizer, self.d_scheduler = optimizers.get_rmsprop_linear(
                self.discriminator.parameters(),
                params.n_steps,
                lr=params.d_lr,
                wd=params.d_wd)
        else:
            raise Exception(f"Optimizer {params.d_optimizer} not found.")
        if params.m_optimizer == "SGD":
            self.m_optimizer, self.m_scheduler = optimizers.get_sgd_adapt(
                self.mapping.parameters(),
                lr=params.m_lr,
                mode="max",
                wd=params.m_wd,
                factor=params.m_lr_decay,
                patience=params.m_lr_patience)
        elif params.m_optimizer == "RMSProp":
            self.m_optimizer, self.m_scheduler = optimizers.get_rmsprop_linear(
                self.mapping.parameters(),
                params.n_steps,
                lr=params.m_lr,
                wd=params.m_wd)
        else:
            raise Exception(f"Optimizer {params.m_optimizer} not found")
        self.m_beta = params.m_beta
        self.smooth = params.smooth
        self.wgan = params.wgan
        self.d_clip_mode = params.d_clip_mode
        if params.wgan:
            self.loss_fn = _wasserstein_distance
        else:
            self.loss_fn = nn.BCEWithLogitsLoss(reduction="elementwise_mean")
        self.sampler = [
            WordSampler(self.dic[id],
                        n_urns=n_samples,
                        alpha=params.a_sample_factor,
                        top=params.a_sample_top) for id in [0, 1]
        ]
        self.d_bs = params.d_bs
        self.d_gp = params.d_gp