Example #1
0
 def train_d2v(config):
     adjust_config()
     data = dataset()
     d2vpairs = D2VPairs(dataset=data, **config["pairs"])
     d2vemb = SGNS(d2vpairs, dataset=data, **config["model"])
     last_iter = str(config["model"]["iterations"])
     macro, acc = eval_model(d2vemb.embeddings[last_iter], dataset=data)
     tune.track.log(f1_macro=macro,
                    accuracy=acc,
                    avg_loss=d2vemb.losses[last_iter])
Example #2
0
 def train_jce(config):
     adjust_config()
     data = dataset()
     d2vpairs = D2VPairs(dataset=data, **config["pairs_d2v"])
     dwpairs = DWPairs(dataset=data, **config["pairs_dw"])
     jceemb = JCE(data=[d2vpairs, dwpairs], dataset=data, **config["model"])
     last_iter = str(config["model"]["iterations"])
     macro, acc = eval_model(jceemb.embeddings[last_iter], dataset=data)
     tune.track.log(f1_macro=macro,
                    accuracy=acc,
                    avg_loss=jceemb.losses[last_iter])
Example #3
0
    def train(self):
        for epoch in range(self.iterations):
            self.data.make_pairs()
            loader = DataLoader(self.data,
                                batch_size=self.batch_size,
                                shuffle=True,
                                num_workers=self.workers,
                                pin_memory=True)

            total_batches = len(loader)
            tenth = int(total_batches / 10)
            epoch_loss = 0
            epoch_batches = 0
            avg_loss = 0

            loop = tqdm(enumerate(loader),
                        total=total_batches,
                        disable=(not config.progress))
            for i, (pos_u, pos_v, neg_v) in loop:
                epoch_batches += 1

                if self.use_cuda:
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                # Zero gradients
                self.optimizer.zero_grad()
                # Execute forward pass and get loss
                loss = self.model.forward(pos_u, pos_v, neg_v)
                epoch_loss += loss
                # Execute backward pass
                loss.backward()
                self.optimizer.step()

                if i % tenth == 0:
                    avg_loss = epoch_loss / epoch_batches
                    config.debug(
                        f'Epoch {epoch+1}/{self.iterations} - {int(i/total_batches * 100)}%'
                    )

                if config.progress:
                    loop.set_description(
                        f'Epoch {epoch+1}/{self.iterations}, Total Loss {epoch_loss.round()}, Avg. Loss {avg_loss.round()}'
                        + f', LR {self.get_current_lr()}')
            config.debug(f'Epoch {epoch+1}/{self.iterations} - 100%')
            loop.close()
            del loader, loop
            self.scheduler.step()
            self.report_values(epoch, epoch_loss / epoch_batches)
            if self.dataset:
                print("Evaluation: ",
                      eval_model(self.get_embedding(), self.dataset, folds=10))
Example #4
0
    def train(self):
        for epoch in range(self.iterations):
            for i, data in enumerate(self.data):
                data.make_pairs(alg_index=i)
            loaders = [
                DataLoader(pairs,
                           batch_size=self.batch_sizes[i],
                           shuffle=True,
                           num_workers=self.workers,
                           pin_memory=True)
                for i, pairs in enumerate(self.data)
            ]
            batches = round_robin([iter(loader) for loader in loaders])

            total_batches = np.sum([len(loader) for loader in loaders])
            tenth = int(total_batches / 10)
            epoch_loss = 0
            epoch_batches = 0
            avg_loss = 0

            loop = tqdm(enumerate(batches),
                        total=total_batches,
                        disable=(not config.progress))
            for i, (pos_u, pos_v, neg_v, alg_index) in loop:
                alg_index = alg_index.data[0]  # Reduce [batch_size] to int
                epoch_batches += 1

                if self.use_cuda:
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()
                    alg_index = alg_index.cuda()

                # Disable backward autograd for all embeddings expect the one of the current algorithm
                if self.disable_grad:
                    optimizer = self.optimizers[alg_index]
                    disabled_embeddings = []
                    for name, param in self.model.named_parameters():
                        if name.startswith(
                                'u_embeddings'
                        ) and not name.startswith(f'u_embeddings.{alg_index}'):
                            param.requires_grad = False
                            disabled_embeddings.append(param)
                else:
                    # for SINE (not disabling grad) we select the same optimizer for everything
                    optimizer = self.optimizers[0]

                # Zero gradients
                optimizer.zero_grad()
                # Execute forward pass and get loss
                loss = self.model.forward(pos_u, pos_v, neg_v, alg_index)
                epoch_loss += loss
                # Execute backward pass
                loss.backward()
                optimizer.step()

                # Re-enable all disabled embeddings for next batch
                if self.disable_grad:
                    for disabled_emb in disabled_embeddings:
                        disabled_emb.requires_grad = True

                if i % tenth == 0:
                    avg_loss = epoch_loss / epoch_batches
                    config.debug(
                        f'Epoch {epoch+1}/{self.iterations} - {int(i/total_batches * 100)}%'
                    )

                if config.progress:
                    loop.set_description(
                        f'Epoch {epoch+1}/{self.iterations}, Total Loss {epoch_loss.round()}, Avg. Loss {avg_loss.round()}'
                        + f', LR {self.get_current_lr()}')
            config.debug(f'Epoch {epoch+1}/{self.iterations} - 100%')
            loop.close()
            del loop, batches, loaders
            for scheduler in self.schedulers:
                scheduler.step()
            self.report_values(epoch, epoch_loss / epoch_batches)
            if self.dataset:
                print("Evaluation: ",
                      eval_model(self.get_embedding(), self.dataset, folds=10))
def main():

    for drop_percentage in drop_percentages:
        os.system("mkdir {}/{} 2> /dev/null".format(TIMESTAMP_CACHE_DIR,
                                                    drop_percentage))
        if 0 == drop_percentage:
            # run on Full dataset
            test_to_be_run = ["full_dataset"]
        else:
            test_to_be_run = test_cases
        for test in test_to_be_run:
            print("Performing test {} with drop_percentage = {}".format(
                test, drop_percentage))
            dataset = copy.deepcopy(complete_dataset)
            # do not perform any edge removal when running on full dataset
            if "full_dataset" != test:
                print("Removing process ...")
                remove_method = getattr(dataset, test)
                remove_method(drop_percentage)

            os.system("mkdir {}/{}/{} 2> /dev/null".format(
                TIMESTAMP_CACHE_DIR, drop_percentage, test))
            cache_dir = "{}/{}/{}".format(TIMESTAMP_CACHE_DIR, drop_percentage,
                                          test)

            params["d2v"]["model"]["cache_file"] = cache_dir + "/d2v"
            params["dw"]["model"]["cache_file"] = cache_dir + "/dw"
            params["sine"]["cache_file"] = cache_dir + "/sine"

            ################################################################
            # Saving dataset
            ################################################################
            print(
                "saving missing dataset in {}/missing_edges_dataset.p".format(
                    cache_dir))
            pickle.dump(
                dataset,
                open("{}/missing_edges_dataset.p".format(cache_dir), "wb"))

            ################################################################
            # Making pairs
            ################################################################
            d2vpairs = D2VPairs(dataset=dataset, **params["d2v"]["pairs"])
            dwpairs = DWPairs(dataset=dataset, **params["dw"]["pairs"])

            #############################################
            # Doc2Vec
            #############################################
            print("Running Doc2Vec ...")
            d2vmodel = SGNS(d2vpairs, **params["d2v"]["model"])
            d2vEmb = d2vmodel.embeddings[str(
                params["d2v"]["model"]["iterations"])]
            print("Eval D2V: ", eval_model(d2vEmb, dataset=dataset))

            #############################################
            # DeepWalk
            #############################################
            print("Running DeepWalk ...")
            dwmodel = SGNS(dwpairs, **params["dw"]["model"])
            dwEmb = dwmodel.embeddings[str(
                params["dw"]["model"]["iterations"])]
            print("Eval DW: ", eval_model(dwEmb, dataset=dataset))

            #############################################
            # SINE
            #############################################
            print("Running SINE ...")
            sinemodel = JCE(data=[d2vpairs, dwpairs],
                            disable_grad=False,
                            **params["sine"])
            sineEmb = sinemodel.embeddings[str(params["sine"]["iterations"])]
            print("Eval JCE (SINE): ", eval_model(sineEmb, dataset=dataset))