Example #1
0
    def _load(model_name_or_path: str, load_weights: bool = False):
        if not os.path.exists(model_name_or_path):
            if model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
                if not os.path.exists(f"saved/{model_name_or_path}"):
                    archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[
                        model_name_or_path]
                    download_url(archive_file, "saved/",
                                 f"{model_name_or_path}.zip")
                    untar("saved/", f"{model_name_or_path}.zip")
                model_name_or_path = f"saved/{model_name_or_path}"
            else:
                raise KeyError("Cannot find the pretrained model {}".format(
                    model_name_or_path))

        try:
            version = open(os.path.join(model_name_or_path,
                                        "version")).readline().strip()
        except Exception:
            version = None

        bert_config = BertConfig.from_dict(
            json.load(
                open(os.path.join(model_name_or_path, "bert_config.json"))))
        tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
        if version == "2":
            bert_model = OAGMetaInfoBertModel(bert_config, tokenizer)
        else:
            bert_model = OAGBertPretrainingModel(bert_config)

        model_weight_path = os.path.join(model_name_or_path,
                                         "pytorch_model.bin")
        if load_weights and os.path.exists(model_weight_path):
            bert_model.load_state_dict(torch.load(model_weight_path))

        return bert_config, tokenizer, bert_model
    def load_data(self):
        rpath = "data/supervised_classification/" + self.dataset
        zip_name = self.dataset + ".zip"
        if not os.path.isdir(rpath):
            download_url(dataset_url_dict[self.dataset], rpath, name=zip_name)
            untar(rpath, zip_name)

        # dest_dir = '../oagbert/benchmark/raid/yinda/oagbert_v1.5/%s/supervised' % self.dataset
        dest_dir = rpath

        def _load(name):
            data = []
            for line in open("%s/%s.jsonl" % (dest_dir, name)):
                data.append(json.loads(line.strip()))
            return data

        train_data, dev_data, test_data = _load("train"), _load("dev"), _load(
            "test")
        return train_data, dev_data, test_data
Example #3
0
    def train(self, data):
        if not os.path.isfile(self.load_path):
            print("=> no checkpoint found at '{}'".format(self.load_path))
            url = "https://github.com/cenyk1230/gcc-data/raw/master/saved/gcc_pretrained.pth"
            path = "/".join(self.load_path.split("/")[:-1])
            name = self.load_path.split("/")[-1]
            download_url(url, path, name=name)

        print("=> loading checkpoint '{}'".format(self.load_path))
        checkpoint = torch.load(self.load_path, map_location="cpu")
        print("=> loaded successfully '{}' (epoch {})".format(self.load_path, checkpoint["epoch"]))
        args = checkpoint["opt"]

        args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if isinstance(data, list):
            train_dataset = GraphClassificationDataset(
                data=data,
                rw_hops=args.rw_hops,
                subgraph_size=args.subgraph_size,
                restart_prob=args.restart_prob,
                positional_embedding_size=args.positional_embedding_size,
            )
        else:
            train_dataset = NodeClassificationDataset(
                data=data,
                rw_hops=args.rw_hops,
                subgraph_size=args.subgraph_size,
                restart_prob=args.restart_prob,
                positional_embedding_size=args.positional_embedding_size,
            )
        args.batch_size = len(train_dataset)
        train_loader = torch.utils.data.DataLoader(
            dataset=train_dataset,
            batch_size=args.batch_size,
            collate_fn=batcher(),
            shuffle=False,
            num_workers=args.num_workers,
        )

        # create model and optimizer
        model = GraphEncoder(
            positional_embedding_size=args.positional_embedding_size,
            max_node_freq=args.max_node_freq,
            max_edge_freq=args.max_edge_freq,
            max_degree=args.max_degree,
            freq_embedding_size=args.freq_embedding_size,
            degree_embedding_size=args.degree_embedding_size,
            output_dim=args.hidden_size,
            node_hidden_dim=args.hidden_size,
            edge_hidden_dim=args.hidden_size,
            num_layers=args.num_layer,
            num_step_set2set=args.set2set_iter,
            num_layer_set2set=args.set2set_lstm_layer,
            gnn_model=args.model,
            norm=args.norm,
            degree_input=True,
        )

        model = model.to(args.device)

        model.load_state_dict(checkpoint["model"])

        del checkpoint

        emb = test_moco(train_loader, model, args)

        return emb.numpy()
Example #4
0
 def download(self):
     for name in self.raw_file_names:
         # download_url("{}/{}/{}".format(self.url, self.name, name), self.raw_dir)
         download_url(self.url.format(self.name, name),
                      self.raw_dir,
                      name=name)
Example #5
0
 def download(self):
     for name in self.raw_file_names:
         download_url("{}{}".format(self.url, name), self.raw_dir)
Example #6
0
 def download(self):
     for name in self.raw_file_names:
         download_url(self.url.format(name), self.raw_dir, name=name)
         time.sleep(0.5)
Example #7
0
 def download(self):
     download_url(self.url, self.raw_dir, name="processed.zip")
Example #8
0
 def download(self):
     for name in self.raw_file_names:
         download_url("{}/{}/{}".format(self.url, self.name.lower(), name),
                      self.raw_dir)
Example #9
0
 def download(self):
     fname = "{}.zip".format(self.name.lower())
     download_url("{}{}.zip&dl=1".format(self.url, self.name.lower()),
                  self.raw_dir, fname)
     untar(self.raw_dir, fname)
Example #10
0
 def download(self):
     download_url(self.url, self.raw_dir, name=self.name + ".zip")
     untar(self.raw_dir, self.name + ".zip")
Example #11
0
 def download(self):
     for name in self.raw_file_names:
         if not os.path.exists(os.path.join(self.raw_dir, name)):
             download_url("{}/{}".format(self.url, name), self.raw_dir)
Example #12
0
 def download(self):
     fname = "{}.tgz".format(self.name.lower())
     download_url("{}{}.tgz&dl=1".format(base_url, self.name), self.raw_dir,
                  fname)
     untar(self.raw_dir, fname)
Example #13
0
 def download(self):
     filename = self.name + '.zip'
     download_url(self.url, self.processed_dir, name=filename)
     untar(self.processed_dir, filename)
     print(f'downloaded to {self.processed_dir}')