def _load(model_name_or_path: str, load_weights: bool = False): if not os.path.exists(model_name_or_path): if model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: if not os.path.exists(f"saved/{model_name_or_path}"): archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[ model_name_or_path] download_url(archive_file, "saved/", f"{model_name_or_path}.zip") untar("saved/", f"{model_name_or_path}.zip") model_name_or_path = f"saved/{model_name_or_path}" else: raise KeyError("Cannot find the pretrained model {}".format( model_name_or_path)) try: version = open(os.path.join(model_name_or_path, "version")).readline().strip() except Exception: version = None bert_config = BertConfig.from_dict( json.load( open(os.path.join(model_name_or_path, "bert_config.json")))) tokenizer = BertTokenizer.from_pretrained(model_name_or_path) if version == "2": bert_model = OAGMetaInfoBertModel(bert_config, tokenizer) else: bert_model = OAGBertPretrainingModel(bert_config) model_weight_path = os.path.join(model_name_or_path, "pytorch_model.bin") if load_weights and os.path.exists(model_weight_path): bert_model.load_state_dict(torch.load(model_weight_path)) return bert_config, tokenizer, bert_model
def load_data(self): rpath = "data/supervised_classification/" + self.dataset zip_name = self.dataset + ".zip" if not os.path.isdir(rpath): download_url(dataset_url_dict[self.dataset], rpath, name=zip_name) untar(rpath, zip_name) # dest_dir = '../oagbert/benchmark/raid/yinda/oagbert_v1.5/%s/supervised' % self.dataset dest_dir = rpath def _load(name): data = [] for line in open("%s/%s.jsonl" % (dest_dir, name)): data.append(json.loads(line.strip())) return data train_data, dev_data, test_data = _load("train"), _load("dev"), _load( "test") return train_data, dev_data, test_data
def train(self, data): if not os.path.isfile(self.load_path): print("=> no checkpoint found at '{}'".format(self.load_path)) url = "https://github.com/cenyk1230/gcc-data/raw/master/saved/gcc_pretrained.pth" path = "/".join(self.load_path.split("/")[:-1]) name = self.load_path.split("/")[-1] download_url(url, path, name=name) print("=> loading checkpoint '{}'".format(self.load_path)) checkpoint = torch.load(self.load_path, map_location="cpu") print("=> loaded successfully '{}' (epoch {})".format(self.load_path, checkpoint["epoch"])) args = checkpoint["opt"] args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if isinstance(data, list): train_dataset = GraphClassificationDataset( data=data, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) else: train_dataset = NodeClassificationDataset( data=data, rw_hops=args.rw_hops, subgraph_size=args.subgraph_size, restart_prob=args.restart_prob, positional_embedding_size=args.positional_embedding_size, ) args.batch_size = len(train_dataset) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, collate_fn=batcher(), shuffle=False, num_workers=args.num_workers, ) # create model and optimizer model = GraphEncoder( positional_embedding_size=args.positional_embedding_size, max_node_freq=args.max_node_freq, max_edge_freq=args.max_edge_freq, max_degree=args.max_degree, freq_embedding_size=args.freq_embedding_size, degree_embedding_size=args.degree_embedding_size, output_dim=args.hidden_size, node_hidden_dim=args.hidden_size, edge_hidden_dim=args.hidden_size, num_layers=args.num_layer, num_step_set2set=args.set2set_iter, num_layer_set2set=args.set2set_lstm_layer, gnn_model=args.model, norm=args.norm, degree_input=True, ) model = model.to(args.device) model.load_state_dict(checkpoint["model"]) del checkpoint emb = test_moco(train_loader, model, args) return emb.numpy()
def download(self): for name in self.raw_file_names: # download_url("{}/{}/{}".format(self.url, self.name, name), self.raw_dir) download_url(self.url.format(self.name, name), self.raw_dir, name=name)
def download(self): for name in self.raw_file_names: download_url("{}{}".format(self.url, name), self.raw_dir)
def download(self): for name in self.raw_file_names: download_url(self.url.format(name), self.raw_dir, name=name) time.sleep(0.5)
def download(self): download_url(self.url, self.raw_dir, name="processed.zip")
def download(self): for name in self.raw_file_names: download_url("{}/{}/{}".format(self.url, self.name.lower(), name), self.raw_dir)
def download(self): fname = "{}.zip".format(self.name.lower()) download_url("{}{}.zip&dl=1".format(self.url, self.name.lower()), self.raw_dir, fname) untar(self.raw_dir, fname)
def download(self): download_url(self.url, self.raw_dir, name=self.name + ".zip") untar(self.raw_dir, self.name + ".zip")
def download(self): for name in self.raw_file_names: if not os.path.exists(os.path.join(self.raw_dir, name)): download_url("{}/{}".format(self.url, name), self.raw_dir)
def download(self): fname = "{}.tgz".format(self.name.lower()) download_url("{}{}.tgz&dl=1".format(base_url, self.name), self.raw_dir, fname) untar(self.raw_dir, fname)
def download(self): filename = self.name + '.zip' download_url(self.url, self.processed_dir, name=filename) untar(self.processed_dir, filename) print(f'downloaded to {self.processed_dir}')