def train(self, train, valid, dataset: str): path = "Models/TransE/save.pt" # prepare the data self.emap = IMap() self.rmap = IMap() self.h2t = defaultdict(list) for h, r, t in train: self.emap.put(h) self.emap.put(t) self.rmap.put(r) self.h2t[h].append(self.emap[t]) for h, tails in self.h2t.items(): self.h2t[h] = torch.tensor(tails) train_batch = data.DataLoader(TripleData(train, self.emap, self.rmap), batch_size=self.batch_size) valid_batch = data.DataLoader(TripleData(valid, self.emap, self.rmap), batch_size=self.batch_size) # prepare the model self.module = TransEModule(len(self.emap), len(self.rmap), dim=self.dim).to(self.device) self.optimizer = optim.Adam(self.module.parameters(), lr=self.lr) # train it epoch = 1 best_val = float("+inf") patience = 5 p = patience print(f"Early stopping with patience {patience}") while p > 0: print(f"Epoch {epoch}") # training self.module.train() train_it = tqdm(train_batch, desc="\tTraining", file=sys.stdout) self.epoch(train_it) # validation self.module.eval() valid_it = tqdm(valid_batch, desc="\tValidating", file=sys.stdout) with torch.no_grad(): v_loss, v_pd, v_nd = self.epoch(valid_it, learn=False) if v_loss < best_val: torch.save(self.module, path) best_val = v_loss p = patience else: p -= 1 epoch += 1 print() print( f"Loading best val loss = {best_val:.2f} at epoch {epoch-patience-1}" ) # self.module = torch.load(path) self.inspect_embeddings()
def __init__(self): # a mapping between english names and entity number self.emap = IMap() # a mapping between english names and relation number self.rmap = IMap() # all known r -> t self.r_t = defaultdict(set) # the Knowledge Graph self.kg = defaultdict(list)
def plot_test_distances(dataset: str): limit = 10 train, valid, test = load_dataset(dataset) # map entities to an id emap = IMap() for h, _, t in train: emap.put(h) emap.put(t) # build the kg kg = lil_matrix((len(emap), len(emap)), dtype=np.uint16) for h, _, t in train: kg[emap[h], emap[t]] = 1 kg = kg.tocsr() test.sort(key=lambda hrt: hrt[0]) distances = [] _h = None shortest = None for h, _, t in tqdm(test, desc="Distances"): if _h != h: shortest = dijkstra(kg, limit=limit, indices=emap[h], return_predecessors=False) _h = h distances.append(shortest[emap[t]]) distances = np.array(distances) distances[distances > limit] = limit + 1 plt.hist(distances, bins=range(0, limit + 2)) plt.axvline(distances.mean(), color="red", linestyle="dashed") plt.axvline(np.median(distances), color="black") plt.title(f"Distances of test triples in training graph in {dataset}") plt.xlabel("distance") plt.ylabel("# of nodes") plt.show()
def targets(data, dataset: str, min_dist=2, max_dist=3): try: with open(f"Structures/bad_ex_{dataset}.json", "r") as f: return json.load(f) except FileNotFoundError: emap = IMap() r_t = defaultdict(set) h_r = defaultdict(set) for h, r, t in data: emap.put(h) emap.put(t) r_t[r].add(emap[t]) h_r[emap[h]].add(r) g = lil_matrix((len(emap), len(emap))) for h, r, t in data: g[emap[h], emap[t]] = 1 g = g.tocsr() ts = [] for i in trange(len(emap), desc="Bad examples", ncols=140): rel_inds = set() for r in h_r[i]: rel_inds |= r_t[r] dists = dijkstra( g, directed=False, unweighted=True, indices=i, return_predecessors=False, limit=max_dist ) dists_inds = set(np.asarray((min_dist <= dists) & (dists <= max_dist)).nonzero()[0].tolist()) ts.append(list(dists_inds ^ rel_inds)) with open(f"Structures/bad_ex_{dataset}.json", "w") as f: json.dump(ts, f) return ts
def sparse_graph(data): emap = IMap() for h, r, t in data: emap.put(h) emap.put(t) g = lil_matrix((len(emap), len(emap))) for h, r, t in data: g[emap[h], emap[t]] = 1 return g.tocsr()
def load(self, train, valid, dataset: str): # prepare the data self.emap = IMap() self.rmap = IMap() self.h2t = defaultdict(list) for h, r, t in train: self.emap.put(h) self.emap.put(t) self.rmap.put(r) self.h2t[h].append(self.emap[t]) for h, tails in self.h2t.items(): self.h2t[h] = torch.tensor(tails) self.module = torch.load(self.path) self.optimizer = optim.Adam(self.module.parameters(), lr=self.lr) valid_batch = data.DataLoader(TripleData(valid, self.emap, self.rmap), batch_size=self.batch_size) valid_it = tqdm(valid_batch, ncols=140, desc="\tValidating", file=sys.stdout) with torch.no_grad(): self.epoch(valid_it, learn=False) self.inspect_embeddings()
def __init__(self, depth=3, embed_size=64, lr=0.001): self.path = "Models/APRA/save.pt" if torch.cuda.is_available(): print("Using the GPU") self.device = torch.device("cuda") else: print("Using the CPU") self.device = torch.device("cpu") self.depth = max(2, depth) self.embed_size = embed_size self.lr = lr self.batch_size = 2 self.neg_per_pos = 5 self.view_every = 0 self.cmap = IMap("abcdefghijklmnopqrstuvwxyz-/'. ") self.loss = nn.CrossEntropyLoss( torch.tensor([1, self.neg_per_pos], dtype=torch.float, device=self.device))