def train(self, train, valid, dataset: str):
        path = "Models/TransE/save.pt"

        # prepare the data
        self.emap = IMap()
        self.rmap = IMap()
        self.h2t = defaultdict(list)
        for h, r, t in train:
            self.emap.put(h)
            self.emap.put(t)
            self.rmap.put(r)
            self.h2t[h].append(self.emap[t])
        for h, tails in self.h2t.items():
            self.h2t[h] = torch.tensor(tails)
        train_batch = data.DataLoader(TripleData(train, self.emap, self.rmap),
                                      batch_size=self.batch_size)
        valid_batch = data.DataLoader(TripleData(valid, self.emap, self.rmap),
                                      batch_size=self.batch_size)

        # prepare the model
        self.module = TransEModule(len(self.emap),
                                   len(self.rmap),
                                   dim=self.dim).to(self.device)
        self.optimizer = optim.Adam(self.module.parameters(), lr=self.lr)

        # train it
        epoch = 1
        best_val = float("+inf")
        patience = 5
        p = patience
        print(f"Early stopping with patience {patience}")
        while p > 0:
            print(f"Epoch {epoch}")

            # training
            self.module.train()
            train_it = tqdm(train_batch, desc="\tTraining", file=sys.stdout)
            self.epoch(train_it)

            # validation
            self.module.eval()
            valid_it = tqdm(valid_batch, desc="\tValidating", file=sys.stdout)
            with torch.no_grad():
                v_loss, v_pd, v_nd = self.epoch(valid_it, learn=False)
            if v_loss < best_val:
                torch.save(self.module, path)
                best_val = v_loss
                p = patience
            else:
                p -= 1
            epoch += 1
            print()
        print(
            f"Loading best val loss = {best_val:.2f} at epoch {epoch-patience-1}"
        )
        # self.module = torch.load(path)
        self.inspect_embeddings()
Exemple #2
0
 def __init__(self):
     # a mapping between english names and entity number
     self.emap = IMap()
     # a mapping between english names and relation number
     self.rmap = IMap()
     # all known r -> t
     self.r_t = defaultdict(set)
     # the Knowledge Graph
     self.kg = defaultdict(list)
Exemple #3
0
def plot_test_distances(dataset: str):
    limit = 10
    train, valid, test = load_dataset(dataset)
    # map entities to an id
    emap = IMap()
    for h, _, t in train:
        emap.put(h)
        emap.put(t)
    # build the kg
    kg = lil_matrix((len(emap), len(emap)), dtype=np.uint16)
    for h, _, t in train:
        kg[emap[h], emap[t]] = 1
    kg = kg.tocsr()
    test.sort(key=lambda hrt: hrt[0])
    distances = []
    _h = None
    shortest = None
    for h, _, t in tqdm(test, desc="Distances"):
        if _h != h:
            shortest = dijkstra(kg,
                                limit=limit,
                                indices=emap[h],
                                return_predecessors=False)
            _h = h
        distances.append(shortest[emap[t]])
    distances = np.array(distances)
    distances[distances > limit] = limit + 1
    plt.hist(distances, bins=range(0, limit + 2))
    plt.axvline(distances.mean(), color="red", linestyle="dashed")
    plt.axvline(np.median(distances), color="black")
    plt.title(f"Distances of test triples in training graph in {dataset}")
    plt.xlabel("distance")
    plt.ylabel("# of nodes")
    plt.show()
Exemple #4
0
def targets(data, dataset: str, min_dist=2, max_dist=3):
	try:
		with open(f"Structures/bad_ex_{dataset}.json", "r") as f:
			return json.load(f)
	except FileNotFoundError:
		emap = IMap()
		r_t = defaultdict(set)
		h_r = defaultdict(set)
		for h, r, t in data:
			emap.put(h)
			emap.put(t)
			r_t[r].add(emap[t])
			h_r[emap[h]].add(r)
		g = lil_matrix((len(emap), len(emap)))
		for h, r, t in data:
			g[emap[h], emap[t]] = 1
		g = g.tocsr()
		ts = []
		for i in trange(len(emap), desc="Bad examples", ncols=140):
			rel_inds = set()
			for r in h_r[i]:
				rel_inds |= r_t[r]
			dists = dijkstra(
				g, directed=False, unweighted=True, indices=i,
				return_predecessors=False, limit=max_dist
			)
			dists_inds = set(np.asarray((min_dist <= dists) & (dists <= max_dist)).nonzero()[0].tolist())
			ts.append(list(dists_inds ^ rel_inds))
		with open(f"Structures/bad_ex_{dataset}.json", "w") as f:
			json.dump(ts, f)
		return ts
Exemple #5
0
def sparse_graph(data):
	emap = IMap()
	for h, r, t in data:
		emap.put(h)
		emap.put(t)
	g = lil_matrix((len(emap), len(emap)))
	for h, r, t in data:
		g[emap[h], emap[t]] = 1
	return g.tocsr()
 def load(self, train, valid, dataset: str):
     # prepare the data
     self.emap = IMap()
     self.rmap = IMap()
     self.h2t = defaultdict(list)
     for h, r, t in train:
         self.emap.put(h)
         self.emap.put(t)
         self.rmap.put(r)
         self.h2t[h].append(self.emap[t])
     for h, tails in self.h2t.items():
         self.h2t[h] = torch.tensor(tails)
     self.module = torch.load(self.path)
     self.optimizer = optim.Adam(self.module.parameters(), lr=self.lr)
     valid_batch = data.DataLoader(TripleData(valid, self.emap, self.rmap),
                                   batch_size=self.batch_size)
     valid_it = tqdm(valid_batch,
                     ncols=140,
                     desc="\tValidating",
                     file=sys.stdout)
     with torch.no_grad():
         self.epoch(valid_it, learn=False)
     self.inspect_embeddings()
Exemple #7
0
 def __init__(self, depth=3, embed_size=64, lr=0.001):
     self.path = "Models/APRA/save.pt"
     if torch.cuda.is_available():
         print("Using the GPU")
         self.device = torch.device("cuda")
     else:
         print("Using the CPU")
         self.device = torch.device("cpu")
     self.depth = max(2, depth)
     self.embed_size = embed_size
     self.lr = lr
     self.batch_size = 2
     self.neg_per_pos = 5
     self.view_every = 0
     self.cmap = IMap("abcdefghijklmnopqrstuvwxyz-/'. ")
     self.loss = nn.CrossEntropyLoss(
         torch.tensor([1, self.neg_per_pos],
                      dtype=torch.float,
                      device=self.device))