Ejemplo n.º 1
0
    def pretrain_begin_iteration(self):
        super(Sampler, self).pretrain_begin_iteration()

        # sample triangular data
        filtered_pos = [
            p for p in self._pos[0] if p[0] + 1 < self._pos_range[1]
        ]  # except the last time slice!

        if len(filtered_pos) <= 0:
            print(
                "No possible triangular samples, given positive range {} to {}"
                .format(self._pos_range[0], self._pos_range[1]))
            triagdata = [None] * len(
                self._pos[0]
            )  # in order to pass assertion in datagen_pos_neg.batches()
        else:
            if not self.__enable_cache:
                mapper = utils.ParMap(self.__uncached_sampler_factory(),
                                      self.__sample_uncached_monitor,
                                      njobs=gconf.njobs)
                triagdata = []
                sample_round = 0
                while len(triagdata) < len(self._pos[0]):
                    left_cnt = len(self._pos[0]) - len(triagdata)

                    # verboses
                    print("sample round {}, target #samples {}".format(
                        sample_round, left_cnt))
                    sample_round += 1

                    # increase the probability of finish sampling in a single round
                    left_cnt = int(
                        left_cnt *
                        (float(self.__all_trial) / self.__succ_trial + 0.2))
                    if left_cnt < 100:
                        left_cnt = 100
                        mapper.njobs = 1
                    lb = max(0, utils.crandint(len(filtered_pos) - left_cnt))
                    ub = min(lb + left_cnt, len(filtered_pos))
                    newsamples = mapper.run(filtered_pos[lb:ub])
                    self.__all_trial += (ub - lb)
                    self.__succ_trial += len(newsamples)
                    triagdata.extend(newsamples)
                triagdata = triagdata[:len(self._pos[0])]
            else:
                raise NotImplementedError()

        self._neg.append(triagdata)  # neg, triangdata_int, triangdata_float
Ejemplo n.º 2
0
    def __make_neg(self, posdata, negdup=1):
        negdata = []
        # TODO: this is an ugly fix, try to add indexing support in mygraph
        nodenames = list(self.dataset.gtgraphs['any'].vp['name'])

        for d in posdata:
            k, src, tgt = d
            negdata.append([])
            for i in range(negdup):
                if utils.crandint(2) == 0:  # replace source
                    if self.__enable_cache:
                        curcache = self._rep_cache(k)[tgt]
                        new_src = curcache[utils.crandint(len(curcache))]
                        negdata[-1].extend([new_src, tgt])
                    else:
                        # TODO: although it is almost impossible for a node to have all edges, check this in advance
                        #new_src = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1)
                        new_src = utils.crandint(
                            self.dataset.gtgraphs[k].num_vertices())
                        assert not self.dataset.gtgraphs[k].is_directed()
                        while self.dataset.mygraphs[k].exists(
                                nodenames[new_src], nodenames[tgt]):
                            #new_src = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1)
                            new_src = utils.crandint(
                                self.dataset.gtgraphs[k].num_vertices())
                        negdata[-1].extend([new_src, tgt])
                else:  # replace target
                    if self.__enable_cache:
                        curcache = self._rep_cache(k)[src]
                        #new_tgt = curcache[random.randint(0, len(curcache) - 1)]
                        new_tgt = curcache[utils.crandint(len(curcache))]
                        negdata[-1].extend([src, new_tgt])
                    else:
                        #new_tgt = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1)
                        new_tgt = utils.crandint(
                            self.dataset.gtgraphs[k].num_vertices())
                        while self.dataset.mygraphs[k].exists(
                                nodenames[src], nodenames[new_tgt]):
                            #new_tgt = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1)
                            new_tgt = utils.crandint(
                                self.dataset.gtgraphs[k].num_vertices())
                        negdata[-1].extend([src, new_tgt])
        negdata = np.array(negdata)
        assert negdata.shape == (len(posdata), 2 * negdup), "{}, {}".format(
            negdata.shape, (len(posdata), 2 * negdup))

        return negdata
Ejemplo n.º 3
0
    def __sample_one_uncached(data, nodenames, name2idx, mygraphs, localstep):
        k, src, tgt = [
            int(d) for d in data
        ]  # convert from np types to int, to avoid problems in c extensions
        k = int(k)
        src = int(src)
        tgt = int(tgt)
        localstep = int(localstep)
        myg = mygraphs[k - localstep]
        mynextg = mygraphs[k + 1 - localstep]

        if utils.crandint(2) == 0:  # target as key point
            trycnt = 0
            # new_src = random.randint(0, self.dataset.graphs[k].num_vertices() - 1)
            nbr = myg.out_neighbours(nodenames[tgt])
            new_src = name2idx[nbr[utils.crandint(len(nbr))]]
            # while self._edge(k, tgt, new_src) is None or self._edge(k, src, new_src) is not None:
            while new_src == tgt or new_src == src or not myg.exists(nodenames[tgt], nodenames[new_src]) or \
                    myg.exists(nodenames[src], nodenames[new_src]):
                if trycnt >= 5:
                    break
                # new_src = random.randint(0, self.dataset.graphs[k].num_vertices() - 1)
                new_src = name2idx[nbr[utils.crandint(len(nbr))]]
                trycnt += 1
            if trycnt >= 5:
                # nbr = [int(v) for v in self.dataset.gtgraphs[k].vertex(tgt).out_neighbours()
                #       if int(v) != src and int(v) != tgt and not myg.exists(nodenames[int(v)], nodenames[src])]
                #       if int(v) != src and self._edge(k, v, src) is None]
                cand = [name2idx[n] for n in nbr]
                cand = [
                    n for n in cand if n != src and n != tgt
                    and not myg.exists(nodenames[n], nodenames[src])
                ]
                if len(cand) <= 0:
                    return None, trycnt
                # new_src = nbr[random.randint(0, len(nbr) - 1)]
                new_src = cand[utils.crandint(len(cand))]
            # triagdata.append([k, tgt, src, new_src, self._edge(k + 1, src, new_src) is not None,
            #                  w[self._edge(k, tgt, src)], w[self._edge(k, tgt, new_src)]])
            ret = [
                k, tgt, src, new_src,
                mynextg.exists(nodenames[src], nodenames[new_src]),
                myg.edge(nodenames[tgt], nodenames[src]),
                myg.edge(nodenames[tgt], nodenames[new_src])
            ]
        else:  # src as key point
            trycnt = 0
            nbr = myg.out_neighbours(nodenames[src])
            # new_tgt = random.randint(0, self.dataset.graphs[k].num_vertices() - 1)
            new_tgt = name2idx[nbr[utils.crandint(len(nbr))]]
            # while self._edge(k, src, new_tgt) is None or self._edge(k, tgt, new_tgt) is not None:
            while new_tgt == src or new_tgt == tgt or not myg.exists(nodenames[src], nodenames[new_tgt]) or \
                    myg.exists(nodenames[tgt], nodenames[new_tgt]):
                if trycnt >= 5:
                    break
                # new_tgt = random.randint(0, self.dataset.graphs[k].num_vertices() - 1)
                new_tgt = name2idx[nbr[utils.crandint(len(nbr))]]
                trycnt += 1
            if trycnt >= 5:
                # nbr = [int(v) for v in self.dataset.gtgraphs[k].vertex(src).out_neighbours()
                #       if int(v) != tgt and int(v) != src and not myg.exists(nodenames[int(v)], nodenames[tgt])]
                #       if int(v) != tgt and self._edge(k, v, tgt) is None]
                cand = [name2idx[n] for n in nbr]
                cand = [
                    n for n in cand if n != tgt and n != src
                    and not myg.exists(nodenames[n], nodenames[tgt])
                ]
                if len(cand) <= 0:
                    return None, trycnt
                # new_tgt = nbr[random.randint(0, len(nbr) - 1)]
                new_tgt = cand[utils.crandint(len(cand))]
            # triagdata.append([k, src, tgt, new_tgt, self._edge(k + 1, tgt, new_tgt) is not None,
            #                  w[self._edge(k, src, tgt)], w[self._edge(k, src, new_tgt)]])
            ret = [
                k, src, tgt, new_tgt,
                mynextg.exists(nodenames[tgt], nodenames[new_tgt]),
                myg.edge(nodenames[src], nodenames[tgt]),
                myg.edge(nodenames[src], nodenames[new_tgt])
            ]

        assert len(set(ret[1:4])) == 3 and ret[5] > 0 and ret[5] > 0, ret
        return ret, trycnt