Exemple #1
0
    def next_batch(self, all_dps, name):
        last_batch = False
        dataset = {}
        filenames = []
        C_trees = []
        L_a_trees = []
        L_b_trees = []
        labels = []
        for dp in all_dps[self.data_pointer: min(self.data_pointer + self.batch_size, len(all_dps))]:
            with open(dp, "r") as f:
                data  = json.load(f)
                filenames.append(dp)
                C_trees.append(DPu.convert_tree_to_tensors(data["C_tree"]))
                L_a_trees.append(DPu.convert_tree_to_tensors(data["L_a_tree"]))
                L_b_trees.append(DPu.convert_tree_to_tensors(data["L_b_tree"]))
                labels.append(data["label"])

        dataset["name"] = name
        dataset["size"] = len(C_trees)
        dataset["C_batch"] = batch_tree_input(C_trees)
        dataset["L_a_batch"] = batch_tree_input(L_a_trees)
        dataset["L_b_batch"] = batch_tree_input(L_b_trees)
        dataset["label_batch"] = torch.tensor(labels)
        dataset["filenames"] = filenames
        self.data_pointer+=self.batch_size
        if self.data_pointer>len(all_dps):
            last_batch = True
            self.data_pointer = 0
        return dataset, last_batch
Exemple #2
0
    def _dataset_from_dps(self, all_dps, name):
        dataset = {}
        C_trees = []
        L_a_trees = []
        L_b_trees = []
        labels = []
        for dp in all_dps:
            with open(dp, "r") as f:
                data  = json.load(f)
                C_trees.append(DPu.convert_tree_to_tensors(data["C_tree"]))
                L_a_trees.append(DPu.convert_tree_to_tensors(data["L_a_tree"]))
                L_b_trees.append(DPu.convert_tree_to_tensors(data["L_b_tree"]))
                labels.append(data["label"])

        dataset["name"] = name
        dataset["size"] = len(all_dps)
        dataset["C_batch"] = batch_tree_input(C_trees)
        dataset["L_a_batch"] = batch_tree_input(L_a_trees)
        dataset["L_b_batch"] = batch_tree_input(L_b_trees)
        dataset["label_batch"] = torch.tensor(labels)
        return dataset
Exemple #3
0
    def parse_and_batch_input(self, lemma, kept_lits, to_be_checked_lits):
        lit_jsons, lits = self.parse_lemma(lemma)
        print("no of lits:", len(lit_jsons))

        """
        example:
        with p_model, and
        kept_lits = [0, 1, 4]
        to_be_checked_lits = [5, 6]
        => calculating
        P(l_5|l_0)=0,
        P(l_5|l_1)=0,
        P(l_5|l_4)=0,
        P(l_6|l_0)=0.1,
        P(l_6|l_1)=0.2,
        P(l_6|l_4)=0,
        """
        #batching inputs
        #L_kept_batch = [l_0, l_1, l_4, l_0, l_1, l_4)
        #L_2bchecked_batch = [l_5, l_5, l_5, l_6, l_6, l_6)
        L_kept_trees = []
        L_2bchecked_trees = []
        # print("batching...")
        for tobechecked_idx in to_be_checked_lits:
            for kept_idx in kept_lits:
                # print("kept", kept_idx, "tobechecked", tobechecked_idx)
                L_kept_trees.append(DPu.convert_tree_to_tensors(lit_jsons[kept_idx]["tree"]))
                L_2bchecked_trees.append(DPu.convert_tree_to_tensors(lit_jsons[tobechecked_idx]["tree"]))

        if len(L_kept_trees)>0:
            L_kept_batch = batch_tree_input(L_kept_trees)
            L_2bechecked_batch = batch_tree_input(L_2bchecked_trees)

            return L_kept_batch, L_2bechecked_batch, len(lits), lits
        else:
            return None, None, len(lits), None
Exemple #4
0
    def next_batch(self, datapart, batch_size, gamma = 0.1):
        """
        datapart is either self.train_dps or self.test_dps
        """
        last_batch = False
        dataset = {}
        input_trees = []
        labels = []
        log.debug("data_pointer:{}".format(self.data_pointer))
        log.debug(len(datapart))

        for i in range(self.data_pointer, min(self.data_pointer + batch_size, len(datapart))):
            #for each datapoint (timestamp, original cube, inducted cube, mask)
            datapoint = datapart[i]

            lit_jsons = [self.id2lits_json[idx]["lit_tree"] for idx in datapoint["ori"]]

            ori_tree_input = batch_tree_input(lit_jsons)

            input_trees.append(ori_tree_input)

            #randomly change 0 to 1 to encourage higher recall
            if gamma > 0 and len(datapoint["mask"]) > MIN_LEN_GAMMA:
                mask = datapoint["mask"][:] #has to do a copy here otherwise data will be corrupted
                # print("before", mask)
                for mask_idx in range(len(mask)):
                    if mask[mask_idx] == 0 and random.random()<gamma:
                        mask[mask_idx] = 1
                # print("after", mask)

                labels.append(torch.tensor(mask).to(self.device))
            else:
                labels.append(torch.tensor(datapoint["mask"]).to(self.device))
        if len(input_trees) == 0:
            self.data_pointer = 0
            return None, True


        dataset["size"] = len(input_trees)
        dataset["input_trees"] = input_trees
        dataset["labels"] = labels
        self.data_pointer+=batch_size
        if self.data_pointer>len(datapart):
            last_batch = True
            self.data_pointer = 0
        return dataset, last_batch
Exemple #5
0
    def next_batch(self, P_matrix, batch_size, negative_sampling_rate):
        last_batch = False
        dataset = {}
        filenames = []
        L_a_trees = []
        L_b_trees = []
        labels = []
        log.debug("data_pointer:{}".format(self.data_pointer))
        log.debug(len(P_matrix))

        if negative_sampling_rate == -1:  #if not using negative sampling
            for i in range(self.data_pointer,
                           min(self.data_pointer + batch_size, len(P_matrix))):
                #at row_i
                for j in range(len(P_matrix[i])):
                    #if P_matrix[i][j]== -1, it indicates a pair that has never been observed -> skip
                    if P_matrix[i][j] < 0:
                        continue
                    # print(self.id2lits_json[i])
                    L_a_trees.append(self.id2lits_json[i]["lit_tree"])
                    L_b_trees.append(self.id2lits_json[j]["lit_tree"])
                    filenames.append((self.id2lits_json[i]["filename"],
                                      self.id2lits_json[j]["filename"]))
                    if self.threshold > 0:
                        labels.append(int(P_matrix[i][j] > self.threshold))
                    else:
                        labels.append(int(P_matrix[i][j] <= self.threshold))
        else:  #if using negative sampling
            pos_samples = []
            neg_samples = []
            for i in range(self.data_pointer,
                           min(self.data_pointer + batch_size, len(P_matrix))):
                #at row_i
                for j in range(len(P_matrix[i])):
                    #if P_matrix[i][j]== -1, it indicates a pair that has never been observed -> skip
                    if P_matrix[i][j] < 0:
                        continue

                    if self.threshold > 0:
                        if (P_matrix[i][j] > self.threshold):
                            pos_samples.append((i, j, 1))
                        else:
                            neg_samples.append((i, j, 0))
                    else:
                        if (P_matrix[i][j] <= self.threshold):
                            pos_samples.append((i, j, 1))
                        else:
                            neg_samples.append((i, j, 0))

            if self.threshold > 0 and len(pos_samples) > 0:
                n_neg_samples = len(pos_samples) * negative_sampling_rate
                n_neg_samples = min(len(neg_samples), n_neg_samples)
                neg_samples = random.sample(neg_samples, n_neg_samples)

            all_samples = pos_samples + neg_samples
            random.shuffle(all_samples)
            log.debug(
                "Use negative sampling. Number of datapoints for this batch:{}"
                .format(len(all_samples)))
            for (i, j, label) in all_samples:
                L_a_trees.append(self.id2lits_json[i]["lit_tree"])
                L_b_trees.append(self.id2lits_json[j]["lit_tree"])
                filenames.append((self.id2lits_json[i]["filename"],
                                  self.id2lits_json[j]["filename"]))
                labels.append(int(label))

        #return None if size = 0
        if len(L_a_trees) == 0:
            self.data_pointer = 0
            return None, True
        dataset["size"] = len(L_a_trees)
        dataset["L_a_batch"] = batch_tree_input(L_a_trees)
        dataset["L_b_batch"] = batch_tree_input(L_b_trees)
        dataset["label_batch"] = torch.tensor(labels)
        dataset["filenames"] = filenames
        self.data_pointer += batch_size
        if self.data_pointer > len(P_matrix):
            last_batch = True
            self.data_pointer = 0
        return dataset, last_batch