Beispiel #1
0
    def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)

            return True

        for split in self.data:
            file_name = "v4_atomic_{}.csv".format(map_name(split))

            df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0)
            df.iloc[:, :9] = df.iloc[:, :9].apply(
                lambda col: col.apply(json.loads))

            for cat in self.categories:
                attr = df[cat]
                self.data[split]["total"] += utils.zipped_flatten(
                    zip(attr.index, ["<{}>".format(cat)] * len(attr),
                        attr.values))

        if do_take_partial_dataset(self.opt.data):
            self.data["train"]["total"] = select_partial_dataset(
                self.opt.data, self.data["train"]["total"])

        return False
Beispiel #2
0
    def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)
            return True

        for split in self.data:
            file_name = map_name(split, self.opt.data)
            if split != "dev" or self.opt.data.devversion != "12":
                string_tuples = open("{}/{}".format(
                    path, file_name), "r").read().split("\n")
                tuples = [x.split("\t") for x in string_tuples if x]
            elif split == "dev":
                string_tuples = open("{}/{}".format(path, "dev1.txt"), "r").read().split("\n")
                tuples = [x.split("\t") for x in string_tuples if x]
            

            if split in ["dev", "test"]:
                self.data[split]["total"] = \
                        [(i[0].lower().strip(), i[1].lower().strip(), i[2].lower().strip(), i[3].lower().strip(), i[4].lower().strip(), i[5].lower().strip(), i[6].lower().strip(), i[7].lower().strip(), i[8].lower().strip(), i[9].lower().strip(), i[10].lower().strip()      ) for i in tuples]
            else:
                self.data[split]["total"] = \
                        [(i[0].lower().strip(), i[1].lower().strip(), i[2].lower().strip(), i[3].lower().strip(), i[4].lower().strip(), i[5].lower().strip(), i[6].lower().strip(), i[7].lower().strip(), i[8].lower().strip(), i[9].lower().strip(), i[10].lower().strip()) for i in tuples]
            
        return False
Beispiel #3
0
    def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)
            return True

        for split in self.data:
            print('We are working on split:', split)
            file_name = map_name(split, self.opt.data)

            if split != "dev" or self.opt.data.devversion != "12":
                string_tuples = open("{}/{}".format(path, file_name),
                                     "r").read().split("\n")
                tuples = [x.split("\t") for x in string_tuples if x]
            else:
                string_tuples = open("{}/{}".format(path, "dev1.txt"),
                                     "r").read().split("\n")
                tuples = [x.split("\t") for x in string_tuples if x]
                string_tuples = open("{}/{}".format(path, "dev2.txt"),
                                     "r").read().split("\n")
                tuples += [x.split("\t") for x in string_tuples if x]

            if split in ["dev", "test"]:
                if self.opt.data.rel == "language":
                    self.data[split]["total"] = \
                        [(i[1].lower().strip(), split_into_words[i[0]],
                         i[2].lower().strip(), int(i[3])) for i in tuples]
                    self.data[split]["positive"] = \
                        [(i[1].lower().strip(), split_into_words[i[0]],
                         i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])]
                    self.data[split]["negative"] = \
                        [(i[1].lower().strip(), split_into_words[i[0]],
                         i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])]
                elif self.opt.data.rel == "relation":
                    self.data[split]["total"] = \
                        [(i[1].lower().strip(), "<{}>".format(i[0]),
                         i[2].lower().strip(), int(i[3])) for i in tuples]
                    self.data[split]["positive"] = \
                        [(i[1].lower().strip(), "<{}>".format(i[0]),
                         i[2].lower().strip(), int(i[3])) for i in tuples if int(i[3])]
                    self.data[split]["negative"] = \
                        [(i[1].lower().strip(), "<{}>".format(i[0]),
                         i[2].lower().strip(), int(i[3])) for i in tuples if not int(i[3])]
            else:
                if self.opt.data.rel == "language":
                    self.data[split]["total"] = \
                        [(i[1].lower().strip(), split_into_words[i[0]],
                         i[2].lower().strip(), i[3]) for i in tuples]
                elif self.opt.data.rel == "relation":
                    self.data[split]["total"] = \
                        [(i[1].lower().strip(), "<{}>".format(i[0]),
                         i[2].lower().strip(), i[3]) for i in tuples]

        return False
    def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)

            return True

        for split in self.data:
            file_name = "motiv_sent_none_{}.csv".format(map_name(split))
            print(f"read file: {file_name}")

            df = pandas.read_csv("{}/{}".format(path, file_name))
            print(f"columns: {df.columns}")
            sentences = []
            for i in range(len(df.values)):
                context = df.loc[i, "context"].split('\t')[1:]
                linenum = int(df.loc[i, "linenum"])
                char = df.loc[i, "char"]
                sentences.append('|'.join(context[0:linenum + 1]) +
                                 f"</s>{char}<s>")
            targets = df["motivation"].values

            if len(self.categories) == 1:
                cat = self.categories[0]

                self.data[split]["total"] += [
                    (str(sent), str(rel), str(tar))
                    for sent, rel, tar in zip(sentences, ["<{}>".format(cat)] *
                                              len(sentences), targets)
                ]

        if do_take_partial_dataset(self.opt.data):
            self.data["train"]["total"] = select_partial_dataset(
                self.opt.data, self.data["train"]["total"])

        return False
Beispiel #5
0
    def load_data(self, path):
        if ".pickle" in path:
            print("Loading data from: {}".format(path))
            data_utils.load_existing_data_loader(self, path)

            return True

        for split in self.data:
            if split == 'train':
                n_data = self.n_train
            elif split == 'dev':
                n_data = self.n_dev
            elif split == 'test':
                n_data = self.n_test

            # Read & load ATOMIC dataset file
            file_name = "v4_atomic_{}.csv".format(map_name(split))

            df = pandas.read_csv("{}/{}".format(path, file_name), index_col=0)
            df.iloc[:, :9] = df.iloc[:, :9].apply(lambda col: col.apply(json.loads))
 
            if self.comet:
                """
                For replicating original COMET settings we don't need a graph.
                """
                for cat in [item for item in self.categories if not 'Inverse' in item]:
                    attr = df[cat]
                    self.data[split]["total"] += utils.zipped_flatten(zip(
                        attr.index, ["<{}>".format(cat)] * len(attr), attr.values))
                #ipdb.set_trace()

            elif self.pathcomet:
                """
                Replicate original COMET, but prepend every s,r with a path from a graph
                """
                comet_orig = {}
                comet_orig['train'] = {"total": []}
                comet_orig['dev'] = {"total": []}
                comet_orig['test'] = {"total": []}

                for cat in [item for item in self.categories if not 'Inverse' in item]:
                    attr = df[cat]
                    #self.data[split]["total"] += utils.zipped_flatten(zip(
                    #    attr.index, ["<{}>".format(cat)] * len(attr), attr.values))
                    comet_orig[split]["total"] += utils.zipped_flatten(zip(
                        attr.index, ["<{}>".format(cat)] * len(attr), attr.values))
                
                comet_orig[split]["total"] = [list(item) for item in comet_orig[split]["total"]] # Convert tuples into list

                # Build graph
                #G=nx.Graph()
                G=nx.DiGraph()
                entities = set()

               
                for cat in [item for item in self.categories if not 'Inverse' in item]:
                    attr = df[cat]
                    triples = utils.zipped_flatten(zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values))

                    # Add to the graph
                    for triple in triples:
                        m1, rel, m2 = triple
                        entities.add(m1)
                        entities.add(m2)
                        G.add_node(m1, type='subj')
                        G.add_node(m2, type='obj')
                        G.add_edge(m1, m2, rel=rel) 
                        G.add_edge(m2, m1, rel=rel.replace('>','Inverse>')) # Inverse relation

                        #ipdb.set_trace()
                    #self.data[split]["total"] += utils.zipped_flatten(zip(
                    #    attr.index, ["<{}>".format(cat)] * len(attr), attr.values))

                examples = []
                for base_subj, base_rel, base_obj in comet_orig[split]["total"]:
                    unique_paths = set()

                    for _ in range(self.n_per_node[split]):
                        curr_node = base_subj
                        walk = data_utils.Path(curr_node)
                        walk.nodes.add(base_obj) # We don't want to see the target object in the input path

                        n_attempts = 0
                        while len(walk.walk) * 1 < self.max_path_len:
                            obj, relation, dead_end = data_utils.single_step_reverse(curr_node, G)
                            if dead_end:
                                n_attempts += 1
                                break
                            updated = walk.update(obj, relation, prepend=True)
                            if updated:
                                curr_node = obj
                            else:
                                n_attempts += 1

                            if n_attempts > 10 :
                                break

                        if not ' '.join(walk.walk + [base_rel] + [base_obj]) in unique_paths:
                            assert walk.walk[-1] == base_subj
                            walk.walk.append(base_rel)
                            walk.walk.append(base_obj)
                            examples.append(walk.walk)
                            unique_paths.add(' '.join(walk.walk))

                        #ipdb.set_trace()

                        if len(examples) % 500 == 0:
                            print("\nGenerated {} {} examples".format(len(examples), split))
                            print(walk.walk)


                        #if len(examples) >= n_data:
                        #    break
                    #ipdb.set_trace()
                #examples = examples[:n_data]    
                if self.add_orig[split]:
                    self.data[split]["total"] += comet_orig[split]["total"]
                    self.data[split]["total"] += examples
                else:
                    self.data[split]["total"] = examples 
                #ipdb.set_trace()
 

            else:
                """
                Graph based path data generation
                """

                #G=nx.Graph()
                G=nx.DiGraph()
                entities = set()

               
                for cat in [item for item in self.categories if not 'Inverse' in item]:
                    attr = df[cat]
                    triples = utils.zipped_flatten(zip(attr.index, ["<{}>".format(cat)] * len(attr), attr.values))

                    # Add to the graph
                    for triple in triples:
                        m1, rel, m2 = triple
                        entities.add(m1)
                        entities.add(m2)
                        G.add_node(m1, type='subj')
                        G.add_node(m2, type='obj')
                        G.add_edge(m1, m2, rel=rel) 
                        G.add_edge(m2, m1, rel=rel.replace('>','Inverse>'))# Inverse relation

                        #ipdb.set_trace()
                    #self.data[split]["total"] += utils.zipped_flatten(zip(
                    #    attr.index, ["<{}>".format(cat)] * len(attr), attr.values))


                examples = []
                all_nodes = list(G.nodes())
                random.shuffle(all_nodes)
                for node in all_nodes:
                    unique_paths = set() # Use for filtering out duplicate paths starting from the same start_node

                    for _ in range(self.n_per_node[split]):
                        curr_node = node
                        walk = data_utils.Path(curr_node)

                        n_attempts = 0
                        while len(walk.walk) * 1 < self.max_path_len:
                            obj, relation, dead_end = data_utils.single_step(curr_node, G)
                            if dead_end:
                                n_attempts += 1
                                break
                            updated = walk.update(obj, relation)
                            if updated:
                                curr_node = obj
                            else:
                                #ipdb.set_trace()
                                n_attempts += 1
                            #print(walk.walk)

                            if n_attempts > 10 :
                                break

                        if not ' '.join(walk.walk) in unique_paths:
                            examples.append(walk.walk)
                            unique_paths.add(' '.join(walk.walk))
                            #print(' '.join(walk.walk))

                        if len(examples) % 500 == 0:
                            print("\nGenerated {} {} examples".format(len(examples), split))
                            print(walk.walk)

                    #ipdb.set_trace()

                    if len(examples) >= n_data:
                        break

                examples = examples[:n_data]    
                self.data[split]["total"] = examples 
                #ipdb.set_trace()
                
        if do_take_partial_dataset(self.opt.data):
            self.data["train"]["total"] = select_partial_dataset(
                self.opt.data, self.data["train"]["total"])

        return False
Beispiel #6
0
 def load_data(self, path):
     if ".pickle" in path:
         data_utils.load_existing_data_loader(self, path)
         return True