コード例 #1
0
    def read_data_sources(self, folder_paths):
        logging.info("Reading data sources...")
        for folder_name in folder_paths:
            folder_path = os.path.join(self.data_folder, folder_name)
            logging.info("-->folder: {}".format(folder_path))
            source_map = OrderedDict()
            data_folder_path = os.path.join(folder_path, "data")
            model_folder_path = os.path.join(folder_path, "model")

            for filename in os.listdir(data_folder_path):
                extension = os.path.splitext(filename)[1]

                if ".DS" in filename:
                    continue
                logging.info("   ...file: {}".format(filename))
                print(filename)

                source = Source(os.path.splitext(filename)[0])
                file_path = os.path.join(data_folder_path, filename)

                if "full" in data_folder_path:
                    source.read_data_from_wc_csv(file_path)
                elif extension == ".csv":
                    source.read_data_from_csv(file_path)
                elif extension == ".json":
                    source.read_data_from_json(file_path)
                elif extension == ".xml":
                    source.read_data_from_xml(file_path)
                else:
                    source.read_data_from_text_file(file_path)
                source_map[filename] = source
            if os.path.exists(model_folder_path):
                for filename in os.listdir(model_folder_path):
                    if ".DS" in filename:
                        continue

                    try:
                        source = source_map[os.path.splitext(
                            os.path.splitext(filename)[0])[0]]
                    except:
                        source = source_map[filename]

                    extension = os.path.splitext(filename)[1]
                    if extension == ".json":
                        source.read_semantic_type_json(
                            os.path.join(model_folder_path, filename))
                    else:
                        print(source)
                        source.read_semantic_type_from_gold(
                            os.path.join(model_folder_path, filename))

            self.dataset_map[folder_name] = source_map
コード例 #2
0
    def read_data_sources(self, folder_paths):
        semantic_type_set = set()
        attr_count = 0
        for folder_name in folder_paths:
            self.logger.debug("Read dataset: %s", folder_name)

            folder_path = "data/datasets/%s" % folder_name
            source_map = OrderedDict()
            data_folder_path = os.path.join(folder_path, "data")
            model_folder_path = os.path.join(folder_path, "model")

            for filename in sorted(os.listdir(data_folder_path)):
                extension = os.path.splitext(filename)[1]

                if ".DS" in filename:
                    continue

                self.logger.debug("    -> read: %s", filename)

                source = Source(os.path.splitext(filename)[0])
                file_path = os.path.join(data_folder_path, filename)

                if "full" in data_folder_path:
                    source.read_data_from_wc_csv(file_path)
                elif extension == ".csv":
                    source.read_data_from_csv(file_path)
                elif extension == ".json":
                    source.read_data_from_json(file_path)
                elif extension == ".xml":
                    source.read_data_from_xml(file_path)
                else:
                    source.read_data_from_text_file(file_path)
                source_map[filename] = source

                # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36
                for key in list(source.column_map.keys()):
                    column = source.column_map[key]
                    if column.semantic_type:
                        if len(column.value_list) == 0:
                            del source.column_map[key]
                            source.empty_val_columns[key] = column
                            logging.warning("Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values",
                                            column.name, source.name)

                for column in source.column_map.values():
                    semantic_type_set.add(column.semantic_type)
                attr_count += len(source.column_map.values())
            if os.path.exists(model_folder_path):
                for filename in os.listdir(model_folder_path):
                    if ".DS" in filename:
                        continue

                    try:
                        source = source_map[os.path.splitext(os.path.splitext(filename)[0])[0]]
                    except:
                        source = source_map[filename]

                    extension = os.path.splitext(filename)[1]
                    if extension == ".json":
                        source.read_semantic_type_json(os.path.join(model_folder_path, filename))
                    else:
                        print source
                        source.read_semantic_type_from_gold(os.path.join(model_folder_path, filename))

            self.dataset_map[folder_name] = source_map
            # print semantic_type_set
            print len(semantic_type_set)
            print attr_count
コード例 #3
0
ファイル: semantic_labeler.py プロジェクト: sgottsch/Tab2KG
    def read_data_sources(self, folder_paths):
        semantic_type_set = set()
        attr_count = 0
        for folder_name in folder_paths:
            self.logger.debug("Read dataset: %s", folder_name)

            folder_path = "data/datasets/%s" % folder_name
            source_map = OrderedDict()
            data_folder_path = os.path.join(folder_path, "tables")
            model_folder_path = os.path.join(folder_path, "models")

            for filename in sorted(os.listdir(data_folder_path)):
                extension = os.path.splitext(filename)[1]

                if ".DS" in filename:
                    continue

                self.logger.debug("    -> read: %s", filename)

                source = Source(os.path.splitext(filename)[0])
                file_path = os.path.join(data_folder_path, filename)

                if "full" in data_folder_path:
                    source.read_data_from_wc_csv(file_path)
                elif extension == ".csv":
                    source.read_data_from_csv(file_path)
                elif extension == ".json":
                    source.read_data_from_json(file_path)
                elif extension == ".xml":
                    source.read_data_from_xml(file_path)
                else:
                    source.read_data_from_text_file(file_path)
                source_map[filename] = source

                if ('rowNumber' in source.column_map):
                    del source.column_map['rowNumber']

                # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36
                for key in list(source.column_map.keys()):
                    column = source.column_map[key]
                    if column.semantic_type:
                        if len(column.value_list) == 0:
                            del source.column_map[key]
                            source.empty_val_columns[key] = column
                            logging.warning(
                                "Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values",
                                column.name, source.name)

                for column in source.column_map.values():
                    semantic_type_set.add(column.semantic_type)
                attr_count += len(source.column_map.values())
            if os.path.exists(model_folder_path):
                for filename in os.listdir(model_folder_path):
                    if ".DS" in filename:
                        continue

                    try:
                        source = source_map[os.path.splitext(
                            os.path.splitext(filename)[0])[0]]
                    except:
                        source = source_map[filename]

                    extension = os.path.splitext(filename)[1]
                    if extension == ".json":
                        source.read_semantic_type_json(
                            os.path.join(model_folder_path, filename))
                    else:
                        print(source)
                        source.read_semantic_type_from_gold(
                            os.path.join(model_folder_path, filename))

            self.dataset_map[folder_name] = source_map
            # print semantic_type_set
            print(len(semantic_type_set))
            print(attr_count)