Ejemplo n.º 1
0
 def load_data(self, node_id = "", parm = 'all'):
     """
     load train data
     :param node_id:
     :param parm:
     :return:
     """
     try:
         _multi_node_flag = self.multi_node_flag
         if _multi_node_flag == True:
             file_path = utils.get_filepaths(self.data_store_path, 'tfrecords')
         else:
             file_path = utils.get_filepaths(self.data_store_path, 'h5')
         return file_path
     except Exception as e:
         raise Exception(e)
Ejemplo n.º 2
0
    def create_tfrecords_file(self, output_file, skip_header, df_csv_read,
                              label, label_type):
        """
        Creates a TFRecords file for the given input data and
        example transofmration function
        """
        try:
            # automl temparary method
            fp_list = utils.get_filepaths(self.data_store_path,
                                          file_type='tfrecords')
            for file_path in fp_list:
                os.remove(file_path)  # 승우씨것

            writer = tf.python_io.TFRecordWriter(output_file)
            logging.info("Creating TFRecords file at", output_file, "...")

            CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS = self.make_continuous_category_list(
                self.data_conf["cell_feature"])
            print_row_count = 10000
            csv_dataframe = df_csv_read
            for count, row in csv_dataframe.iterrows():
                x = self.create_example_pandas(row, CONTINUOUS_COLUMNS,
                                               CATEGORICAL_COLUMNS, label,
                                               label_type)
                if (count % print_row_count == 0):
                    logging.info(
                        "###### TFRecording row count : {0}".format(count))
                writer.write(x.SerializeToString())
            writer.close()
            print("Wrote to", output_file)
        except Exception as e:
            raise e
Ejemplo n.º 3
0
 def load_data(self, node_id = "", parm = 'all'):
     """
     load train data
     :param node_id:
     :param parm:
     :return:
     """
     try:
         _multi_node_flag = self.multi_node_flag
         if _multi_node_flag == True:
             file_path = utils.get_filepaths(self.data_store_path, 'tfrecords')
         else:
             file_path = utils.get_filepaths(self.data_store_path, 'h5')
         return file_path
     except Exception as e:
         raise Exception(e)
Ejemplo n.º 4
0
    def create_tfrecords_file(self, output_file, skip_header, df_csv_read, label,label_type):
        """
        Creates a TFRecords file for the given input data and
        example transofmration function
        """
        try:
            # automl temparary method
            fp_list = utils.get_filepaths(self.data_store_path, file_type='tfrecords')
            for file_path in fp_list:
                os.remove(file_path)  # 승우씨것

            writer = tf.python_io.TFRecordWriter(output_file)
            logging.info("Creating TFRecords file at", output_file, "...")

            CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS  = self.make_continuous_category_list(self.data_conf["cell_feature"])
            print_row_count = 10000
            csv_dataframe = df_csv_read
            for count, row in csv_dataframe.iterrows():
                x = self.create_example_pandas(row, CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS, label,label_type)
                if (count % print_row_count == 0):
                    logging.info("###### TFRecording row count : {0}".format(count))
                writer.write(x.SerializeToString())
            writer.close()
            print("Wrote to", output_file)
        except Exception as e:
            raise e
Ejemplo n.º 5
0
 def get_eval_node_file_list(self, conf_data):
     """ Eval Data Node 찾고, 경로를 찾아서 CSV를 읽음
         self.data_conf에 cell_feature에 넣음 
     Args:
       params:
         * _conf_data : nnid의 wf정보 
     Returns:
       None
     """
     #ToDo 두줄 지워도 될듯
     eval_data_node = [
         _i for _i, _k in conf_data.get('cls_pool').items()
         if 'evaldata' in _i
     ]
     data_conf_node_id = [
         _i for _i, _k in conf_data.get('cls_pool').items()
         if 'dataconf' in _i
     ]
     eval_data_cls = wf_data_frame(eval_data_node[0])
     #eval_source_path = eval_data_cls.source_path
     eval_source_path = self.data_src_path
     fp_list = utils.get_filepaths(eval_source_path, file_type='csv')
     for file_path in fp_list:
         df_csv_read = self.load_csv_by_pandas(file_path)
         self.data_conf = self.make_column_types(
             df_csv_read, eval_data_node[0],
             data_conf_node_id[0])  # make columns type of csv
Ejemplo n.º 6
0
 def load_data(self, node_id = "", parm = 'all'):
     """
     load train data
     :param node_id:
     :param parm:
     :return:
     """
     return utils.get_filepaths(self.data_store_path)
Ejemplo n.º 7
0
 def load_data(self, node_id="", parm='all'):
     """
     load train data
     :param node_id:
     :param parm:
     :return:
     """
     return utils.get_filepaths(self.data_store_path)
Ejemplo n.º 8
0
    def load_data(self, node_id, parm = 'all'):
        """
        load train data
        Multi Locad를 위한 메소드 변경 피더 로 인한 변경포함
        :param node_id:
        :param parm:
        :return:
        """
        try:
            multi_node_flag = self.get_prev_node()[0].multi_node_flag
            data_store_path = self.get_prev_node()[0].data_store_path
            if multi_node_flag == True:
                return utils.get_filepaths(data_store_path, 'tfrecords')
            else:
                return utils.get_filepaths(data_store_path, 'h5')

        except Exception as e:
            raise Exception(e)
Ejemplo n.º 9
0
 def load_data(self, node_id="", parm='all'):
     """
     load train data
     :param node_id:
     :param parm:
     :return:
     """
     try:
         file_path = utils.get_filepaths(self.data_store_path, 'iob')
         return file_path
     except Exception as e:
         raise Exception(e)
Ejemplo n.º 10
0
    def get_eval_node_file_list(self, conf_data):
        #evalnode 찾고
        #만들고, 경로 가져옴

        eval_data_node = [_i  for _i, _k in conf_data.get('cls_pool').items() if 'evaldata' in _i]
        data_conf_node_id = [_i for _i, _k in conf_data.get('cls_pool').items() if 'dataconf' in _i]
        eval_data_cls = wf_data_frame(eval_data_node[0])
        eval_source_path = eval_data_cls.source_path
        fp_list = utils.get_filepaths(eval_source_path, file_type='csv')
        for file_path in fp_list:
            df_csv_read = self.load_csv_by_pandas(file_path)
            self.data_conf = self.make_column_types(df_csv_read, eval_data_node[0],
                                                    data_conf_node_id[0])  # make columns type of csv
Ejemplo n.º 11
0
    def src_local_handler(self, conf_data):
        """

        :param conf_data:
        :return:
        """
        try:
            fp_list = utils.get_filepaths(self.data_src_path)
            for file_path in fp_list :
                str_buf = self._load_local_files(file_path)
                self._save_raw_file(str_buf)
        except Exception as e:
            raise Exception(e)
Ejemplo n.º 12
0
    def src_local_handler(self, conf_data):
        """

        :param conf_data:
        :return:
        """
        try:
            fp_list = utils.get_filepaths(self.data_src_path)
            for file_path in fp_list :
                str_buf = self._load_local_files(file_path)
                conv_buf = self.encode_pad(self._preprocess(str_buf, type=self.data_preprocess_type))
                self._save_hdf5(conv_buf)
        except Exception as e:
            raise Exception(e)
Ejemplo n.º 13
0
    def src_local_handler(self, conf_data):
        """

        :param conf_data:
        :return:
        """
        try:
            fp_list = utils.get_filepaths(self.data_src_path)
            for file_path in fp_list:
                str_buf = self._load_local_files(file_path)
                conv_buf = self.encode_pad(
                    self._preprocess(str_buf, type=self.data_preprocess_type))
                self._save_hdf5(conv_buf)
        except Exception as e:
            raise Exception(e)
Ejemplo n.º 14
0
    def get_input_data(self):
        try:

            train_list = [ self.load_data_from_h5(file_path) for file_path in utils.get_filepaths(self.data_store_path)]
            df_train = pd.DataFrame()
            result_train = df_train.append(train_list)

            test_list = [self.load_data_from_h5(file_eval_path) for file_eval_path in utils.get_filepaths(self.data_store_eval_path)]
            df_test = pd.DataFrame()
            result_test = df_test.append(test_list)
        except Exception as e:
            logging.info("NeuralNetNodeXgboost get input Exception : {0}".format(e))
            raise Exception(e)

        return result_train, result_test
Ejemplo n.º 15
0
 def create_hdf5(self, data_path, dataframe):
     """
     Create hdf5
     :param data_path:
     :return:dataframe
     """
     #todo fix
     #automl temparary method
     fp_list = utils.get_filepaths(data_path, file_type='h5')
     #for file_path in fp_list:
     #    os.remove(file_path) #승우씨것
     if len(fp_list) == 0:
         file_name = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".h5"
         output_path = os.path.join(data_path, file_name)
         hdf = pd.HDFStore(output_path)
         hdf.put('table1', dataframe, format='table', data_columns=True, encoding='UTF-8')
         hdf.close()
Ejemplo n.º 16
0
 def create_hdf5(self, data_path, dataframe):
     """
     Create hdf5
     :param data_path:
     :return:dataframe
     """
     #todo fix
     #automl temparary method
     fp_list = utils.get_filepaths(data_path, file_type='h5')
     #for file_path in fp_list:
     #    os.remove(file_path) #승우씨것
     if len(fp_list) == 0:
         file_name = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".h5"
         output_path = os.path.join(data_path, file_name)
         hdf = pd.HDFStore(output_path)
         hdf.put('table1', dataframe, format='table', data_columns=True, encoding='UTF-8')
         hdf.close()
Ejemplo n.º 17
0
 def get_eval_node_file_list(self, conf_data):
     """ Eval Data Node 찾고, 경로를 찾아서 CSV를 읽음
         self.data_conf에 cell_feature에 넣음 
     Args:
       params:
         * _conf_data : nnid의 wf정보 
     Returns:
       None
     """
     #ToDo 두줄 지워도 될듯
     eval_data_node = [_i  for _i, _k in conf_data.get('cls_pool').items() if 'evaldata' in _i]
     data_conf_node_id = [_i for _i, _k in conf_data.get('cls_pool').items() if 'dataconf' in _i]
     eval_data_cls = wf_data_frame(eval_data_node[0])
     #eval_source_path = eval_data_cls.source_path
     eval_source_path = self.data_src_path
     fp_list = utils.get_filepaths(eval_source_path, file_type='csv')
     for file_path in fp_list:
         df_csv_read = self.load_csv_by_pandas(file_path)
         self.data_conf = self.make_column_types(df_csv_read, eval_data_node[0],
                                                 data_conf_node_id[0])   # make columns type of csv
Ejemplo n.º 18
0
    def src_local_handler(self, conf_data):
        """
        Make h5 & tfrecord for multi treading

        Arguments:
            conf_data : data_source_path. etc
        """
        try:
            logging.info("Data node starting : {0}".format(conf_data['node_id']))
            fp_list = utils.get_filepaths(self.data_src_path, file_type='csv')
            _multi_node_flag = self.multi_node_flag

            eval_data = dict((_i, _k) for _i, _k in self.cls_list.items() if 'evaldata' in _i)

            try:
                #data conf node id 찾기
                data_conf_node_id = ''
                for _i, _k in self.cls_list.items():
                    if 'dataconf' in _i:
                        data_conf_node_id = _i
                        #eval 카테고리 데이터를 가져 오기 위해서 필요 Evalnode가 실행할때는 필요 없음
                        if 'data_node' not in conf_data['node_id']:
                            self.get_eval_node_file_list(conf_data)

                data_dfconf_list = data_conf_node_id

                for file_path in fp_list:
                    df_csv_read = self.load_csv_by_pandas(file_path)

                    if 'dataconf' in data_dfconf_list:
                        self.data_conf = self.make_column_types(df_csv_read, conf_data['node_id'], data_conf_node_id) # make columns type of csv
                        #eval 것도 같이 가져와서 unique value를 구해야함

                    #self.make_unique_value_each_column(df_csv_read,conf_data['node_id'])
                    self.create_hdf5(self.data_store_path, df_csv_read)
                    #Todo 뽑아서 함수화 시킬것
                    #for wdnn
                    #Wdnn인경우 data_dfconf가 무조껀 한개만 존재 하므로 아래와 같은 로직이 가능
                    if len(data_dfconf_list) > 0:
                        #Todo 정리가능

                        _key =data_dfconf_list
                        _nnid = _key.split('_')[0]
                        _ver = _key.split('_')[1]
                        _node  = 'dataconf_node'
                        _wf_data_conf = wf_data_conf(_key)
                        if hasattr(_wf_data_conf,'label') == True:
                            # label check
                            _label = _wf_data_conf.label
                            _labe_type = _wf_data_conf.label_type
                            origin_labels_list = _wf_data_conf.label_values if hasattr(_wf_data_conf,'label_values') else list() #처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김
                            compare_labels_list = self.set_dataconf_for_labels(df_csv_read,_label)
                            self.combined_label_list = utils.get_combine_label_list(origin_labels_list,compare_labels_list )
                            #리스트를 합친다음 DB에 업데이트 한다.
                            _data_conf = dict()
                            _data_conf['label_values'] = self.combined_label_list
                            if _labe_type == 'CONTINUOUS':
                                _data_conf['label_values'] = list()
                            _wf_data_conf.put_step_source(_nnid, _ver,_node, _data_conf )

                            # make tfrecord for multi Threading
                            if _multi_node_flag == True:
                                skip_header = False
                                # Todo Have to remove if production
                                self.save_tfrecord(file_path, self.data_store_path, skip_header, df_csv_read,_label, _labe_type)

                    dir = self.data_src_path+"/backup"
                    if not os.path.exists(dir):
                        os.makedirs(dir)
                        #os.mkdir(self.data_src_path+"/backup")

                    file_name_bk = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".csvbk"
                    shutil.copy(file_path,self.data_src_path+"/backup/"+file_name_bk )
                    os.remove(file_path) #승우씨것
            except Exception as e:
                logging.error("Datanode making h5 or tfrecord error".format(e))
                raise Exception(e)
            logging.info("Data node end : {0}".format(conf_data['node_id']))
            return None
        except Exception as e:
            raise Exception(e)
Ejemplo n.º 19
0
    def src_local_handler(self, conf_data):
        """ Converting csv to  h5 and Tf Record
            Data Node for Data_frame
            1) Wdnn인 경우 
              Pandas를 파싱하면서 Categorical 인지 Continuous인지 구별하여 DataConf에 입력(eval data할때는 안함. DataNode 기준 )
              Category일경우 Unique값을 Dataconf에 입력
              Label type이 Categorical이면 Label의 Unique값을 DataConf입력 
              _preprocess_type에 따라 Pandas 전처리 
            2) _multi_node_flag 가 True일 경우 TfRecord까지 생성
            3) Wdnn이 아닌경우 H5만 생성
        Args:
          params:
            * conf_data : nn_info
        Returns:
          None
        Raises:

        """
        try:
            logging.info("Data node starting : {0}".format(
                conf_data['node_id']))
            fp_list = utils.get_filepaths(self.data_src_path, file_type='csv')
            _multi_node_flag = self.multi_node_flag
            _preprocess_type = self.data_preprocess_type
            #_preprocess_type = "maxabs_scale"
            _drop_duplicate = self.drop_duplicate
            dir = self.data_src_path + "/backup"  # backup 디렉토리 만들고
            if not os.path.exists(dir):
                os.makedirs(dir)
            #if len(_preprocess_type) > 1 :
            if _preprocess_type:
                for file_path in fp_list:
                    #Train data convert
                    df_csv_read = self.load_csv_by_pandas(file_path)
                    preprocess_path = utils.get_preprocess_path(
                        self.net_id, self.net_ver, self.node_id)
                    logging.info("preprocess_path {0}".format(preprocess_path))
                    logging.info(
                        "preprocess_file {0}".format(_preprocess_type))

                    spec = importlib.util.spec_from_file_location(
                        "data_preprocess", "/hoya_src_root/data_preprocess.py")
                    foo = importlib.util.module_from_spec(spec)
                    spec.loader.exec_module(foo)
                    _pre_df_csv_read = foo.data_preprocess_by_file(df_csv_read)
                    self.create_hdf5(self.data_store_path, _pre_df_csv_read)

                eval_fp_list = utils.get_filepaths(self.data_src_eval_path,
                                                   file_type='csv')

                for eval_file_path in eval_fp_list:
                    # Eval data convert
                    df_csv_eval_read = self.load_csv_by_pandas(eval_file_path)
                    preprocess_path = utils.get_preprocess_path(
                        self.net_id, self.net_ver, self.node_id)
                    logging.info("preprocess_path {0}".format(preprocess_path))
                    logging.info(
                        "preprocess_file {0}".format(_preprocess_type))
                    spec = importlib.util.spec_from_file_location(
                        "data_preprocess", "/hoya_src_root/data_preprocess.py")

                    foo = importlib.util.module_from_spec(spec)
                    spec.loader.exec_module(foo)
                    _pre_df_csv_eval_read = foo.data_preprocess_by_file(
                        df_csv_eval_read)
                    self.create_hdf5(self.data_store_eval_path,
                                     _pre_df_csv_eval_read)

            else:

                try:
                    data_conf_node_id = self.check_eval_node_for_wdnn(
                        conf_data)
                    data_dfconf_list = data_conf_node_id
                    for file_path in fp_list:
                        if len(data_dfconf_list) == 0:  #WDNN이 아닌것
                            df_csv_read = self.load_csv_by_pandas(file_path)
                            self.create_hdf5(self.data_store_path, df_csv_read)
                        if len(data_dfconf_list) > 0:  #WDNN인것
                            df_csv_read = self.load_csv_by_pandas(file_path)
                            if 'dataconf' in data_dfconf_list:  #이미 여기서 Dataconf인지 판단
                                self.data_conf = self.make_column_types(
                                    df_csv_read, conf_data['node_id'],
                                    data_conf_node_id
                                )  # make columns type of csv
                                # eval 것도 같이 가져와서 unique value를 구해야함
                                # Todo 만약 eval과 train의 데이터 타입이 틀리면 Category로 해야하는 로직이 필요함
                            _label, _labe_type = self.make_label_values(
                                data_dfconf_list, df_csv_read
                            )  # WDNN인 경우 Label Values를 Dataconf에 넣음

                            drop_dup_df_csv_read = self.make_drop_duplicate(
                                df_csv_read, _drop_duplicate, _label)
                            #_pre_df_csv_read = self.make_preprocessing_pandas(drop_dup_df_csv_read, _preprocess_type,_label )
                            #temp_preprocess_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_pre.csvbk"
                            #_pre_df_csv_read.to_csv(self.data_src_path + "/backup/" + temp_preprocess_filename)
                            self.create_hdf5(self.data_store_path, df_csv_read)
                            if _multi_node_flag == True:
                                skip_header = False
                                # Todo Have to remove if production
                                self.save_tfrecord(file_path,
                                                   self.data_store_path,
                                                   skip_header, df_csv_read,
                                                   _label, _labe_type)

                        file_name_bk = strftime("%Y-%m-%d-%H:%M:%S",
                                                gmtime()) + ".csvbk"
                        shutil.copy(
                            file_path,
                            self.data_src_path + "/backup/" + file_name_bk)
                        #os.remove(file_path) #승우씨것
                except Exception as e:
                    logging.error(
                        "Datanode making h5 or tfrecord error".format(e))
                    raise Exception(e)
            logging.info("Data node end : {0}".format(conf_data['node_id']))
            return None
        except Exception as e:
            raise Exception(e)
Ejemplo n.º 20
0
    def src_local_handler(self, conf_data):
        """ Converting csv to  h5 and Tf Record
            Data Node for Data_frame
            1) Wdnn인 경우 
              Pandas를 파싱하면서 Categorical 인지 Continuous인지 구별하여 DataConf에 입력(eval data할때는 안함. DataNode 기준 )
              Category일경우 Unique값을 Dataconf에 입력
              Label type이 Categorical이면 Label의 Unique값을 DataConf입력 
              _preprocess_type에 따라 Pandas 전처리 
            2) _multi_node_flag 가 True일 경우 TfRecord까지 생성
            3) Wdnn이 아닌경우 H5만 생성
        Args:
          params:
            * conf_data : nn_info
        Returns:
          None
        Raises:

        """
        try:
            logging.info("Data node starting : {0}".format(conf_data['node_id']))
            fp_list = utils.get_filepaths(self.data_src_path, file_type='csv')
            _multi_node_flag = self.multi_node_flag
            _preprocess_type = self.data_preprocess_type
            #_preprocess_type = "maxabs_scale"
            _drop_duplicate = self.drop_duplicate
            dir = self.data_src_path + "/backup"  # backup 디렉토리 만들고
            if not os.path.exists(dir):
                os.makedirs(dir)
            #if len(_preprocess_type) > 1 :
            if _preprocess_type:
                for file_path in fp_list:
                    #Train data convert
                    df_csv_read = self.load_csv_by_pandas(file_path)
                    preprocess_path = utils.get_preprocess_path(self.net_id, self.net_ver, self.node_id)
                    logging.info("preprocess_path {0}".format(preprocess_path))
                    logging.info("preprocess_file {0}".format(_preprocess_type))

                    spec = importlib.util.spec_from_file_location("data_preprocess", "/hoya_src_root/data_preprocess.py")
                    foo = importlib.util.module_from_spec(spec)
                    spec.loader.exec_module(foo)
                    _pre_df_csv_read = foo.data_preprocess_by_file(df_csv_read)
                    self.create_hdf5(self.data_store_path, _pre_df_csv_read)

                eval_fp_list = utils.get_filepaths(self.data_src_eval_path, file_type='csv')

                for eval_file_path in eval_fp_list:
                    # Eval data convert
                    df_csv_eval_read = self.load_csv_by_pandas(eval_file_path)
                    preprocess_path = utils.get_preprocess_path(self.net_id, self.net_ver, self.node_id)
                    logging.info("preprocess_path {0}".format(preprocess_path))
                    logging.info("preprocess_file {0}".format(_preprocess_type))
                    spec = importlib.util.spec_from_file_location("data_preprocess",
                                                                  "/hoya_src_root/data_preprocess.py")

                    foo = importlib.util.module_from_spec(spec)
                    spec.loader.exec_module(foo)
                    _pre_df_csv_eval_read = foo.data_preprocess_by_file(df_csv_eval_read)
                    self.create_hdf5(self.data_store_eval_path, _pre_df_csv_eval_read)

            else:

                try:
                    data_conf_node_id = self.check_eval_node_for_wdnn(conf_data)
                    data_dfconf_list = data_conf_node_id
                    for file_path in fp_list:
                        if len(data_dfconf_list) == 0:  #WDNN이 아닌것
                            df_csv_read = self.load_csv_by_pandas(file_path)
                            self.create_hdf5(self.data_store_path, df_csv_read)
                        if len(data_dfconf_list) > 0:   #WDNN인것
                            df_csv_read = self.load_csv_by_pandas(file_path)
                            if 'dataconf' in data_dfconf_list: #이미 여기서 Dataconf인지 판단
                                self.data_conf = self.make_column_types(df_csv_read, conf_data['node_id'],
                                                                        data_conf_node_id)  # make columns type of csv
                                # eval 것도 같이 가져와서 unique value를 구해야함
                                # Todo 만약 eval과 train의 데이터 타입이 틀리면 Category로 해야하는 로직이 필요함
                            _label,_labe_type = self.make_label_values(data_dfconf_list, df_csv_read)   # WDNN인 경우 Label Values를 Dataconf에 넣음

                            drop_dup_df_csv_read = self.make_drop_duplicate(df_csv_read, _drop_duplicate,_label)
                            #_pre_df_csv_read = self.make_preprocessing_pandas(drop_dup_df_csv_read, _preprocess_type,_label )
                            #temp_preprocess_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_pre.csvbk"
                            #_pre_df_csv_read.to_csv(self.data_src_path + "/backup/" + temp_preprocess_filename)
                            self.create_hdf5(self.data_store_path, df_csv_read)
                            if _multi_node_flag == True:
                                skip_header = False
                                # Todo Have to remove if production
                                self.save_tfrecord(file_path, self.data_store_path, skip_header, df_csv_read,_label, _labe_type)

                        file_name_bk = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".csvbk"
                        shutil.copy(file_path,self.data_src_path+"/backup/"+file_name_bk )
                        #os.remove(file_path) #승우씨것
                except Exception as e:
                    logging.error("Datanode making h5 or tfrecord error".format(e))
                    raise Exception(e)
            logging.info("Data node end : {0}".format(conf_data['node_id']))
            return None
        except Exception as e:
            raise Exception(e)
Ejemplo n.º 21
0
    def src_local_handler(self, conf_data):
        """
        read data from local file system
        :param conf_data:
        :return:
        """
        try:
            # init value
            vocab_words = None
            vocab_tags = None
            vocab_chars = None

            # get word embedding model
            parm = {"type": "model", "val_1": {}, "val_2": []}
            embed_model = PredictNetW2V().run(self.word_embed_model, parm)

            # read files from srouce folder (handle one by one)
            fp_list = utils.get_filepaths(self.data_src_path, file_type='iob')
            if (len(fp_list) == 0):
                return None

            netconf_node = self.get_linked_next_node_with_grp('netconf')
            if (len(netconf_node) > 0):
                store_path = get_model_path(netconf_node[0].get_net_id(),
                                            netconf_node[0].get_net_ver(),
                                            netconf_node[0].get_net_node_id())

                # create dict folder for ner if not exists
                netconf_path = ''.join([store_path, '/dict/'])
                if not os.path.exists(netconf_path):
                    os.makedirs(netconf_path)

                vocab_words = self.load_vocab(''.join(
                    [netconf_path, 'words.txt']))
                vocab_tags = self.load_vocab(''.join(
                    [netconf_path, 'tags.txt']))
            else:
                return None

            for file_path in fp_list:
                # Data Generators
                dev = self.CoNLLDataset(file_path)
                train = self.CoNLLDataset(file_path)

                # get distinct vocab and chars
                vocab_words, vocab_tags = self.get_vocabs([train, dev],
                                                          vocab=vocab_words,
                                                          tags=vocab_tags)
                vocab = vocab_words & set(embed_model.wv.index2word)
                vocab.add(self.UNK)
                vocab_chars = self.get_char_vocab(train, chars=vocab_chars)

            # write dict and vecotors for train
            self.write_char_embedding(vocab_chars,
                                      ''.join([netconf_path, 'char.vec']))
            self.write_vocab(vocab_chars, ''.join([netconf_path, 'chars.txt']))
            self.write_vocab(vocab, ''.join([netconf_path, 'words.txt']))
            self.write_vocab(vocab_tags, ''.join([netconf_path, 'tags.txt']))
            self.export_trimmed_glove_vectors(
                vocab, embed_model, ''.join([netconf_path, 'words.vec']))

        except Exception as e:
            raise Exception(e)
        finally:
            for file_path in fp_list:
                # move source file to store path
                str_buf = self._load_local_files(file_path)
                self._save_raw_file(str_buf)
Ejemplo n.º 22
0
 def load_data(self, node_id="", parm = 'all'):
     dataconf = WorkFlowDataImage().get_step_source(node_id)
     output_directory = dataconf["store_path"]
     return  utils.get_filepaths(output_directory)
Ejemplo n.º 23
0
 def load_data(self, node_id="", parm = 'all'):
     dataconf = WorkFlowDataImage().get_step_source(node_id)
     output_directory = dataconf["store_path"]
     return  utils.get_filepaths(output_directory)