def load_data(self, node_id = "", parm = 'all'): """ load train data :param node_id: :param parm: :return: """ try: _multi_node_flag = self.multi_node_flag if _multi_node_flag == True: file_path = utils.get_filepaths(self.data_store_path, 'tfrecords') else: file_path = utils.get_filepaths(self.data_store_path, 'h5') return file_path except Exception as e: raise Exception(e)
def create_tfrecords_file(self, output_file, skip_header, df_csv_read, label, label_type): """ Creates a TFRecords file for the given input data and example transofmration function """ try: # automl temparary method fp_list = utils.get_filepaths(self.data_store_path, file_type='tfrecords') for file_path in fp_list: os.remove(file_path) # 승우씨것 writer = tf.python_io.TFRecordWriter(output_file) logging.info("Creating TFRecords file at", output_file, "...") CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS = self.make_continuous_category_list( self.data_conf["cell_feature"]) print_row_count = 10000 csv_dataframe = df_csv_read for count, row in csv_dataframe.iterrows(): x = self.create_example_pandas(row, CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS, label, label_type) if (count % print_row_count == 0): logging.info( "###### TFRecording row count : {0}".format(count)) writer.write(x.SerializeToString()) writer.close() print("Wrote to", output_file) except Exception as e: raise e
def create_tfrecords_file(self, output_file, skip_header, df_csv_read, label,label_type): """ Creates a TFRecords file for the given input data and example transofmration function """ try: # automl temparary method fp_list = utils.get_filepaths(self.data_store_path, file_type='tfrecords') for file_path in fp_list: os.remove(file_path) # 승우씨것 writer = tf.python_io.TFRecordWriter(output_file) logging.info("Creating TFRecords file at", output_file, "...") CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS = self.make_continuous_category_list(self.data_conf["cell_feature"]) print_row_count = 10000 csv_dataframe = df_csv_read for count, row in csv_dataframe.iterrows(): x = self.create_example_pandas(row, CONTINUOUS_COLUMNS, CATEGORICAL_COLUMNS, label,label_type) if (count % print_row_count == 0): logging.info("###### TFRecording row count : {0}".format(count)) writer.write(x.SerializeToString()) writer.close() print("Wrote to", output_file) except Exception as e: raise e
def get_eval_node_file_list(self, conf_data): """ Eval Data Node 찾고, 경로를 찾아서 CSV를 읽음 self.data_conf에 cell_feature에 넣음 Args: params: * _conf_data : nnid의 wf정보 Returns: None """ #ToDo 두줄 지워도 될듯 eval_data_node = [ _i for _i, _k in conf_data.get('cls_pool').items() if 'evaldata' in _i ] data_conf_node_id = [ _i for _i, _k in conf_data.get('cls_pool').items() if 'dataconf' in _i ] eval_data_cls = wf_data_frame(eval_data_node[0]) #eval_source_path = eval_data_cls.source_path eval_source_path = self.data_src_path fp_list = utils.get_filepaths(eval_source_path, file_type='csv') for file_path in fp_list: df_csv_read = self.load_csv_by_pandas(file_path) self.data_conf = self.make_column_types( df_csv_read, eval_data_node[0], data_conf_node_id[0]) # make columns type of csv
def load_data(self, node_id = "", parm = 'all'): """ load train data :param node_id: :param parm: :return: """ return utils.get_filepaths(self.data_store_path)
def load_data(self, node_id="", parm='all'): """ load train data :param node_id: :param parm: :return: """ return utils.get_filepaths(self.data_store_path)
def load_data(self, node_id, parm = 'all'): """ load train data Multi Locad를 위한 메소드 변경 피더 로 인한 변경포함 :param node_id: :param parm: :return: """ try: multi_node_flag = self.get_prev_node()[0].multi_node_flag data_store_path = self.get_prev_node()[0].data_store_path if multi_node_flag == True: return utils.get_filepaths(data_store_path, 'tfrecords') else: return utils.get_filepaths(data_store_path, 'h5') except Exception as e: raise Exception(e)
def load_data(self, node_id="", parm='all'): """ load train data :param node_id: :param parm: :return: """ try: file_path = utils.get_filepaths(self.data_store_path, 'iob') return file_path except Exception as e: raise Exception(e)
def get_eval_node_file_list(self, conf_data): #evalnode 찾고 #만들고, 경로 가져옴 eval_data_node = [_i for _i, _k in conf_data.get('cls_pool').items() if 'evaldata' in _i] data_conf_node_id = [_i for _i, _k in conf_data.get('cls_pool').items() if 'dataconf' in _i] eval_data_cls = wf_data_frame(eval_data_node[0]) eval_source_path = eval_data_cls.source_path fp_list = utils.get_filepaths(eval_source_path, file_type='csv') for file_path in fp_list: df_csv_read = self.load_csv_by_pandas(file_path) self.data_conf = self.make_column_types(df_csv_read, eval_data_node[0], data_conf_node_id[0]) # make columns type of csv
def src_local_handler(self, conf_data): """ :param conf_data: :return: """ try: fp_list = utils.get_filepaths(self.data_src_path) for file_path in fp_list : str_buf = self._load_local_files(file_path) self._save_raw_file(str_buf) except Exception as e: raise Exception(e)
def src_local_handler(self, conf_data): """ :param conf_data: :return: """ try: fp_list = utils.get_filepaths(self.data_src_path) for file_path in fp_list : str_buf = self._load_local_files(file_path) conv_buf = self.encode_pad(self._preprocess(str_buf, type=self.data_preprocess_type)) self._save_hdf5(conv_buf) except Exception as e: raise Exception(e)
def src_local_handler(self, conf_data): """ :param conf_data: :return: """ try: fp_list = utils.get_filepaths(self.data_src_path) for file_path in fp_list: str_buf = self._load_local_files(file_path) conv_buf = self.encode_pad( self._preprocess(str_buf, type=self.data_preprocess_type)) self._save_hdf5(conv_buf) except Exception as e: raise Exception(e)
def get_input_data(self): try: train_list = [ self.load_data_from_h5(file_path) for file_path in utils.get_filepaths(self.data_store_path)] df_train = pd.DataFrame() result_train = df_train.append(train_list) test_list = [self.load_data_from_h5(file_eval_path) for file_eval_path in utils.get_filepaths(self.data_store_eval_path)] df_test = pd.DataFrame() result_test = df_test.append(test_list) except Exception as e: logging.info("NeuralNetNodeXgboost get input Exception : {0}".format(e)) raise Exception(e) return result_train, result_test
def create_hdf5(self, data_path, dataframe): """ Create hdf5 :param data_path: :return:dataframe """ #todo fix #automl temparary method fp_list = utils.get_filepaths(data_path, file_type='h5') #for file_path in fp_list: # os.remove(file_path) #승우씨것 if len(fp_list) == 0: file_name = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".h5" output_path = os.path.join(data_path, file_name) hdf = pd.HDFStore(output_path) hdf.put('table1', dataframe, format='table', data_columns=True, encoding='UTF-8') hdf.close()
def get_eval_node_file_list(self, conf_data): """ Eval Data Node 찾고, 경로를 찾아서 CSV를 읽음 self.data_conf에 cell_feature에 넣음 Args: params: * _conf_data : nnid의 wf정보 Returns: None """ #ToDo 두줄 지워도 될듯 eval_data_node = [_i for _i, _k in conf_data.get('cls_pool').items() if 'evaldata' in _i] data_conf_node_id = [_i for _i, _k in conf_data.get('cls_pool').items() if 'dataconf' in _i] eval_data_cls = wf_data_frame(eval_data_node[0]) #eval_source_path = eval_data_cls.source_path eval_source_path = self.data_src_path fp_list = utils.get_filepaths(eval_source_path, file_type='csv') for file_path in fp_list: df_csv_read = self.load_csv_by_pandas(file_path) self.data_conf = self.make_column_types(df_csv_read, eval_data_node[0], data_conf_node_id[0]) # make columns type of csv
def src_local_handler(self, conf_data): """ Make h5 & tfrecord for multi treading Arguments: conf_data : data_source_path. etc """ try: logging.info("Data node starting : {0}".format(conf_data['node_id'])) fp_list = utils.get_filepaths(self.data_src_path, file_type='csv') _multi_node_flag = self.multi_node_flag eval_data = dict((_i, _k) for _i, _k in self.cls_list.items() if 'evaldata' in _i) try: #data conf node id 찾기 data_conf_node_id = '' for _i, _k in self.cls_list.items(): if 'dataconf' in _i: data_conf_node_id = _i #eval 카테고리 데이터를 가져 오기 위해서 필요 Evalnode가 실행할때는 필요 없음 if 'data_node' not in conf_data['node_id']: self.get_eval_node_file_list(conf_data) data_dfconf_list = data_conf_node_id for file_path in fp_list: df_csv_read = self.load_csv_by_pandas(file_path) if 'dataconf' in data_dfconf_list: self.data_conf = self.make_column_types(df_csv_read, conf_data['node_id'], data_conf_node_id) # make columns type of csv #eval 것도 같이 가져와서 unique value를 구해야함 #self.make_unique_value_each_column(df_csv_read,conf_data['node_id']) self.create_hdf5(self.data_store_path, df_csv_read) #Todo 뽑아서 함수화 시킬것 #for wdnn #Wdnn인경우 data_dfconf가 무조껀 한개만 존재 하므로 아래와 같은 로직이 가능 if len(data_dfconf_list) > 0: #Todo 정리가능 _key =data_dfconf_list _nnid = _key.split('_')[0] _ver = _key.split('_')[1] _node = 'dataconf_node' _wf_data_conf = wf_data_conf(_key) if hasattr(_wf_data_conf,'label') == True: # label check _label = _wf_data_conf.label _labe_type = _wf_data_conf.label_type origin_labels_list = _wf_data_conf.label_values if hasattr(_wf_data_conf,'label_values') else list() #처음 입려할때 라벨벨류가 없으면 빈 리스트 넘김 compare_labels_list = self.set_dataconf_for_labels(df_csv_read,_label) self.combined_label_list = utils.get_combine_label_list(origin_labels_list,compare_labels_list ) #리스트를 합친다음 DB에 업데이트 한다. _data_conf = dict() _data_conf['label_values'] = self.combined_label_list if _labe_type == 'CONTINUOUS': _data_conf['label_values'] = list() _wf_data_conf.put_step_source(_nnid, _ver,_node, _data_conf ) # make tfrecord for multi Threading if _multi_node_flag == True: skip_header = False # Todo Have to remove if production self.save_tfrecord(file_path, self.data_store_path, skip_header, df_csv_read,_label, _labe_type) dir = self.data_src_path+"/backup" if not os.path.exists(dir): os.makedirs(dir) #os.mkdir(self.data_src_path+"/backup") file_name_bk = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".csvbk" shutil.copy(file_path,self.data_src_path+"/backup/"+file_name_bk ) os.remove(file_path) #승우씨것 except Exception as e: logging.error("Datanode making h5 or tfrecord error".format(e)) raise Exception(e) logging.info("Data node end : {0}".format(conf_data['node_id'])) return None except Exception as e: raise Exception(e)
def src_local_handler(self, conf_data): """ Converting csv to h5 and Tf Record Data Node for Data_frame 1) Wdnn인 경우 Pandas를 파싱하면서 Categorical 인지 Continuous인지 구별하여 DataConf에 입력(eval data할때는 안함. DataNode 기준 ) Category일경우 Unique값을 Dataconf에 입력 Label type이 Categorical이면 Label의 Unique값을 DataConf입력 _preprocess_type에 따라 Pandas 전처리 2) _multi_node_flag 가 True일 경우 TfRecord까지 생성 3) Wdnn이 아닌경우 H5만 생성 Args: params: * conf_data : nn_info Returns: None Raises: """ try: logging.info("Data node starting : {0}".format( conf_data['node_id'])) fp_list = utils.get_filepaths(self.data_src_path, file_type='csv') _multi_node_flag = self.multi_node_flag _preprocess_type = self.data_preprocess_type #_preprocess_type = "maxabs_scale" _drop_duplicate = self.drop_duplicate dir = self.data_src_path + "/backup" # backup 디렉토리 만들고 if not os.path.exists(dir): os.makedirs(dir) #if len(_preprocess_type) > 1 : if _preprocess_type: for file_path in fp_list: #Train data convert df_csv_read = self.load_csv_by_pandas(file_path) preprocess_path = utils.get_preprocess_path( self.net_id, self.net_ver, self.node_id) logging.info("preprocess_path {0}".format(preprocess_path)) logging.info( "preprocess_file {0}".format(_preprocess_type)) spec = importlib.util.spec_from_file_location( "data_preprocess", "/hoya_src_root/data_preprocess.py") foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) _pre_df_csv_read = foo.data_preprocess_by_file(df_csv_read) self.create_hdf5(self.data_store_path, _pre_df_csv_read) eval_fp_list = utils.get_filepaths(self.data_src_eval_path, file_type='csv') for eval_file_path in eval_fp_list: # Eval data convert df_csv_eval_read = self.load_csv_by_pandas(eval_file_path) preprocess_path = utils.get_preprocess_path( self.net_id, self.net_ver, self.node_id) logging.info("preprocess_path {0}".format(preprocess_path)) logging.info( "preprocess_file {0}".format(_preprocess_type)) spec = importlib.util.spec_from_file_location( "data_preprocess", "/hoya_src_root/data_preprocess.py") foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) _pre_df_csv_eval_read = foo.data_preprocess_by_file( df_csv_eval_read) self.create_hdf5(self.data_store_eval_path, _pre_df_csv_eval_read) else: try: data_conf_node_id = self.check_eval_node_for_wdnn( conf_data) data_dfconf_list = data_conf_node_id for file_path in fp_list: if len(data_dfconf_list) == 0: #WDNN이 아닌것 df_csv_read = self.load_csv_by_pandas(file_path) self.create_hdf5(self.data_store_path, df_csv_read) if len(data_dfconf_list) > 0: #WDNN인것 df_csv_read = self.load_csv_by_pandas(file_path) if 'dataconf' in data_dfconf_list: #이미 여기서 Dataconf인지 판단 self.data_conf = self.make_column_types( df_csv_read, conf_data['node_id'], data_conf_node_id ) # make columns type of csv # eval 것도 같이 가져와서 unique value를 구해야함 # Todo 만약 eval과 train의 데이터 타입이 틀리면 Category로 해야하는 로직이 필요함 _label, _labe_type = self.make_label_values( data_dfconf_list, df_csv_read ) # WDNN인 경우 Label Values를 Dataconf에 넣음 drop_dup_df_csv_read = self.make_drop_duplicate( df_csv_read, _drop_duplicate, _label) #_pre_df_csv_read = self.make_preprocessing_pandas(drop_dup_df_csv_read, _preprocess_type,_label ) #temp_preprocess_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_pre.csvbk" #_pre_df_csv_read.to_csv(self.data_src_path + "/backup/" + temp_preprocess_filename) self.create_hdf5(self.data_store_path, df_csv_read) if _multi_node_flag == True: skip_header = False # Todo Have to remove if production self.save_tfrecord(file_path, self.data_store_path, skip_header, df_csv_read, _label, _labe_type) file_name_bk = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".csvbk" shutil.copy( file_path, self.data_src_path + "/backup/" + file_name_bk) #os.remove(file_path) #승우씨것 except Exception as e: logging.error( "Datanode making h5 or tfrecord error".format(e)) raise Exception(e) logging.info("Data node end : {0}".format(conf_data['node_id'])) return None except Exception as e: raise Exception(e)
def src_local_handler(self, conf_data): """ Converting csv to h5 and Tf Record Data Node for Data_frame 1) Wdnn인 경우 Pandas를 파싱하면서 Categorical 인지 Continuous인지 구별하여 DataConf에 입력(eval data할때는 안함. DataNode 기준 ) Category일경우 Unique값을 Dataconf에 입력 Label type이 Categorical이면 Label의 Unique값을 DataConf입력 _preprocess_type에 따라 Pandas 전처리 2) _multi_node_flag 가 True일 경우 TfRecord까지 생성 3) Wdnn이 아닌경우 H5만 생성 Args: params: * conf_data : nn_info Returns: None Raises: """ try: logging.info("Data node starting : {0}".format(conf_data['node_id'])) fp_list = utils.get_filepaths(self.data_src_path, file_type='csv') _multi_node_flag = self.multi_node_flag _preprocess_type = self.data_preprocess_type #_preprocess_type = "maxabs_scale" _drop_duplicate = self.drop_duplicate dir = self.data_src_path + "/backup" # backup 디렉토리 만들고 if not os.path.exists(dir): os.makedirs(dir) #if len(_preprocess_type) > 1 : if _preprocess_type: for file_path in fp_list: #Train data convert df_csv_read = self.load_csv_by_pandas(file_path) preprocess_path = utils.get_preprocess_path(self.net_id, self.net_ver, self.node_id) logging.info("preprocess_path {0}".format(preprocess_path)) logging.info("preprocess_file {0}".format(_preprocess_type)) spec = importlib.util.spec_from_file_location("data_preprocess", "/hoya_src_root/data_preprocess.py") foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) _pre_df_csv_read = foo.data_preprocess_by_file(df_csv_read) self.create_hdf5(self.data_store_path, _pre_df_csv_read) eval_fp_list = utils.get_filepaths(self.data_src_eval_path, file_type='csv') for eval_file_path in eval_fp_list: # Eval data convert df_csv_eval_read = self.load_csv_by_pandas(eval_file_path) preprocess_path = utils.get_preprocess_path(self.net_id, self.net_ver, self.node_id) logging.info("preprocess_path {0}".format(preprocess_path)) logging.info("preprocess_file {0}".format(_preprocess_type)) spec = importlib.util.spec_from_file_location("data_preprocess", "/hoya_src_root/data_preprocess.py") foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) _pre_df_csv_eval_read = foo.data_preprocess_by_file(df_csv_eval_read) self.create_hdf5(self.data_store_eval_path, _pre_df_csv_eval_read) else: try: data_conf_node_id = self.check_eval_node_for_wdnn(conf_data) data_dfconf_list = data_conf_node_id for file_path in fp_list: if len(data_dfconf_list) == 0: #WDNN이 아닌것 df_csv_read = self.load_csv_by_pandas(file_path) self.create_hdf5(self.data_store_path, df_csv_read) if len(data_dfconf_list) > 0: #WDNN인것 df_csv_read = self.load_csv_by_pandas(file_path) if 'dataconf' in data_dfconf_list: #이미 여기서 Dataconf인지 판단 self.data_conf = self.make_column_types(df_csv_read, conf_data['node_id'], data_conf_node_id) # make columns type of csv # eval 것도 같이 가져와서 unique value를 구해야함 # Todo 만약 eval과 train의 데이터 타입이 틀리면 Category로 해야하는 로직이 필요함 _label,_labe_type = self.make_label_values(data_dfconf_list, df_csv_read) # WDNN인 경우 Label Values를 Dataconf에 넣음 drop_dup_df_csv_read = self.make_drop_duplicate(df_csv_read, _drop_duplicate,_label) #_pre_df_csv_read = self.make_preprocessing_pandas(drop_dup_df_csv_read, _preprocess_type,_label ) #temp_preprocess_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_pre.csvbk" #_pre_df_csv_read.to_csv(self.data_src_path + "/backup/" + temp_preprocess_filename) self.create_hdf5(self.data_store_path, df_csv_read) if _multi_node_flag == True: skip_header = False # Todo Have to remove if production self.save_tfrecord(file_path, self.data_store_path, skip_header, df_csv_read,_label, _labe_type) file_name_bk = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + ".csvbk" shutil.copy(file_path,self.data_src_path+"/backup/"+file_name_bk ) #os.remove(file_path) #승우씨것 except Exception as e: logging.error("Datanode making h5 or tfrecord error".format(e)) raise Exception(e) logging.info("Data node end : {0}".format(conf_data['node_id'])) return None except Exception as e: raise Exception(e)
def src_local_handler(self, conf_data): """ read data from local file system :param conf_data: :return: """ try: # init value vocab_words = None vocab_tags = None vocab_chars = None # get word embedding model parm = {"type": "model", "val_1": {}, "val_2": []} embed_model = PredictNetW2V().run(self.word_embed_model, parm) # read files from srouce folder (handle one by one) fp_list = utils.get_filepaths(self.data_src_path, file_type='iob') if (len(fp_list) == 0): return None netconf_node = self.get_linked_next_node_with_grp('netconf') if (len(netconf_node) > 0): store_path = get_model_path(netconf_node[0].get_net_id(), netconf_node[0].get_net_ver(), netconf_node[0].get_net_node_id()) # create dict folder for ner if not exists netconf_path = ''.join([store_path, '/dict/']) if not os.path.exists(netconf_path): os.makedirs(netconf_path) vocab_words = self.load_vocab(''.join( [netconf_path, 'words.txt'])) vocab_tags = self.load_vocab(''.join( [netconf_path, 'tags.txt'])) else: return None for file_path in fp_list: # Data Generators dev = self.CoNLLDataset(file_path) train = self.CoNLLDataset(file_path) # get distinct vocab and chars vocab_words, vocab_tags = self.get_vocabs([train, dev], vocab=vocab_words, tags=vocab_tags) vocab = vocab_words & set(embed_model.wv.index2word) vocab.add(self.UNK) vocab_chars = self.get_char_vocab(train, chars=vocab_chars) # write dict and vecotors for train self.write_char_embedding(vocab_chars, ''.join([netconf_path, 'char.vec'])) self.write_vocab(vocab_chars, ''.join([netconf_path, 'chars.txt'])) self.write_vocab(vocab, ''.join([netconf_path, 'words.txt'])) self.write_vocab(vocab_tags, ''.join([netconf_path, 'tags.txt'])) self.export_trimmed_glove_vectors( vocab, embed_model, ''.join([netconf_path, 'words.vec'])) except Exception as e: raise Exception(e) finally: for file_path in fp_list: # move source file to store path str_buf = self._load_local_files(file_path) self._save_raw_file(str_buf)
def load_data(self, node_id="", parm = 'all'): dataconf = WorkFlowDataImage().get_step_source(node_id) output_directory = dataconf["store_path"] return utils.get_filepaths(output_directory)