def find_modify(self, index_id, operation, columns, modify_operation, modify_columns=[], limit=0, offset=0): """Updates/deletes row(s) using opened index. Returns number of modified rows or a list of original values in case ``modify_operation`` ends with ``?``. Raises ``ValueError`` if given data doesn't validate. :param integer index_id: id of opened index. :param string operation: logical comparison operation to use over ``columns``. Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`. Only one operation is allowed per call. :param iterable columns: list of column values for comparison operation. List must be ordered in the same way as columns are defined in opened index. :param string modify_operation: modification operation (update or delete). Currently allowed operations are defined in :const:`~.MODIFY_OPERATIONS`. :param iterable modify_columns: list of column values for update operation. List must be ordered in the same way as columns are defined in opened index. Only usable for *update* operation, :param integer limit: optional limit of results to change. Default is one row. In case multiple rows are expected to be changed, ``limit`` must be set explicitly, HS wont change all found rows by default. :param integer offset: optional offset of rows to search for. :rtype: list """ if operation not in self.FIND_OPERATIONS \ or modify_operation not in self.MODIFY_OPERATIONS: raise ValueError('Operation is not supported.') if not check_columns(columns): raise ValueError('Columns must be a non-empty iterable.') if modify_operation in ('U', '+', '-', 'U?', '+?', '-?') \ and not check_columns(modify_columns): raise ValueError( 'Modify_columns must be a non-empty iterable for update operation' ) query = chain((str(index_id), operation, str(len(columns))), imap(encode, columns), (str(limit), str(offset), modify_operation), imap(encode, modify_columns)) response = self._call(index_id, query, force_index=True) return response
def find_modify(self, index_id, operation, columns, modify_operation, modify_columns=[], limit=0, offset=0): """Updates/deletes row(s) using opened index. Returns number of modified rows or a list of original values in case ``modify_operation`` ends with ``?``. Raises ``ValueError`` if given data doesn't validate. :param integer index_id: id of opened index. :param string operation: logical comparison operation to use over ``columns``. Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`. Only one operation is allowed per call. :param iterable columns: list of column values for comparison operation. List must be ordered in the same way as columns are defined in opened index. :param string modify_operation: modification operation (update or delete). Currently allowed operations are defined in :const:`~.MODIFY_OPERATIONS`. :param iterable modify_columns: list of column values for update operation. List must be ordered in the same way as columns are defined in opened index. Only usable for *update* operation, :param integer limit: optional limit of results to change. Default is one row. In case multiple rows are expected to be changed, ``limit`` must be set explicitly, HS wont change all found rows by default. :param integer offset: optional offset of rows to search for. :rtype: list """ if operation not in self.FIND_OPERATIONS \ or modify_operation not in self.MODIFY_OPERATIONS: raise ValueError('Operation is not supported.') if not check_columns(columns): raise ValueError('Columns must be a non-empty iterable.') if modify_operation in ('U', '+', '-', 'U?', '+?', '-?') \ and not check_columns(modify_columns): raise ValueError('Modify_columns must be a non-empty iterable for update operation') query = chain( (str(index_id), operation, str(len(columns))), imap(encode, columns), (str(limit), str(offset), modify_operation), imap(encode, modify_columns) ) response = self._call(index_id, query, force_index=True) return response
def find(self, index_id, operation, columns, limit=0, offset=0): """Finds row(s) via opened index. Raises ``ValueError`` if given data doesn't validate. :param integer index_id: id of opened index. :param string operation: logical comparison operation to use over ``columns``. Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`. Only one operation is allowed per call. :param iterable columns: list of column values for comparison operation. List must be ordered in the same way as columns are defined in opened index. :param integer limit: optional limit of results to return. Default is one row. In case multiple results are expected, ``limit`` must be set explicitly, HS wont return all found rows by default. :param integer offset: optional offset of rows to search for. :rtype: list """ if operation not in self.FIND_OPERATIONS: raise ValueError('Operation is not supported.') if not check_columns(columns): raise ValueError('Columns must be a non-empty iterable.') query = chain((str(index_id), operation, str(len(columns))), imap(encode, columns), (str(limit), str(offset))) response = self._call(index_id, query, force_index=True) return response
def find(self, index_id, operation, columns, limit=0, offset=0): """Finds row(s) via opened index. Raises ``ValueError`` if given data doesn't validate. :param integer index_id: id of opened index. :param string operation: logical comparison operation to use over ``columns``. Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`. Only one operation is allowed per call. :param iterable columns: list of column values for comparison operation. List must be ordered in the same way as columns are defined in opened index. :param integer limit: optional limit of results to return. Default is one row. In case multiple results are expected, ``limit`` must be set explicitly, HS wont return all found rows by default. :param integer offset: optional offset of rows to search for. :rtype: list """ if operation not in self.FIND_OPERATIONS: raise ValueError('Operation is not supported.') if not check_columns(columns): raise ValueError('Columns must be a non-empty iterable.') query = chain( (str(index_id), operation, str(len(columns))), imap(encode, columns), (str(limit), str(offset)) ) response = self._call(index_id, query, force_index=True) return response
def _data_preprocess( clip_start_date, clip_end_date, disk_smart_df, use_model_id, use_2017_fault_data, is_train, ): """ """ if use_model_id: disk_smart_df = disk_smart_df[disk_smart_df.model == use_model_id] disk_smart_df = disk_smart_df[disk_smart_df['dt'] >= clip_start_date] if clip_start_date is not None \ else disk_smart_df disk_smart_df = disk_smart_df[disk_smart_df['dt'] <= clip_end_date] if clip_end_date is not None \ else disk_smart_df if use_2017_fault_data: fault_data_2017_path = os.path.join(conf.DATA_DIR, '2017_fault_data.h5') fault_2017_df = pd.read_hdf( fault_data_2017_path, columns=SELECTED_CONT_COLS + SELECTED_INDEX_COLS + SELECTED_CATE_COLS + SELECTED_LABEL_COLS, ) disk_smart_df = pd.concat([disk_smart_df, fault_2017_df], axis=0) # some task-specific clean rules correct_column_type(disk_smart_df) index_cols, cate_cols, cont_cols, label_cols = check_columns( disk_smart_df.dtypes.to_dict()) disk_smart_df.drop_duplicates(index_cols, keep='first', inplace=True) mask = (disk_smart_df[POWER_ON_HOURS_COL] != 0) disk_smart_df = disk_smart_df[mask] # disk_smart_df.dropna(subset=[POWER_ON_HOURS_COL], inplace=True) if is_train: cols_with_unique_number = remove_cont_cols_with_unique_value( disk_smart_df, cont_cols, threshold=DROP_UNIQUE_COL_THRESHOLD) disk_smart_df.drop(columns=cols_with_unique_number, inplace=True) drop_na_cols = check_nan_value(disk_smart_df, threshold=DROP_NAN_COL_THRESHOLD) disk_smart_df.drop(columns=drop_na_cols, inplace=True) disk_smart_df.loc[disk_smart_df[USING_LABEL] != 0, USING_LABEL] = FAULT_LABEL return disk_smart_df
def insert(self, index_id, columns): """Inserts single row using opened index. Raises ``ValueError`` if given data doesn't validate. :param integer index_id: id of opened index. :param list columns: list of column values for insertion. List must be ordered in the same way as columns are defined in opened index. :rtype: bool """ if not check_columns(columns): raise ValueError('Columns must be a non-empty iterable.') query = chain((str(index_id), '+', str(len(columns))), imap(encode, columns)) self._call(index_id, query, force_index=True) return True
def show_csv_config(request): recordtype = 'collectionobjects' matrix = deepcopy(RECORDTYPES[recordtype][2][0]) # convert from dict of tuples to list of tuples matrix = [[ m, ] + matrix[m] for m in matrix] matrix = [[m[i] for i in (0, 1, 3, 6, 5)] for m in matrix] matrix = sorted(matrix, key=lambda x: x[4]) #labels = 'input_column,cspace_field,context_tag,data_type,check_exists,row_id,authority or vocabulary'.split(',') labels = 'input column,cspace field,data type,authority or vocabulary,row id'.split( ',') columnhandling = check_columns(labels, 'none', recordtype) #message = "%s 'actionable' fields configured in config file." % len(matrix) message = "'Mappable' fields for %s" % recordtype rtypes = [[RECORDTYPES[r][0], r] for r in RECORDTYPES.keys()] return labels, matrix, message, rtypes
def insert(self, index_id, columns): """Inserts single row using opened index. Raises ``ValueError`` if given data doesn't validate. :param integer index_id: id of opened index. :param list columns: list of column values for insertion. List must be ordered in the same way as columns are defined in opened index. :rtype: bool """ if not check_columns(columns): raise ValueError('Columns must be a non-empty iterable.') query = chain( (str(index_id), '+', str(len(columns))), imap(encode, columns) ) self._call(index_id, query, force_index=True) return True
def feature_engineering(filename='', fe_save_filename='train_fe.feather', is_train=True, use_2017_fault_data=False, clip_start_date=None, clip_end_date=None, pred_start_date='2018-09-01', pred_end_date='2018-09-30', use_model_id=None, num_processes=10): """ :return: """ logger.info('训练数据特征工程: %s,数据集截断起始日期:%s, 数据集截断结束日期:%s' % (is_train, clip_start_date, clip_end_date)) # load dataset disk_smart_df = _load_data_into_dataframe(filename, is_train) # preprocess data disk_smart_df = _data_preprocess(clip_start_date, clip_end_date, disk_smart_df, use_model_id, use_2017_fault_data, is_train) """generate cate feats""" fe_df = disk_smart_df.copy(deep=True) del disk_smart_df gc.collect() # fe_df['model_type'] = fe_df['model'].map({1: 0, 2: 1}).astype('category') for col in TRANSFORM_CONT_INTO_CAT_COLS: fe_df[col + '_cate'] = 0 fe_df.loc[fe_df[col] > 0, col + '_cate'] = 1 fe_df[col + '_cate'] = fe_df[col + '_cate'].astype('category') fe_df['power_on_hours_in_day_unit'] = fe_df[POWER_ON_HOURS_COL] // 24 if is_train: fe_df['power_on_hours_in_day_unit_cate'] = pd.cut( fe_df['power_on_hours_in_day_unit'], bins=BINS_FOR_CUT_POWER_ON_HOURS_FEAT, labels=False) else: fe_df['power_on_hours_in_day_unit_cate'] = pd.cut( fe_df['power_on_hours_in_day_unit'], POWER_ON_HOURS_CATE_LIST, labels=False) fe_df['power_on_hours_in_day_unit_cate'] = fe_df[ 'power_on_hours_in_day_unit_cate'].astype('category') fe_df.drop(columns=['power_on_hours_in_day_unit'], inplace=True) """ generate cont feats""" # error weight combination features _get_combination_weight(fe_df, ERR_RECORD_COLS, 'err_weight') _get_combination_weight(fe_df, SEEK_ERR_COLS, 'seek_err_weight') _get_combination_weight(fe_df, DEGRADATION_ERR_COLS, 'degradation_err_weight') fe_df.drop(columns=TRANSFORM_CONT_INTO_CAT_COLS, inplace=True) # sliding window feature group_cols = ['model', 'serial_number'] index_cols, cate_cols, cont_cols, label_cols = check_columns( fe_df.dtypes.to_dict()) if is_train: label_df = fe_df[ index_cols + label_cols] # for further joining with feature engineered data fe_df = fe_df[index_cols + cate_cols + cont_cols] fe_df = _sliding_window(fe_df, group_cols, cont_cols, cate_cols, num_processes) # drop the col with too many nan if is_train: drop_na_cols = check_nan_value(fe_df, threshold=DROP_NAN_COL_THRESHOLD) fe_df.drop(columns=drop_na_cols, inplace=True) if is_train: fe_df = _merge_label_df_and_fe_df( label_df, fe_df, index_cols) # get the label cols back del label_df gc.collect() fe_df.reset_index(drop=True, inplace=True) save_path = os.path.join(conf.DATA_DIR, fe_save_filename) fe_df.to_feather(save_path) logger.info('特征工程文件文件已保存至%s' % save_path) else: # get the prediction duration for predict data mask = fe_df.dt >= pred_start_date mask &= fe_df.dt <= pred_end_date fe_df = fe_df[mask] fe_df.reset_index(drop=True, inplace=True) return fe_df