Esempio n. 1
    def find_modify(self,
        """Updates/deletes row(s) using opened index.

        Returns number of modified rows or a list of original values in case
        ``modify_operation`` ends with ``?``.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param string operation: logical comparison operation to use over ``columns``.
            Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`.
            Only one operation is allowed per call.
        :param iterable columns: list of column values for comparison operation.
            List must be ordered in the same way as columns are defined in
            opened index.
        :param string modify_operation: modification operation (update or delete).
            Currently allowed operations are defined in :const:`~.MODIFY_OPERATIONS`.
        :param iterable modify_columns: list of column values for update operation.
            List must be ordered in the same way as columns are defined in
            opened index. Only usable for *update* operation,
        :param integer limit: optional limit of results to change. Default is
            one row. In case multiple rows are expected to be changed, ``limit``
            must be set explicitly, HS wont change all found rows by default.
        :param integer offset: optional offset of rows to search for.
        :rtype: list

        if operation not in self.FIND_OPERATIONS \
                or modify_operation not in self.MODIFY_OPERATIONS:
            raise ValueError('Operation is not supported.')

        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        if modify_operation in ('U', '+', '-', 'U?', '+?', '-?') \
            and not check_columns(modify_columns):
            raise ValueError(
                'Modify_columns must be a non-empty iterable for update operation'

        query = chain((str(index_id), operation, str(len(columns))),
                      imap(encode, columns),
                      (str(limit), str(offset), modify_operation),
                      imap(encode, modify_columns))

        response = self._call(index_id, query, force_index=True)

        return response
Esempio n. 3
    def find(self, index_id, operation, columns, limit=0, offset=0):
        """Finds row(s) via opened index.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param string operation: logical comparison operation to use over ``columns``.
            Currently allowed operations are defined in :const:`~.FIND_OPERATIONS`.
            Only one operation is allowed per call.
        :param iterable columns: list of column values for comparison operation.
            List must be ordered in the same way as columns are defined
            in opened index.
        :param integer limit: optional limit of results to return. Default is
            one row. In case multiple results are expected, ``limit`` must be
            set explicitly, HS wont return all found rows by default.
        :param integer offset: optional offset of rows to search for.
        :rtype: list
        if operation not in self.FIND_OPERATIONS:
            raise ValueError('Operation is not supported.')

        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        query = chain((str(index_id), operation, str(len(columns))),
                      imap(encode, columns), (str(limit), str(offset)))

        response = self._call(index_id, query, force_index=True)

        return response
def _data_preprocess(
    if use_model_id:
        disk_smart_df = disk_smart_df[disk_smart_df.model == use_model_id]
    disk_smart_df = disk_smart_df[disk_smart_df['dt'] >= clip_start_date] if clip_start_date is not None \
    else disk_smart_df
    disk_smart_df = disk_smart_df[disk_smart_df['dt'] <= clip_end_date] if clip_end_date is not None \
    else disk_smart_df

    if use_2017_fault_data:
        fault_data_2017_path = os.path.join(conf.DATA_DIR,
        fault_2017_df = pd.read_hdf(
        disk_smart_df = pd.concat([disk_smart_df, fault_2017_df], axis=0)

    # some task-specific clean rules
    index_cols, cate_cols, cont_cols, label_cols = check_columns(
    disk_smart_df.drop_duplicates(index_cols, keep='first', inplace=True)
    mask = (disk_smart_df[POWER_ON_HOURS_COL] != 0)
    disk_smart_df = disk_smart_df[mask]
    #     disk_smart_df.dropna(subset=[POWER_ON_HOURS_COL], inplace=True)

    if is_train:
        cols_with_unique_number = remove_cont_cols_with_unique_value(
            disk_smart_df, cont_cols, threshold=DROP_UNIQUE_COL_THRESHOLD)
        disk_smart_df.drop(columns=cols_with_unique_number, inplace=True)
        drop_na_cols = check_nan_value(disk_smart_df,
        disk_smart_df.drop(columns=drop_na_cols, inplace=True)
        disk_smart_df.loc[disk_smart_df[USING_LABEL] != 0,
                          USING_LABEL] = FAULT_LABEL
    return disk_smart_df
Esempio n. 6
    def insert(self, index_id, columns):
        """Inserts single row using opened index.

        Raises ``ValueError`` if given data doesn't validate.

        :param integer index_id: id of opened index.
        :param list columns: list of column values for insertion. List must be
            ordered in the same way as columns are defined in opened index.
        :rtype: bool
        if not check_columns(columns):
            raise ValueError('Columns must be a non-empty iterable.')

        query = chain((str(index_id), '+', str(len(columns))),
                      imap(encode, columns))

        self._call(index_id, query, force_index=True)

        return True
def feature_engineering(filename='',
    """'训练数据特征工程: %s,数据集截断起始日期:%s, 数据集截断结束日期:%s' %
                (is_train, clip_start_date, clip_end_date))

    # load dataset
    disk_smart_df = _load_data_into_dataframe(filename, is_train)

    # preprocess data
    disk_smart_df = _data_preprocess(clip_start_date, clip_end_date,
                                     disk_smart_df, use_model_id,
                                     use_2017_fault_data, is_train)
    """generate cate feats"""
    fe_df = disk_smart_df.copy(deep=True)
    del disk_smart_df

    fe_df['model_type'] = fe_df['model'].map({1: 0, 2: 1}).astype('category')
        fe_df[col + '_cate'] = 0
        fe_df.loc[fe_df[col] > 0, col + '_cate'] = 1
        fe_df[col + '_cate'] = fe_df[col + '_cate'].astype('category')

    fe_df['power_on_hours_in_day_unit'] = fe_df[POWER_ON_HOURS_COL] // 24
    if is_train:
        fe_df['power_on_hours_in_day_unit_cate'] = pd.cut(
        fe_df['power_on_hours_in_day_unit_cate'] = pd.cut(
    fe_df['power_on_hours_in_day_unit_cate'] = fe_df[
    fe_df.drop(columns=['power_on_hours_in_day_unit'], inplace=True)
    """ generate cont feats"""
    # error weight combination features
    _get_combination_weight(fe_df, ERR_RECORD_COLS, 'err_weight')
    _get_combination_weight(fe_df, SEEK_ERR_COLS, 'seek_err_weight')
    _get_combination_weight(fe_df, DEGRADATION_ERR_COLS,
    fe_df.drop(columns=TRANSFORM_CONT_INTO_CAT_COLS, inplace=True)

    # sliding window feature
    group_cols = ['model', 'serial_number']
    index_cols, cate_cols, cont_cols, label_cols = check_columns(

    if is_train:
        label_df = fe_df[
            index_cols +
            label_cols]  # for further joining with feature engineered data
    fe_df = fe_df[index_cols + cate_cols + cont_cols]
    fe_df = _sliding_window(fe_df, group_cols, cont_cols, cate_cols,

    # drop the col with too many nan
    if is_train:
        drop_na_cols = check_nan_value(fe_df, threshold=DROP_NAN_COL_THRESHOLD)
        fe_df.drop(columns=drop_na_cols, inplace=True)

    if is_train:
        fe_df = _merge_label_df_and_fe_df(
            label_df, fe_df, index_cols)  # get the label cols back
        del label_df
        fe_df.reset_index(drop=True, inplace=True)
        save_path = os.path.join(conf.DATA_DIR, fe_save_filename)
        fe_df.to_feather(save_path)'特征工程文件文件已保存至%s' % save_path)
        # get the prediction duration for predict data
        mask = fe_df.dt >= pred_start_date
        mask &= fe_df.dt <= pred_end_date
        fe_df = fe_df[mask]
        fe_df.reset_index(drop=True, inplace=True)
    return fe_df