Example #1
0
def missing_values_table(df: pd.DataFrame) -> pd.DataFrame:
    """Количество не заполненных значений в DataFrame.

    https://habr.com/post/414613/

    :param df: pd.DataFrame:

    """
    # Всего недостает
    mis_val = df.isnull().sum()
    # Процент недостающих данных
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    # Таблица с результатами
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    # Переименование столбцов
    mis_val_table_ren_columns = \
        mis_val_table.rename(
            columns={0: 'Missing Values', 1: '% of Total Values'}
        )
    # Сортировка про процентажу
    mis_val_table_ren_columns = \
        mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:, 1] != 0]\
        .sort_values('% of Total Values', ascending=False)\
        .round(6)
    # Инфо
    print("В выбранном датафрейме " + str(df.shape[1]) + " столбцов.\n"
          "Всего " + str(mis_val_table_ren_columns.shape[0]) +
          " столбцов с неполными данными.")
    # Возврат таблицы с данными
    return mis_val_table_ren_columns
Example #2
0
def clean(in_data: pd.DataFrame) -> pd.DataFrame:
    assert 'question1' in in_data.columns
    assert 'question2' in in_data.columns

    print("removing nan")
    in_data = in_data[~in_data.isnull()]
    in_data = in_data[(~in_data["question1"].isna())
                      & (~in_data["question2"].isna())]

    print("fixing contractions")
    in_data['question1'] = np.vectorize(contractions.fix)(in_data['question1'])
    in_data['question2'] = np.vectorize(contractions.fix)(in_data['question2'])

    print("fixing emoji")
    in_data['question1'] = np.vectorize(emoji.demojize)(in_data['question1'])
    in_data['question2'] = np.vectorize(emoji.demojize)(in_data['question2'])

    print("cleaning")
    in_data['question1'] = clean_sentence(in_data['question1'])
    in_data['question2'] = clean_sentence(in_data['question2'])

    in_data = in_data[~in_data.isnull()]
    in_data = in_data[(~in_data["question1"].isna())
                      & (~in_data["question2"].isna())]

    in_data['question1'] = in_data['question1'].str.lower()
    in_data['question2'] = in_data['question2'].str.lower()
    return in_data
Example #3
0
def get_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ze zadaneho dataframu zjistete chybejici hodnoty. Vyvorte DataFrame, ktery bude obsahovat v indexu jednotlive promenne
    a ve prvnim sloupci bude promenna 'Total' obsahujici celkovy pocet chybejicich hodnot a ve druhem sloupci promenna 'Percent',
    ve ktere bude procentualni vyjadreni chybejicich hodnot vuci celkovemu poctu radku v tabulce.
    DataFrame seradte od nejvetsich po nejmensi hodnoty.
    Vrattre DataFrame chybejicich hodnot a celkovy pocet chybejicich hodnot.

    Priklad:

               |  Total  |  Percent
    "Column1"  |   34    |    76
    "Column2"  |   0     |    0

    """

    column = list(df)
    total_missing = df.isnull().sum().sum()
    sum_of_value = df.isnull().sum()
    sum_of_value.values
    percent = list()
    for i in sum_of_value.values:
        percent.append(round(i / total_missing * 100, 2))

    df_new = pd.DataFrame(list(zip(sum_of_value.values, percent)),
                          columns=['Total', 'Percent'],
                          index=column)

    return df_new
Example #4
0
def fill_na(all_data: pd.DataFrame):

    all_data_na = get_na(all_data)
    # 欠損値があるカラムをリスト化
    na_col_list = all_data.isnull().sum()[
        all_data.isnull().sum() > 0].index.tolist()

    #欠損があるカラムのデータ型を確認
    all_data[na_col_list].dtypes.sort_values()

    #欠損値が存在するかつfloat型のリストを作成
    float_list = all_data[na_col_list].dtypes[all_data[na_col_list].dtypes ==
                                              "float64"].index.tolist()

    #欠損値が存在するかつobject型のリストを作成
    obj_list = all_data[na_col_list].dtypes[all_data[na_col_list].dtypes ==
                                            "object"].index.tolist()

    #float型の場合は欠損値を0で置換
    all_data[float_list] = all_data[float_list].fillna(0)

    #object型の場合は欠損値を"None"で置換
    all_data[obj_list] = all_data[obj_list].fillna("None")

    #欠損値が全て置換できているか確認
    all_data.isnull().sum()[all_data.isnull().sum() > 0]

    return all_data
Example #5
0
def smart_cut(df:pd.DataFrame, threshold=0, weight_col=1):
    """Drop columns and raws with many nan, minimizing the loss of informative data-points
    
    Args: 
        threshold : percentage of nan left in the dataframe
        wheight_col : importance of columns, to penalize dropping of columns more (weight>1) or less (weight<1)
    
    Returns:
        df (pd.DataFrame): clean dataframe 
    """

    while (df.isnull().sum().sum())/(df.shape[0] * df.shape[1]) > threshold:

        worst_row = np.argmax(df.isnull().sum(1))
        worst_row_value = np.max(df.isnull().sum(1))

        worst_col = np.argmax(df.isnull().sum(0))
        worst_col_value = np.max(df.isnull().sum(0))

        # criterium : minimize loss of valid data-points
        if (df.shape[1] - worst_row_value) <= (weight_col*(df.shape[0] - worst_col_value)):
            df = df.drop(worst_row)
        else:
            df = df.drop(worst_col,axis=1)

    return df
Example #6
0
def missing_values_table(df: pd.DataFrame) -> pd.DataFrame:
    """
    Counts and calculates null values per column

    :param df: features's DataFrame
    :return:
    """
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
        columns={0: 'Missing Values', 1: '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print("Hay " + str(df.shape[1]) + " columnas.\n"
                                      "Hay " + str(mis_val_table_ren_columns.shape[0])
          + " columnas con valores nulos")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns
Example #7
0
def order_by_nan(df:pd.DataFrame):
    """Order dataframe (row and columns) according to presence of data, i.e. more data row-columns on top-left"""

    new_index = df.isnull().sum(1).sort_values().index
    new_col_index = df.isnull().sum().sort_values().index
    df = df.reindex(new_index)
    df = df.reindex(columns=new_col_index)
    return df
Example #8
0
def hasnulls(df: pd.DataFrame, verbose: bool = False):
    nulls = df.isnull().sum().sum()
    print_c(verbose, "Number of nulls", nulls)
    if nulls > 0:
        print_c(verbose, df.isnull().sum().sort_values(ascending=False))
        return True
    else:
        return False
Example #9
0
def summary_missing_data(df: pd.DataFrame,
                         lowest_proportion: float = 0.0) -> pd.DataFrame:
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum() /
               df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent],
                             axis=1,
                             keys=['Count', 'Percent'])
    return missing_data[missing_data['Percent'] > lowest_proportion]
Example #10
0
def missing_table(df: pd.DataFrame):
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum() / len(df)
    missing_table = pd.concat([null_val, percent], axis=1)
    missing_table_len_columns = missing_table.rename(columns={
        0: "欠損数",
        1: "%"
    })
    return missing_table_len_columns
Example #11
0
def display_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum() /
               df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent],
                             axis=1,
                             keys=['Total', 'Percent'])

    return missing_data
Example #12
0
def check_manynull_cols(input_df: pd.DataFrame, null_rate: float = 0.3):
    """欠損の割合が一定以上の列をprintするだけ
    Args:
        input_df(pd.DataFrame): input_df
        null_rate(float): 欠損が全体の何割以上の列を表示するか
    Return:
        nothing
    """
    n = len(input_df) * null_rate
    print(input_df.isnull().sum()[input_df.isnull().sum() > n])
Example #13
0
def filter_ferc714_hourly_demand_matrix(
    df: pd.DataFrame,
    min_data: int = 100,
    min_data_fraction: float = 0.9,
) -> pd.DataFrame:
    """
    Filter incomplete years from FERC 714 hourly demand matrix.

    Nulls respondent-years with too few data and
    drops respondents with no data across all years.

    Args:
        df: FERC 714 hourly demand matrix,
          as described in :func:`load_ferc714_hourly_demand_matrix`.
        min_data: Minimum number of non-null hours in a year.
        min_data_fraction: Minimum fraction of non-null hours between the first and last
          non-null hour in a year.

    Returns:
        Hourly demand matrix `df` modified in-place.
    """
    # Identify respondent-years where data coverage is below thresholds
    has_data = ~df.isnull()
    coverage = (
        # Last timestamp with demand in year
        has_data[::-1].groupby(df.index.year[::-1]).idxmax() -
        # First timestamp with demand in year
        has_data.groupby(df.index.year).idxmax()).apply(
            lambda x: 1 + x.dt.days * 24 + x.dt.seconds / 3600, axis=1)
    fraction = has_data.groupby(df.index.year).sum() / coverage
    short = coverage.lt(min_data)
    bad = fraction.gt(0) & fraction.lt(min_data_fraction)
    # Set all values in short or bad respondent-years to null
    mask = (short | bad).loc[df.index.year]
    mask.index = df.index
    df[mask] = np.nan
    # Report nulled respondent-years
    for mask, msg in [
        (short, 'Nulled short respondent-years (below min_data)'),
        (bad, 'Nulled bad respondent-years (below min_data_fraction)'),
    ]:
        row, col = mask.values.nonzero()
        report = (pd.DataFrame({
            'id': mask.columns[col],
            'year': mask.index[row]
        }).groupby('id')['year'].apply(lambda x: np.sort(x)))
        with pd.option_context('display.max_colwidth', -1):
            logger.info(f'{msg}:\n{report}')
    # Drop respondents with no data
    blank = df.columns[df.isnull().all()].tolist()
    df.drop(columns=blank, inplace=True)
    # Report dropped respondents (with no data)
    logger.info(f'Dropped blank respondents: {blank}')
    return df
def missing_data_ratio(df: pd.DataFrame, display=False):
    # missing data
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum() /
               df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent],
                             axis=1,
                             keys=['Total', 'Percent'])
    if display:
        print(missing_data.head(20))
    return missing_data
def remove_features_by_missing_data_ratio(df: pd.DataFrame,
                                          fraction: float = 0.15,
                                          missing_data=None):
    if missing_data is None:
        missing_data = missing_data_ratio(df)
    # dealing with missing data
    df = df.drop((missing_data[missing_data['Percent'] > fraction]).index, 1)
    # df = df.drop(df.loc[df['Electrical'].isnull()].index)
    df.isnull().sum().max(
    )  # just checking that there's no missing data missing...
    return df
Example #16
0
def all_valid_verification(X: pd.DataFrame, y: pd.DataFrame) -> bool:
    """
    A verification method requires all entries in both feature set and label set to be non-null.
    This function depends on specific case.
    """
    if np.any(X.isnull()):
        return False
    elif np.any(y.isnull()):
        return False
    elif len(X) == 0:
        return False
    return True
Example #17
0
def kesson_table(df: pd.DataFrame) -> pd.DataFrame:
    """generate table that contains lack count and percentage
    
    Arguments:
        df {pd.DataFrame} -- source data
    
    Returns:
        pd.DataFrame -- columns={0: "欠損数", 1: "%"}
    """
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum() / len(df)
    kesson_table = pd.concat([null_val, percent], axis=1)
    kesson_table_ren_columns = kesson_table.rename(columns={0: "欠損数", 1: "%"})
    return kesson_table_ren_columns
Example #18
0
def reduce_vars_corr(df: pd.DataFrame,
                     field_names: list,
                     max_num: float,
                     imputer: str = 'knnimpute'):
    num_vars = len(field_names) - 1
    print('Current vars:  {0}'.format(num_vars))
    if not max_num or max_num < 1:
        if max_num == 0:
            max_num = 0.5
        max_num = int(np.power(df.shape[0], max_num))

    print('Max allowed vars: {0}'.format(max_num))

    if num_vars > max_num:

        if df.isnull().any().any():
            imputed_df, field_names = impute_if_any_nulls(
                df.loc[:, field_names].astype(float))
            for n in field_names:
                df[n] = imputed_df[n]
        # Creates Correlation Matrix
        corr_matrix = df.loc[:, field_names].corr()

        max_corr = [(fld, corr_matrix.iloc[i + 1, :i].max())
                    for i, fld in reverse_enumerate(field_names[1:])]
        max_corr.sort(key=lambda tup: tup[1])

        return_x_vals = [fld for fld, corr in max_corr[:max_num]]
        print('Number of Remaining Fields: {0}'.format(len(return_x_vals)))
        print('Remaining Fields: {0}'.format(return_x_vals))
        return df, return_x_vals

    return df, field_names
Example #19
0
def impute_if_any_nulls(impute_df: pd.DataFrame, verbose: bool = False):
    from fancyimpute import BiScaler, NuclearNormMinimization, MatrixFactorization, IterativeSVD
    impute_names = impute_df.columns.values.tolist()
    impute_index = impute_df.index.values
    for imputer in [
            BiScaler, NuclearNormMinimization, MatrixFactorization,
            IterativeSVD
    ]:
        if impute_df.isnull().any().any():
            print(
                f'Imputation: Null values are in the DF. Running imputation using "{imputer.__name__}"'
            )
            impute_df = imputer(verbose=verbose).fit_transform(
                impute_df.values)
            impute_df = pd.DataFrame(data=impute_df,
                                     columns=impute_names,
                                     index=impute_index)
        else:
            break
    # else:
    #     print('Imputation: Unable to eliminate all NULL values from the dataframe! FIX THIS!')

    for n in impute_names.copy():
        if impute_df[n].isnull().any().any():
            print('Field [{0}] was still empty after imputation! Removing it!'.
                  format(n))
            impute_names.remove(n)

    return impute_df, impute_names
Example #20
0
def remove_nans(
    counterfactuals: pd.DataFrame,
    factuals: pd.DataFrame = None
) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
    """Remove instances for which a counterfactual could not be found.

    Parameters
    ----------
    counterfactuals:
        Has to be the same shape as factuals.
    factuals:
        Has to be the same shape as counterfactuals. (optional)

    Returns
    -------

    """
    # get indices of unsuccessful counterfactuals
    nan_idx = counterfactuals.index[counterfactuals.isnull().any(axis=1)]
    output_counterfactuals = counterfactuals.copy()
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    if factuals is not None:
        if factuals.shape[0] != counterfactuals.shape[0]:
            raise ValueError(
                "Counterfactuals and factuals should contain the same amount of samples"
            )
        output_factuals = factuals.copy()
        output_factuals = output_factuals.drop(index=nan_idx)
        return output_counterfactuals, output_factuals

    return output_counterfactuals
def _check_Xy(X: pd.DataFrame,
              y: pd.Series, *,
              norm_y=False) -> Tuple[pd.Series, pd.Series]:
    if np.ndim(X) == 1:
        X = pd.Series(X).to_frame()
    elif np.ndim(X) == 2:
        X = pd.DataFrame(X)

    assert X.ndim == 2
    assert np.ndim(y) == 1
    assert len(X) == len(y)

    valid = ~X.isnull().any(1).values
    X = pd.Series(list(zip(*X.values[valid].T)),
                  name=tuple(X.columns)).astype('category')
    y = pd.Series(y).reset_index(drop=True)[valid]

    if is_object_dtype(y):
        y = pd.Categorical(y)

    if norm_y:
        assert is_numeric_dtype(y)
        y = (y - y.mean()) / y.std()

    return X, y
Example #22
0
def remove_nan(df: pd.DataFrame) -> pd.DataFrame:

    if df.isnull().values.any():
        print(f'Data not OK, removing nan values..')
        print()
        nan_values = []
        indices = list(np.arange(df.shape[1]))
        for j in range(df.shape[1]):
            nan_values.append(df[j].isnull().sum().sum())

        print(f'Before:')
        print(f"Indices:    {indices}")  #index of feature
        print(f"NaN values: {nan_values}"
              )  #number of nan values corresponding to each feature
        print()

        df = df.fillna(df.median())  #replacing nan with median

        nan_values = []
        indices = list(np.arange(df.shape[1]))
        for j in range(df.shape[1]):
            nan_values.append(df[j].isnull().sum().sum())

        print(f'After:')
        print(f"Indices:    {indices}")  #index of feature
        print(f"NaN values: {nan_values}"
              )  #number of nan values corresponding to each feature
        print()

    else:
        print(f"Data has no NaN values")

    return df
Example #23
0
def check_if_valid_data(df: pd.DataFrame) -> bool:
    # Check if dataframe is empty
    if df.empty:
        print("No songs downloaded. Finishing execution")
        return False

    # Primary Key Check
    if pd.Series(df['played_at']).is_unique:
        pass
    else:
        raise Exception("Primary Key check is violated")

    # Check for nulls
    if df.isnull().values.any():
        raise Exception("Null values found")

    # Check that all timestamps are of yesterday's date
    yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
    yesterday = yesterday.replace(hour=0, minute=0, second=0, microsecond=0)

    timestamps = df["timestamp"].tolist()
    for timestamp in timestamps:
        if datetime.datetime.strptime(timestamp, '%Y-%m-%d') != yesterday:
            raise Exception(
                "At least one of the returned songs does not have a yesterday's timestamp"
            )

    return True
Example #24
0
def check_data(df: pd.DataFrame) -> bool:
    list_error = []
    #  is empty ?
    message_empty = "Nothing downloaded. Perhaps you listened no one song that day. Execution Finished"
    assert not df.empty, list_error.append(message_empty)

    # is duplicates ?
    message_keys = "Primary Key is violated, hence there are duplicates in data"
    assert pd.Series(
        df["played_at"]).is_unique, list_error.append(message_keys)

    # is null values ?
    message_null = "Null values found"
    assert not df.isnull().values.any(), list_error.append(message_null)

    # is it last 24 hours ?
    '''yesterday_ = datetime.datetime.now() - datetime.timedelta(days=1)
    yesterday_ = yesterday_.replace(hour=0, minute=0, second=0, microsecond=0)
    today_ = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
    yesterday_list = [yesterday_, today_]
    timestamps = df["timestamp"].to_list()

    for timestamp in timestamps:
        if datetime.datetime.strptime(timestamp, "%Y-%m-%d") not in yesterday_list:
            print(yesterday_)
            list_error.append("Something wrong with timestamps")
            raise Exception("Something wrong with timestamps")'''

    if list_error:
        for error in list_error:
            print(error)
        return False
    return True
Example #25
0
 def _check_metadata(self, df: pd.DataFrame):
     """Check metadata LOCATIONS tab has valid format."""
     # Check columns
     cols = ["location", "source_name", "automated", "include"]
     cols_missing = [col for col in cols if col not in df.columns]
     cols_wrong = [col for col in df.columns if col not in cols]
     if cols_missing:
         raise ValueError(f"LOCATIONS missing column(s): {cols_missing}.")
     if cols_wrong:
         raise ValueError(f"LOCATIONS has invalid column(s): {cols_wrong}.")
     # Check duplicated rows
     location_counts = df.location.value_counts()
     if (location_counts > 1).any(None):
         locations_dup = location_counts[location_counts > 1].index.tolist()
         raise ValueError(
             f"Duplicated location(s) found in LOCATIONS. Check {locations_dup}"
         )
     if df.isnull().any(None):
         raise ValueError(
             f"Check LOCATIONS. Some fields missing (empty / NaNs)")
     # Ensure booleanity of columns automated, include
     if not df.automated.isin([True, False]).all():
         vals = df.automated.unique()
         raise ValueError(
             f"LOCATIONS column `automated` should only contain TRUE/FALSE. Check {vals}"
         )
     if not df.include.isin([True, False]).all():
         vals = df.include.unique()
         raise ValueError(
             f"LOCATIONS column `include` should only contain TRUE/FALSE. Check {vals}"
         )
Example #26
0
    def _fully_connect_outflows(self,
                                outflows_data: pd.DataFrame) -> pd.DataFrame:
        """Helper function for get_data_inputs that ensures outflows_data is fully connected."""
        # Handle sparse outflow events where a disaggregation is missing data for some time steps
        fully_connected_columns = self.data_dict["disaggregation_axes"] + [
            "compartment",
            "outflow_to",
            "time_step",
        ]
        outflows_data = (outflows_data.groupby(fully_connected_columns)
                         ["total_population"].sum().unstack(
                             level=["time_step"]))

        # Raise a warning if there are any disaggregations without outflow records for more than 25% of the time steps
        missing_event_threshold = 0.25
        number_of_missing_events = outflows_data.isnull().sum(axis=1)
        sparse_disaggregations = number_of_missing_events[
            number_of_missing_events /
            len(outflows_data.columns) > missing_event_threshold]
        if not sparse_disaggregations.empty:
            warn(
                f"Outflows data is missing for more than {missing_event_threshold * 100}% for some disaggregations:\n"
                f"{100 * sparse_disaggregations / len(outflows_data.columns)}")

        # Fill the total population with 0 and remove the multiindex for the population simulation
        return (outflows_data.fillna(0).stack("time_step").reset_index(
            name="total_population"))
Example #27
0
def get_data_metadata(X: DataFrame, y: Series) -> dict:
    X_raw = convert_to_raw(X)

    feature_metadata_orig = FeatureMetadata.from_df(X)
    feature_metadata_raw = FeatureMetadata.from_df(X_raw)

    num_rows, num_cols = X.shape
    num_null = X.isnull().sum().sum()

    try:
        problem_type = infer_problem_type(y, silent=True)
    except:
        # TODO: Remove, only here for legacy compatibility
        problem_type = infer_problem_type(y)
    if problem_type in ['binary', 'multiclass']:
        num_classes = len(y.unique())
    else:
        num_classes = None

    data_metadata = {
        'num_rows': num_rows,
        'num_cols': num_cols,
        'num_null': num_null,
        'num_classes': num_classes,
        'problem_type': problem_type,
        'feature_metadata': feature_metadata_orig,
        'feature_metadata_raw': feature_metadata_raw,
    }
    # TODO: class imbalance
    # TODO: has_text
    # TODO: has_special
    # TODO: memory size

    return data_metadata
Example #28
0
    def __init__(self, evidence_dataframe: pd.DataFrame):
        """
        L(data|xi, signal *or* noise) = L(data|signal)*xi +(1-xi)*L(data|noise)
        where xi --> p(signal)

        Parameters
        ----------

        evidence_dataframe: pandas dataframe
        """
        bilby.Likelihood.__init__(
            self,
            parameters={
                DUTY_CYCLE: None,
                GLITCH_H1_DUTY_CYCLE: None,
                GLITCH_L1_DUTY_CYCLE: None,
            },
        )
        nan_present = evidence_dataframe.isnull().values.any()
        assert not nan_present, "NaN present in the evidence dataframe!"
        self.log_evidence = evidence_dataframe[rkeys.LOG_EVIDENCE].values
        self.log_noise_evidence = evidence_dataframe[
            rkeys.LOG_NOISE_EVIDENCE].values
        self.log_glitch_H_evidence = evidence_dataframe[
            rkeys.LOG_GLITCH_H_EVIDENCE].values
        self.log_glitch_L_evidence = evidence_dataframe[
            rkeys.LOG_GLITCH_L_EVIDENCE].values
Example #29
0
def add_no_response_ratio(df_clean_log: pd.DataFrame,
                          df_whole_v4_dirty: pd.DataFrame):
    """
    Compute new features: no_response_ratio and invalid_ratio.
    For each interview, calculate the proportion of no-response and invalid answer.
    """
    df_clean_log = df_clean_log.drop_duplicates()
    df_invalid = df_clean_log[['question', "uuid"]].groupby("uuid").count()

    # remove all dummy columns
    for col in df_whole_v4_dirty.columns:
        if '/' in col:
            df_whole_v4_dirty = df_whole_v4_dirty.drop(col, axis=1)

    df_whole_v4_dirty['no_response_ratio'] = df_whole_v4_dirty.isnull().sum(
        axis=1)
    df_whole_v4_dirty['invalid_ratio'] = 0

    _, n_col = df_whole_v4_dirty.shape
    df_whole_v4_dirty = df_whole_v4_dirty.apply(cal_no_r_count,
                                                axis=1,
                                                df_invalid=df_invalid)
    df_whole_v4_dirty[
        'no_response_ratio'] = df_whole_v4_dirty['no_response_ratio'] / n_col
    df_whole_v4_dirty[
        'invalid_ratio'] = df_whole_v4_dirty['invalid_ratio'] / n_col
    return df_whole_v4_dirty['no_response_ratio'], df_whole_v4_dirty[
        'invalid_ratio']
def df_missing_vals(df: pd.DataFrame) -> pd.Series:
    """
    return Series containing the number of NaNs for each column that
    contains at least one
    """
    null_counts = df.isnull().sum()
    return null_counts[null_counts > 0]
    def input_design(self, value: pd.DataFrame):
        if not value.equals(self.input_design):
            # if dataframe is diferent from the current one in display, warn
            # the about layout changes, and reset case number and status cols
            self.layoutAboutToBeChanged.emit()

            self._input_design = value
            self.dataChanged.emit(
                self.index(0, 0),
                self.index(self.rowCount(), self.columnCount()))
            self.is_input_design_generated = False \
                if value.isnull().all(axis=None) else True
            self.input_design_changed.emit(self.is_input_design_generated)

            self.layoutChanged.emit()

            # number of experiments
            n_samp = self.app_data.doe_lhs_settings['n_samples']
            if self.app_data.doe_lhs_settings['inc_vertices']:
                n_samp += 2**len(self._input_alias)  # include vertices

            # create empty dataframes with NaN values
            self._case_num = pd.DataFrame({'case': np.arange(1, n_samp + 1)})
            self._status_sim = pd.DataFrame({'status': [''] * n_samp},
                                            dtype=object)

            # reset sampled data as well
            self.samp_data = pd.DataFrame(np.nan,
                                          index=range(n_samp),
                                          columns=self._output_alias,
                                          dtype=float)
Example #32
0
type(np.nan)

## create a sample data setdd
zip1 = zip([2, 4, 8],
            [np.nan, 5, 7],
            [np.nan, np.nan, 22])

df1 = DataFrame(zip1, columns = ['a', 'b', 'c'])
df1


## finding missing values with pandas DataFrame method`isnull` and numpy `isnan`
## returns boolean values where True/False

# search a whole dataframe
df1.isnull()
np.isnan(df1)

# search specific columns
cols = ['a', 'c'] # create a list of column keys
df1[cols]
df1[cols].isnull()

# also works on a series
df1['b']
df1['b'].isnull()

# pandas also has a negation of `isnull`, `notnull`
df1.isnull()
df1.notnull()
df1.isnull() == df1.notnull() # all false! perfectly opposite
                    数据框的空值处理
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#%%
from pandas import DataFrame
from string import letters
d = DataFrame(arange(100.0).reshape(10,10),columns=list(letters[:10]))
#%%
d[d%13==0]=np.nan
d[d%17==0]=np.nan
d[(d>=80) & (d<90)]=np.nan
#%%
d
#%% 所有含有空值的行全部删除
d.dropna()
#%% 只删除全部是空值的数据
d.dropna(how='all')
#%% 只要有一列数据为空就删除 
d.dropna(how='any')
#%% 要求至少有9列数据不为空
d.dropna(thresh=9)
#%% 按列进行删除
d.dropna(thresh=8,axis=1)
#%% DataFrame没有isnull方法
d.isnull()
#%% 空值填充
d.fillna(0)
#%% 
d.fillna(dict(zip(letters[:10],range(-1,-11,-1))))
# 注意:使用dropna时默认使用的坐标是行,使用fillna时默认是使用列
#%% 
d.fillna(method='ffill',limit=1)
Example #34
0
class DataWorker(object):
	
	
	def feat_value2int(series):
	    all_values = list(enumerate(np.unique(series)))
	    value_dict = {name : i for i,name in all_values}
	    return value_dict
	
	
	def __init__(self,data=None):
		"""
		Init DataWorker with pandas.DataFrame
		Otherwise make sure that the rdata can be transformed to DataFrame.
		"""
		if data is None:
			self.__data = {}
		if isinstance(data,DataFrame):
			self.__data = data.copy()
		else:
			self.__data = DataFrame(data)
		
		self.__featureDict = None

		
	@property
	def featureDict(self):
		self.__data.select_dtypes(include=['object'])
		
	@featureDict.setter
	def featureDict(self,value):
		pass
		
	@property
	def data(self):
		return self.__data
	@data.setter
	def data(self,df):
		self.data = df
	
	def getColNamesWithNan(self):
		s = self.__data.isnull().any()
		return	s.index[s==True].tolist()
	
	def dataClean(self,transDict = None,fillna={'all':'most_frequent'},yCol = -1):
		"""
		yCol: the col you wanna predict
		fillna: 
			{columnn:method_name} dictionary
			default:{'all':'most_frequent'}
			provied functions are : 'most_frequent','mean','median','first_n_frequent,n'(where the last n is a number)
			when key =='all' : fill column which include na with the same function,
			this key is suggested to put at the end
		"""
		
		# try to map all data to numeric

		self.__data = cd.fillna(self.__data,fillna)

		if transDict == None:
			self.__featureDict
		if yCol != -1:
			self.__data = cd.change_yCol(self.__data,yCol)
		
		
	def algorithmUsing():
		pass

	def showFeagure():
		pass
	def getResult():
		pass
df[0:1]                                              # to index a row use ranges
df.ix[0]                                             # .ix() indexes rows but outputs the row as a column
df.ix[1:2,] ['col1']] = 666                          # to index or modify a specific cell
df3 = df.ix[['A','B','C','D','E','F'],new_columns]   # .ix() can be used to add rows and columns simultaneously
df2 = df.reindex(['A','B','C','D','E','F'])          # .reindex() to add new rows (in this case 'C')
df2 = df.reindex(columns=new_columns)                # .reindex() can also add new columns
df3['newCol'] = a_list                               # you can use lists to fill columns
df4 = df3.drop('newCol', axis = 1)                   # drop a column 
df5 = df4.rename(columns={'col1':'test1'})           # rename a column 
df6 = df5.rename(index={'A':'Alpha'})                # rename an index
dflist = df['col2'].tolist()                         # convert a column to a list
df6.index = df6.index.map(str.lower)                 # bulk convert all indexes to lower case
df7 = df6.rename(index=str.title, 
                 columns=str.title)                  # bulk convert indexes and columns to capital first letter
df > 0                                               # conditional operators create new, boolean dataframes
df.isnull()                                          # check entire dataframe for NaNs


#############################################################################################################
# 3. Sorting Dataframes
#############################################################################################################
df.sort_index()                                      # sort descending by index
df.sort(columns = ['Col1','Col2'],
        ascending = True, inplace = True)            # sort by specified column

#############################################################################################################
# 4. Missing Values
#############################################################################################################

df5 = DataFrame([[1,2,3,np.nan],[np.nan,5,6,7],[7,np.nan,9,np.nan],[np.nan,np.nan,np.nan,np.nan]])
Example #36
0
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)

m = regex.match('*****@*****.**')
m.groupdict()


###pandas中矢量化的字符串函数
data = {'Dave': '*****@*****.**', 'Steve': '*****@*****.**',
        'Rob': '*****@*****.**', 'Wes': np.nan}
data = Series(data)

data

data.isnull()

data.str.contains('gmail')

pattern

data.str.findall(pattern, flags=re.IGNORECASE)

matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

matches.str.get(1)

matches.str[0]

data.str[:5]
Example #37
0
def create_log_features2(ids, feature_df):
    """Creates log features2

    Parameters
    ----------
    ids : data frame with id column
    feature_df : data frame with columns
                 id - id
                 log_feature - name of feature in the form "log_feature space feature's number"
                               ex. log_feature 56
                 volume - volume
    Return
    ------
    df : data frame with columns
         id - id
         max_log_feature - maximum feature's number
         min_log_feature - minimum feature's number
         median_log_feature - median feature's number
         count_log_feature - number of different features
         max_volume - maximum volume
         min_volume - minimum volume
         median - median volume
         count_volume - the same as count_log_features, consider dropping!
    """

    print "CREATING LOG FEATURES"
    start_time = time.time()

    all_df = DataFrame(ids, columns=['id'])

    feature_df['log_feature'] = feature_df['log_feature'].apply(lambda val: int(val.split()[1]))

    names = ('log_feature', 'volume')

    for name in names:
        gdf = feature_df[['id', name]].groupby('id', as_index=False)

        max_feature = gdf.max()
        max_feature.columns = ['id', 'max_' + name]
        all_df = pd.merge(all_df, max_feature, how='inner', on='id')

        min_feature = gdf.min()
        min_feature.columns = ['id', 'min_' + name]
        all_df = pd.merge(all_df, min_feature, how='inner', on='id')

        median_feature = gdf.median()
        median_feature.columns = ['id', 'median_' + name]
        all_df = pd.merge(all_df, median_feature, how='inner', on='id')

        count_feature = gdf.count()
        count_feature.columns = ['id', 'count_' + name]
        all_df = pd.merge(all_df, count_feature, how='inner', on='id')

        # check whether there are null entries
    if sum(sum(1 * all_df.isnull().values)) > 0:
        print "ERROR: there are null entries in the all df data frame"

    elapsed_time = time.time() - start_time;
    print "ALL FEATURES WERE SUCCESFULLY CREATED. TIME ELAPSED " + str(elapsed_time) + "sec."

    return all_df
Example #38
0
# QUICK TIP: you can repeat lists by multiplying!
[1,2,3]
[1,2,3]*3

# types missing data
None
np.nan
type(None)
type(np.nan)

## create a sample data set
zip1 = zip([2,4,8], [np.nan, 5, 7], [np.nan, np.nan, 22])
df1 = DataFrame(zip1, columns = ['a', 'b', 'c'])

## search for missing data using
df1.isnull() # pandas method to find missing data
np.isnan(df1) # numpy way

## subset of columns
cols = ['a', 'c']
df1[cols]
df1[cols].isnull()

## for series
df1['b'].isnull()

## find non-missing values
df1.isnull()
df1.notnull()
df1.isnull() == df1.notnull()
Example #39
0
# b) Para aumentar la información de nuestros datos, concatenar a nuestro
# DataFrame un objeto DataFrame con la siguiente información:
new_data = {'equipo': ['Atletico de Madrid'],
            'titulos': [29],
            'socios': [48008]}

equipos = equipos.append(new_data,ignore_index=True)


# c) Crear una nueva columna 'posicion' con los siguientes datos:
posicion_values = ['13', np.nan, '3', np.nan, '5', np.nan]

equipos['posicion'] = posicion_values

# d) Mostrar la posicion de los elementos que son NA en nuestro DataFrame.
# Esto mostraría las filas completas
equipos[equipos.isnull().any(axis=1)]
# Esto mostraría solo las posiciones, entendiendo estas como los índices
equipos[equipos.isnull().any(axis=1)].index

# e) Mostrar nuestro DataFrame sin las filas con elementos NA.
equipos.dropna()



string_data = Series(['aardvark', 'artichoke', np.nan,'avocado'])
string_data
string_data.isnull()
string_data[0] = None
string_data.isnull()