Ejemplo n.º 1
0
def transform_to_np_csr(
        data_numeric: pandas.DataFrame,
        data_dummies: pandas.SparseDataFrame) -> sparse.csr_matrix:
    data = data_numeric

    ###
    # np sparse matrices
    ###
    print("converting numeric training data to sparse df")
    data: pandas.SparseDataFrame = data.to_sparse(fill_value=0)
    mem_util.print_mem_usage()

    print("combining sparse numeric with sparse dummies (as data frame)")
    data: pandas.SparseDataFrame = data.join(data_dummies)
    mem_util.print_mem_usage()

    print("converting to float")
    data: pandas.SparseDataFrame = data.astype('float32')
    mem_util.print_mem_usage()

    print("converting to coo matrix")
    data: sparse.coo_matrix = data.to_coo()
    mem_util.print_mem_usage()

    print("converting to csr")
    data: sparse.csr_matrix = data.tocsr()
    mem_util.print_mem_usage()

    return data
})


def _get_encoded_file_name(file_suffix: str, sep='__') -> str:
    return f"{encoded_file_base_name}{sep}{file_suffix}"


def _get_result_file_path(file_suffix: str, sep='__') -> str:
    return os.path.join(result_dir, _get_encoded_file_name(file_suffix, sep))


def _get_temp_data_path(file_suffix: str, sep='__') -> str:
    return os.path.join(temp_dir, _get_encoded_file_name(file_suffix, sep))


mem_util.print_mem_usage()

print(f"reading in data, may take a while: {raw_path}")
data_src: DataFrame = pandas.read_pickle(raw_path)
if sample_n is not None and sample_n > 0:
    print(f"sampling: {sample_n}")
    data_src = data_src.sample(n=sample_n)

print("cleaning...")
gis_pps.clean(data_src)
mem_util.print_mem_usage()

print("shaping...")
gis_pps.shape(data_src, encode_app_date_bi_weeks=True)
mem_util.print_mem_usage()