Ejemplo n.º 1
0
    raise ValueError(
        "The number of categorical and non-categorical columns don't add up")

print(trainData[categoricalVariables].isna().sum().sum())
trainData[categoricalVariables] = trainData[categoricalVariables].fillna("NA")
print(trainData[categoricalVariables].isna().sum().sum())

print(testData[categoricalVariables].isna().sum().sum())
testData[categoricalVariables] = testData[categoricalVariables].fillna("NA")
print(testData[categoricalVariables].isna().sum().sum())

from dataPrepFunctions import binDataByQuartiles
import tqdm as tqdm

# Bin data into low-med-high = 0-1-2
trainData = binDataByQuartiles(trainData, nonCategoricalVariables)
testData = binDataByQuartiles(testData, nonCategoricalVariables)

print(trainData.shape)

# CHECK
## Make sure no null values
if (trainData[nonCategoricalVariables].isna().sum().sum() == 0):
    print("No Null Values")
else:
    raise ValueError('Null values introduced after binning!')

# CHECK
## Make sure no null values
if (trainData[nonCategoricalVariables].isna().sum().sum() == 0):
    print("No Null Values")
categoricalVariables = list(trainFeature.columns[trainFeature.dtypes == object])
print('2. categorical num is {}'.format(len(categoricalVariables)))            ### 45
'''
assert 'acute_rej_epi_22' in nonCategoricalVariables


### (a) file with NA for categorical Variables 
trainFeature[categoricalVariables] = trainFeature[categoricalVariables].fillna("NA")
### (b) file with average value for noncategorical Variables
for feat in nonCategoricalVariables: ## nonCategoricalVariables, ['acute_rej_epi_22'] 
  avg = float(trainFeature[feat].mode())
  trainFeature[feat] = trainFeature[feat].fillna(avg)


assert trainFeature[nonCategoricalVariables].isna().sum().sum() == 0
trainFeature, feature_map = binDataByQuartiles(trainFeature, nonCategoricalVariables)
assert trainFeature[nonCategoricalVariables].isna().sum().sum() == 0


nonCategoricalVariables = list(trainFeature.columns[trainFeature.dtypes == "category"])
print('3. noncategorical num is {}'.format(len(nonCategoricalVariables)))      ### 59
categoricalVariables = list(trainFeature.columns[trainFeature.dtypes == object])
print('3. categorical num is {}'.format(len(categoricalVariables)))            ### 45


def PdFrame2FileLine(DataFrame, categoricalVariables, nonCategoricalVariables, nonFeatures, separate_symbol = ';'):
  #DataFrame = trainFeature, testFeature
  N = DataFrame.shape[0]
  feat_name = []
  #total_line = []
  num_feat = len(nonFeatures) + len(nonCategoricalVariables)
Ejemplo n.º 3
0
assert categoricalVariables == categoricalVariables2
print('categorical num is {}'.format(len(categoricalVariables)))  ### 45
assert 'acute_rej_epi_22' in nonCategoricalVariables

trainFeature[categoricalVariables] = trainFeature[categoricalVariables].fillna(
    "NA")
testFeature[categoricalVariables] = testFeature[categoricalVariables].fillna(
    "NA")
for feat in nonCategoricalVariables:  ## nonCategoricalVariables, ['acute_rej_epi_22']
    avg = float(trainFeature[feat].mode())
    trainFeature[feat] = trainFeature[feat].fillna(avg)
    testFeature[feat] = testFeature[feat].fillna(avg)

assert trainFeature[nonCategoricalVariables].isna().sum().sum() == 0
assert testFeature[nonCategoricalVariables].isna().sum().sum() == 0
trainFeature = binDataByQuartiles(trainFeature, nonCategoricalVariables)
testFeature = binDataByQuartiles(testFeature, nonCategoricalVariables)
assert trainFeature[nonCategoricalVariables].isna().sum().sum() == 0
assert testFeature[nonCategoricalVariables].isna().sum().sum() == 0

nonCategoricalVariables = list(
    trainFeature.columns[trainFeature.dtypes == "category"])
nonCategoricalVariables2 = list(
    testFeature.columns[testFeature.dtypes == "category"])
assert nonCategoricalVariables == nonCategoricalVariables2
print('noncategorical num is {}'.format(len(nonCategoricalVariables)))  ### 59
categoricalVariables = list(
    trainFeature.columns[trainFeature.dtypes == object])
categoricalVariables2 = list(testFeature.columns[testFeature.dtypes == object])
assert categoricalVariables == categoricalVariables2
print('categorical num is {}'.format(len(categoricalVariables)))  ### 45