import pandas as pd
import numpy as np
import missingpy
from _datetime import datetime

# mask = np.loadtxt('mask_pattern.csv', delimiter=',')
# data = pd.read_csv('final_data_missing.csv')


# mask = np.loadtxt('mask_pattern_mcar.csv', delimiter=',')
data = pd.read_csv('final_data_missing80n500.csv')

print(data)

imputer = missingpy.MissForest()


now1 = datetime.now()
current_time = now1.strftime("%H:%M:%S")
print("Starting Time =", current_time)

Data_Imputed = imputer.fit_transform(data)


now1 = datetime.now()
current_time = now1.strftime("%H:%M:%S")
print("Final Time =", current_time)

df = pd.DataFrame(Data_Imputed)

df.to_csv('MissForestImputed80n500.csv', index=False)
Esempio n. 2
0
features_train = [
    "full_sq", "metro_min_walk", "big_market_km", "workplaces_km",
    "university_km", "cafe_count_1000", "shopping_centers_km", "office_km",
    "big_church_km", "school_education_centers_top_20_raion",
    "build_count_after_1995", "cafe_count_1500_price_500", "market_count_500",
    "oil_chemistry_km", "railroad_km", "ts_km", "young_all", "work_male",
    "work_female", "ekder_all", "build_count_mix", "build_count_1971-1995",
    "build_count_1946-1970", "build_count_1921-1945", "build_count_before_1920"
]

features_subset = train[features_train]
#features_subset[features_subset["build_count_mix"].isna()][["build_count_mix","build_count_1971-1995","build_count_1946-1970"]].head()

features_subset["metro_min_walk"] = features_subset["metro_min_walk"].fillna(
    features_subset["metro_min_walk"].mean())
rf_imp = missingpy.MissForest(criterion="mse")
filled_features = rf_imp.fit_transform(features_subset)
data_imp = pd.DataFrame(filled_features, columns=features_subset.columns)
data_imp["target"] = train["price_doc"]
data_imp["timestamp"] = train["timestamp"]
l_feat = Lasso(alpha=.4)
l_feat.fit(data_imp.drop(["timestamp", "target"], axis=1), data_imp["target"])
coefs = l_feat.coef_

selected_features = []
cols = data_imp.drop(["timestamp", "target"], axis=1).columns
for i in range(len(coefs)):
    if coefs[i] != 0:
        selected_features.append(cols[i])
#selected_features
Esempio n. 3
0
new_index.head(5)


#redo merge of hourly data to index
Hourlys = pd.merge(new_index, Hourlys, on='timestamp', how='left', sort=False,copy=True)
Hourlys.set_index(Hourlys.timestamp, inplace=True)
Hourlys = Hourlys.loc[:, Hourlys.columns != 'timestamp']

#viz of missing values
missingdata_df2 = Hourlys.columns[Hourlys.isnull().any()].tolist()
# msno.matrix(Hourlys[missingdata_df2])

gc.collect()

# predict Missing hourly pollutants from present pollutants with random forest
imputer = mp.MissForest()
polluted_imputed = imputer.fit_transform(Hourlys)
# polluted_imputed


#put imputed data in a data frame
Hourlys_cols = Hourlys.columns.values
Hourlys = pd.DataFrame(polluted_imputed, columns = Hourlys_cols, index=Hourlys.index )
Hourlys = Hourlys.reindex(sorted(Hourlys.columns), axis=1)
# Hourlys.head(3)
# Hourlys.shape

gc.collect()


# group hourly data by daily averages to merge with daily pollutants
Esempio n. 4
0
import numpy as np
import missingpy

if __name__ == "__main__":
    data = np.loadtxt('data0.0_50.csv', delimiter=',')
    #data = data[,0:2]
    imputer = missingpy.MissForest(verbose=1)
    imputer.fit_transform(data)