Ejemplo n.º 1
0
# Check the MUT CAT vars and NL vars, find proper methods for processing and generate related constants
from modules.analyser import Analyzer
from modules.loader import Loader
from modules.saver import Saver
from modules.preprocessor import Processor
from utils.constants import VILLE_NAME, Armoire_PICK, Int_PICK, PL_PICK
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os

date_str = '0723'
analyzer = Analyzer(datestr=date_str)
loader = Loader(datestr=date_str)
saver = Saver(datestr=date_str)
processor = Processor(datestr=date_str)
"""
MUL CAT vars
PL: lampe_Type
INT: pan_Solde, int_Solde, int_ElemDefaut, int_TypeTnt, int_TypeEqt, pan_TypeEqt, pan_Defaut, int_Defaut

NL vars

"""
## lampe_Type: {}
## int_ElemDefaut: {'cover':['Crosse','Vasque','Enveloppe exterieure','Support','Coffret'],
## 'electricity':['Armorceur','Platine','Lampe','Câbles','Appareillage','Ballast','Protection électrique'],
## 'else':['NA','Luminaire','Armoire départ','Horloge','Alimentation générale']}
from modules.saver import Saver
import datetime as dt
import pandas as pd

save_path = 'D:\\Users\\Yuan.ZHANG\\PycharmProjects\\compa0516\\data_save'
CURRENT_TIME_AP = '2018-05-15'
CURRENT_TIME_INT = '2018_05_15'
Intfilename_lst = [
    "BDDExportInterventions-{} du 01_01_2013 au 15_05_2018.xlsx".format(
        CURRENT_TIME_INT)
]
loader = Loader(datadir="D:\\Users\Yuan.ZHANG\\PycharmProjects\\data")

saver = Saver()
cleaner = Cleaner()
analyzer = Analyzer()

# # standardize the format of the dataframe
# for ville in VILLE_NAME:
#     # rename the dataframe,remove redundant info and save
#     data_Arm = loader.load_ArmPL(foldername=ville,filename="BDDExport_ArmoireBt_{}_{}.xlsx".format(ville,CURRENT_TIME_AP), NAME_LIST=Armoire_NAME)
#     data_PL = loader.load_ArmPL(foldername=ville,filename="BDDExport_PointLumineux_{}_{}.xlsx".format(ville,CURRENT_TIME_AP), NAME_LIST=PL_NAME)
#     data_Int = loader.load_Intervention(foldername=ville,filename_lst=Intfilename_lst, NAME_LIST=Int_NAME)
#
#     data_Arm = cleaner.rv_dupRow(data_Arm)
#     data_Ar = cleaner.rep_dur(data_Arm, Var_lst=Armoire_TIME, currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0))
#     data_PL = cleaner.rv_dupRow(data_PL)
#     data_PL = cleaner.rep_dur(data_PL, Var_lst=PL_TIME, currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0))
#     data_Int = cleaner.rv_dupRow(data_Int)
#     data_Int = cleaner.rep_dur(data_Int, Var_lst=Int_TIME, currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0))
#
Ejemplo n.º 3
0
from utils.constants import Armoire_NAME, Armoire_ARM_CAT, Armoire_DEPART_CAT, Armoire_TIME, Armoire_ARM_DIST
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
import datetime as dt

loader = Loader(
    datadir=
    "/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea")
cleaner = Cleaner()
analyser = Analyzer()

# load the data
data_Ar = loader.load_ArmPL(
    filename="BDDExport_ArmoireBt_NOUMEA_2018-05-15.xlsx",
    NAME_LIST=Armoire_NAME)

# remove the duplicated rows and replace the date with the duration
data_Ar = cleaner.rv_dupRow(data_Ar)
data_Ar = cleaner.rep_dur(data_Ar,
                          Var_lst=Armoire_TIME,
                          currtime=dt.datetime(2018, 5, 5, 0, 0, 0, 0))

# generate the count for NAN for all the variables
analyser.gen_NAN_excel(data_Ar.iloc[:, 0:43], 'Armoire_arm', 'Armoire_arm_or')
analyser.gen_NAN_excel(data_Ar.iloc[:, 43:], 'Armoire_depart',
                       'Armoire_depart_or')

# pick the variables and regroup
data_Ar_arm = analyser.pick_Var(data=data_Ar,
                                Var_lst=Armoire_ARM_CAT + Armoire_ARM_DIST)
Ejemplo n.º 4
0
from utils.constants import PL_NAME, PL_TIME,PL_PL_CAT,PL_PL_DIST,PL_LAN_CAT,PL_LAN_DIST
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
import datetime as dt

loader = Loader(datadir="/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea")
cleaner = Cleaner()
analyser = Analyzer()
data_PL = loader.load_ArmPL(filename="BDDExport_PointLumineux_NOUMEA_2018-05-15.xlsx", NAME_LIST=PL_NAME)

data_PL = cleaner.rv_dupRow(data_PL)
data_PL = cleaner.rep_dur(data_PL,Var_lst=PL_TIME,currtime=dt.datetime(2018, 5, 5, 0, 0, 0, 0))

data_PL_PL = analyser.pick_Var(data=data_PL, Var_lst=PL_PL_CAT+PL_PL_DIST)
data_PL_LAN = analyser.pick_Var(data=data_PL, Var_lst=PL_LAN_CAT+PL_LAN_DIST)
data_PL_PL = cleaner.rv_dupRow(data_PL_PL)

analyser.gen_NAN_excel(data_PL.iloc[:,0:60],'PL_PL','PL_PL_or')
analyser.gen_NAN_excel(data_PL.iloc[:,60:],'PL_LAN','PL_LAN_or')

analyser.gen_histogram_Pie(data_PL_PL, 'PL_PL', Var_lst=PL_PL_CAT)
analyser.gen_histogram_Pie(data_PL_LAN, 'PL_LAN', Var_lst=PL_LAN_CAT)

analyser.gen_Dist(data_PL, 'PL_PL', Var_lst=PL_PL_DIST)
analyser.gen_Dist(data_PL_LAN,'PL_LAN',Var_lst=PL_LAN_DIST)

analyser.gen_NAN_excel(data_PL_PL,'PL_PL','PL_PL')
analyser.gen_NAN_excel(data_PL_LAN,'PL_LAN','PL_LAN')
Ejemplo n.º 5
0
from modules.analyser import Analyzer
from modules.loader import Loader
import numpy as np

date_str = '0723'
analyzer = Analyzer(datestr=date_str)
loader = Loader(date_str)


# ArmInt_cluster = loader.load_excel(filename='ArmInt_cluster',foldername='Cluster')
# ArmInt_cluster.drop(['PanneDelai_1'], axis=1,inplace=True)
# feature_names = np.array(list(ArmInt_cluster.columns))
#
# clf = loader.load_pickle('Randomforest_Armoire')
# analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='Randomforest_featureimportance_Armoire',top_n=40)
#
# clf = loader.load_pickle('GradientBoosting_Armoire')
# analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='GradientBoosting_featureimportance_Armoire',top_n=40)
#



PL_cluster = loader.load_excel(filename='PL_cluster',foldername='Cluster')
PL_cluster.drop(['PanneDelai_1'], axis=1,inplace=True)
feature_names = np.array(list(PL_cluster.columns))

clf = loader.load_pickle('Randomforest_PL')
analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='Randomforest_featureimportance_PL',top_n=40)

clf = loader.load_pickle('GradientBoosting_PL')
analyzer.plot_feature_importance(importances=clf.best_estimator_.feature_importances_,featurenames=feature_names,title='GradientBoosting_featureimportance_PL',top_n=40)
from utils.constants import Int_NAME, Int_TIME ,Int_INT_CAT,Int_INT_DIST,Int_PAN_CAT,Int_PAN_DIST
from modules.loader import Loader
from modules.cleaner import Cleaner
from modules.analyser import Analyzer
import datetime as dt

loader = Loader(datadir="/Users/zhangyuan/Documents/Workspace/StageCiteosWorkspace/data/noumea")
cleaner = Cleaner()
analyser = Analyzer()

Intfilename_lst = ["BDDExportInterventions-2018_05_15 du 01_01_2013 au 15_05_2018.xlsx"]

data_Int = loader.load_Intervention(filename_lst=Intfilename_lst, NAME_LIST=Int_NAME)
data_Int = cleaner.rv_dupRow(data_Int)
data_Int = cleaner.rep_dur(data_Int,Var_lst=Int_TIME,currtime=dt.datetime(2018, 5, 15, 0, 0, 0, 0))

data_Int_PAN = analyser.pick_Var(data=data_Int, Var_lst=Int_PAN_CAT+Int_PAN_DIST+['pan_Code'])
data_Int_PAN = cleaner.rv_dupRow(data=data_Int_PAN,Var_lst=['pan_Code'])
data_Int_INT = analyser.pick_Var(data=data_Int, Var_lst=Int_INT_CAT+Int_INT_DIST)
data_Int_INT = cleaner.rv_dupRow(data_Int_INT)

analyser.gen_NAN_excel(data_Int.iloc[:,0:23],'Intervention_int','Intervention_int_or')
analyser.gen_NAN_excel(data_Int.iloc[:,23:],'Intervention_pan','Intervention_pan_or')

analyser.gen_histogram_Pie(data_Int_INT, 'Intervention_int', Var_lst=Int_INT_CAT)
analyser.gen_histogram_Pie(data_Int_PAN, 'Intervention_pan', Var_lst=Int_PAN_CAT)

analyser.gen_Dist(data_Int_INT, 'Intervention_int', Var_lst=Int_INT_DIST)
analyser.gen_Dist(data_Int_PAN, 'Intervention_pan',Var_lst=Int_PAN_DIST)

analyser.gen_NAN_excel(data_Int_INT,'Intervention_int','Intervention_int')