def load_data(): file_path = filedialog.askopenfilename(initialdir="./", title="Select file", filetypes=(("matlab files", "*.mat"), ("all files", "*.*"))) traSetLst, tstSetLst = DPre.preprocess( DPre.load_data(file_path, datVar.get()), seq_len, sample_num) ## dbNam, datNam global traDatStmp, traLabStmp global tstDatStmp, tstLabStmp traDatStmp, traLabStmp = DPre.pack_time_stamp( traSetLst, fea_num, len(traSetLst)) ## package train data.. tstDatStmp, tstLabStmp = DPre.pack_time_stamp( tstSetLst, fea_num, len(tstSetLst)) ## package test data.. predBut.config(state="normal") global model input = Input(shape=( seq_len, fea_num )) # sample num : 2000, time_stamp : 32(seq_len), feature num : 2 model = Bp.build_model(input) try: model.load_weights('./model_save/model_weight/' + datVar.get() + '.h5') msg.set('load success!') label.config(text=msg.get()) except: msg.set('fail to load') label.config(text=msg.get())
def select_random_records(reevaluate=False): if not reevaluate: list_of_random_number = random.sample(range(total_number_of_records), number_of_records_to_sample) else: list_of_random_number = Utility.read_words_file_into_list(path_to_saved_manually_tagged_file , 0 ) list_of_random_number = [int(x) for x in list_of_random_number] list_of_random_number2 = copy.deepcopy(list_of_random_number) list_of_random_number3 = copy.deepcopy(list_of_random_number) list_of_random_number4 = copy.deepcopy(list_of_random_number) sentences = Data_Preprocessing.Raw_Record_Sentences_Parser(Data_Preprocessing.path_to_Rawdata) with open(path_to_sampled_raw_records, "w") as sampled_raw_records: for c, s in enumerate(sentences): if c in list_of_random_number4: logger.info("sampled record number #%.i", c) list_of_random_number4.remove(c) string_to_print = str(c) + '\t' + " ".join(s) print(string_to_print, file=sampled_raw_records) sentences = Utility.Utility_Sentence_Parser(Data_Preprocessing.path_to_Save_file) with open(path_to_sampled_preprocessed_records, "w") as sample_preprocessed_records: for c,s in enumerate(sentences): if c in list_of_random_number: logger.info("sampled record number #%.i", c ) list_of_random_number.rnemove(c) string_to_print = str(c)nn + '\t' + " ".join(s) print(string_to_print , file=sample_preprocessed_records ) sentences = Utility.Utility_Sentence_Parser(Text_Normalization.save_folder_name + "/Normalized_Text_Stage_2.txt") with open(path_to_sampled_normalized_records, "w") as sample_normalized_records: for c,s in enumerate(sentences): if c in list_of_random_number2: logger.info("sampled record number #%.i", c ) list_of_random_number2.remove(c) string_to_print = str(c) + '\t' + " ".join(s) print(string_to_print , file=sample_normalized_records ) sentences = Utility.Utility_Sentence_Parser(Maintenence_Action_Detection.processed_file_name) with open(path_to_sampled_final_records, "w") as sampled_final_records: for c,s in enumerate(sentences,0): if c in list_of_random_number3: logger.info("sampled record number #%.i", c) list_of_random_number3.remove(c) string_to_print = str(c) + '\t' + " ".join(s) print(string_to_print , file=sampled_final_records ) if not reevaluate: copyfile(path_to_sampled_normalized_records, path_to_file_to_be_tagged_manually)
def Trainig(): if (os.path.exists("PreProcessing Data/TrainingDataSet.csv")): data = pd.read_csv("PreProcessing Data/TrainingDataSet.csv") print("Training DataSet Already Existing") Y = data['Rate'] # Label X = data.drop(columns=["Rate"], inplace=False) else: print("Starting Training Data PreProcessing") pre_processing = Data_Preprocessing.Pre_Processing() X, Y = pre_processing.PreProcessing_Trainig() highX = X[Y == 2] highY = Y[Y == 2] intermediateX = X[Y == 1] intermediateY = Y[Y == 1] lowX = X[Y == 0] lowY = Y[Y == 0] HX_train, HX_test, HY_train, HY_test = train_test_split(highX, highY, test_size=0.2, shuffle=True) IX_train, IX_test, IY_train, IY_test = train_test_split(intermediateX, intermediateY, test_size=0.2, shuffle=True) LX_train, LX_test, LY_train, LY_test = train_test_split(lowX, lowY, test_size=0.2, shuffle=True) X_train = np.concatenate((HX_train, IX_train, LX_train)) y_train = np.concatenate((HY_train, IY_train, LY_train)) X_test = np.concatenate((HX_test, IX_test, LX_test)) y_test = np.concatenate((HY_test, IY_test, LY_test)) print("Start Classification Techinques") # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=True) np.save('modelsPCA/traindata.npy', X_train) np.save('modelsPCA/trainlabel.npy', y_train) caller(X_train, y_train, X_test, y_test, 0) PCA_algorithm(X_train, y_train, X_test, y_test)
from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.svm import LinearSVC from sklearn import metrics import Data_Preprocessing import warnings import time warnings.filterwarnings("ignore") data_obj = Data_Preprocessing.process_data() data_obj.clean_data() ''' Naive Bayes classifier class ''' class Naive_Bayes_Classifier: def __init__(self): self.vectorizer_time = {'Count' : 0 , 'TFIDF' : 0 } self.classifier_time = {'Count' : 0 , 'TFIDF' : 0 } def Count_vectorizer_classifier(self): start = time.time() print("\n\nRunning Naive Bayes with Count Vectorizer...") x_train, x_test = data_obj.generate_count_vectorizer() y_train = data_obj.y_train y_test = data_obj.y_test
def Lemmatize_words(string_to_normalize): i = 0 for i in range(len(string_to_normalize)): word = string_to_normalize[i] if word in lemmatized_words_dictionry: string_to_normalize[i] = lemmatized_words_dictionry[word] return string_to_normalize if __name__ == "__main__": Unknown_words_building() lemmatized_words_building() Bigram_building() #sentences = Data_Preprocessing.Sentences_Parser_2('./Input_Output_Folder/Preprocessed_Record/Cleaned_Data_1.txt') sentences = Data_Preprocessing.Sentences_Parser_2(Data_Preprocessing.path_to_Save_file) with open(path_to_normalized_stage_1_records, "w") as Normalized_Text_Stage_1: i = 1 for sentence in sentences: string_to_print = ' '.join(Normalize_Text_stage_1(sentence)) print(str(i) + '\t' + string_to_print , file=Normalized_Text_Stage_1) i+=1 sentences = Data_Preprocessing.Sentences_Parser_2(path_to_normalized_stage_1_records) with open(path_to_normalized_stage_1_lemmatized_records, "w") as Normalized_Text_Stage_1_lemmatized: i = 1 for sentence in sentences: string_to_print = ' '.join(Lemmatize_words(sentence)) print(str(i) + '\t' + string_to_print , file=Normalized_Text_Stage_1_lemmatized) i+=1
import pylab #def Create_statistics(X1,cols): """ ----------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------- """ if __name__ == '__main__': pathtrain = 'E:\Prospective_dataset\ClinicalDataWithImage.csv' path_image_data = 'E:\\DCI Prediction\\Data\\Image_data' frame = pd.read_csv(pathtrain, sep=';') image_feats = True [X1, Y, cols, names] = dp.Fix_Dataset(frame, image_feats) #false for no images features Features = mt.Connect_Image_Features(names, path_image_data) cols = pd.Index.tolist(cols) #important columns according to feature selection with RFC ind1 = cols.index('SAH_vol_ml') ind2 = cols.index('DIAGNOSIS_FISHER4_E1_C1_IPH') ind3 = cols.index('TIME_ICTUS_CTSCANDT') ind4 = cols.index('Age') ind5 = cols.index('ADMISSION_GCS_TOTAL_AMC_E1_C1') ind6 = cols.index('ANEURYSM_LENGTH_E1_C1_1') ind7 = cols.index('DIAGNOSIS_FISHER4_E1_C1_SDH') ind8 = cols.index('ANEURYSM_WIDTH_E1_C1_1') ind9 = cols.index('TREATMENT_E1_C1') ind10 = cols.index('ANEURYSM_LOCATION_E1_C1_1')
import Data_Preprocessing as dp import Feature_Selection as fs """ ----------------------------------------------------------------------------------------------------------------- ----------------------------------------------------------------------------------------------------------------- """ if __name__ == '__main__': #pathtrain ='E:\Prospective_dataset\DataWithFeats.csv' path_clin_data =r'E:\Prospective_dataset\ClinicalDataWithImage.csv' path_image_data = r'E:\Prospective\Data\Image_data' #pathtrain ='E:\Prospective_dataset\test150.csv' #dummy_ranks = pd.get_dummies(df['prestige'], prefix='prestige') frame = pd.read_csv(path_clin_data,sep=';') [X1,Y,cols,names]=dp.Fix_Dataset(frame,False) #false for no images features Features=mt.Connect_Image_Features(names,path_image_data) cols=pd.Index.tolist(cols) X=X1 #first clinical features ind1=cols.index('SAH_vol_ml') ind2=cols.index('DIAGNOSIS_FISHER4_E1_C1_IPH') ind3=cols.index('TIME_ICTUS_CTSCANDT') ind4=cols.index('Age') ind5=cols.index('ADMISSION_GCS_TOTAL_AMC_E1_C1') ind6=cols.index('ANEURYSM_LENGTH_E1_C1_1') ind7=cols.index('DIAGNOSIS_FISHER4_E1_C1_SDH') ind8=cols.index('ANEURYSM_WIDTH_E1_C1_1') ind9=cols.index('TREATMENT_E1_C1')
new_words[key2] = words2[key2] else: break Linked_Words(new_words) def Legal_Words_Linked_Words(): with open("../Data_Cleaning/lemmatized_word_4.txt", "r") as lemmatized_word_file: for line in lemmatized_word_file: print(line.split()) if __name__ == "__main__": #global_starting_word = 'reapair' #words_by_alphebat = dict(model.most_similar(positive=[global_starting_word ], topn=4)) #Linked_Words(words_by_alphebat) #Legal_Words_Linked_Words() #Print_Similar_Word() sentences = Data_Preprocessing.Sentences_Parser_2( '../Data_Cleaning/Cleaned_Data_10.txt') d = Data_Preprocessing.Build_Frequency_Dic(sentences) for w in sorted(d, key=d.get, reverse=True): word = w print(model.most_similar(positive=[word], topn=10))
#T=0.01 #T=0.00 #method='backward' #T=0 path_variables = path_variables + feats_use + ".csv" #path_variables=path_variables+feats_use+".csv" path_results = 'E:\\Mrclean\\Results_Sens\\test' + '-' + feats_use + '-' + label_use + '-' + method + '\\' path_models = path_results + "\\Models" if not os.path.exists(path_results): os.makedirs(path_results) os.makedirs(path_models) [X, Y, cols, center, vals_mask] = pp.Fix_Dataset_csv(path_data, label_use, feats_use, path_variables) #data=pd.io.stata.read_stata(("E:\\Mrclean\\Data\\RegistryOpenclinicacheck_core.dta")) center, range_centers = pp.Combine_Center5_10(center) [X, cols] = pp.Encode_Variables(X, cols, vals_mask) cols = np.array(cols) np.save(path_results + 'cols.npy', cols) #X=pp.Normalize_Min_Max(X) num_feats = X.shape[1] splits = 100 cv = 5 mean_tprr = 0.0
def sigmoid(z): """ Compute the sigmoid of z Arguments: z -- A scalar or numpy array of any size. Return: s -- sigmoid(z) """ return 1 / (1 + np.exp(-z)) y = sigmoid(pd_y) X_train, X_test, y_train, y_test = dp.get_TrainingSet_and_Test_set( pd_X.T, y.T, 0.3) X_train = X_train.T X_test = X_test.T y_train = y_train.T y_test = y_test.T def get_y(Row): if Row[0] > 0.5: return 1 else: return 0 y = pd.DataFrame(y).apply(get_y, axis=0).reshape(1, m)
def stage_1(): words = Data_Preprocessing.Words_Parser('./Processed_Data/', 'vacab_alphebat.txt') s = aspell.Speller('lang', 'en') with open("./Processed_Data/Stage1/auto_corrected_words_v_3.txt", "w") as auto_corrected_words_2: with open("./Processed_Data/Stage1/auto_corrected_words_v_3.txt", "w") as auto_corrected_words: with open("./Processed_Data/Stage1/misspelled_words.txt_2_v_3", "w") as misspelled_words: with open( "./Processed_Data/Stage1/spell_checked_words_2_v_3.txt", "w") as spell_checked_words: i = 1 misspelled_words_count = 0 auto_corrected_words_count = 0 for word in words: is_word = s.check(word) if not is_word: misspelled_words_count += 1 correction_candidates_list = [ candidate for candidate in s.suggest(word) if string_in_corpus(candidate) > 0 ] correction_candidates = ' '.join( correction_candidates_list) most_similar_word = model.most_similar( positive=[word], topn=1) if len(correction_candidates_list ) > 0 and most_similar_word[0][ 0] == correction_candidates_list[0]: string_to_print = '{0}\t{1:15}\t{2:10}\t{3:50}'.format( misspelled_words_count, word, correction_candidates_list[0], '**********************') auto_corrected_words_count += 1 print(string_to_print, file=auto_corrected_words) elif len(correction_candidates_list) == 1: string_to_print = '{0}\t{1:15}\t{2:10}\t{3:50}'.format( misspelled_words_count, word, str(most_similar_word[0][0]), correction_candidates) auto_corrected_words_count += 1 print(string_to_print, file=auto_corrected_words_2) else: string_to_print = '{0}\t{1:15}\t{2:10}\t{3:40}'.format( misspelled_words_count, word, str(most_similar_word[0][0]), correction_candidates) print(string_to_print, file=misspelled_words) string_to_print = '{0}\t{1:15}\t{2:6}'.format( i, word, is_word) i += 1 print(string_to_print, file=spell_checked_words) print('Total incorrect words are' + str(misspelled_words_count), file=spell_checked_words) print('Total incorrect words are' + str(auto_corrected_words_count), file=spell_checked_words)
import gensim import aspell import Data_Preprocessing model = gensim.models.Word2Vec.load('./Data/mymodel_19_1000') word_list = list(model.wv.vocab) word_list.sort(key=len, reverse=False) words = sorted(word_list) Data_Preprocessing.write_vacab_to_txt('./Processed_Data/vacab_alphebat.txt', words) Data_Preprocessing.write_vacab_to_txt('./Processed_Data/vacab_length.txt', word_list) with open("./Processed_Data/Cleaned_Data_15.txt", "r") as f: searchlines = f.read() def string_in_corpus(string_to_search): return searchlines.find(string_to_search) def stage_1(): words = Data_Preprocessing.Words_Parser('./Processed_Data/', 'vacab_alphebat.txt') s = aspell.Speller('lang', 'en') with open("./Processed_Data/Stage1/auto_corrected_words_v_3.txt", "w") as auto_corrected_words_2: with open("./Processed_Data/Stage1/auto_corrected_words_v_3.txt", "w") as auto_corrected_words: with open("./Processed_Data/Stage1/misspelled_words.txt_2_v_3",
def run(st,data,mongocls,session_id): st.markdown("### Steps -") view_sample_data = st.checkbox("View Sample Data - ") if view_sample_data: if mongocls.get_session_info({'session_id': session_id + '_ml_df'}) is not None: st.dataframe(pd.DataFrame(mongocls.get_session_info({'session_id':session_id+'_ml_df'})["data_dict"]).head()) else: st.dataframe(data.head()) c_load,c_session,c_restrict_columns = st.beta_columns(3) if c_load.button("Reload Data!!!"): data = get_data(data,mongocls,session_id) if c_session.button("Reset Session!!!"): mongocls.delete_session({'session_id': session_id + '_ml_df'}) if c_restrict_columns.checkbox("Restrict Columns: "): restrict_columns = st.multiselect("Restrict Columns ", data.columns.tolist()) data = data[restrict_columns] expander = st.beta_expander("Data Preparation:",expanded=False) data = get_data(data,mongocls,session_id) columns = data.columns.tolist() with expander: c1,c2,c3 = st.beta_columns(3) fs = c1.checkbox("Feature Selection") if fs: st.text("Currently under development!!!") #fs_option = st.selectbox("Selection Option",['SelectKBest','RFE','PCA','LDA']) dimpute = c2.checkbox("Data Imputers") if dimpute: impute_options = st.selectbox("Impute Option",['SimpleImputer']) if impute_options: imputer = dp.Imputers(dataframe=data,select_imputer=impute_options) imputer.select_imputers(imputerSelect = impute_options) data = imputer.fit_transform() data = pd.DataFrame(data, columns=columns) mongocls.delete_session({'session_id': session_id + '_ml_df'}) mongocls.write_session_info( {'session_id': session_id + '_ml_df', 'data_dict': data.to_dict("records")}) st.dataframe(data.head()) expander_enc = st.beta_expander("Data Encoding", expanded=False) with expander_enc: encode = st.checkbox("Apply Encoding") if encode: c1_encode,c2_encode,c3_encode = st.beta_columns(3) encoding_option = c1_encode.selectbox("Select Encoder",['LabelEncoder','OneHotEncoder','OrdinalEncoder','Binarizer','LabelBinarizer','MultiLabelBinarizer']) Y_col_encode = c2_encode.selectbox("Select Y (target) (Encoding)", data.columns.tolist()) cat_columns = c3_encode.multiselect("Select Categorical Columns to Encode",data.columns.tolist()) encode_btn = st.button("Encode Data!!!") if encode_btn: if len(cat_columns)==0: encode_cls = dp.Encoders(df=data,y=Y_col_encode) else: encode_cls = dp.Encoders(df=data,y=Y_col_encode, cat_columns = cat_columns) encode_cls.select_encoder(encode_type=encoding_option) data = encode_cls.compile_encoding() st.dataframe(data.head()) mongocls.delete_session({'session_id': session_id + '_ml_df'}) mongocls.write_session_info( {'session_id': session_id + '_ml_df', 'data_dict': data.to_dict("records")}) expander_sample = st.beta_expander("Data Sampling", expanded=False) with expander_sample: c1, c2, c3, c4 = st.beta_columns(4) sample_options = c1.selectbox("Sampling Options", ["Over", "Under","RandomOverSampler"]) sampling_ratio = c2.slider('Sampling Ratio', min_value=0.1, max_value=1.0, step=0.05) Y_col = c3.selectbox("Select Y (target) (Sampling)", data.columns.tolist()) if Y_col != '': X_cols = c4.multiselect("Select X Columns (default is all)", [col for col in data.columns.tolist() if col != Y_col]) if len(X_cols) <= 0: X_cols = [col for col in data.columns.tolist() if col != Y_col] X_val = data[X_cols] X_val = data[X_cols] Y_val = data[Y_col] else: st.warning("Please select Target column!!!") sampler_btn = st.button("Run Sampler") if sampler_btn: sample_cls = dp.Sampling(df=data[X_cols + [Y_col]], target=Y_col, sampling_option=sample_options) X_val, Y_val = sample_cls.run_sampler() data = (pd.concat([X_val, Y_val], axis=1)) st.dataframe(data.head()) mongocls.delete_session({'session_id': session_id + '_ml_df'}) mongocls.write_session_info( {'session_id': session_id + '_ml_df', 'data_dict': data.to_dict("records")}) expander_scale = st.beta_expander("Data Scaling", expanded=False) with expander_scale: scale = st.checkbox("Apply Scaling") if scale: c1, c2, c3 = st.beta_columns(3) scaling_options = c1.selectbox("Sampling Options", ["StandardScaler", "MaxAbsScaler", "MinMaxScaler","RobustScaler", "Normalizer","PowerTransformer","QuantileTransformer"]) Y_col = c2.selectbox("Select Y (target) (Scaling)", data.columns.tolist()) cat_columns = c3.multiselect("Categorical Columns",[col for col in data.columns.tolist() if col != Y_col]) scale_btn = expander_scale.button("Scale Data!!!") if scale_btn: scaling = dp.scaling(df=data,cat_columns=cat_columns,scalar_type=scaling_options,y=Y_col) data = scaling.compile_scalar() st.dataframe(data.head()) mongocls.delete_session({'session_id': session_id + '_ml_df'}) mongocls.write_session_info( {'session_id': session_id + '_ml_df', 'data_dict': data.to_dict("records")}) pass expander_model = st.beta_expander("Model Training", expanded=False) with expander_model: c1,c2,c3 = st.beta_columns(3) model_type = c1.selectbox("Model Type",['Regression','Classification']) y_col = c2.selectbox("Select Target Variable",data.columns.tolist()) model_exe = ml_models.MLmodels(df=data,y_column=y_col,problem_type=model_type) select_models = c3.multiselect("Select ML models",model_exe.get_model_list()) run_models = st.button("Run Models - ") model_storage = {} if run_models: for model in select_models: model_exe.select_model_to_run(model_select=model) model_storage[model]= model_exe.compile_modeling() train_x, train_y, test_x, test_y = model_exe.get_train_test() mongocls.delete_session({'session_id': session_id + '_models_ran'}) mongocls.write_session_info( {'session_id': session_id + '_models_ran','models_trained': pickle.dumps(model_storage),'train_test_data':pickle.dumps( { 'train_x':train_x, 'train_y':train_y, 'test_x':test_x, 'test_y':test_y } )}) if len(model_storage.keys())>0: st.write("Model Run completed for the below models - ") st.code(model_storage) expander_metrics = st.beta_expander("Evaluation Metrics", expanded=False) with expander_metrics: #try: models_trained = pickle.loads(mongocls.get_session_info({'session_id': session_id + '_models_ran'})["models_trained"]) loaded_info = pickle.loads(mongocls.get_session_info({'session_id': session_id + '_models_ran'})["train_test_data"]) train_x = loaded_info['train_x'] train_y = loaded_info['train_y'] test_x = loaded_info['test_x'] test_y = loaded_info['test_y'] metric_selected = {} for model,trained_model in models_trained.items(): st.text(model) metric_cls = ml_metrics.Metrics(y_test=test_y) metric_selected[model] = st.multiselect('Select Metrics to see for the Model ('+model+')',metric_cls.get_metric_list()) metrics_btn = st.button("Click to see the metrics") if metrics_btn: for model,metrics in metric_selected.items(): for metric in metrics: metric_cls.select_metrics(metric) st.write(metric) st.write(metric_cls.metrics_solve(estimator=models_trained[model], test_x=test_x)) #except: # st.warning("No Models trained yet!!!")
def main(): Data_Preprocessing.main_process() Naive_Bayes_Classifier.data_load()
import Classification import Classification_Testing import Data_Preprocessing import numpy as np from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler print("Appstore games Classification") input = int(input("Press 1 To Train Models Or 2 To Test Model :: ")) if input == 1: Classification.Trainig() else: pre_processing = Data_Preprocessing.Pre_Processing() X, Y = pre_processing.PreProcessing_Testing() print("Testing Before PCA Algorithm") Testing = Classification_Testing.Classification_Testing(X, Y) Testing.OneVsOnelinear(0) Testing.OneVsOne_LinearSVC(0) Testing.OneVsOne_ploy(0) Testing.OneVsOne_rbf(0) Testing.adaBoost(0) Testing.decisionTree(0) Testing.KNN(0) xtrain = np.load('modelsPCA/traindata.npy', allow_pickle=True) ytrain = np.load('modelsPCA/trainlabel.npy', allow_pickle=True) Testing.PCA_algorithm(xtrain, ytrain)