def testNetwork(self, data): converted_json = self.convertJsonToList(data) featuresObject = Features(converted_json) featuresObject.processFeatures() input_arr = featuresObject.getFeatures() print "in testNetwork" print input_arr return chr(self.clf.predict(input_arr))
def addTrainingSetEntry(self, data, target, convert): if convert: data = convertJsonToList(data) featuresObject = Features(data) featuresObject.processFeatures() input = featuresObject.getFeatures() outfile = open('training_data.csv', 'a') outfile.write(','.join(map(str, input))) outfile.write(',') outfile.write(str(ord(target))) outfile.write('\n') outfile.close()
def test_bugs_reported_prior(self): objs = [{'creation_ts': 86400, 'something': 1}, # t+1 {'creation_ts': (2 * 86400) - 1, 'something': 1}, # t+1 {'creation_ts': 2 * 86400, 'something': 1}, # t+2 {'creation_ts': 3 * 86400, 'something': 1}] # t+3 f = Features() f.vec = DictVectorizer() f.matrix = f.vec.fit_transform(objs).toarray() res = f.bugs_within(f.bugs_between, 1) expected = np.array([0, 1, 1, 0]) self.assertTrue((res == expected).all())
def prepare_test_data(): test_data = [] log.write("Open test set") with open(TESTING_FILE, "r") as csv_file: csv_data = csv.DictReader(csv_file) for row in csv_data: test_data.append(row) log.write("Preprocessed test set") for data in test_data: data["preprocessed_kalimat"] = Preprocess(data).preprocess() log.write("Extract feature test set") feature = Features(test_data) feature.extract_feature() # feature.get_trainable_dataset() return test_data
def initialModelTraining(user_email, data_sources, stress_model): # first model init based on 14 days data from_time = 0 # from the very beginning of data collection data = grpc_handler.grpc_load_user_data( from_ts=from_time, uid=user_email, data_sources=data_sources, data_src_for_sleep_detection=Features.SCREEN_ON_OFF) features = Features(uid=user_email, dataset=data) df = pd.DataFrame(features.extract_for_after_survey()) # preprocessing and saving the result df_preprocessed = stress_model.preprocessing(df) with open('data_result/' + str(user_email) + "_features.p", 'wb') as file: pickle.dump(df_preprocessed, file) # normalizing norm_df = stress_model.normalizing("default", df_preprocessed, None) # init model stress_model.initModel(norm_df)
def main(): log.write("Open dataset") dataset = open_dataset([TRAINING_DIR + files for files in TRAINING_FILES]) merged_dataset = [] log.write("Resolve disagreement data") for k, v in dataset.items(): if k == TRAINING_DIR + TRAINING_FILES[ 2] or k == TRAINING_DIR + TRAINING_FILES[4]: dataset[k] = disagreement_handling(v) merged_dataset += dataset[k] analyze_data(merged_dataset) log.write("Analyzing sense") sense_id = set() for datum in merged_dataset: sense_id.add(datum["sense"]) xml_root = ET.parse(SENSE_FILES).getroot() for word in xml_root: for sense in word.findall("senses/sense"): if word.attrib["wid"].zfill(2) + sense.attrib["sid"].zfill( 2) not in sense_id: log.write( "Kata `{}` dengan sense `{}` tidak ditemukan di data training" .format(word[0].text, sense.attrib)) log.write("Preprocessing") for data in merged_dataset: data["preprocessed_kalimat"] = Preprocess(data).preprocess() # for datum in merged_dataset: # datum["preprocessed_kalimat"] = print(merged_dataset[0]) log.write("Feature extraction") feature = Features(merged_dataset) feature.extract_feature() # feature.get_trainable_dataset() # log.write(merged_dataset[0]) with open("feature.csv", "w") as csv_file: csv_writer = csv.writer(csv_file) # csv_writer.writerow(["kalimat_id", "sense", "features"]) for data in merged_dataset: if "data_embedding" in data: csv_writer.writerow( [data["\ufeffkalimat_id"], data["kata"], data["sense"]] + list(data["data_embedding"])) log.write("Build Dataset") word_feature_mat, dummy_train, dummy_test = build_dataset(merged_dataset) classifier = { "Random Forest": RandomForestClassifier(n_estimators=1000), "SVM": SVC(C=10000, gamma=0.1, tol=1e-6, decision_function_shape='ovo'), "Neural Net": MLPClassifier(hidden_layer_sizes=2000, activation='tanh', solver='adam', tol=1e-6, learning_rate_init=0.001, max_iter=1000, early_stopping=True) } best_model = None best_acc = 0.0000001 test_data = prepare_test_data() for model_name, model_class in classifier.items(): log.write("Try {} :".format(model_name)) true_count = 0 n_data = 0 model = model_class ansfile = "answers/{}_{}.csv".format(model_name, int(time.time())) for word in sorted(list(word_feature_mat.keys())): print("predicting {}".format(word)) model.fit(dummy_train[word][0], dummy_train[word][1]) prediction = model.predict(dummy_test[word][0]) n_data += len(prediction) for pred, true in zip(prediction, dummy_test[word][1]): if pred == true: true_count += 1 model = model.fit(word_feature_mat[word][0], word_feature_mat[word][1]) actual_test(test_data, model, word, ansfile) accuracy = 100 * true_count / n_data # if accuracy > best_acc: # # best_model = model_class log.write("Akurasi dari {} : {} %".format(model_name, accuracy))
] test_dataframe = pd.read_csv('MealNoMealData/mealData3.csv', names=columns) # print(test_dataframe) row, column = test_dataframe.shape for i in range(row): test_dataframe.dropna(thresh=4, axis=0) print("test_data") # print(test_dataframe) test_dataframe = test_dataframe.interpolate(method='linear', limit_direction='backward') print(test_dataframe) # test_dataframe=test_dataframe.dropna() # print(test_dataframe) s = DataSetFormation() f = Features(4) data = f.completefeatures(test_dataframe) data = normalized_data = s.normalizeData(data) # data=s.applyPCA(data,3) data["Label"] = 1 print(data) column = [ 'fft1', 'fft2', 'fft3', 'fft4', 'velocity1', 'velocity2', 'velocity3', 'velocity4', 'rolling1', 'rolling2', 'rolling3', 'rolling4', 'dwt1', 'dwt2', 'dwt3', 'dwt4' ] column_p = ['pc1', 'pc2', 'pc3'] column_v = ['velocity1', 'velocity2', 'rolling2', 'rolling1'] value = loaded_model.predict(data[column_v]) print(value) result = loaded_model.score(data[column_v], data['Label'])
'fft1', 'fft2', 'fft3', 'fft4', 'velocity1', 'velocity2', 'velocity3', 'velocity4', 'rolling1', 'rolling2', 'dwt1', 'dwt2', 'dwt3', 'dwt4' ] data = pd.DataFrame(extracted_features, columns=columns) data = data.dropna() print(data.head()) data = StandardScaler().fit_transform(data.values) data = pd.DataFrame(data, columns=columns) return data s = DataSetFormation() s.read_csv() s.createFeatureMatrixCGM() mealFeatures = Features(4) s.mealDataFrame.to_csv("myMealData.csv") noMealFeatures = Features(4) s.noMealDataFrame.to_csv("myNoMealData.csv") finalMealDataFrame = pd.read_csv("myMealData.csv") finalNoMealDataFrame = pd.read_csv("myNoMealData.csv") meal = mealFeatures.completefeatures(finalMealDataFrame) print(meal) print("Final Meal DataSet") mealPrincipalComponentDataFrame = s.normalizeData(meal) nomeal = noMealFeatures.completefeatures(finalNoMealDataFrame) print(nomeal) print("Here", mealPrincipalComponentDataFrame) mealPrincipalComponentDataFrame['Label'] = 1 print("Final NoMeal DataSet") noMealPrincipalComponentDataFrame = s.normalizeData(nomeal)
'rolling4', 'expwindow1', 'expwindow2', 'expwindow3', 'expwindow4', 'dwt1', 'dwt2', 'dwt3', 'dwt4' ] data = pd.DataFrame(extracted_features, columns=columns) data = data.dropna() print(data.head()) data = MinMaxScaler().fit_transform(data.values) data = pd.DataFrame(data, columns=columns) self.applyPCA(data, 5, person, 'PCA') print("""----------------------------------------| | Enter a Person Number | | | |-----------------------------------------|""") n = input() directoryPath = os.getcwd() access_right = 0o777 try: if not os.path.isdir(directoryPath + '/Person' + str(n)): os.mkdir(directoryPath + '/Person' + str(n), access_right) except OSError: print('Directoy not created') s = DataSetFormation(int(n)) s.plotCGMData(int(n)) b = Features(4, s.CGMData) final_extracted_feature_matrix = b.completefeatures(int(n)) df = pd.DataFrame(final_extracted_feature_matrix) df.to_csv('FeaturesExtracted.csv') s.normalizeData(final_extracted_feature_matrix, n)
def prediction_task(i): global grpc_handler print("Prediction task for {} is running... ".format(prediction_times[i])) grpc_handler = GrpcHandler('165.246.21.202:50051', manager_id, manager_email, campaign_id) now_time = int(datetime.datetime.now().timestamp()) * 1000 from_time = now_time - (4 * 3600 * 1000) # from 4 hours before now time users_info = grpc_handler.grpc_load_user_emails() ema_order = i + 1 data_sources = grpc_handler.grpc_get_data_sources_info() for user_email, id_day in users_info.items(): user_id = id_day['uid'] day_num = id_day['dayNum'] sm = StressModel(uid=user_email, dayNo=day_num, emaNo=ema_order) # 0. check users day num if it's more than 14 days, only then extract features for 14 days and init the model if day_num > survey_duration: # if the first day and the first ema order after 14days if day_num == survey_duration + 1 and ema_order == 1: initialModelTraining(user_email, data_sources, sm) else: # 1. retrieve the data from the gRPC server # get all user data from gRPC server between start_ts and end_ts data = grpc_handler.grpc_load_user_data( from_ts=from_time, uid=user_email, data_sources=data_sources, data_src_for_sleep_detection=Features.SCREEN_ON_OFF) # 2. extract features from retrieved data with open('data_result/' + str(user_email) + "_features.p", 'rb') as file: step1_preprocessed = pickle.load(file) features = Features(uid=user_email, dataset=data) df = pd.DataFrame( features.extract_regular(start_ts=from_time, end_ts=now_time, ema_order=ema_order)) # 3. pre-process and normalize the extracted features new_row_preprocessed = sm.preprocessing(df) norm_df = sm.normalizing("new", step1_preprocessed, new_row_preprocessed) # 4. init StressModel here # get test data new_row_for_test = norm_df[(norm_df['Day'] == day_num) & (norm_df['EMA order'] == ema_order)] ## get trained model with open('model_result/' + str(user_email) + "_model.p", 'rb') as file: initModel = pickle.load(file) # 5. make prediction using current features with that model features = StressModel.feature_df_with_state['features'].values y_pred = initModel.predict(new_row_for_test[features]) new_row_preprocessed['Sterss_label'] = y_pred # 6. save current features prediction as label to DB # insert a new pre-processed feature entry in DB with predicted label # DATA- , Model UPDATE update_df = pd.concat([ step1_preprocessed.reset_index(drop=True), new_row_preprocessed.reset_index(drop=True) ]) with open('data_result/' + str(user_email) + "_features.p", 'wb') as file: pickle.dump(update_df, file) # 7. save prediction in DB and return it to gRPC server # Send the prediction with "STRESS_PREDICTION" data source and "day_num ema_order prediction_value" value user_all_labels = list(set(step1_preprocessed['Stress_label'])) model_results = list( sm.getSHAP( user_all_labels, y_pred, new_row_for_test, initModel)) # saves results on ModelResult table in DB # construct a message from model results and return it to gRPC server result_data = {} for model_result in model_results: result_data[model_result.prediction_result] = { "day_num": model_result.day_num, "ema_order": model_result.ema_order, "accuracy": model_result.accuracy, "feature_ids": model_result.feature_ids } #return prediction message to gRPC for user to see grpc_handler.grpc_send_user_data( user_id, user_email, data_sources['STRESS_PREDICTION'], now_time, result_data) # 8. check user self report and update the DB of pre-processed features with reported stress label if if there is self report from user # check 'SELF_STRESS_REPORT' data source for user and run retrain if needed and retrain # region Retrain the models with prev self reports check_and_handle_self_report(user_email, data, sm) grpc_handler.grpc_close()
# print(final_label_df) self.finalDataFrame = pd.concat([final_df_features, final_label_df], axis=1) df.to_csv("New.csv") self.finalDataFrame.to_csv("FinalDataFrame.csv") # # db_default = DBSCAN(eps = 0.375, min_samples = 5).fit(finalPCADataFrame) #create a matrix to check if the vl s = DataSetFormation() s.read_csv() s.createFeatureMatrixCGM() s.createGroundTruth() #Creating a ground truth table of 6 clusters #, >0 to 20, 21 to 40, 41 to 60, 61 t o 80, 81 to 100. mealFeatures = Features(4) features = s.getFeatures() mealPrincipalComponentDataFrame = s.normalizeData(features) print(len(mealPrincipalComponentDataFrame)) s.createDBSCANClusterFromFeatures(mealPrincipalComponentDataFrame) # s.createDBSCANClusterFromFeaturesMax() # s.SSEMetrics() # X_train, X_test, y_train, y_test = train_test_split(self.mealPrincipalComponentDataFrame,self.carbIntakeDataFrame,test_size=0.33, random_state=42) # print(X_train) # s.createKMeansCluster(mealPrincipalComponentDataFrame) print(len(mealPrincipalComponentDataFrame)) # s.plotPointCluster() # s.createDBSCANClusterFromFeaturesMax() s.calculateAccuracy()
def __call__(self, parser, namespace, values, option_string=None): from feature_extraction import Features Features.header(sys.stdout) parser.exit()