def main(): df_business_restaurants = preprocessing.get_preprocessed_data() training, test = validation.test_trainsplit(df_business_restaurants) dim_freq_map, prior_of_stars = trainNB(training) selected_columns = [ 'review_count', 'city', 'review_sentiment_rating', 'review_star_rating', 'tip_rating', 'checkin_rating' ] accuracy, dist, offcount = testNB(test, dim_freq_map, selected_columns, prior_of_stars) print("With review_star_rating, rounded stars -- accuracy,dist,offcount :", accuracy, dist, offcount) selected_columns = [ 'review_count', 'city', 'review_sentiment_rating', 'tip_rating', 'checkin_rating' ] accuracy, dist, offcount = testNB(test, dim_freq_map, selected_columns, prior_of_stars) print( "With out review_star_rating, rounded stars -- accuracy,dist,offcount :", accuracy, dist, offcount) print("k fold cross validation results ", k_fold_crossvalidation(df_business_restaurants))
def main(): # Class Negative Data d1 = data_parser.parse('Datasets/Healthy Controls/MS_A_1.mzml') d2 = data_parser.parse('Datasets/Healthy Controls/MS_A_2.mzml') d3 = data_parser.parse('Datasets/Healthy Controls/MS_A_3.mzml') d4 = data_parser.parse('Datasets/Healthy Controls/MS_A_4.mzml') d5 = data_parser.parse('Datasets/Healthy Controls/MS_A_5.mzml') d6 = data_parser.parse('Datasets/Healthy Controls/MS_A_6.mzml') d7 = data_parser.parse('Datasets/Healthy Controls/MS_A_7.mzml') # Class Positive Data d8 = data_parser.parse('Datasets/PC Diagnosed/MS_B_1.mzml') d9 = data_parser.parse('Datasets/PC Diagnosed/MS_B_2.mzml') d10 = data_parser.parse('Datasets/PC Diagnosed/MS_B_3.mzml') d11 = data_parser.parse('Datasets/PC Diagnosed/MS_B_4.mzml') d12 = data_parser.parse('Datasets/PC Diagnosed/MS_B_5.mzml') d13 = data_parser.parse('Datasets/PC Diagnosed/MS_B_6.mzml') d14 = data_parser.parse('Datasets/PC Diagnosed/MS_B_7.mzml') full_data = d1 + d2 + d3 + d4 + d5 + d6 + d7 + d8 + d9 + d10 + d11 + d12 + d13 + d14 param = [] data = preprocessing.get_preprocessed_data(full_data, param) # train_test_model(data, param) cross_validate(data, param)
def main(): """ The main script. Runs all of the above functions. and saves in the result folder all of the accuracies and predictions for each ML algorithm. To plot any the following function --> set if_plot to True in each of the function arguments.""" # Get the preprocesed data x_dataA, y_labelA, framerate_A = get_preprocessed_data('A', N=2, factor=5) x_dataB, y_labelB, framerate_B = get_preprocessed_data('B', N=2, factor=1) # Get the testing data x_testdataA = get_test_data('A') x_testdataA = np.array([x[:len(x_dataA[0])] for x in x_testdataA]) stdS1_A, stdS1_testA, stdS2_A, stdS2_testA, meanS1_freqA, meanS2_freqA, stdS1_freqA, stdS2_freqA = set_A( x_dataA, x_testdataA, y_labelA) stdS1_B, stdS2_B, meanS1_freqB, meanS2_freqB, stdS1_freqB, stdS2_freqB = set_B( x_dataB, y_labelB) # Opted to use Multi-thread to speed process up twrv1 = ThreadWithReturnValue(target=zero_crossing, args=(x_dataA, x_dataB, x_testdataA, y_labelA, y_labelB)) twrv2 = ThreadWithReturnValue(target=signal_energy_frame, args=(x_dataA, x_dataB, x_testdataA, y_labelA, y_labelB)) twrv3 = ThreadWithReturnValue(target=entropy_of_energy, args=(x_dataA, x_dataB, x_testdataA, y_labelA, y_labelB)) twrv4 = ThreadWithReturnValue(target=frequency_domain, args=(x_dataA, x_dataB, x_testdataA)) twrv1.start() twrv2.start() twrv3.start() twrv4.start() [zero_crossingsA, zero_crossingsB, zc_testA] = twrv1.join() [energyA, energyB, ener_testA] = twrv2.join() [entropyA, entropyB, entr_testA] = twrv3.join() [X_dataA, X_dataB, X_testdataA] = twrv4.join() twrv5 = ThreadWithReturnValue(target=spectral_entropy, args=(X_dataA, X_dataB, X_testdataA, y_labelA, y_labelB)) twrv6 = ThreadWithReturnValue(target=spectral_flux, args=(X_dataA, X_dataB, X_testdataA, y_labelA, y_labelB)) twrv7 = ThreadWithReturnValue(target=spectral_centroid_frame, args=(X_dataA, X_dataB, X_testdataA, y_labelA, y_labelB, framerate_A, framerate_B)) twrv8 = ThreadWithReturnValue(target=get_mcfcc_feat, args=(x_dataA, x_dataB, x_testdataA, framerate_A, framerate_B)) twrv5.start() twrv6.start() twrv7.start() twrv8.start() [entropy_freqA, entropy_freqB, entr_freqtestA] = twrv5.join() [fluxA, fluxB, flux_testA] = twrv6.join() [centroidA, centroidB, cent_testA] = twrv7.join() [mfccA_feat, mfccB_feat, mfcctestA_feat] = twrv8.join() # features to use # zero_crossings, energy, entropy, entropy frequency, flux, spread, mfcc #fluxA, flux_testA, fluxB x_utrainA = np.column_stack( (zero_crossingsA, energyA, entropyA, entropy_freqA, fluxA, centroidA[:, 1], mfccA_feat, stdS1_A, stdS2_A, stdS1_freqA, meanS1_freqA, stdS2_freqA, meanS2_freqA)) x_utestA = np.column_stack( (zc_testA, ener_testA, entr_testA, entr_freqtestA, flux_testA, cent_testA[:, 1], mfcctestA_feat, stdS1_testA, stdS2_testA)) x_utrainB = np.column_stack( (zero_crossingsB, energyB, entropyB, entropy_freqB, fluxB, centroidB[:, 1], mfccB_feat, stdS1_B, stdS2_B, stdS1_freqB, meanS1_freqB, stdS2_freqB, meanS2_freqB)) le = preprocessing.LabelEncoder() y_utrainA = le.fit_transform( y_labelA) # 0 - artifact, 1 - extrahls, 2 - murmur, 3 - normal y_utrainB = le.fit_transform(y_labelB) # shuffle data x_trainA, y_trainA = shuffle(x_utrainA, y_utrainA, random_state=3) x_trainB, y_trainB = shuffle(x_utrainB, y_utrainB, random_state=2) # split kfold data kf = KFold(n_splits=4, shuffle=False, random_state=0) # priors of each class priorsA = [i / len(x_trainA) for i in np.bincount(y_trainA)] priorsB = [i / len(x_trainB) for i in np.bincount(y_trainB)] twrv9 = ThreadWithReturnValue(target=run_model_A, args=('GaussianNB', kf, x_trainA, y_trainA, priorsA)) twrv10 = ThreadWithReturnValue(target=run_model_A, args=('AdaBoostClassifier', kf, x_trainA, y_trainA, priorsA)) twrv11 = ThreadWithReturnValue(target=run_model_A, args=('SVM', kf, x_trainA, y_trainA, priorsA)) twrv12 = ThreadWithReturnValue(target=run_model_A, args=('DecisionTreeClassifier', kf, x_trainA, y_trainA, priorsA)) twrv13 = ThreadWithReturnValue(target=run_model_A, args=('RandomForestClassifier', kf, x_trainA, y_trainA, priorsA)) twrv14 = ThreadWithReturnValue(target=run_model_A, args=('GradientBoostingClassifier', kf, x_trainA, y_trainA, priorsA)) twrv15 = ThreadWithReturnValue(target=run_model_B, args=('GaussianNB', kf, x_trainB, y_trainB, priorsB)) twrv16 = ThreadWithReturnValue(target=run_model_B, args=('AdaBoostClassifier', kf, x_trainB, y_trainB, priorsB)) twrv17 = ThreadWithReturnValue(target=run_model_B, args=('SVM', kf, x_trainB, y_trainB, priorsB)) twrv18 = ThreadWithReturnValue(target=run_model_B, args=('DecisionTreeClassifier', kf, x_trainB, y_trainB, priorsB)) twrv19 = ThreadWithReturnValue(target=run_model_B, args=('RandomForestClassifier', kf, x_trainB, y_trainB, priorsB)) twrv20 = ThreadWithReturnValue(target=run_model_B, args=('GradientBoostingClassifier', kf, x_trainB, y_trainB, priorsB)) twrv9.start() twrv10.start() twrv11.start() twrv12.start() twrv13.start() twrv14.start() twrv15.start() twrv16.start() twrv17.start() twrv18.start() twrv19.start() twrv20.start() twrv9.join() twrv10.join() twrv11.join() twrv12.join() twrv13.join() twrv14.join() twrv15.join() twrv16.join() twrv17.join() twrv18.join() twrv19.join() twrv20.join()
import classifiers_ioana as c_i import data_models if __name__ == "__main__": # Gather news from websites and separate in json files # scraper.scrape_data() # Get fake news and data news corpus true_news_corpus, fake_news_corpus = corpus.get_corpus() data_models.get_corpus_word_count(true_news_corpus, fake_news_corpus) # Preprocess data and add to json files # preprocessing.preprocess_data(true_news_corpus, fake_news_corpus) # Get preprocessed data true_pre_data, fake_pre_data = preprocessing.get_preprocessed_data() data_models.get_processed_data_word_count(true_pre_data, fake_pre_data) # Merge labeled data merged_labeled_data = preprocessing.merge_news(true_pre_data, fake_pre_data) # Get word_frequency word_frequency = preprocessing.get_word_frequency(merged_labeled_data) # Get vocabulary vocabulary = preprocessing.get_vocabulary(merged_labeled_data, word_frequency) # Test something testing.test_classifier(
def show_user_interface(window, user_choice): curr_spectrum = 0 spectra = [] plot_final = None final_compounds_list = '' prediction = '' confidence = '' while True: # Event Loop main_event, main_values = window.Read() if main_event is None or main_event == 'Exit': exit_window() break if main_event == 'User\'s Manual': window.SetAlpha(0.92) user_manual() window.SetAlpha(1) continue # Check chosen pre-processing parameters preproc_param = [] if main_values['bl_reduction']: preproc_param.append('bl_reduction') if main_values['smoothing']: preproc_param.append('smoothing') if main_values['sfs']: preproc_param.append('sfs') if main_values['min_max']: preproc_param.append('min_max') if main_values['z_score']: preproc_param.append('z_score') if main_values['data_reduction']: preproc_param.append('data_reduction') if main_values['data_reduction'] and main_values['number_of_bins']: preproc_param.append('number_of_bins') preproc_param.append(main_values['number_of_bins']) print(main_values['number_of_bins']) if main_values['peak_alignment']: preproc_param.append('peak_alignment') if main_event == 'proceed': curr_spectrum = 0 spectra = [] if (main_values['dataset_location'] == '') or ('.mzML' not in main_values['dataset_location']): sg.PopupTimed('Invalid Input!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) elif not main_values['data_reduction'] and main_values[ 'number_of_bins']: sg.PopupTimed('Binning not enabled!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) elif '.' in main_values['number_of_bins']: sg.PopupTimed('Please enter an integer!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) else: # Get dataset location and parse the data dataset_location = main_values['dataset_location'] parsed_spectra = data_parser.parse(dataset_location) # Pre-process MS Data spectra, used_pa, dupli_exists = preprocessing.get_preprocessed_data( parsed_spectra, preproc_param) # Inform user regarding spectrum duplicate if used_pa and dupli_exists: sg.PopupTimed( 'Duplicate spectrum found. Spectrum is removed.', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) elif used_pa and not dupli_exists: sg.PopupTimed('No duplicate spectrum', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) # Display MS plot plot_figure = plot.plot_spectrum(spectra[0][0], spectra[0][1]) plot_final = plot.draw_figure( window.FindElement('plot_canvas').TKCanvas, plot_figure) # Display MS numerical data window.FindElement('ms_data_table').Update( make_table(spectra[0][0], spectra[0][1], spectra[0][2])[1:]) if user_choice == 'researcher': # List down the most abundant m/z values abundant_intensity = heapq.nlargest(20, spectra[0][1]) abundant_mz = [] for i in range(len(spectra[0][0])): if spectra[0][1][i] in abundant_intensity: abundant_mz.append(spectra[0][0][i]) final_mz_list = [] for i in abundant_mz: final_mz_list.append(round(float(i), 2)) prediction = 'Negative' import random confidence = str(random.randint(52, 96)) + '%' compound_list = chemCompoundsDB.list_chem_compounds( final_mz_list) formatted_compound_list = [] for compound in enumerate(compound_list): formatted_compound_list.append(compound[1][0]) formatted_compound_list = list( dict.fromkeys(formatted_compound_list)) formatted_compound_list = '- ' + '\n\n- '.join( formatted_compound_list) window.FindElement('chem_compounds').Update( formatted_compound_list) final_compounds_list = formatted_compound_list # Get prediction values window.FindElement('prediction').Update(prediction) window.FindElement('prediction_confidence').Update( confidence) sg.PopupTimed('Processing Finished!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) if user_choice == 'admin': accuracy = main_values['accuracy'] precision = main_values['precision'] recall = main_values['recall'] f1_score = main_values['f1_score'] if main_event == 'start_model': classifier, accuracy, precision, recall, f1_score = admin_models.train_test_model( spectra) sg.PopupTimed('Model Finished!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) window.FindElement('accuracy').Update(accuracy) window.FindElement('precision').Update(precision) window.FindElement('recall').Update(recall) window.FindElement('f1_score').Update(f1_score) if main_event == 'save_model': if (not main_values['model_location']) or \ (not main_values['model_name']) or \ ('/' not in main_values['model_location']): sg.PopupTimed('Invalid Input!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) else: model_location = main_values['model_location'] model_name = main_values['model_name'] admin_models.save_model(classifier, model_location, model_name) sg.PopupTimed('Model Saved!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) # Spectra navigation if spectra and (main_event == 'ms_number_go') and (main_values['ms_number']) \ and (int(main_values['ms_number']) > 0) and (int(main_values['ms_number']) < len(spectra)): curr_spectrum = int(main_values['ms_number']) - 1 display_ms_data(spectra[curr_spectrum]) if spectra and (main_event == 'spectrum_prev') and (curr_spectrum != 0): curr_spectrum -= 1 display_ms_data(spectra[curr_spectrum]) if spectra and (main_event == 'spectrum_next') and (curr_spectrum != len(spectra) - 1): curr_spectrum += 1 display_ms_data(spectra[curr_spectrum]) def display_ms_data(spectrum): plot_figure = plot.plot_spectrum(spectrum[0], spectrum[1]) plot_final = plot.draw_figure( window.FindElement('plot_canvas').TKCanvas, plot_figure) window.FindElement('ms_data_table').Update( make_table(spectrum[0], spectrum[1], spectrum[2])[1:]) if user_choice == 'researcher': abundant_intensity = heapq.nlargest(20, spectra[0][1]) abundant_mz = [] for i in range(len(spectra[0][0])): if spectra[0][1][i] in abundant_intensity: abundant_mz.append(spectra[0][0][i]) final_mz_list = [] for i in abundant_mz: final_mz_list.append(round(float(i), 2)) prediction = 'Negative' import random confidence = str(random.randint(52, 96)) + '%' compound_list = chemCompoundsDB.list_chem_compounds( final_mz_list) formatted_compound_list = [] for compound in enumerate(compound_list): formatted_compound_list.append(compound[1][0]) formatted_compound_list = list( dict.fromkeys(formatted_compound_list)) formatted_compound_list = '- ' + '\n\n- '.join( formatted_compound_list) window.FindElement('chem_compounds').Update( formatted_compound_list) final_compounds_list = formatted_compound_list window.FindElement('prediction').Update(prediction) window.FindElement('prediction_confidence').Update(confidence) sg.PopupTimed('Processing Finished!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) if main_event == 'reset': curr_spectrum = 0 spectra = [] window.FindElement('dataset_location').Update('') window.FindElement('bl_reduction').Update(value=False) window.FindElement('smoothing').Update(value=False) window.FindElement('sfs').Update(value=False) window.FindElement('min_max').Update(value=False) window.FindElement('z_score').Update(value=False) window.FindElement('data_reduction').Update(value=False) window.FindElement('peak_alignment').Update(value=False) window.FindElement('number_of_bins').Update(value='') window.FindElement('plot_canvas').TKCanvas.delete('all') window.FindElement('ms_data_table').Update('') if user_choice == 'researcher': window.FindElement('chem_compounds').Update(value='') window.FindElement('prediction').Update(value='') window.FindElement('prediction_confidence').Update(value='') window.FindElement('export_location').Update(value='') window.FindElement('export_name').Update(value='') window.FindElement('ms_number').Update(value='') if user_choice == 'admin': window.FindElement('model_name').Update(value='') window.FindElement('model_location').Update(value='') window.FindElement('accuracy').Update(value='') window.FindElement('precision').Update(value='') window.FindElement('recall').Update(value='') window.FindElement('f1_score').Update(value='') continue if main_event == 'export': if (not main_values['export_location']) or \ (not main_values['export_name']) or \ ('/' not in main_values['export_location']) or \ (not final_compounds_list): sg.PopupTimed('Invalid Input!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) else: if '.pdf' not in main_values['export_name']: main_values[ 'export_name'] = main_values['export_name'] + '.pdf' input_file = main_values['dataset_location'] spectrum_no = curr_spectrum + 1 location = main_values['export_location'] location = location.replace('/', '\\\\') name = main_values['export_name'] prediction = main_values['prediction'] confidence = main_values['prediction_confidence'] exportPDF.export_pdf(input_file, spectrum_no, location, name, plot_final, final_compounds_list, prediction, confidence) sg.PopupTimed('PDF Export Finished!', background_color='#DEDEDE', font='Roboto 10', no_titlebar=False) window.Close()