Python get_preprocessed_dataの例

プログラミング言語: Python

名前空間/パッケージ名: preprocessing

メソッド/関数: get_preprocessed_data

hotexamples.comのコード掲載数: 5

Python get_preprocessed_data - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのpreprocessing.get_preprocessed_dataの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

def main():
    df_business_restaurants = preprocessing.get_preprocessed_data()
    training, test = validation.test_trainsplit(df_business_restaurants)
    dim_freq_map, prior_of_stars = trainNB(training)
    selected_columns = [
        'review_count', 'city', 'review_sentiment_rating',
        'review_star_rating', 'tip_rating', 'checkin_rating'
    ]
    accuracy, dist, offcount = testNB(test, dim_freq_map, selected_columns,
                                      prior_of_stars)
    print("With review_star_rating, rounded stars -- accuracy,dist,offcount :",
          accuracy, dist, offcount)
    selected_columns = [
        'review_count', 'city', 'review_sentiment_rating', 'tip_rating',
        'checkin_rating'
    ]
    accuracy, dist, offcount = testNB(test, dim_freq_map, selected_columns,
                                      prior_of_stars)
    print(
        "With out review_star_rating, rounded stars -- accuracy,dist,offcount :",
        accuracy, dist, offcount)
    print("k fold cross validation results ",
          k_fold_crossvalidation(df_business_restaurants))

コード例 #2

ファイルを表示

def main():
    # Class Negative Data
    d1 = data_parser.parse('Datasets/Healthy Controls/MS_A_1.mzml')
    d2 = data_parser.parse('Datasets/Healthy Controls/MS_A_2.mzml')
    d3 = data_parser.parse('Datasets/Healthy Controls/MS_A_3.mzml')
    d4 = data_parser.parse('Datasets/Healthy Controls/MS_A_4.mzml')
    d5 = data_parser.parse('Datasets/Healthy Controls/MS_A_5.mzml')
    d6 = data_parser.parse('Datasets/Healthy Controls/MS_A_6.mzml')
    d7 = data_parser.parse('Datasets/Healthy Controls/MS_A_7.mzml')

    # Class Positive Data
    d8 = data_parser.parse('Datasets/PC Diagnosed/MS_B_1.mzml')
    d9 = data_parser.parse('Datasets/PC Diagnosed/MS_B_2.mzml')
    d10 = data_parser.parse('Datasets/PC Diagnosed/MS_B_3.mzml')
    d11 = data_parser.parse('Datasets/PC Diagnosed/MS_B_4.mzml')
    d12 = data_parser.parse('Datasets/PC Diagnosed/MS_B_5.mzml')
    d13 = data_parser.parse('Datasets/PC Diagnosed/MS_B_6.mzml')
    d14 = data_parser.parse('Datasets/PC Diagnosed/MS_B_7.mzml')

    full_data = d1 + d2 + d3 + d4 + d5 + d6 + d7 + d8 + d9 + d10 + d11 + d12 + d13 + d14
    param = []
    data = preprocessing.get_preprocessed_data(full_data, param)
    # train_test_model(data, param)
    cross_validate(data, param)

コード例 #3

ファイルを表示

ファイル: finding_features.py プロジェクト: akhilvasvani/Classifying-Heart-Sounds

def main():
    """ The main script. Runs all of the above functions. and saves in the result folder all of the accuracies
        and predictions for each ML algorithm.
        To plot any the following function --> set if_plot to True in each of the function arguments."""

    # Get the preprocesed data
    x_dataA, y_labelA, framerate_A = get_preprocessed_data('A', N=2, factor=5)
    x_dataB, y_labelB, framerate_B = get_preprocessed_data('B', N=2, factor=1)

    # Get the testing data
    x_testdataA = get_test_data('A')
    x_testdataA = np.array([x[:len(x_dataA[0])] for x in x_testdataA])

    stdS1_A, stdS1_testA, stdS2_A, stdS2_testA, meanS1_freqA, meanS2_freqA, stdS1_freqA, stdS2_freqA = set_A(
        x_dataA, x_testdataA, y_labelA)

    stdS1_B, stdS2_B, meanS1_freqB, meanS2_freqB, stdS1_freqB, stdS2_freqB = set_B(
        x_dataB, y_labelB)

    # Opted to use Multi-thread to speed process up
    twrv1 = ThreadWithReturnValue(target=zero_crossing,
                                  args=(x_dataA, x_dataB, x_testdataA,
                                        y_labelA, y_labelB))
    twrv2 = ThreadWithReturnValue(target=signal_energy_frame,
                                  args=(x_dataA, x_dataB, x_testdataA,
                                        y_labelA, y_labelB))
    twrv3 = ThreadWithReturnValue(target=entropy_of_energy,
                                  args=(x_dataA, x_dataB, x_testdataA,
                                        y_labelA, y_labelB))
    twrv4 = ThreadWithReturnValue(target=frequency_domain,
                                  args=(x_dataA, x_dataB, x_testdataA))

    twrv1.start()
    twrv2.start()
    twrv3.start()
    twrv4.start()

    [zero_crossingsA, zero_crossingsB, zc_testA] = twrv1.join()
    [energyA, energyB, ener_testA] = twrv2.join()
    [entropyA, entropyB, entr_testA] = twrv3.join()
    [X_dataA, X_dataB, X_testdataA] = twrv4.join()

    twrv5 = ThreadWithReturnValue(target=spectral_entropy,
                                  args=(X_dataA, X_dataB, X_testdataA,
                                        y_labelA, y_labelB))
    twrv6 = ThreadWithReturnValue(target=spectral_flux,
                                  args=(X_dataA, X_dataB, X_testdataA,
                                        y_labelA, y_labelB))
    twrv7 = ThreadWithReturnValue(target=spectral_centroid_frame,
                                  args=(X_dataA, X_dataB, X_testdataA,
                                        y_labelA, y_labelB, framerate_A,
                                        framerate_B))
    twrv8 = ThreadWithReturnValue(target=get_mcfcc_feat,
                                  args=(x_dataA, x_dataB, x_testdataA,
                                        framerate_A, framerate_B))

    twrv5.start()
    twrv6.start()
    twrv7.start()
    twrv8.start()

    [entropy_freqA, entropy_freqB, entr_freqtestA] = twrv5.join()
    [fluxA, fluxB, flux_testA] = twrv6.join()
    [centroidA, centroidB, cent_testA] = twrv7.join()
    [mfccA_feat, mfccB_feat, mfcctestA_feat] = twrv8.join()

    # features to use
    # zero_crossings, energy, entropy, entropy frequency, flux, spread, mfcc #fluxA, flux_testA, fluxB
    x_utrainA = np.column_stack(
        (zero_crossingsA, energyA, entropyA, entropy_freqA, fluxA,
         centroidA[:, 1], mfccA_feat, stdS1_A, stdS2_A, stdS1_freqA,
         meanS1_freqA, stdS2_freqA, meanS2_freqA))
    x_utestA = np.column_stack(
        (zc_testA, ener_testA, entr_testA, entr_freqtestA, flux_testA,
         cent_testA[:, 1], mfcctestA_feat, stdS1_testA, stdS2_testA))

    x_utrainB = np.column_stack(
        (zero_crossingsB, energyB, entropyB, entropy_freqB, fluxB,
         centroidB[:, 1], mfccB_feat, stdS1_B, stdS2_B, stdS1_freqB,
         meanS1_freqB, stdS2_freqB, meanS2_freqB))

    le = preprocessing.LabelEncoder()
    y_utrainA = le.fit_transform(
        y_labelA)  # 0 - artifact, 1 - extrahls, 2 - murmur, 3 - normal
    y_utrainB = le.fit_transform(y_labelB)

    # shuffle data
    x_trainA, y_trainA = shuffle(x_utrainA, y_utrainA, random_state=3)
    x_trainB, y_trainB = shuffle(x_utrainB, y_utrainB, random_state=2)

    # split kfold data
    kf = KFold(n_splits=4, shuffle=False, random_state=0)

    # priors of each class
    priorsA = [i / len(x_trainA) for i in np.bincount(y_trainA)]
    priorsB = [i / len(x_trainB) for i in np.bincount(y_trainB)]

    twrv9 = ThreadWithReturnValue(target=run_model_A,
                                  args=('GaussianNB', kf, x_trainA, y_trainA,
                                        priorsA))
    twrv10 = ThreadWithReturnValue(target=run_model_A,
                                   args=('AdaBoostClassifier', kf, x_trainA,
                                         y_trainA, priorsA))
    twrv11 = ThreadWithReturnValue(target=run_model_A,
                                   args=('SVM', kf, x_trainA, y_trainA,
                                         priorsA))
    twrv12 = ThreadWithReturnValue(target=run_model_A,
                                   args=('DecisionTreeClassifier', kf,
                                         x_trainA, y_trainA, priorsA))
    twrv13 = ThreadWithReturnValue(target=run_model_A,
                                   args=('RandomForestClassifier', kf,
                                         x_trainA, y_trainA, priorsA))
    twrv14 = ThreadWithReturnValue(target=run_model_A,
                                   args=('GradientBoostingClassifier', kf,
                                         x_trainA, y_trainA, priorsA))

    twrv15 = ThreadWithReturnValue(target=run_model_B,
                                   args=('GaussianNB', kf, x_trainB, y_trainB,
                                         priorsB))
    twrv16 = ThreadWithReturnValue(target=run_model_B,
                                   args=('AdaBoostClassifier', kf, x_trainB,
                                         y_trainB, priorsB))
    twrv17 = ThreadWithReturnValue(target=run_model_B,
                                   args=('SVM', kf, x_trainB, y_trainB,
                                         priorsB))
    twrv18 = ThreadWithReturnValue(target=run_model_B,
                                   args=('DecisionTreeClassifier', kf,
                                         x_trainB, y_trainB, priorsB))
    twrv19 = ThreadWithReturnValue(target=run_model_B,
                                   args=('RandomForestClassifier', kf,
                                         x_trainB, y_trainB, priorsB))
    twrv20 = ThreadWithReturnValue(target=run_model_B,
                                   args=('GradientBoostingClassifier', kf,
                                         x_trainB, y_trainB, priorsB))

    twrv9.start()
    twrv10.start()
    twrv11.start()
    twrv12.start()
    twrv13.start()
    twrv14.start()
    twrv15.start()
    twrv16.start()
    twrv17.start()
    twrv18.start()
    twrv19.start()
    twrv20.start()

    twrv9.join()
    twrv10.join()
    twrv11.join()
    twrv12.join()
    twrv13.join()
    twrv14.join()
    twrv15.join()
    twrv16.join()
    twrv17.join()
    twrv18.join()
    twrv19.join()
    twrv20.join()

コード例 #4

ファイルを表示

ファイル: main.py プロジェクト: Codrinator/FakeNews

import classifiers_ioana as c_i
import data_models

if __name__ == "__main__":
    # Gather news from websites and separate in json files
    # scraper.scrape_data()

    # Get fake news and data news corpus
    true_news_corpus, fake_news_corpus = corpus.get_corpus()
    data_models.get_corpus_word_count(true_news_corpus, fake_news_corpus)

    # Preprocess data and add to json files
    # preprocessing.preprocess_data(true_news_corpus, fake_news_corpus)

    # Get preprocessed data
    true_pre_data, fake_pre_data = preprocessing.get_preprocessed_data()
    data_models.get_processed_data_word_count(true_pre_data, fake_pre_data)

    # Merge labeled data
    merged_labeled_data = preprocessing.merge_news(true_pre_data,
                                                   fake_pre_data)

    # Get word_frequency
    word_frequency = preprocessing.get_word_frequency(merged_labeled_data)

    # Get vocabulary
    vocabulary = preprocessing.get_vocabulary(merged_labeled_data,
                                              word_frequency)

    # Test something
    testing.test_classifier(

コード例 #5

ファイルを表示

ファイル: userInterface.py プロジェクト: noelbriones/PanCan-Spectrum

def show_user_interface(window, user_choice):
    curr_spectrum = 0
    spectra = []
    plot_final = None
    final_compounds_list = ''
    prediction = ''
    confidence = ''
    while True:  # Event Loop
        main_event, main_values = window.Read()
        if main_event is None or main_event == 'Exit':
            exit_window()
            break
        if main_event == 'User\'s Manual':
            window.SetAlpha(0.92)
            user_manual()
            window.SetAlpha(1)
            continue

        # Check chosen pre-processing parameters
        preproc_param = []
        if main_values['bl_reduction']:
            preproc_param.append('bl_reduction')
        if main_values['smoothing']:
            preproc_param.append('smoothing')
        if main_values['sfs']:
            preproc_param.append('sfs')
        if main_values['min_max']:
            preproc_param.append('min_max')
        if main_values['z_score']:
            preproc_param.append('z_score')
        if main_values['data_reduction']:
            preproc_param.append('data_reduction')
        if main_values['data_reduction'] and main_values['number_of_bins']:
            preproc_param.append('number_of_bins')
            preproc_param.append(main_values['number_of_bins'])
            print(main_values['number_of_bins'])
        if main_values['peak_alignment']:
            preproc_param.append('peak_alignment')

        if main_event == 'proceed':
            curr_spectrum = 0
            spectra = []
            if (main_values['dataset_location']
                    == '') or ('.mzML' not in main_values['dataset_location']):
                sg.PopupTimed('Invalid Input!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)
            elif not main_values['data_reduction'] and main_values[
                    'number_of_bins']:
                sg.PopupTimed('Binning not enabled!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)
            elif '.' in main_values['number_of_bins']:
                sg.PopupTimed('Please enter an integer!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)
            else:
                # Get dataset location and parse the data
                dataset_location = main_values['dataset_location']
                parsed_spectra = data_parser.parse(dataset_location)

                # Pre-process MS Data
                spectra, used_pa, dupli_exists = preprocessing.get_preprocessed_data(
                    parsed_spectra, preproc_param)

                # Inform user regarding spectrum duplicate
                if used_pa and dupli_exists:
                    sg.PopupTimed(
                        'Duplicate spectrum found. Spectrum is removed.',
                        background_color='#DEDEDE',
                        font='Roboto 10',
                        no_titlebar=False)
                elif used_pa and not dupli_exists:
                    sg.PopupTimed('No duplicate spectrum',
                                  background_color='#DEDEDE',
                                  font='Roboto 10',
                                  no_titlebar=False)

                # Display MS plot
                plot_figure = plot.plot_spectrum(spectra[0][0], spectra[0][1])
                plot_final = plot.draw_figure(
                    window.FindElement('plot_canvas').TKCanvas, plot_figure)

                # Display MS numerical data
                window.FindElement('ms_data_table').Update(
                    make_table(spectra[0][0], spectra[0][1],
                               spectra[0][2])[1:])

                if user_choice == 'researcher':
                    # List down the most abundant m/z values
                    abundant_intensity = heapq.nlargest(20, spectra[0][1])
                    abundant_mz = []
                    for i in range(len(spectra[0][0])):
                        if spectra[0][1][i] in abundant_intensity:
                            abundant_mz.append(spectra[0][0][i])
                    final_mz_list = []
                    for i in abundant_mz:
                        final_mz_list.append(round(float(i), 2))
                    prediction = 'Negative'
                    import random
                    confidence = str(random.randint(52, 96)) + '%'

                    compound_list = chemCompoundsDB.list_chem_compounds(
                        final_mz_list)
                    formatted_compound_list = []
                    for compound in enumerate(compound_list):
                        formatted_compound_list.append(compound[1][0])
                    formatted_compound_list = list(
                        dict.fromkeys(formatted_compound_list))
                    formatted_compound_list = '- ' + '\n\n- '.join(
                        formatted_compound_list)
                    window.FindElement('chem_compounds').Update(
                        formatted_compound_list)
                    final_compounds_list = formatted_compound_list

                    # Get prediction values
                    window.FindElement('prediction').Update(prediction)
                    window.FindElement('prediction_confidence').Update(
                        confidence)

                    sg.PopupTimed('Processing Finished!',
                                  background_color='#DEDEDE',
                                  font='Roboto 10',
                                  no_titlebar=False)

                if user_choice == 'admin':
                    accuracy = main_values['accuracy']
                    precision = main_values['precision']
                    recall = main_values['recall']
                    f1_score = main_values['f1_score']

        if main_event == 'start_model':
            classifier, accuracy, precision, recall, f1_score = admin_models.train_test_model(
                spectra)
            sg.PopupTimed('Model Finished!',
                          background_color='#DEDEDE',
                          font='Roboto 10',
                          no_titlebar=False)

            window.FindElement('accuracy').Update(accuracy)
            window.FindElement('precision').Update(precision)
            window.FindElement('recall').Update(recall)
            window.FindElement('f1_score').Update(f1_score)

        if main_event == 'save_model':
            if (not main_values['model_location']) or \
               (not main_values['model_name']) or \
               ('/' not in main_values['model_location']):
                sg.PopupTimed('Invalid Input!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)
            else:
                model_location = main_values['model_location']
                model_name = main_values['model_name']
                admin_models.save_model(classifier, model_location, model_name)
                sg.PopupTimed('Model Saved!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)

        # Spectra navigation
        if spectra and (main_event == 'ms_number_go') and (main_values['ms_number']) \
                and (int(main_values['ms_number']) > 0) and (int(main_values['ms_number']) < len(spectra)):
            curr_spectrum = int(main_values['ms_number']) - 1
            display_ms_data(spectra[curr_spectrum])
        if spectra and (main_event
                        == 'spectrum_prev') and (curr_spectrum != 0):
            curr_spectrum -= 1
            display_ms_data(spectra[curr_spectrum])
        if spectra and (main_event == 'spectrum_next') and (curr_spectrum !=
                                                            len(spectra) - 1):
            curr_spectrum += 1
            display_ms_data(spectra[curr_spectrum])

        def display_ms_data(spectrum):
            plot_figure = plot.plot_spectrum(spectrum[0], spectrum[1])
            plot_final = plot.draw_figure(
                window.FindElement('plot_canvas').TKCanvas, plot_figure)
            window.FindElement('ms_data_table').Update(
                make_table(spectrum[0], spectrum[1], spectrum[2])[1:])

            if user_choice == 'researcher':
                abundant_intensity = heapq.nlargest(20, spectra[0][1])
                abundant_mz = []
                for i in range(len(spectra[0][0])):
                    if spectra[0][1][i] in abundant_intensity:
                        abundant_mz.append(spectra[0][0][i])
                final_mz_list = []
                for i in abundant_mz:
                    final_mz_list.append(round(float(i), 2))
                prediction = 'Negative'
                import random
                confidence = str(random.randint(52, 96)) + '%'

                compound_list = chemCompoundsDB.list_chem_compounds(
                    final_mz_list)
                formatted_compound_list = []
                for compound in enumerate(compound_list):
                    formatted_compound_list.append(compound[1][0])
                formatted_compound_list = list(
                    dict.fromkeys(formatted_compound_list))
                formatted_compound_list = '- ' + '\n\n- '.join(
                    formatted_compound_list)
                window.FindElement('chem_compounds').Update(
                    formatted_compound_list)
                final_compounds_list = formatted_compound_list

                window.FindElement('prediction').Update(prediction)
                window.FindElement('prediction_confidence').Update(confidence)

                sg.PopupTimed('Processing Finished!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)

        if main_event == 'reset':
            curr_spectrum = 0
            spectra = []
            window.FindElement('dataset_location').Update('')
            window.FindElement('bl_reduction').Update(value=False)
            window.FindElement('smoothing').Update(value=False)
            window.FindElement('sfs').Update(value=False)
            window.FindElement('min_max').Update(value=False)
            window.FindElement('z_score').Update(value=False)
            window.FindElement('data_reduction').Update(value=False)
            window.FindElement('peak_alignment').Update(value=False)
            window.FindElement('number_of_bins').Update(value='')
            window.FindElement('plot_canvas').TKCanvas.delete('all')
            window.FindElement('ms_data_table').Update('')

            if user_choice == 'researcher':
                window.FindElement('chem_compounds').Update(value='')
                window.FindElement('prediction').Update(value='')
                window.FindElement('prediction_confidence').Update(value='')
                window.FindElement('export_location').Update(value='')
                window.FindElement('export_name').Update(value='')
                window.FindElement('ms_number').Update(value='')

            if user_choice == 'admin':
                window.FindElement('model_name').Update(value='')
                window.FindElement('model_location').Update(value='')
                window.FindElement('accuracy').Update(value='')
                window.FindElement('precision').Update(value='')
                window.FindElement('recall').Update(value='')
                window.FindElement('f1_score').Update(value='')

            continue

        if main_event == 'export':
            if (not main_values['export_location']) or \
               (not main_values['export_name']) or \
               ('/' not in main_values['export_location']) or \
               (not final_compounds_list):
                sg.PopupTimed('Invalid Input!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)
            else:
                if '.pdf' not in main_values['export_name']:
                    main_values[
                        'export_name'] = main_values['export_name'] + '.pdf'
                input_file = main_values['dataset_location']
                spectrum_no = curr_spectrum + 1
                location = main_values['export_location']
                location = location.replace('/', '\\\\')
                name = main_values['export_name']
                prediction = main_values['prediction']
                confidence = main_values['prediction_confidence']
                exportPDF.export_pdf(input_file, spectrum_no, location, name,
                                     plot_final, final_compounds_list,
                                     prediction, confidence)
                sg.PopupTimed('PDF Export Finished!',
                              background_color='#DEDEDE',
                              font='Roboto 10',
                              no_titlebar=False)
    window.Close()