Python data_preprocess Examples, preprocess.data_preprocess Python Examples

Example #1

0

Show file

File: main.py Project: kienvu58/keyword_suggestion

 def get_data(
     self,
     dsn_database,
     dsn_hostname,
     dsn_port,
     dsn_protocol,
     dsn_uid,
     dsn_pwd,
     level,
 ):
     preprocess = preprocess.Preprocessor()
     raw_data = preprocess.db2_connect(
         dsn_database, dsn_hostname, dsn_port, dsn_protocol, dsn_uid, dsn_pwd
     )
     data = preprocess.data_preprocess(raw_data, level)
     return data

Example #2

0

Show file

    def test(self, dataset):
        """Accepts a dataset for testing

        Calculates the probility of the test set against all categories and
        predicts the label of the test set
        """

        predictions = []
        for data in dataset:
            # Clean the test data entry
            cleaned_data = data_preprocess(data)
            # Compute for the posterior probability of the entry
            post_prob = self.get_test_prob(cleaned_data)
            # Store the label of the entry into predictions
            predictions.append(self.classes[np.argmax(post_prob)])

        return np.array(predictions)

Example #3

0

Show file

File: tweets_update.py Project: itsuncheng/COMP4901I_Assignments

    def __init__(self, filename, cleaning, max_vocab_size, update_embeds):
        revs, word2idx = data_preprocess(filename, cleaning, max_vocab_size)
        data, label = feature_extraction_index(revs, word2idx)
        word_emb_mat = np.loadtxt('w_emb_mat.txt')

        # data = normalization(data)
        X_train, X_dev, Y_train, Y_dev = train_test_split(data,
                                                          label,
                                                          test_size=0.2,
                                                          random_state=0)
        # print("X_train.shape: ", X_train.shape)
        self.data = X_train
        self.label = Y_train
        self.X_dev = X_dev
        self.Y_dev = Y_dev
        self.word2idx = word2idx
        self.embeddings = nn.Embedding.from_pretrained(
            torch.from_numpy(word_emb_mat), freeze=not update_embeds)

Example #4

0

Show file

def transformation():
    """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
    it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
    just means one prediction per line, since there's a single column.
    """
    data = None

    # Convert from CSV to pandas
    if flask.request.content_type == 'text/csv':
        data = flask.request.data.decode('utf-8')
        s = StringIO.StringIO(data)
        data = pd.read_csv(s)
    else:
        return flask.Response(response='This predictor only supports CSV data',
                              status=415,
                              mimetype='text/plain')

    print('Invoked with {} records'.format(data.shape[0]))

    # Prep data
    prepped_data = data_preprocess(data)
    # Drop last_trip_date column
    prepped_data.drop(["last_trip_date"], axis=1, inplace=True)
    data_array = prepped_data.values
    print(data_array[0])

    # Do the prediction
    predictions = ScoringService.predict(data_array)

    # Convert from numpy back to CSV
    out = StringIO.StringIO()
    pd.DataFrame({
        'results': predictions
    }).to_csv(out, header=False, index=False)
    result = out.getvalue()

    return flask.Response(response=result, status=200, mimetype='text/csv')

Example #5

0

Show file

    return parameters

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description='Building Interactive Intelligent Systems')
    parser.add_argument('-c','--clean', help='True to do data cleaning, default is False', action='store_false')
    parser.add_argument('-mv','--max_vocab', help='max vocab size predifined, no limit if set -1', required=False, default=-1)
    parser.add_argument('-lr','--learning_rate', required=False, default=0.001)
    parser.add_argument('-i','--num_iter', required=False, default=1)
    parser.add_argument('-fn','--file_name', help='file name', required=False, default='myTest')
    args = vars(parser.parse_args())
    print(args)

    print('[Read the data from twitter-sentiment-testset.csv...]')
    revs, word2idx = data_preprocess('./twitter-sentiment-testset.csv', args['clean'], int(args['max_vocab']))
    
    print('[Extract features from the read data...]')
    data, label = feature_extraction_bow(revs, word2idx)
    data = normalization(data)
    
    # shuffle data
    shuffle_idx = np.arange(len(data))
    np.random.shuffle(shuffle_idx)
    data = data[shuffle_idx]
    label = label[shuffle_idx]


    print('[Start training...]')
    X_train, X_dev, Y_train, Y_dev = train_test_split(data, label, test_size=0.2, random_state=0)
    parameters = model(X_train.T, Y_train.T, X_dev.T, Y_dev.T, args['file_name'],

Example #6

0

Show file

        print('Bi-direction Matching is starting ----------')

        result = []
        t = trange(len(sentences))
        for i in t:
            if len(fmm[i]) > len(bmm[i]):  # 首先考虑分词数目，取词数较少的结果
                result.append(bmm[i])
            elif len(fmm[i]) < len(bmm[i]):  # 分词数目相同时，取单字数目较少的结果
                result.append(fmm[i])
            elif fmm[i] == bmm[i]:
                result.append(fmm[i])
            else:
                count_fmm = [len(s) for s in fmm[i]].count(1)
                count_bmm = [len(s) for s in bmm[i]].count(1)
                if count_fmm > count_bmm:
                    result.append(bmm[i])
                else:
                    result.append(fmm[i])

        self.result_evalutate(result)
        self.write_to_txt(result, 'result_bm.txt')
        print('Bi-direction Matching completed ----------')
        return result


if __name__ == "__main__":
    stop_words, words_set = pre.load_word_data()
    sentences, labels = pre.data_preprocess()
    fenci = Segment_Words(stop_words, words_set, labels)
    result = fenci.bi_direction_matching(sentences)

Example #7

0

Show file

# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from preprocess import data_preprocess
from NaiveBayes import NaiveBayes

raw_data = open('a1_d3.txt', 'r')
dataset = data_preprocess(raw_data)

split_dataset = np.array_split(dataset, 5)

accuracies = []
f_scores = []

for epoch in range(5):

    X_train = []
    X_test = []
    y_train = []
    y_test = []

    for i in range(5):
        if epoch == i:
            X_test = split_dataset[epoch]['Review'].values
            y_test = split_dataset[epoch]['Sentiment'].values
        else:
            X_train.append(pd.DataFrame(split_dataset[i]['Review'].values))
            y_train.append(pd.DataFrame(split_dataset[i]['Sentiment'].values))

    X_train = pd.concat(X_train)[0].values

Example #8

0

Show file

def main():
    # Set seed for reproducability
    np.random.seed(6969)

    # Preprocess
    do_smoothing = False
    do_subset = False
    do_snv = False
    do_normalize = False

    # Analysis
    inspection = 'processed'
    show_plots = True
    do_outlier_filtering = False
    do_linear_class = False
    do_nonlinear_class = False

    # Load data
    dir_path = '../datasets/indian_pines'
    data_file = 'indian_pines.mat'
    cali_file = 'calibration.mat'
    labels_file = 'indian_pines_gt.mat'
    dataloader = Dataloader(dir_path, data_file, cali_file, labels_file)

    # Create tables, resample and create test set
    X = dataloader.get_calibrated_samples()
    Y = dataloader.get_labels()
    W = dataloader.get_wave_lengths()
    X = create_table(X)
    Y = create_table(Y)
    X, Y = resample_dataset(X, Y, 4.0)
    X_train, Y_train, X_test, Y_test = create_test_set(X, Y, test_frac=0.30)

    # Smoothing, subset selection, SNV
    avg_window = 5
    # 0-38, 42-44, 48-53, 65-73, 84-86, 91-98, 120-144, 167-169, 172-220
    subset1 = np.arange(0, 38)
    subset2 = np.arange(42, 44)
    subset3 = np.arange(48, 53)
    subset4 = np.arange(65, 73)
    subset5 = np.arange(84, 86)
    subset6 = np.arange(91, 98)
    subset7 = np.arange(120, 144)
    subset8 = np.arange(167, 169)
    subset9 = np.arange(172, 220)
    subset_inds = np.concatenate((subset1, subset2, subset3, subset4, subset5,
                                  subset6, subset7, subset8, subset9))
    X_train, X_test = data_preprocess(X_train, X_test, avg_window, subset_inds,
                                      do_smoothing, do_subset, do_snv, False)
    #W = dataloader.get_wave_lengths(subset_inds)

    # Outlier detection
    if do_outlier_filtering:
        outliers = hotellings_t2(X_train,
                                 Y_train,
                                 0.05,
                                 True,
                                 False,
                                 fig_num=3,
                                 fig_size=(12, 6))
        outliers = np.take(outliers, [2])  # 95% CI: Total 45; 2
        inspect_outliers(W, X_train, outliers)
        X_train, Y_train = remove_outliers(X_train, Y_train, outliers)

    # Normalization
    if do_normalize:
        X_train = normalize(X_train)
        X_test = normalize(X_test)

    # PCA inspection
    if inspection == 'processed':
        data_inspection(W, X_train, Y_train, 17, 1, (12, 6), 'Raw Data',
                        'Wave lengths [nm]', 'Radiance [Wm^(-2)sr^(-1)]')
    elif inspection == 'pls':
        pls_inspection(X_train, Y_train, n_comps=8)
    elif inspection == 'pca':
        pca_inspection(X_train, Y_train, n_comps=8)
    elif inspection == 'kpca':
        kernel_pca_inspection(X_train, Y_train, 8, 'linear')

    # Linear classification
    if do_linear_class:
        linear_classification(X_train,
                              Y_train,
                              X_test,
                              Y_test,
                              n_folds=5,
                              n_comps_max=10,
                              threshold=0.90,
                              show_plots=show_plots,
                              fignum=2,
                              figsize=(8, 6),
                              normalize=False)

    # Non-linear classification
    if do_nonlinear_class:
        gamma_min = 1e-5  #1e-5
        gamma_max = 3e-1  #3e-1
        n_gammas = 30  #30
        gammas = np.linspace(gamma_min, gamma_max, n_gammas)
        best_gamma = svm_cross_validation(X_train,
                                          Y_train,
                                          n_folds=5,
                                          kernel='rbf',
                                          gammas=gammas,
                                          show_plots=show_plots)
        svm_classification(X_train,
                           Y_train,
                           X_test,
                           Y_test,
                           kernel='rbf',
                           gamma=best_gamma)

Example #9

0

Show file

    def train(self, dataset, labels):
        """Accepts a dataset with the shape (l x d)
        where l is the number of classes and the labels with a shape of (l)

        Training function for the Naive Bayes Model
        Computes for the BoW for each class
        """

        self.dataset = dataset
        self.labels = labels

        if not isinstance(self.dataset, np.ndarray):
            self.dataset = np.array(self.dataset)

        if not isinstance(self.labels, np.ndarray):
            self.labels = np.array(self.labels)

        for cat_index, category in enumerate(self.classes):
            # get all data for that category
            all_cat_data = self.dataset[self.labels == category]

            # clean the gathered data
            cleaned_data = [
                data_preprocess(cat_data) for cat_data in all_cat_data
            ]

            cleaned_data = pd.DataFrame(data=cleaned_data)

            # construct the BoW for that category
            np.apply_along_axis(self.add_to_BoW, 1, cleaned_data, cat_index)

        if self.reduce:
            self.reduce_words()
        prob_classes = np.empty(self.classes.shape[0])
        all_words = []
        cat_word_counts = np.empty(self.classes.shape[0])

        for cat_index, category in enumerate(self.classes):
            # Compute for probability of a category, p(C)
            prob_classes[cat_index] = np.sum(self.labels == category) / float(
                self.classes.shape[0])

            # Compute for total count of all words in each category
            # count = list(self.bow_dicts[cat_index].values())
            # removed +1
            cat_word_counts[cat_index] = np.sum(
                np.array(list(
                    self.bow_dicts[cat_index].values()))) + self.alpha

            # Get all words for this category
            all_words += self.bow_dicts[cat_index].keys()

        # Construct the vocab for the training set
        self.vocab = np.unique(np.array(all_words))
        self.vocab_length = self.vocab.shape[0]

        # Get all denominators per category
        # removed + 1 from self.vocab_length
        demons = np.array([
            cat_word_counts[cat_index] + self.vocab_length + self.alpha
            for cat_index, category in enumerate(self.classes)
        ])

        # Compile the data into tuples
        self.cat_infos = [(self.bow_dicts[cat_index], prob_classes[cat_index],
                           demons[cat_index])
                          for cat_index, category in enumerate(self.classes)]

        self.cat_infos = np.array(self.cat_infos)