Python Preprocess Examples, Preprocessing.Preprocess Python Examples

Example #1

0

Show file

File: plot_svm.py Project: ZacChen11/The-Default-of-Credit-Card-Clients

 def load_data(self, sample, x_axi_attr_index, y_axi_attr_index):
     from Preprocessing import Preprocess
     from Postprocessing import Postprocess
     creditdata = Preprocess("default of credit card clients.xls")
     raw_X_train, raw_X_test, raw_y_train, raw_y_test = creditdata.load()
     low_dim_X_train, low_dim_X_test, low_dim_Y_train, low_dim_Y_test = creditdata.dimension_decrease(
     )
     postp = Postprocess(low_dim_X_train, low_dim_X_test, low_dim_Y_train,
                         low_dim_Y_test)
     x1, x2, y1, y2 = postp.improve_data()
     return self.data_simplification(x1, y1, sample, x_axi_attr_index,
                                     y_axi_attr_index)

Example #2

0

Show file

File: experiment.py Project: ZacChen11/The-Default-of-Credit-Card-Clients

 def __init__(self):
     self.classifier = []
     self.processor = []
     self.result = []
     creditdata = Preprocess("default of credit card clients.xls")
     self.raw_X_train, self.raw_X_test, self.raw_Y_train, self.raw_Y_test = creditdata.load(
     )
     self.low_dim_X_train, self.low_dim_X_test, self.low_dim_Y_train, self.low_dim_Y_test = \
         creditdata.dimension_decrease()
     x1, x2, y1, y2 = self.low_dim_X_train, self.low_dim_X_test, self.low_dim_Y_train, self.low_dim_Y_test
     self.discretizer = Postprocess(x1, x2, y1, y2)
     self.discretized_X_train, self.discretized_X_test, self.discretized_Y_train, self.discretized_Y_test = \
         self.discretizer.improve_data()
     self.buildclf()
     self.buildprocessor()
     self.logfile = open("execution_Log", "a")

Example #3

0

Show file

File: featureEngineering.py Project: unangity/multiclass-speech-classification

class FeatureEngineer:
    def __init__(self, feature, path, train, test):
        self.preprocessor = Preprocess(feature, path, train, test)
        self.train_feat, self.test_feat = self.preprocessor.combine_data(
            self.preprocessor.features)
        self.df_train = self.preprocessor.df_train
        self.df_test = self.preprocessor.df_test

    def convert_labels(self):
        #dictionary to change labels to integers
        levels = {key: value \
            for key, value in zip(self.preprocessor.df_train.word.unique(), \
            range( len( self.df_train.word.unique() )) ) }

        #binary encoding labels
        labels = to_categorical(np.array([levels[key] \
            for key in self.df_train["word"]], dtype=np.float32))

        return labels, levels

    def scale(self):
        size_train = len(self.train_feat)
        size_test = len(self.test_feat)
        sc = StandardScaler()

        for i in range(size_train):
            self.train_feat[i] = sc.fit_transform(self.train_feat[i])

        for i in range(size_test):
            self.test_feat[i] = sc.fit_transform(self.test_feat[i])

    def remake_array(self, arr):
        ''' To ensure the last axis is the same for all samples '''
        remade_array = np.zeros((len(arr), 99, 13))
        for i, item in enumerate(arr):
            for j in range(item.shape[0]):
                for k in range(len(item[j])):
                    remade_array[i][j][k] += arr[i][j][k]

        return remade_array[:, :, :, np.newaxis]

    def define_splitting(self, feat, lbl, ratio):
        np.random.seed(37555)
        X_1, X_2, y_1, y_2 = ms.train_test_split(feat, lbl, test_size=ratio)

        return X_1, X_2, y_1, y_2

    def splitting(self):
        remade_data = self.remake_array(self.train_feat)
        labels, _ = self.convert_labels()

        X_train, X_2, y_train, y_2 = self.define_splitting(
            remade_data, labels, 0.3)
        X_val, X_test, y_val, y_test = self.define_splitting(X_2, y_2, 0.5)

        return (X_train, y_train), (X_val, y_val), (X_test, y_test)

Example #4

0

Show file

    def trainmodel(self):
        prep = Preprocess("default of credit card clients.xls")
        prep.load()
        low_dim_x1, low_dim_x2, low_dim_y1, low_dim_y2 = prep.dimension_decrease(
        )
        postp = Postprocess(low_dim_x1, low_dim_x2, low_dim_y1, low_dim_y2)
        discretized_x1, discretized_x2, discretized_y1, discretized_y2 = postp.improve_data(
        )
        x = np.concatenate((discretized_x1, discretized_x2))
        y = np.concatenate((discretized_y1, discretized_y2))
        self.c.fit(x, y)

        y_pred = self.c.predict(x)
        mislabeled = (y != y_pred).sum()
        totaltest = x.shape[0]
        print(
            "Mislabeled points (%s Classification) out of a total %d points : %d"
            % ("SVC", totaltest, mislabeled))
        Precision = 1 - mislabeled / totaltest
        print("Precision of %s is %4.2f%%" % ("SVC", Precision * 100))

Example #5

0

Show file

File: Customer_Segmentation.py Project: SamShowalter/AI_Source_Code

    def __init__(self,
                 Method=KMeans(),
                 data=pd.DataFrame(),
                 orig_data=pd.DataFrame(),
                 log=None,
                 test_name="",
                 random_state=42):
        """
		Constructor for LinearInversion class object. No parameters are given. 
		Necessary information from the estimator class is provided later in the
		execution by QST_sim.

		Args:

			data:							Raw data input
			method:							Implementation method class (e.g. KMeans)
			log:							Master Log object reference
			test_name:						Name of the test being run
			random_state:					Seed for random number generator

		"""

        #Date and time metadata
        self.execution_date_start = dt.datetime.now().date()
        self.execution_time_start = dt.datetime.now().time().strftime(
            "%H.%M.%S")

        #Set test name
        self.test_name = test_name

        #Attribute user inputs to object
        self.data = data

        self.Log = log
        self.Log.test_number += 1

        #Initialize a preprocessing object
        self.Preprocess = Preprocess(self)

        #Initialize Segmentation Method Object
        self.SegMethod = SegMethod(self, Method=Method)

        #Set random state to default input
        self.random_state = random_state

        #Visualization folder name
        self.viz_folder_name = self.test_name + "_Visualizations"

        #Set original data
        self.orig_train_data = orig_data

        #Initialize all data storage variables
        self.train_data = None
        self.class_label = None

Example #6

0

Show file

    def __init__(self):
        self.preprocess = Preprocess.Preprocess()
        self.embeddings = Embeddings.Embeddings()


        self.stop_words = []
        stop_words_path = config.data_prefix_path + 'spanish.txt'
        with open(stop_words_path, 'r') as fr:
            lines = fr.readlines()
            for line in lines:
                self.stop_words.append(line.strip())

Example #7

0

Show file

File: Rank-CNN.py Project: jinlongyu60/NLPCC2017_Task5

    def __init__(self):
        self.preprocessor = Preprocess.Preprocessor()
        self.embedding = Embeddings()
        self.lr = 5e-4
        self.batch_size = 128
        self.n_epoch = 10

        self.sentence_length = self.preprocessor.sentence_length
        self.vec_dim = self.embedding.vec_dim
        self.filter_sizes = [2, 3]
        self.num_filters = 64
        self.num_hidden = 100
        self.l2_reg = 0.0004
        self.num_classes = 2

        self.vocab_size = 212237

Example #8

0

Show file

    def __init__(self, lang):
        self.preprocessor = Preprocess.Preprocess()
        self.embedding = Embeddings()
        self.Feature = Feature.Feature()
        self.Powerfulwords = PowerfulWord.PowerfulWord()
        self.Graph = GraphFeature.GraphFeature()
        self.lang = lang

        if lang == 'es':
            self.sentence_length = self.preprocessor.max_es_length
        elif lang == 'en':
            self.sentence_length = self.preprocessor.max_en_length

        self.n_folds = 10
        self.num_classes = 2
        self.eclipse = 1e-10
        self.vec_dim = self.embedding.vec_dim

        self.clip_gradients = False
        self.max_grad_norm = 5.

Example #9

0

Show file

File: MatchPyramid.py Project: zhangxt/CIKM-AnalytiCup-2018

    def __init__(self):
        self.preprocessor = Preprocess.Preprocess()
        self.embedding = Embeddings()
        self.Feature = Feature.Feature()

        self.lr = 0.0004
        self.keep_prob = 0.5
        self.l2_reg = 0.04
        self.sentence_length = self.preprocessor.max_length

        self.vec_dim = self.embedding.vec_dim
        self.hidden_dim = 16

        self.num_classes = 2
        self.batch_size = 128
        self.n_epoch = 20
        self.eclipse = 1e-10
        self.num_features = 15

        self.cosine = True
        self.psize1 = 3
        self.psize2 = 3

Example #10

0

Show file

    def __init__(self, model_type="ABCNN3", clip_gradients=True):
        self.model_type = model_type

        self.preprocessor = Preprocess.Preprocessor()
        self.embedding = Embeddings()
        self.lr = 0.05
        self.batch_size = 64
        self.n_epoch = 12

        self.sentence_length = self.preprocessor.sentence_length
        self.w = 4
        self.l2_reg = 0.0004
        self.di = 50                               # The number of convolution kernels
        self.vec_dim = self.embedding.vec_dim
        self.num_classes = 2
        self.num_layers = 2

        self.clip_gradients = clip_gradients
        self.max_grad_norm = 5.
        self.eclipse = 1e-9

        self.vocab_size = 212237

Example #11

0

Show file

File: main.py Project: ALSM-PhD/Segframe

def main_exec(config):
    """
    Main execution line. Dispatch processes according to parameter groups.
    Multiple processes here prevent main process from consuming too much memory.
    """

    if not os.path.isdir(config.bdir):
        os.mkdir(config.bdir)

    if not os.path.isdir(config.weights_path):
        os.mkdir(config.weights_path)

    if not os.path.isdir(config.model_path):
        os.mkdir(config.model_path)

    if not os.path.isdir(config.cache):
        os.mkdir(config.cache)

    if not os.path.isdir(config.logdir):
        os.mkdir(config.logdir)

    if config.preprocess:
        if config.img_type is None:
            imgt = img_types
        else:
            imgt = config.img_type

        if config.multiprocess:
            proc = Process(target=Preprocess.preprocess_data,
                           args=(config, imgt))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            Preprocess.preprocess_data(config, imgt)

    if config.train:
        if not os.path.isdir(config.weights_path):
            os.mkdir(config.weights_path)
        if not os.path.isdir(config.model_path):
            os.mkdir(config.model_path)

        if config.multiprocess:
            ctx = mp.get_context('spawn')
            cache_m = CacheManager()
            proc = ctx.Process(target=GenericTrainer.run_training,
                               args=(config, cache_m.getLocations()))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            GenericTrainer.run_training(config, None)

    if config.al:
        if not os.path.isdir(config.weights_path):
            os.mkdir(config.weights_path)
        if not os.path.isdir(config.model_path):
            os.mkdir(config.model_path)

        if config.multiprocess:
            ctx = mp.get_context('spawn')
            cache_m = CacheManager()
            proc = ctx.Process(target=ALTrainer.run_training,
                               args=(config, cache_m.getLocations()))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            ts = importlib.import_module('Trainers', config.strategy)
            getattr(ts, config.strategy).run_training(config, None)

    if config.pred:
        if config.multiprocess:
            ctx = mp.get_context('spawn')
            cache_m = CacheManager()
            proc = Process(target=Predictions.run_prediction,
                           args=(config, cache_m.getLocations()))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            Predictions.run_prediction(config, None)

    if config.postproc:
        pass

    if config.runtest:
        if config.tmode == 0:
            pass
        elif config.tmode == 1:
            #Run train test
            TrainTest.run(config)
        elif config.tmode == 2:
            DatasourcesTest.run(config)
        elif config.tmode == 3:
            PredictionTest.run(config)
        elif config.tmode == 4:
            ActiveLearningTest.run(config)

    if not (config.preprocess or config.train or config.postproc or config.pred
            or config.runtest):
        print(
            "The problem begins with choice: preprocess, train, postprocess or predict"
        )

Example #12

0

Show file

            # group owed amount into different intervals
            if -100000 <= temp_owe < 0:
                self.x_test[row, 6] = -1
            elif -500000 <= temp_owe < -100000:
                self.x_test[row, 6] = -2
            elif temp_owe < -500000:
                self.x_test[row, 6] = -3
            elif self.x_test[row, 6] == 0:
                continue
            elif 1 <= temp_owe < 100001:
                self.x_test[row, 6] = 1
            elif 10000 <= temp_owe < 500001:
                self.x_test[row, 6] = 2
            else:
                self.x_test[row, 6] = 3

    def improve_data(self):
        self.set_age()
        self.set_amount()
        return self.x_train, self.x_test, self.y_train, self.y_test


if __name__ == '__main__':
    a = Preprocess("default of credit card clients.xls")
    rx1, rx2, ry1, ry2 = a.load()
    x1, x2, y1, y2 = a.dimension_decrease()
    b = Postprocess(x1, x2, y1, y2)
    xd1, xd2, yd1, yd2 = b.improve_data()

Example #13

0

Show file

File: Embeddings.py Project: zhangxt/CIKM-AnalytiCup-2018

 def __init__(self):
     self.scale = 0.1
     self.vec_dim = 300
     self.preprocessor = Preprocess.Preprocess()

Example #14

0

Show file

 def __init__(self):
     self.Proprocess = Preprocess.Preprocess()
     self.GraphFeture = GraphFeature.GraphFeature()

Example #15

0

Show file

File: EvaluateModels.py Project: ahmedkas/Text-Classification

import pandas as pd
import sys
from keras.models import model_from_json
sys.path.append('./code/DataCleaning')
sys.path.append('./code/Models')
sys.path.append('./code/Evaluation')
from Preprocessing import Preprocess
from DataManipulation import Manipulations_Selector
import Train
import DataSeperator
data = pd.read_csv(
    "/home/sultan/Desktop/bitbuket/Text-Classification-master/data/GT.csv")

x = data["comment_text"]
x = x.values.tolist()
x = Preprocess(x)

available_text_manipulation = sys.argv[1]
available_class = sys.argv[2]
available_model = sys.argv[3]

Embed = False
reshape = True
if available_text_manipulation == "Embedding":
    Embed = True
    reshape = False

if available_text_manipulation == "WORD2VEC":
    reshape = False

if available_text_manipulation == "WORD2VEC_pre":

Example #16

0

Show file

 def __init__(self):
     self.preprocess = Preprocess.Preprocess()

Example #17

0

Show file

File: featureEngineering.py Project: unangity/multiclass-speech-classification

 def __init__(self, feature, path, train, test):
     self.preprocessor = Preprocess(feature, path, train, test)
     self.train_feat, self.test_feat = self.preprocessor.combine_data(
         self.preprocessor.features)
     self.df_train = self.preprocessor.df_train
     self.df_test = self.preprocessor.df_test

Example #18

0

Show file

 def __init__(self):
     self.preprocess = Preprocess.Preprocess()
     self.Feature = Feature.Feature()

Example #19

0

Show file

File: BaseMlModel.py Project: zhangxt/CIKM-AnalytiCup-2018

    def __init__(self):

        self.preprocessor = Preprocess.Preprocess()
        self.Feature = Feature.Feature()
        self.Powerfulwords = PowerfulWord.PowerfulWord()
        self.Graph = GraphFeature.GraphFeature()