Python DataPreprocessing.DataPreprocessing 예제들, DataPreprocessing.DataPreprocessing, qlink Python 예제들

예제 #1

0

파일 보기

파일: pipeline.py 프로젝트: yuxiaohuang/research

def get_result_from_data(data_dir, result_dir, dp_dir):
    """
    Get result from data
    :param data_dir: the pathname of the data directory
    :param result_dir: the pathname of the result directory
    :param dp_dir: the pathname of the DataPreprocessing module directory
    :return:
    """

    # Add code_dir folder
    sys.path.append(dp_dir)

    # Import the DataPreprocessing module
    import DataPreprocessing
    # Get the DataPreprocessing object
    dp = DataPreprocessing.DataPreprocessing(data_dir)

    # Match data file withm names file
    data_names = dp.match_data_names()

    # The parallel pipelines for data preprocessing, train, test, and evaluate the ALA classifier
    # n_jobs = -1 indicates (all CPUs are used)
    # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads
    Parallel(n_jobs=10)(
        delayed(pipeline)(dp, data_file, names_file, result_dir)
        for data_file, names_file in data_names)

예제 #2

0

파일 보기

    def getData(self, indexs, hasHeader, needHandleNegativeOneIndex, flag):
        data = []
        columns = defaultdict(list)  # each value in each column is appended to a list

        with open(self.fileName, encoding='Latin-1') as f:
            reader = csv.reader(f, delimiter = ",",quoting=csv.QUOTE_NONE)  # read rows into a dictionary format
            if hasHeader == 1:
                next(reader)
                next(reader)
            for row in reader:
                for (i, v) in enumerate(row):
                    columns[i].append(v)
        for j in indexs:
            newColumns = columns[j]
            if j in needHandleNegativeOneIndex:

                newColumns = DataPreprocessing.DataPreprocessing().handleNegativeOneV2([float(i) for i in newColumns], flag = False)
            data.append(newColumns)
        # minLength
        allLengths = []
        for i in range(0, len(data)):
            allLengths.append(len(data[i]))
        minLength = np.array(allLengths).min()
        for i in range(0, len(data)):
            data[i] = data[i][0:minLength]
        return data

예제 #3

0

파일 보기

파일: FacMachineRanker.py 프로젝트: yougth/IRDM2017

    def train(self, trainDF, validateDF):
        print("+++++++++++++++++++++Training model...")
        print("Remove non trainable features...")
        self.xTrain = trainDF
        self.yTrain = trainDF[self.yColDiscrete]
        if ('relevance_int' in self.xTrain):
            self.xTrain = self.xTrain.drop('relevance_int', axis=1)

        print("OneHot encoding")
        self.xTrain = pd.get_dummies(self.xTrain, sparse=True)
        self.xTrain = scipy.sparse.csc_matrix(self.xTrain)

        fm = SGDFMClassification(n_iter=1000,
                                 rank=16,
                                 l2_reg_w=0.0005,
                                 l2_reg_V=0.0005,
                                 l2_reg=0.0005,
                                 step_size=0.01)
        self._model = OneVsRestClassifier(fm)

        self.fittedModel = self._model.fit(self.xTrain, self.yTrain)
        self.yPred = self.fittedModel.predict(self.xTrain)

        print("Converting to old labels")
        dp = DataPreprocessing.DataPreprocessing()
        self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix())
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ])
        print("self.yPred:", self.yPred.shape, self.yPred[1:100, ])

        print("MSE:", mean_squared_error(self.yTrain, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred)))
        print("+++++++++++++++++++++Training completed")

예제 #4

0

파일 보기

    def train(self, trainDF, svalidateDF):
        self._model = LogisticRegression(penalty='l2',
                                         dual=False,
                                         tol=0.0001,
                                         C=1.0,
                                         fit_intercept=True,
                                         intercept_scaling=1,
                                         class_weight=None,
                                         random_state=None,
                                         solver='sag',
                                         max_iter=10000,
                                         multi_class='multinomial',
                                         verbose=1,
                                         warm_start=False,
                                         n_jobs=-1)
        print("+++++++++++++++++++++Training model...")

        print("Remove non trainable features...")
        self.xTrain = trainDF
        self.yTrain = trainDF[self.yColDiscrete]
        # self.xValidate=validateDF
        # self.yValidate=validateDF[self.yColDiscrete]

        # self.xTrain.drop('search_term', axis=1, inplace=True)
        # self.xTrain.drop('relevance', axis=1, inplace=True)
        if ('relevance_int' in self.xTrain):
            self.xTrain = self.xTrain.drop('relevance_int', axis=1)
        # self.xTrain.drop('product_idx', axis=1, inplace=True)
        # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        # self.xValidate.drop('search_term', axis=1, inplace=True)
        # self.xValidate.drop('relevance', axis=1, inplace=True)
        # self.xValidate.drop('relevance_int', axis=1, inplace=True)
        # self.xValidate.drop('product_idx', axis=1, inplace=True)
        # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        print("+++++++++++++++++++++Training in progress")
        # print("self.xTrain:",list(self.xTrain))
        # print("self.yTrain:", list(self.yTrain))
        fittedModel = self._model.fit(self.xTrain, self.yTrain)
        self.yPred = fittedModel.predict(self.xTrain)
        # print("self.yPred:", list(self.yPred))

        print("Converting to old labels")
        dp = DataPreprocessing.DataPreprocessing()
        self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix())
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ])
        print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        print("MSE:", mean_squared_error(self.yTrain, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred)))
        # print("Accuracy:", accuracy_score(self.yTrain, self.yPred))
        # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro'))
        # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro'))
        # print("F1:", f1_score(self.yTrain, self.yPred, average='micro'))
        print("+++++++++++++++++++++Training completed")

예제 #5

0

파일 보기

파일: main.py 프로젝트: JohnTargaryen/DeepFuse-Re-implementation

def main():
    DP = DataPreprocessing()
    UnderExposedImages, OverExposedImages = DP.ConstructDataset()
    UnderExposedYCbCrImages = [[], [], []]
    OverExposedYCbCrImages = [[], [], []]
    with tf.Session() as sess:
        for img in UnderExposedImages:
            Image_Array = img.eval()
            #Image_Array = Image_Array / 255 # normalization
            YImage, CbImage, CrImage = rgb_to_ycbcr(Image_Array)

            UnderExposedYCbCrImages[0].append(YImage)
            UnderExposedYCbCrImages[1].append(CbImage)
            UnderExposedYCbCrImages[2].append(CrImage)
            print("test")
        # Image.fromarray(np.asarray(OverExposedImages[0].eval())).show()
    return

예제 #6

0

파일 보기

파일: pipeline.py 프로젝트: work2review/iclr_2021

def pipeline_all_datasets():
    """
    The pipeline for all data sets
    :return:
    """

    # Add code_dir folder
    sys.path.append(dp_dir)

    # Import DataPreprocessing module
    import DataPreprocessing
    dp = DataPreprocessing.DataPreprocessing(data_dir)

    # Match data files with names file
    data_names = dp.match_data_names()

    # The pipeline for each data set (in parallel)
    # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads
    Parallel(n_jobs=1)(
        delayed(pipeline_one_dataset)(dp, data_files, names_file)
        for data_files, names_file in data_names)

예제 #7

0

파일 보기

    4. Taekwondo
    5. MLB 1870-2016



"""

import numpy as np
import math
import time

import DataPreprocessing
import EnsembleClassifiers
import ModelValidation

DP = DataPreprocessing.DataPreprocessing()
EC = EnsembleClassifiers.EnsembleClassifiers()
MV = ModelValidation.ModelValidation()

#Special Cases Pre-processing

#DP.merge_taekwondo_datasets();
#DP.preprocess_sms_dataset();


def dataset_learning(dataset, output_file, dataset_name, preprocess_time, NT,
                     F, parameters):

    txtfile = open('./Learning_Results/' + output_file + '.txt', 'wb')
    txtfile.write("\n::::::::::::::::::::::::::::")
    txtfile.write("\nRandom Forest Classification")

예제 #8

0

파일 보기

파일: ExtractFeatureMain.py 프로젝트: Junwang1993/Edit

    csv2 = CsvReader.CsvReader(glob(dir + 'window*.csv')[0])
    # extract gaze data
    data_gaze = csv1.getData([0, 2, 3],
                             hasHeader=1,
                             needHandleNegativeOneIndex=[2, 3],
                             flag=True)
    # get rid of zeros
    if len(data_gaze[1]) != len(data_gaze[2]):
        data_gaze[0] = data_gaze[0][0:min(len(data_gaze[1]), len(data_gaze[2])
                                          )]
        data_gaze[1] = data_gaze[1][0:min(len(data_gaze[1]), len(data_gaze[2])
                                          )]
        data_gaze[2] = data_gaze[2][0:min(len(data_gaze[1]), len(data_gaze[2])
                                          )]
    # assign outside fixations to (-1, -1)
    data_gaze[1], data_gaze[2] = DataPreprocessing.DataPreprocessing(
    ).correspondingXAndY(data_gaze[1], data_gaze[2])
    # read video csv
    data_video = csv2.getData([0, 1],
                              hasHeader=0,
                              needHandleNegativeOneIndex=[],
                              flag=False)
    # get absolute time for both video and gaze
    at_gaze = [Time.Time(i) for i in data_gaze[0]]
    at_video = [Time.Time(i) for i in data_video[1]]

    # interpolation to video
    start_v, end_v, data_gaze_videoTimeDomain = ConvertTimeDomainToVideo.ConvertTimeDomainToVideo(
    ).convertTimeDomain2Video(time_other=at_gaze,
                              time_vidoe=at_video,
                              collections_data=data_gaze[1:len(data_gaze)])
    if end_v == -1:

예제 #9

0

파일 보기

파일: main.py 프로젝트: maryign/scientific_work_university

# -*- coding: cp1251 -*-
import codecs
import os
import re

# import pandas as pd
# import sys
import null
import numpy as np
from requests.packages import chardet

# reload(sys)
import DataPreprocessing as dp
import CreateDictionary as cd
import DataProcessing as dproc
a1=dp.DataPreprocessing()
a2 = cd.CreateDictionary()
a3 = dproc.DataProcessing()
# f='D:\scientific work\InformationExtracting\processeddataex\qword-20151019144240604.xml'
# # # print a.find_first_occurrences_of_header(f,u'ТАЛОН НА ОКАЗАНИЕ ВМП ИЗ ПАК')
# # # print a.find_last_occurrence_of_header(f,u'ТАЛОН НА ОКАЗАНИЕ ВМП ИЗ ПАК')
# text =  a1.find_all_occurrences_of_header(f,u'ЭПИДАНАМНЕЗ')
# # # print text
# sent = a2.sentences_list(text)
# colloc = a2.collocations_list(sent)
# words = a2.words_list(colloc)
# # print sent
# # print colloc


# frequency lists for stationary diagnosis

예제 #10

0

파일 보기

파일: main.py 프로젝트: winfish/lstm_on_uwb

p.add_argument('--hidden_size', type=int, default = 2) # RNN output size
p.add_argument('--input_size', type=int, default = 4) #RNN input size : number of uwb
p.add_argument('--sequence_length', type=int, default = 5) # # of lstm rolling
p.add_argument('--output_size', type=int, default = 2) #final output size (RNN or softmax, etc)
#FOR TEST
p.add_argument('--load_model_dir', type=str, default="model/RiTA_wo_fcn/stacked_bi_epoch_3000/model_0_00006-17700")
p.add_argument('--test_data', type=str, default='train_data_3D_zigzag.csv')
p.add_argument('--output_results', type=str, default= 'results/RiTA/stack_lstm_epoch3000_17700.csv')
###########
p.add_argument('--mode', type=str, default = "train") #train or test
args = p.parse_args()




data_parser = DataPreprocessing.DataPreprocessing(args.train_data, args.sequence_length)
data_parser.fitDataForMinMaxScaler()

X_data,Y_data = data_parser.set_data()
# data : size of data - sequence length + 1

tf.reset_default_graph()

LSTM = LSTM(args) #batch_size, dic_size, sequence_length, hidden_size, num_classes)
print(X_data.shape) #Data size / sequence length / uwb num


#terms for learning rate decay
global_step = tf.Variable(0, trainable=False)
iter = int(len(X_data)/args.batch_size)
num_total_steps = args.epoches*iter

예제 #11

0

파일 보기

파일: HomeDepotCSVReader.py 프로젝트: yougth/IRDM2017

    def getQueryProductAttributeDataFrame(self,
                                          train_filename,
                                          test_filename,
                                          attribute_filename,
                                          description_filename,
                                          header=0):
        '''
        Takes in HomeDepot CSV and process into the following dataframe:
        1. train_query_df/test_query_df
            - id
            - product_uid
            - search_term
            - relevance
        2. product_df
            - product_uid
            - product_title
            - product_description

        3. attribute_df
            - product_uid
            - name
            - value

        :param train_filename:
        :param test_filename:
        :param attribute_filename:
        :param description_filename:
        :param header:
        :return: [train_query_df, product_df, attribute_df, test_query_df]
        '''
        train_query_df = pd.read_csv(train_filename,
                                     delimiter=',',
                                     low_memory=False,
                                     header=header,
                                     encoding="ISO-8859-1")
        test_query_df = pd.read_csv(test_filename,
                                    delimiter=',',
                                    low_memory=False,
                                    header=header,
                                    encoding="ISO-8859-1")
        attribute_df = pd.read_csv(
            attribute_filename, delimiter=',', low_memory=False, header=header
        )  # people don't seem to use this on attr and desc, encoding="ISO-8859-1")
        description_df = pd.read_csv(description_filename,
                                     delimiter=',',
                                     low_memory=False,
                                     header=header)  #, encoding="ISO-8859-1")

        all_df = pd.concat((train_query_df, test_query_df),
                           axis=0,
                           ignore_index=True)

        train_query_df = train_query_df.drop('product_title', axis=1)
        test_query_df = test_query_df.drop('product_title', axis=1)

        product_df = pd.DataFrame()
        product_df = all_df.drop_duplicates(['product_uid'])
        product_df = product_df.drop('relevance', axis=1)
        product_df = product_df.drop('id', axis=1)
        product_df = product_df.drop('search_term', axis=1)
        product_df = pd.merge(product_df,
                              description_df,
                              how='left',
                              on='product_uid')

        dp = DataPreprocessing.DataPreprocessing()
        train_query_df = dp.transformLabels(trainDF=train_query_df,
                                            newColName='relevance_int')

        # Add a index column in train and test to reference to product
        train_query_df['product_idx'] = [
            product_df[product_df['product_uid'] == uid].index
            for uid in train_query_df.product_uid
        ]
        test_query_df['product_idx'] = [
            product_df[product_df['product_uid'] == uid].index
            for uid in test_query_df.product_uid
        ]

        # print("all: ", len(all_df.product_uid))
        # print("all unique: ", len(all_df.product_uid.unique()))
        # print("description_df: ", len(description_df))
        # print("description_df unique: ", len(description_df.product_uid.unique()))
        # print("product_df: ", len(product_df))
        # print("product_df info", product_df.info())
        # print("train_query_df.info: ", train_query_df.info())

        return [train_query_df, product_df, attribute_df, test_query_df]

예제 #12

0

파일 보기

 def __init__(self, id, host, port, gui=False):
     self.sensorId = id
     self.socket = Socket.MySocket(host=host, port=port)
     self.frameSize = 0.300
     self.dataPre = dataPre.DataPreprocessing()
     self.model = model.Model(gui)