Python DataPrep Examples, data_prep.DataPrep Python Examples

Example #1

0

Show file

class Predictor_cat5():
    def __init__(self):
        self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
        self.categories = [
            'OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS',
            'NEUROTICISM'
        ]
        self.Pre_cat = {
            trait: cat
            for (trait, cat) in zip(self.traits, self.categories)
        }
        self.models = {
            trait: pickle.load(
                open(os.getcwd() + '/model/' + trait + '_model.pkl', 'rb'))
            for i, trait in enumerate(self.traits)
        }
        self.dp = DataPrep()

    def predict(self, X, traits='All', predictions='All'):
        predictions = {}
        self.dp.transform(X)
        if traits == 'All':
            for trait in self.traits:
                pkl_model = self.models[trait]
                # trait_categories = pkl_model.predict(X, regression=False)
                # predictions[self.Pre_cat[trait]+'  '] = str(trait_categories[0])
                # trait_scores = pkl_model.predict(X, regression=True).reshape(1, -1)
                # predictions[self.Pre_cat[trait]+'  '] = predictions[self.Pre_cat[trait]+'  ']+' '+str(round(trait_scores.flatten()[0]*10))+' % '
                trait_categories_probs = pkl_model.predict_proba(X)
                predictions[self.Pre_cat[trait] + '  '] = str(
                    trait_categories_probs[:, 1][0] * 100)
        return predictions

Example #2

0

Show file

 def __init__(self):
     self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
     self.categories = [
         'OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS',
         'NEUROTICISM'
     ]
     self.Pre_cat = {
         trait: cat
         for (trait, cat) in zip(self.traits, self.categories)
     }
     self.models = {
         trait: pickle.load(open('static/' + trait + '_model.pkl', 'rb'))
         for i, trait in enumerate(self.traits)
     }
     self.dp = DataPrep()

Example #3

0

Show file

File: handler.py Project: uk-gov-mirror/ONSdigital.takeon-data-prep-py-lambda

def run_data_prep(event, context):
#def run_data_prep():
    #inp = '{"reference": "4990012", "period": "201211", "survey": "066", "instance": "instanceId"}'
    #event = json.loads(inp)

    print(event)
    dataprep = DataPrep(event)
    records = dataprep.get_qcode_resp_from_db()
    dataprep.construct_response(records)
    dataprep.construct_metadata()
    print("Attempting to invoke Wrangler Lambda with the json string: " + str(event))
    dataprep.send_data_to_wrangler()

#run_data_prep()

Example #4

0

Show file

class Predictor_cat5():
    def __init__(self):
        """
        Loading all regression and classification models of cat5 models (model no 1 )
        """
        self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
        self.categories = [
            'OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS',
            'NEUROTICISM'
        ]
        self.Pre_cat = {
            trait: cat
            for (trait, cat) in zip(self.traits, self.categories)
        }
        self.models = {
            trait: pickle.load(open('static/' + trait + '_model.pkl', 'rb'))
            for i, trait in enumerate(self.traits)
        }
        self.dp = DataPrep()

    def predict(self, X, traits='All', predictions='All'):
        """
            Takes features and returns predictions 
             Transforming text into vector and predicting probablity on text 
        """
        predictions = {}
        self.dp.transform(X)
        if traits == 'All':
            for trait in self.traits:
                pkl_model = self.models[trait]
                # trait_categories = pkl_model.predict(X, regression=False)
                # predictions[self.Pre_cat[trait]+'  '] = str(trait_categories[0])
                # trait_scores = pkl_model.predict(X, regression=True).reshape(1, -1)
                # predictions[self.Pre_cat[trait]+'  '] = predictions[self.Pre_cat[trait]+'  ']+' '+str(round(trait_scores.flatten()[0]*10))+' % '
                trait_categories_probs = pkl_model.predict_proba(X)
                predictions[self.Pre_cat[trait] +
                            '  '] = trait_categories_probs[:, 1][0] * 100
        return predictions

Example #5

0

Show file

File: data_pool.py Project: sfu-natlang/glm-parser

    def load(self,
             data_path,
             data_regex,
             shards,
             prep_path,
             sparkContext):
        """
        For each section in the initializer, iterate through all files
        under that section directory, and load the content of each
        individual file into the class instance.

        This method should be called after section regex has been initalized
        and before any get_data method is called.
        """
        logger.info("Loading data...")
        self.dataPrep = DataPrep(dataURI      = data_path,
                                 dataRegex    = data_regex,
                                 shardNum     = shards,
                                 targetPath   = prep_path,
                                 sparkContext = sparkContext)

        # Load data
        if self.hadoop is True:
            self.dataPrep.loadHadoop()
        else:
            self.dataPrep.loadLocal()

        # Add data to data_list
        # If using yarn mode, local data will not be loaded
        if self.hadoop is False:
            for dirName, subdirList, fileList in os.walk(self.dataPrep.localPath()):
                for file_name in fileList:
                    file_path = "%s/%s" % (str(dirName), str(file_name))
                    self.data_list += self.data_format.get_data_from_file(file_path)
        else:
            aRdd = sparkContext.textFile(self.dataPrep.hadoopPath()).cache()
            tmp  = aRdd.collect()
            tmpStr = ''.join(str(e) + "\n" for e in tmp)
            self.load_stringtext(textString = tmpStr)

        logger.info("Data loaded")
        return

Example #6

0

Show file

File: NN_vs_keras_franke.py Project: elinfi/FYS-STK4155

neurons = [10, 10]
n_outputs = 1
hidden_act = act.Sigmoid()
output_act = act.Identity()

# create data using franke function
seed = 2034
np.random.seed(seed)
x = np.sort(np.random.uniform(0, 1, n))
y = np.sort(np.random.uniform(0, 1, n))
x, y = np.meshgrid(x, y)
z = np.ravel(f.FrankeFunction(x, y) + 0.1*np.random.randn(x.shape[0], x.shape[1]))
z = z.reshape(-1, 1)

# set up the design matrix
data = DataPrep()
X = data.design_matrix(x, y, degree=1)[:, 1:]

# split data in train and test and scale it
X_train, X_test, z_train, z_test = data.train_test_scale(X, z)

# set up the neural network
network = NeuralNetwork(X_train.shape[1], neurons, n_outputs, cost.MSE())
network.create_layers(hidden_act, output_act, seed)

# train the network
batch_size = len(X_train)//n_batches
index_array = np.arange(len(X_train))
for k in range(n_epochs):
    np.random.shuffle(index_array)
    X_minibatches = np.split(X_train[index_array], n_batches)

Example #7

0

Show file

            return self.rfc.predict(X)

    def predict_proba(self, X, regression=False):
        X = self.tfidf.transform(X)
        if regression:
            raise ValueError('Cannot predict probabilites of a regression!')
        else:
            return self.rfc.predict_proba(X)


if __name__ == '__main__':
    traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
    model = Model()

    for trait in traits:
        dp = DataPrep()
        X_regression, y_regression = dp.prep_data('status',
                                                  trait,
                                                  regression=True,
                                                  model_comparison=False)
        X_categorical, y_categorical = dp.prep_data('status',
                                                    trait,
                                                    regression=False,
                                                    model_comparison=False)
        print('Fitting trait ' + trait + ' regression model...')
        model.fit(X_regression, y_regression, regression=True)
        print('Done!')
        print('Fitting trait ' + trait + ' categorical model...')
        model.fit(X_categorical, y_categorical, regression=False)
        print('Done!')
        with open('static/' + trait + '_model.pkl', 'wb') as f:

Example #8

0

Show file

File: data_pool.py Project: sfu-natlang/glm-parser

class DataPool():
    """
    Data object that holds all sentences (dependency trees) and provides
    interface for loading data from the disk and retrieving them using an
    index.

    Data are classified into sections when stored in the disk, but we
    do not preserve such structural information, and all sentences
    will be loaded and "flattened" to be held in a single list.

    The instance maintains a current_index variable, which is used to
    locate the last sentence object we have read. Calling get_next()
    method will increase this by 1, and calling has_next() will test
    this index against the total number. The value of the index is
    persistent during get_next() and has_next() calls, and will only
    be reset to initial value -1 when reset() is called (manually or
    during init).
    """
    def __init__(self,
                 fgen,
                 data_format,
                 data_regex   = None,
                 data_path    = None,
                 textString   = None,
                 prep_path    = 'data/prep/',
                 shards       = 1,
                 sparkContext = None,
                 hadoop       = False):

        """
        Initialize the Data set

        :param data_regex: the sections to be used.
        A regular expression that indicates which sections to be used e.g.
        (0[0-9])|(1[0-9])|(2[0-1])/.*tab
        :type data_regex: str

        :param data_path: the relative or absolute path to the 'penn-wsj-deps' folder
        (including "penn-wsj-deps")
        :type data_path: str

        :param format_path: the file that describes the file format for the type of data
        :type format_path: str
        """
        if isinstance(fgen, basestring):
            self.fgen = importlib.import_module('feature.' + fgen).FeatureGenerator
        else:
            self.fgen = fgen

        if isinstance(data_format, basestring):
            self.data_format = importlib.import_module('data.data_format.' + data_format).DataFormat(self.fgen)
        else:
            self.data_format = data_format

        self.hadoop   = hadoop
        self.reset_all()

        if textString is not None:
            self.load_stringtext(textString)

        if data_regex is not None:
            self.load(data_path    = data_path,
                      data_regex   = data_regex,
                      shards       = shards,
                      prep_path    = prep_path,
                      sparkContext = sparkContext)
        return

    def load(self,
             data_path,
             data_regex,
             shards,
             prep_path,
             sparkContext):
        """
        For each section in the initializer, iterate through all files
        under that section directory, and load the content of each
        individual file into the class instance.

        This method should be called after section regex has been initalized
        and before any get_data method is called.
        """
        logger.info("Loading data...")
        self.dataPrep = DataPrep(dataURI      = data_path,
                                 dataRegex    = data_regex,
                                 shardNum     = shards,
                                 targetPath   = prep_path,
                                 sparkContext = sparkContext)

        # Load data
        if self.hadoop is True:
            self.dataPrep.loadHadoop()
        else:
            self.dataPrep.loadLocal()

        # Add data to data_list
        # If using yarn mode, local data will not be loaded
        if self.hadoop is False:
            for dirName, subdirList, fileList in os.walk(self.dataPrep.localPath()):
                for file_name in fileList:
                    file_path = "%s/%s" % (str(dirName), str(file_name))
                    self.data_list += self.data_format.get_data_from_file(file_path)
        else:
            aRdd = sparkContext.textFile(self.dataPrep.hadoopPath()).cache()
            tmp  = aRdd.collect()
            tmpStr = ''.join(str(e) + "\n" for e in tmp)
            self.load_stringtext(textString = tmpStr)

        logger.info("Data loaded")
        return

    def load_stringtext(self, textString):
        self.data_list += self.data_format.load_stringtext(textString)
        return

    def loadedPath(self):
        if self.dataPrep:
            if self.hadoop is True:
                return self.dataPrep.hadoopPath()
            else:
                return self.dataPrep.localPath()
        else:
            raise RuntimeError("DATAPOOL [ERROR]: Data has not been loaded by DataPrep, cannot retrieve data path.")
        return

    def __add__(self, another_data_pool):
        if another_data_pool is None:
            return deepcopy(self)
        # if self.fgen != another_data_pool.fgen:
        #     raise RuntimeError("DATAPOOL [ERROR]: Merging dataPools do not have the same fgen")
        # if self.data_format != another_data_pool.data_format:
        #     raise RuntimeError("DATAPOOL [ERROR]: Merging dataPools do not have the same format")
        newDataPool = deepcopy(self)
        newDataPool.data_list = newDataPool.data_list + another_data_pool.data_list
        newDataPool.reset_index()
        return newDataPool

    def export(self, fileURI, sparkContext=None):
        self.data_format.export_to_file(self, fileURI, sparkContext)
        return

    def reset_all(self):
        """
        Reset the index variables and the data list.

        Restores the instance to a state when no sentence has been read
        """
        self.reset_index()
        self.data_list = []

        return

    def reset_index(self):
        """
        Reset the index variable to the very beginning of
        sentence list
        """
        self.current_index = -1

    def has_next_data(self):
        """
        Returns True if there is still sentence not read. This call
        does not advence data pointer. Call to get_next_data() will
        do the job.

        :return: False if we have reaches the end of data_list
                 True otherwise
        """
        i = self.current_index + 1
        if i >= 0 and i < len(self.data_list):
            return True
        else:
            return False

    def get_next_data(self):
        """
        Return the next sentence object, which is previously read
        from disk files.

        This method does not perform index checking, so please make sure
        the internal index is valid by calling has_next_data(), or an exception
        will be raise (which would be definitely not what you want)
        """
        if(self.has_next_data()):
            self.current_index += 1
            # Logging how many entries we have supplied
            if self.current_index % 1000 == 0:
                logger.debug("Data finishing %.2f%% ..." %
                             (100 * self.current_index / len(self.data_list), ))

            return self.data_list[self.current_index]
        raise IndexError("Run out of data while calling get_next_data()")

    def get_sent_num(self):
        return len(self.data_list)

Example #9

0

Show file

File: eta_lambda_mnist.py Project: elinfi/FYS-STK4155

no_hidden = False
seed = 2034

# download MNIST dataset
digits = datasets.load_digits()

# define input data and labels
dataset = digits.images
labels = digits.target.reshape(-1, 1)

# flatten the image
N = len(dataset)
dataset = dataset.reshape(N, -1)

# Transform labels to onehot vectors and split in train and test
data = DataPrep()
accuracy = cost.Accuracy()
one_hot = data.create_one_hot(N, labels)
X_train, X_test, z_train, z_test = data.train_test_split(dataset, one_hot)
z_test = np.argmax(z_test, axis=1)
batch_size = len(X_train) // n_batches

# set up the neural network
network = NeuralNetwork(X_train.shape[1], neurons, n_outputs, cost_func)

array_lambda = [0, 1e-4, 1e-3, 1e-2, 1e-1, 0.9]
array_eta = [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1]

accuracy_heatmap = np.zeros((len(array_lambda), len(array_eta)))
index_array = np.arange(len(X_train))

Example #10

0

Show file

File: custom_dataset.py Project: Adityak204/Tabular-Dataset-Classification-in-PyTorch


class TabularDataset:
    def __init__(self, data, targets, model_type='classification'):
        self.data = data
        self.targets = targets
        self.model_type = model_type

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        current_sample = self.data[idx, :]
        current_target = self.targets[idx]
        return {
            "x": torch.tensor(current_sample, dtype=torch.long),
            "y": torch.tensor(current_target, dtype=torch.float)
            if self.model_type == 'regression' else torch.tensor(current_target, dtype=torch.long)
        }


if __name__ == '__main__':
    dt = pd.DataFrame({'category': ['a', 'b', 'c', 'a', 'a', 'c', 'd', 'e', 'c'],
                       'class': ['I', 'IV', 'V', None, 'I', 'V', None, 'VII', 'V'],
                       'targets': [0, 0, 0, 1, 1, 0, 1, 0, 1]
                       })
    data_treat = DataPrep(data=dt, categorical_var_list=['category', 'class'])
    clean_data = data_treat.run_preprocessing(treat_na=True, label_encode=True)
    dataset = TabularDataset(data=clean_data[['category', 'class']].values, targets=clean_data['targets'])
    print(dataset[8])

Example #11

0

Show file

        if regression:
            return self.rfr.predict(X)
        else:
            return self.rfc.predict(X)

    def predict_prob(self, X, regression=False):
        X = self.tfidf.transform(X)
        if regression:
            raise ValueError('Cannot predict probabilites of a regression!')
        else:
            return self.rfc.predict_proba(X)


if __name__ == '__main__':
    traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']
    model = Model()

    for trait in traits:
        dp = DataPrep()
        X_regression, y_regression = dp.prep_data(trait, regression=True)
        X_categorical, y_categorical = dp.prep_data(trait, regression=False)
        print('Entrenando rasgo ' + trait + ' con modelo regression...')
        model.fit(X_regression, y_regression, regression=True)
        print('Hecho!')
        print('Entrenando rasgo ' + trait + ' con modelo categorical...')
        model.fit(X_categorical, y_categorical, regression=False)
        print('Hecho!')
        with open('static/' + trait + '_model.pkl', 'wb') as f:
            # Write the model to a file.
            pickle.dump(model, f)
    print("Entrenamiento terminado!")