class Predictor_cat5(): def __init__(self): self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU'] self.categories = [ 'OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS', 'NEUROTICISM' ] self.Pre_cat = { trait: cat for (trait, cat) in zip(self.traits, self.categories) } self.models = { trait: pickle.load( open(os.getcwd() + '/model/' + trait + '_model.pkl', 'rb')) for i, trait in enumerate(self.traits) } self.dp = DataPrep() def predict(self, X, traits='All', predictions='All'): predictions = {} self.dp.transform(X) if traits == 'All': for trait in self.traits: pkl_model = self.models[trait] # trait_categories = pkl_model.predict(X, regression=False) # predictions[self.Pre_cat[trait]+' '] = str(trait_categories[0]) # trait_scores = pkl_model.predict(X, regression=True).reshape(1, -1) # predictions[self.Pre_cat[trait]+' '] = predictions[self.Pre_cat[trait]+' ']+' '+str(round(trait_scores.flatten()[0]*10))+' % ' trait_categories_probs = pkl_model.predict_proba(X) predictions[self.Pre_cat[trait] + ' '] = str( trait_categories_probs[:, 1][0] * 100) return predictions
def __init__(self): self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU'] self.categories = [ 'OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS', 'NEUROTICISM' ] self.Pre_cat = { trait: cat for (trait, cat) in zip(self.traits, self.categories) } self.models = { trait: pickle.load(open('static/' + trait + '_model.pkl', 'rb')) for i, trait in enumerate(self.traits) } self.dp = DataPrep()
def run_data_prep(event, context): #def run_data_prep(): #inp = '{"reference": "4990012", "period": "201211", "survey": "066", "instance": "instanceId"}' #event = json.loads(inp) print(event) dataprep = DataPrep(event) records = dataprep.get_qcode_resp_from_db() dataprep.construct_response(records) dataprep.construct_metadata() print("Attempting to invoke Wrangler Lambda with the json string: " + str(event)) dataprep.send_data_to_wrangler() #run_data_prep()
class Predictor_cat5(): def __init__(self): """ Loading all regression and classification models of cat5 models (model no 1 ) """ self.traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU'] self.categories = [ 'OPENNESS', 'CONSCIENTIOUSNESS', 'EXTRAVERSION', 'AGREEABLENESS', 'NEUROTICISM' ] self.Pre_cat = { trait: cat for (trait, cat) in zip(self.traits, self.categories) } self.models = { trait: pickle.load(open('static/' + trait + '_model.pkl', 'rb')) for i, trait in enumerate(self.traits) } self.dp = DataPrep() def predict(self, X, traits='All', predictions='All'): """ Takes features and returns predictions Transforming text into vector and predicting probablity on text """ predictions = {} self.dp.transform(X) if traits == 'All': for trait in self.traits: pkl_model = self.models[trait] # trait_categories = pkl_model.predict(X, regression=False) # predictions[self.Pre_cat[trait]+' '] = str(trait_categories[0]) # trait_scores = pkl_model.predict(X, regression=True).reshape(1, -1) # predictions[self.Pre_cat[trait]+' '] = predictions[self.Pre_cat[trait]+' ']+' '+str(round(trait_scores.flatten()[0]*10))+' % ' trait_categories_probs = pkl_model.predict_proba(X) predictions[self.Pre_cat[trait] + ' '] = trait_categories_probs[:, 1][0] * 100 return predictions
def load(self, data_path, data_regex, shards, prep_path, sparkContext): """ For each section in the initializer, iterate through all files under that section directory, and load the content of each individual file into the class instance. This method should be called after section regex has been initalized and before any get_data method is called. """ logger.info("Loading data...") self.dataPrep = DataPrep(dataURI = data_path, dataRegex = data_regex, shardNum = shards, targetPath = prep_path, sparkContext = sparkContext) # Load data if self.hadoop is True: self.dataPrep.loadHadoop() else: self.dataPrep.loadLocal() # Add data to data_list # If using yarn mode, local data will not be loaded if self.hadoop is False: for dirName, subdirList, fileList in os.walk(self.dataPrep.localPath()): for file_name in fileList: file_path = "%s/%s" % (str(dirName), str(file_name)) self.data_list += self.data_format.get_data_from_file(file_path) else: aRdd = sparkContext.textFile(self.dataPrep.hadoopPath()).cache() tmp = aRdd.collect() tmpStr = ''.join(str(e) + "\n" for e in tmp) self.load_stringtext(textString = tmpStr) logger.info("Data loaded") return
neurons = [10, 10] n_outputs = 1 hidden_act = act.Sigmoid() output_act = act.Identity() # create data using franke function seed = 2034 np.random.seed(seed) x = np.sort(np.random.uniform(0, 1, n)) y = np.sort(np.random.uniform(0, 1, n)) x, y = np.meshgrid(x, y) z = np.ravel(f.FrankeFunction(x, y) + 0.1*np.random.randn(x.shape[0], x.shape[1])) z = z.reshape(-1, 1) # set up the design matrix data = DataPrep() X = data.design_matrix(x, y, degree=1)[:, 1:] # split data in train and test and scale it X_train, X_test, z_train, z_test = data.train_test_scale(X, z) # set up the neural network network = NeuralNetwork(X_train.shape[1], neurons, n_outputs, cost.MSE()) network.create_layers(hidden_act, output_act, seed) # train the network batch_size = len(X_train)//n_batches index_array = np.arange(len(X_train)) for k in range(n_epochs): np.random.shuffle(index_array) X_minibatches = np.split(X_train[index_array], n_batches)
return self.rfc.predict(X) def predict_proba(self, X, regression=False): X = self.tfidf.transform(X) if regression: raise ValueError('Cannot predict probabilites of a regression!') else: return self.rfc.predict_proba(X) if __name__ == '__main__': traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU'] model = Model() for trait in traits: dp = DataPrep() X_regression, y_regression = dp.prep_data('status', trait, regression=True, model_comparison=False) X_categorical, y_categorical = dp.prep_data('status', trait, regression=False, model_comparison=False) print('Fitting trait ' + trait + ' regression model...') model.fit(X_regression, y_regression, regression=True) print('Done!') print('Fitting trait ' + trait + ' categorical model...') model.fit(X_categorical, y_categorical, regression=False) print('Done!') with open('static/' + trait + '_model.pkl', 'wb') as f:
class DataPool(): """ Data object that holds all sentences (dependency trees) and provides interface for loading data from the disk and retrieving them using an index. Data are classified into sections when stored in the disk, but we do not preserve such structural information, and all sentences will be loaded and "flattened" to be held in a single list. The instance maintains a current_index variable, which is used to locate the last sentence object we have read. Calling get_next() method will increase this by 1, and calling has_next() will test this index against the total number. The value of the index is persistent during get_next() and has_next() calls, and will only be reset to initial value -1 when reset() is called (manually or during init). """ def __init__(self, fgen, data_format, data_regex = None, data_path = None, textString = None, prep_path = 'data/prep/', shards = 1, sparkContext = None, hadoop = False): """ Initialize the Data set :param data_regex: the sections to be used. A regular expression that indicates which sections to be used e.g. (0[0-9])|(1[0-9])|(2[0-1])/.*tab :type data_regex: str :param data_path: the relative or absolute path to the 'penn-wsj-deps' folder (including "penn-wsj-deps") :type data_path: str :param format_path: the file that describes the file format for the type of data :type format_path: str """ if isinstance(fgen, basestring): self.fgen = importlib.import_module('feature.' + fgen).FeatureGenerator else: self.fgen = fgen if isinstance(data_format, basestring): self.data_format = importlib.import_module('data.data_format.' + data_format).DataFormat(self.fgen) else: self.data_format = data_format self.hadoop = hadoop self.reset_all() if textString is not None: self.load_stringtext(textString) if data_regex is not None: self.load(data_path = data_path, data_regex = data_regex, shards = shards, prep_path = prep_path, sparkContext = sparkContext) return def load(self, data_path, data_regex, shards, prep_path, sparkContext): """ For each section in the initializer, iterate through all files under that section directory, and load the content of each individual file into the class instance. This method should be called after section regex has been initalized and before any get_data method is called. """ logger.info("Loading data...") self.dataPrep = DataPrep(dataURI = data_path, dataRegex = data_regex, shardNum = shards, targetPath = prep_path, sparkContext = sparkContext) # Load data if self.hadoop is True: self.dataPrep.loadHadoop() else: self.dataPrep.loadLocal() # Add data to data_list # If using yarn mode, local data will not be loaded if self.hadoop is False: for dirName, subdirList, fileList in os.walk(self.dataPrep.localPath()): for file_name in fileList: file_path = "%s/%s" % (str(dirName), str(file_name)) self.data_list += self.data_format.get_data_from_file(file_path) else: aRdd = sparkContext.textFile(self.dataPrep.hadoopPath()).cache() tmp = aRdd.collect() tmpStr = ''.join(str(e) + "\n" for e in tmp) self.load_stringtext(textString = tmpStr) logger.info("Data loaded") return def load_stringtext(self, textString): self.data_list += self.data_format.load_stringtext(textString) return def loadedPath(self): if self.dataPrep: if self.hadoop is True: return self.dataPrep.hadoopPath() else: return self.dataPrep.localPath() else: raise RuntimeError("DATAPOOL [ERROR]: Data has not been loaded by DataPrep, cannot retrieve data path.") return def __add__(self, another_data_pool): if another_data_pool is None: return deepcopy(self) # if self.fgen != another_data_pool.fgen: # raise RuntimeError("DATAPOOL [ERROR]: Merging dataPools do not have the same fgen") # if self.data_format != another_data_pool.data_format: # raise RuntimeError("DATAPOOL [ERROR]: Merging dataPools do not have the same format") newDataPool = deepcopy(self) newDataPool.data_list = newDataPool.data_list + another_data_pool.data_list newDataPool.reset_index() return newDataPool def export(self, fileURI, sparkContext=None): self.data_format.export_to_file(self, fileURI, sparkContext) return def reset_all(self): """ Reset the index variables and the data list. Restores the instance to a state when no sentence has been read """ self.reset_index() self.data_list = [] return def reset_index(self): """ Reset the index variable to the very beginning of sentence list """ self.current_index = -1 def has_next_data(self): """ Returns True if there is still sentence not read. This call does not advence data pointer. Call to get_next_data() will do the job. :return: False if we have reaches the end of data_list True otherwise """ i = self.current_index + 1 if i >= 0 and i < len(self.data_list): return True else: return False def get_next_data(self): """ Return the next sentence object, which is previously read from disk files. This method does not perform index checking, so please make sure the internal index is valid by calling has_next_data(), or an exception will be raise (which would be definitely not what you want) """ if(self.has_next_data()): self.current_index += 1 # Logging how many entries we have supplied if self.current_index % 1000 == 0: logger.debug("Data finishing %.2f%% ..." % (100 * self.current_index / len(self.data_list), )) return self.data_list[self.current_index] raise IndexError("Run out of data while calling get_next_data()") def get_sent_num(self): return len(self.data_list)
no_hidden = False seed = 2034 # download MNIST dataset digits = datasets.load_digits() # define input data and labels dataset = digits.images labels = digits.target.reshape(-1, 1) # flatten the image N = len(dataset) dataset = dataset.reshape(N, -1) # Transform labels to onehot vectors and split in train and test data = DataPrep() accuracy = cost.Accuracy() one_hot = data.create_one_hot(N, labels) X_train, X_test, z_train, z_test = data.train_test_split(dataset, one_hot) z_test = np.argmax(z_test, axis=1) batch_size = len(X_train) // n_batches # set up the neural network network = NeuralNetwork(X_train.shape[1], neurons, n_outputs, cost_func) array_lambda = [0, 1e-4, 1e-3, 1e-2, 1e-1, 0.9] array_eta = [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1] accuracy_heatmap = np.zeros((len(array_lambda), len(array_eta))) index_array = np.arange(len(X_train))
class TabularDataset: def __init__(self, data, targets, model_type='classification'): self.data = data self.targets = targets self.model_type = model_type def __len__(self): return self.data.shape[0] def __getitem__(self, idx): current_sample = self.data[idx, :] current_target = self.targets[idx] return { "x": torch.tensor(current_sample, dtype=torch.long), "y": torch.tensor(current_target, dtype=torch.float) if self.model_type == 'regression' else torch.tensor(current_target, dtype=torch.long) } if __name__ == '__main__': dt = pd.DataFrame({'category': ['a', 'b', 'c', 'a', 'a', 'c', 'd', 'e', 'c'], 'class': ['I', 'IV', 'V', None, 'I', 'V', None, 'VII', 'V'], 'targets': [0, 0, 0, 1, 1, 0, 1, 0, 1] }) data_treat = DataPrep(data=dt, categorical_var_list=['category', 'class']) clean_data = data_treat.run_preprocessing(treat_na=True, label_encode=True) dataset = TabularDataset(data=clean_data[['category', 'class']].values, targets=clean_data['targets']) print(dataset[8])
if regression: return self.rfr.predict(X) else: return self.rfc.predict(X) def predict_prob(self, X, regression=False): X = self.tfidf.transform(X) if regression: raise ValueError('Cannot predict probabilites of a regression!') else: return self.rfc.predict_proba(X) if __name__ == '__main__': traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU'] model = Model() for trait in traits: dp = DataPrep() X_regression, y_regression = dp.prep_data(trait, regression=True) X_categorical, y_categorical = dp.prep_data(trait, regression=False) print('Entrenando rasgo ' + trait + ' con modelo regression...') model.fit(X_regression, y_regression, regression=True) print('Hecho!') print('Entrenando rasgo ' + trait + ' con modelo categorical...') model.fit(X_categorical, y_categorical, regression=False) print('Hecho!') with open('static/' + trait + '_model.pkl', 'wb') as f: # Write the model to a file. pickle.dump(model, f) print("Entrenamiento terminado!")