def write_rds(df, filepath): """Write a pandas dataframe to RDS file in the dir specified. Valid locations are scratch or results. Index is stored as first column since pyreadr.write_rds drops it otherwise""" df.insert(0, df.index.name, df.index, allow_duplicates=True) pyreadr.write_rds(filepath, df) # Pyreadr.write_rds fails silently when permissions prevent file write, # so trigger an error if our file isn't actually there with open(filepath, "rb"): pass
def write(df): import shutil import pyreadr from pathlib import Path from tempfile import TemporaryDirectory file = Path(__file__).parent / PARAM['trg'] with TemporaryDirectory() as tmpdir: tmprds = (Path(tmpdir) / Path(PARAM['trg']).stem).with_suffix(".rds") pyreadr.write_rds(str(tmprds), df) with open(tmprds, 'rb') as fd_in: # import gzip # with gzip.open(file, 'wb') as fd_out: from zipfile import ZipFile, ZIP_BZIP2 as ZIP with ZipFile(file, 'w', compression=ZIP) as zf: with zf.open(tmprds.name, 'w') as fd_out: shutil.copyfileobj(fd_in, fd_out)
dtype=np.float) else: market_population = np.array(list(markets_gdp_Gsnap['pop.' + YEAR][0:]), dtype=np.float) points_nodes = list(points_gdp_Gsnap.NN.unique()) #### Loop through cells if MA_CALC_FUNCTION == 'by_cell': points_gdp_Gsnap['MA'] = list( map(calc_MA_per_cell, list(points_gdp_Gsnap.cell_id))) pyreadr.write_rds( os.path.join( project_file_path, 'Data', 'IntermediateData', 'Outputs for Grid', 'DMSPOLS', 'MA_' + YEAR + '_constantpop' + str(CONSTANT_POPULATION) + '.Rds'), points_gdp_Gsnap) #### Loop through nodes if MA_CALC_FUNCTION == 'by_node': counter = 1 points_MA_all = calc_MA_per_node(points_nodes[0]) for points_nodes_i in points_nodes[1:]: points_MA_all_i = calc_MA_per_node(points_nodes_i) points_MA_all = points_MA_all.append(points_MA_all_i) counter = counter + 1 if (counter % 10) == 0: print(counter) print(points_nodes_i)
# Penalty for the MMD distance pars['GAMMA'] = 1 # Penalty encouraging second-order knockoffs pars['LAMBDA'] = 1 # Decorrelation penalty hyperparameter pars['DELTA'] = 1 # Target pairwise correlations between variables and knockoffs pars['target_corr'] = corr_g # Kernel widths for the MMD measure (uniform weights) pars['alphas'] = [1., 2., 4., 8., 16., 32., 64., 128.] # Where the machine is stored checkpoint_name = "../models/deepmodel" # test to exclude 51 x_train = x_train.to_numpy() # Initialize the machine machine = KnockoffMachine(pars, checkpoint_name) # Train the machine #pdb.set_trace() machine.train(x_train) # Generate deep knockoffs xk_train = machine.generate(x_train) # Save knockoffs pyreadr.write_rds("../data/derived_data/knockoffs.rds", pd.DataFrame(xk_train))
print('[%d, %5d] average mini batch loss: %.3f' % (epoch + 1, i + 1, running_loss / 1000)) running_loss = 0.0 print('Finished Training') #save trained model (before evaluation set model to eval mode with model.eval()) torch.save( { "model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoches": epoch }, model_dir + model_name + ".pt") #load saved model #model2 = Net(input_dim = len(data_liver[0][0])) #savedModel = torch.load("models/myFirstModel.pt") #model2.load_state_dict(savedModel['model_state_dict']) #read test data and predict scores using the trained model test_data = dataloader_rds.tabularData("data/dataset-test_std_nocirc.rds") net.eval() #switch model to evaluation mode before predictions if arch_model_type == "regression": predictions = pd.DataFrame(net(test_data[:][0]).detach().numpy()) elif arch_model_type == "multi_class": predictions = pd.DataFrame( torch.argmax(net(test_data[:][0]), dim=1).detach().numpy()) else: print("WARNING: No predications are made!") pyreadr.write_rds(predictions_dir + model_name + ".rds", predictions)
def convert_count_matrix2RDS(infile, outfile): file_df = pd.read_csv(infile, sep="\t", header=0) pyreadr.write_rds(outfile, file_df)
def trainModel(pars): #----- set input data ------ train_set = pars.train_set test_set = pars.test_set #----- set hyperparameters ------ torch.manual_seed(pars.random_seed) arch_model_type = pars.model_type arch_hidden_dims = flexNet.hexagon(int(pars.hidden_number), int(pars.hidden_size_max)) arch_dropout = pars.dropout arch_activation_fct=pars.activation_fn opt_learning_rate = pars.learning_rate opt_momentum = .9 mini_batch_size = int(pars.batch_size) epoches = int(pars.epoches) model_name = pars.model_name #-------------------------------- #----- load training data ----- trainset = dataloader_rds.tabularData(data_dir + train_set) train_loader = torch.utils.data.DataLoader(trainset, batch_size = mini_batch_size, shuffle = True) #------------------------------ #----- initialize network ----- net = flexNet.FlexNet(input_dim = trainset.in_dim(), hidden_dims = arch_hidden_dims, model = arch_model_type, dropout_rate = arch_dropout, activation_fct = arch_activation_fct) #------------------------------ #----- define loss function ----- if arch_model_type == "regression": criterion = nn.MSELoss(reduction = "mean") elif arch_model_type == "multi_class": criterion = nn.NLLLoss() else: raise Exception("No loss function defined. Invalid model type specified!") #-------------------------------- #----- create a stochastic gradient descent optimizer ------ optimizer = optim.SGD(net.parameters(), lr=opt_learning_rate, momentum=opt_momentum) #----------------------------------------------------------- for epoch in range(epoches): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(train_loader, 0): #get the inputs; data is a list of [inputs, labels] inputs, labels = data if arch_model_type == "multi_class": labels = labels.to(dtype=torch.long).squeeze() #zero the parameter gradients optimizer.zero_grad() #forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() #if i % 1000 == 999: # print every 1000 mini-batches # print('[%d, %5d] average mini batch loss: %.3f' % # (epoch + 1, i + 1, running_loss / 1000)) # running_loss = 0.0 print('Finished Training of Model_' + model_name + " !") #save trained model torch.save({"model_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoches": epoch}, model_dir + "model_" + model_name + ".pt") #switch model to eval mode before calc predictions net.eval() #caclulate train set predictions trainset = dataloader_rds.tabularData(data_dir + train_set) if arch_model_type == "regression": predictions = pd.DataFrame(net(trainset[:][0]).detach().numpy()) elif arch_model_type == "multi_class": predictions = pd.DataFrame(torch.argmax(net(trainset[:][0]), dim=1).detach().numpy()) else: print("WARNING: No predications are made for Model_" + model_name + "!") pyreadr.write_rds(predictions_dir + "train_" + model_name + ".rds", predictions) #calc ablation set predictions ablationset = dataloader_rds.tabularData(data_dir + ablation_set) if ablationset.in_dim() == trainset.in_dim(): if arch_model_type == "regression": predictions = pd.DataFrame(net(ablationset[:][0]).detach().numpy()) elif arch_model_type == "multi_class": predictions = pd.DataFrame(torch.argmax(net(ablationset[:][0]), dim=1).detach().numpy()) else: print("WARNING: No predications are made for Model_" + model_name + "!") pyreadr.write_rds(predictions_dir + "ablation_" + model_name + ".rds", predictions) #calc test set predictions testset = dataloader_rds.tabularData(data_dir + test_set) if arch_model_type == "regression": predictions = pd.DataFrame(net(testset[:][0]).detach().numpy()) elif arch_model_type == "multi_class": predictions = pd.DataFrame(torch.argmax(net(testset[:][0]), dim=1).detach().numpy()) else: print("WARNING: No predications are made for Model_" + model_name + "!") pyreadr.write_rds(predictions_dir + "test_" + model_name + ".rds", predictions) return None
"preferred_label_fi"].values[0] if isinstance( escos.loc[escos["conceptUri"] == esco_uri, "alt_labels_fi"].values[0], list): altLabelsFi = "\n".join( escos.loc[escos["conceptUri"] == esco_uri, "alt_labels_fi"].values[0]) elif isinstance( escos.loc[escos["conceptUri"] == esco_uri, "alt_labels_fi"].values[0], str): altLabelsFi = escos.loc[escos["conceptUri"] == esco_uri, "alt_labels_fi"].values[0] else: altLabelsFi = "" row_tuple = (qualification_uri, dp_uri, esco_uri, preferredLabelEn, altLabelsEn, descriptionEn, preferredLabelFi, altLabelsFi) pairing_rows.append(row_tuple) degree_part_competences = pd.DataFrame.from_records( pairing_rows, columns=[ "qualification_uri", "unit_uri", "conceptUri", "preferredLabelEn", "altLabelsEn", "descriptionEn", "preferredLabelFi", "altLabelsFi" ]) pyreadr.write_rds(os.path.join(source_folder, "..", "data", "escos.rds"), degree_part_competences)