Example #1
0
def write_rds(df, filepath):
    """Write a pandas dataframe to RDS file in the dir specified.
    Valid locations are scratch or results.
    Index is stored as first column since pyreadr.write_rds drops it otherwise"""
    df.insert(0, df.index.name, df.index, allow_duplicates=True)
    pyreadr.write_rds(filepath, df)
    # Pyreadr.write_rds fails silently when permissions prevent file write,
    # so trigger an error if our file isn't actually there
    with open(filepath, "rb"):
        pass
Example #2
0
def write(df):
    import shutil
    import pyreadr
    from pathlib import Path
    from tempfile import TemporaryDirectory

    file = Path(__file__).parent / PARAM['trg']

    with TemporaryDirectory() as tmpdir:
        tmprds = (Path(tmpdir) / Path(PARAM['trg']).stem).with_suffix(".rds")
        pyreadr.write_rds(str(tmprds), df)

        with open(tmprds, 'rb') as fd_in:
            # import gzip
            # with gzip.open(file, 'wb') as fd_out:
            from zipfile import ZipFile, ZIP_BZIP2 as ZIP
            with ZipFile(file, 'w', compression=ZIP) as zf:
                with zf.open(tmprds.name, 'w') as fd_out:
                    shutil.copyfileobj(fd_in, fd_out)
Example #3
0
                                     dtype=np.float)
    else:
        market_population = np.array(list(markets_gdp_Gsnap['pop.' +
                                                            YEAR][0:]),
                                     dtype=np.float)

    points_nodes = list(points_gdp_Gsnap.NN.unique())

    #### Loop through cells
    if MA_CALC_FUNCTION == 'by_cell':
        points_gdp_Gsnap['MA'] = list(
            map(calc_MA_per_cell, list(points_gdp_Gsnap.cell_id)))

        pyreadr.write_rds(
            os.path.join(
                project_file_path, 'Data', 'IntermediateData',
                'Outputs for Grid', 'DMSPOLS', 'MA_' + YEAR + '_constantpop' +
                str(CONSTANT_POPULATION) + '.Rds'), points_gdp_Gsnap)

    #### Loop through nodes
    if MA_CALC_FUNCTION == 'by_node':
        counter = 1
        points_MA_all = calc_MA_per_node(points_nodes[0])
        for points_nodes_i in points_nodes[1:]:
            points_MA_all_i = calc_MA_per_node(points_nodes_i)
            points_MA_all = points_MA_all.append(points_MA_all_i)
            counter = counter + 1

            if (counter % 10) == 0:
                print(counter)
                print(points_nodes_i)
Example #4
0
# Penalty for the MMD distance
pars['GAMMA'] = 1
# Penalty encouraging second-order knockoffs
pars['LAMBDA'] = 1
# Decorrelation penalty hyperparameter
pars['DELTA'] = 1
# Target pairwise correlations between variables and knockoffs
pars['target_corr'] = corr_g
# Kernel widths for the MMD measure (uniform weights)
pars['alphas'] = [1., 2., 4., 8., 16., 32., 64., 128.]

# Where the machine is stored
checkpoint_name = "../models/deepmodel"

# test to exclude 51

x_train = x_train.to_numpy()

# Initialize the machine
machine = KnockoffMachine(pars, checkpoint_name)

# Train the machine
#pdb.set_trace()
machine.train(x_train)

# Generate deep knockoffs
xk_train = machine.generate(x_train)

# Save knockoffs
pyreadr.write_rds("../data/derived_data/knockoffs.rds", pd.DataFrame(xk_train))
            print('[%d, %5d] average mini batch loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 1000))
            running_loss = 0.0

print('Finished Training')

#save trained model (before evaluation set model to eval mode with model.eval())
torch.save(
    {
        "model_state_dict": net.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoches": epoch
    }, model_dir + model_name + ".pt")

#load saved model
#model2 = Net(input_dim = len(data_liver[0][0]))
#savedModel = torch.load("models/myFirstModel.pt")
#model2.load_state_dict(savedModel['model_state_dict'])

#read test data and predict scores using the trained model
test_data = dataloader_rds.tabularData("data/dataset-test_std_nocirc.rds")
net.eval()  #switch model to evaluation mode before predictions
if arch_model_type == "regression":
    predictions = pd.DataFrame(net(test_data[:][0]).detach().numpy())
elif arch_model_type == "multi_class":
    predictions = pd.DataFrame(
        torch.argmax(net(test_data[:][0]), dim=1).detach().numpy())
else:
    print("WARNING: No predications are made!")
pyreadr.write_rds(predictions_dir + model_name + ".rds", predictions)
def convert_count_matrix2RDS(infile, outfile):
    file_df = pd.read_csv(infile, sep="\t", header=0)
    pyreadr.write_rds(outfile, file_df)
Example #7
0
def trainModel(pars):
    #----- set input data      ------
    train_set = pars.train_set
    test_set = pars.test_set
    #----- set hyperparameters ------
    torch.manual_seed(pars.random_seed)
    
    arch_model_type = pars.model_type
    arch_hidden_dims = flexNet.hexagon(int(pars.hidden_number), int(pars.hidden_size_max))
    arch_dropout = pars.dropout
    arch_activation_fct=pars.activation_fn
    
    opt_learning_rate = pars.learning_rate
    opt_momentum = .9
    
    mini_batch_size = int(pars.batch_size)
    epoches = int(pars.epoches)
    
    model_name = pars.model_name
    #--------------------------------
    
    
    
    #----- load training data -----
    trainset = dataloader_rds.tabularData(data_dir + train_set)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size = mini_batch_size, shuffle = True)
    #------------------------------
    
    
    
    #----- initialize network -----
    net = flexNet.FlexNet(input_dim = trainset.in_dim(),
                          hidden_dims = arch_hidden_dims,
                          model = arch_model_type, 
                          dropout_rate = arch_dropout,
                          activation_fct = arch_activation_fct)
    #------------------------------
    
    
    
    #----- define loss function -----
    if arch_model_type == "regression":
        criterion = nn.MSELoss(reduction = "mean")
    elif arch_model_type == "multi_class":
        criterion = nn.NLLLoss()
    else:
        raise Exception("No loss function defined. Invalid model type specified!")
    #--------------------------------
    
    
    
    #----- create a stochastic gradient descent optimizer ------
    optimizer = optim.SGD(net.parameters(), lr=opt_learning_rate, momentum=opt_momentum)
    #-----------------------------------------------------------
    
    
    
    for epoch in range(epoches):  # loop over the dataset multiple times
        
        running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        #get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        if arch_model_type == "multi_class":
            labels = labels.to(dtype=torch.long).squeeze()
        
        #zero the parameter gradients
        optimizer.zero_grad()
        
        #forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        #if i % 1000 == 999:    # print every 1000 mini-batches
        #    print('[%d, %5d] average mini batch loss: %.3f' %
        #          (epoch + 1, i + 1, running_loss / 1000))
        #    running_loss = 0.0
    
    print('Finished Training of Model_' + model_name + " !")
    
    
    
    #save trained model
    torch.save({"model_state_dict": net.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "epoches": epoch},
                model_dir + "model_" + model_name + ".pt")
            
    #switch model to eval mode before calc predictions
    net.eval()
    
    #caclulate train set predictions
    trainset = dataloader_rds.tabularData(data_dir + train_set)
    
    if arch_model_type == "regression":
        predictions = pd.DataFrame(net(trainset[:][0]).detach().numpy())
    elif arch_model_type == "multi_class":
        predictions = pd.DataFrame(torch.argmax(net(trainset[:][0]), dim=1).detach().numpy())
    else:
        print("WARNING: No predications are made for Model_" + model_name + "!")
    
    pyreadr.write_rds(predictions_dir + "train_" + model_name + ".rds", predictions)
    
    
    
    #calc ablation set predictions
    ablationset = dataloader_rds.tabularData(data_dir + ablation_set)
    
    if ablationset.in_dim() == trainset.in_dim():
    
        if arch_model_type == "regression":
            predictions = pd.DataFrame(net(ablationset[:][0]).detach().numpy())
        elif arch_model_type == "multi_class":
            predictions = pd.DataFrame(torch.argmax(net(ablationset[:][0]), dim=1).detach().numpy())
        else:
            print("WARNING: No predications are made for Model_" + model_name + "!")
    
        pyreadr.write_rds(predictions_dir + "ablation_" + model_name + ".rds", predictions)
    
    
    #calc test set predictions
    testset = dataloader_rds.tabularData(data_dir + test_set)
    
    
    if arch_model_type == "regression":
        predictions = pd.DataFrame(net(testset[:][0]).detach().numpy())
    elif arch_model_type == "multi_class":
        predictions = pd.DataFrame(torch.argmax(net(testset[:][0]), dim=1).detach().numpy())
    else:
        print("WARNING: No predications are made for Model_" + model_name + "!")
    
    pyreadr.write_rds(predictions_dir + "test_" + model_name + ".rds", predictions)
    
    return None
                                         "preferred_label_fi"].values[0]

            if isinstance(
                    escos.loc[escos["conceptUri"] == esco_uri,
                              "alt_labels_fi"].values[0], list):
                altLabelsFi = "\n".join(
                    escos.loc[escos["conceptUri"] == esco_uri,
                              "alt_labels_fi"].values[0])
            elif isinstance(
                    escos.loc[escos["conceptUri"] == esco_uri,
                              "alt_labels_fi"].values[0], str):
                altLabelsFi = escos.loc[escos["conceptUri"] == esco_uri,
                                        "alt_labels_fi"].values[0]
            else:
                altLabelsFi = ""

            row_tuple = (qualification_uri, dp_uri, esco_uri, preferredLabelEn,
                         altLabelsEn, descriptionEn, preferredLabelFi,
                         altLabelsFi)
            pairing_rows.append(row_tuple)

    degree_part_competences = pd.DataFrame.from_records(
        pairing_rows,
        columns=[
            "qualification_uri", "unit_uri", "conceptUri", "preferredLabelEn",
            "altLabelsEn", "descriptionEn", "preferredLabelFi", "altLabelsFi"
        ])

    pyreadr.write_rds(os.path.join(source_folder, "..", "data", "escos.rds"),
                      degree_part_competences)