Ejemplo n.º 1
0
def upload(request):
    if not _is_logged_in(request):
        return redirect(users.create_login_url('/'))

    if request.method == 'POST':
        from tools import import_data
        data = request.POST['data']
        import_data(data)
        return redirect('/')
    return render(request, 'upload.html')
Ejemplo n.º 2
0
 def import_data(self, data_file="data_test.csv"):
     """
     Function to import data from csv file
     
     import_data(self,data_file="data_test.csv")
     """
     #problem paramters
     self.time_hor, \
     self.nb_obj, \
     self.alpha, \
     self.beta, \
     self.cost_setup, \
     self.cost_stor, \
     self.cost_prod, \
     self.cons_prod, \
     self.constraint = import_data(data_file)
     if len(self.constraint.shape) > 1 :
         self.constraint  = self.constraint[0]
     #problem variables
     self.coef = np.zeros(self.time_hor, float) # lagrangian multiplier
     self.production = np.zeros((self.nb_obj, self.time_hor), float)
     self.price = np.zeros((self.nb_obj, self.time_hor), float)
     self.setup = np.zeros((self.nb_obj, self.time_hor), float)
     self.storage = np.zeros((self.nb_obj, self.time_hor), float)
     if self.verbose > 0:
         show_data(self)
Ejemplo n.º 3
0
class Test(unittest.TestCase):
    """
    The basic class that inherits unittest.TestCase
    """
    algorithm = 'CRep'
    K = 3
    in_folder = '../data/input/'
    out_folder = '../data/output/'
    end_file = '_test'
    adj = 'syn111.dat'
    ego = 'source'
    alter = 'target'
    force_dense = False
    flag_conv = 'log'

    '''
    Import data
    '''
    network = in_folder + adj  # network complete path
    A, B, B_T, data_T_vals = tl.import_data(network, ego=ego, alter=alter, force_dense=force_dense, header=0)
    nodes = A[0].nodes()

    ''' 
    Setting to run the algorithm
    '''
    with open('setting_' + algorithm + '.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
        conf['end_file'] = end_file

    model = CREP.CRep(N=A[0].number_of_nodes(), L=len(A), K=K, **conf)

    # test case function to check the crep.set_name function
    def test_import_data(self):
        print("Start import data test\n")
        if self.force_dense:
            self.assertTrue(self.B.sum() > 0)
            print('B has ', self.B.sum(), ' total weight.')
        else:
            self.assertTrue(self.B.vals.sum() > 0)
            print('B has ', self.B.vals.sum(), ' total weight.')

    # test case function to check the Person.get_name function
    def test_running_algorithm(self):
        print("\nStart running algorithm test\n")

        _ = self.model.fit(data=self.B, data_T=self.B_T, data_T_vals=self.data_T_vals, flag_conv=self.flag_conv,
                           nodes=self.nodes)

        theta = np.load(self.model.out_folder+'theta'+self.model.end_file+'.npz')
        thetaGT = np.load(self.model.out_folder+'theta_'+self.algorithm+'.npz')

        self.assertTrue(np.array_equal(self.model.u_f, theta['u']))
        self.assertTrue(np.array_equal(self.model.v_f, theta['v']))
        self.assertTrue(np.array_equal(self.model.w_f, theta['w']))
        self.assertTrue(np.array_equal(self.model.eta_f, theta['eta']))

        self.assertTrue(np.array_equal(thetaGT['u'], theta['u']))
        self.assertTrue(np.array_equal(thetaGT['v'], theta['v']))
        self.assertTrue(np.array_equal(thetaGT['w'], theta['w']))
        self.assertTrue(np.array_equal(thetaGT['eta'], theta['eta']))
Ejemplo n.º 4
0
def main():
    df = import_data(data_file)
    #    dictionary = construct_json_from_df(df,json_file)
    dico = json_to_dict(json_file)
    print dico
    df = replace_value(df, dico)
    df.to_csv('output/data_cleaned.csv',
              sep=';',
              encoding='utf-8',
              float_format='%.12g')
Ejemplo n.º 5
0
class Test(unittest.TestCase):
    """
    The basic class that inherits unittest.TestCase
    """
    N = 100
    L = 4
    C = 2
    gamma = 0.5
    in_folder = '../data/input/'
    out_folder = '../data/output/test/'
    end_file = '_test'
    adj_name = 'adj.csv'
    cov_name = 'X.csv'
    ego = 'source'
    alter = 'target'
    egoX = 'Name'
    attr_name = 'Metadata'
    rseed = 107261
    N_real = 1
    undirected = False
    force_dense = True
    err = 0.1
    tolerance = 0.0001
    decision = 10
    maxit = 500
    assortative = False
    inf = 1e10
    err_max = 0.0000001
    '''
    Import data
    '''
    A, B, X, nodes = tl.import_data(in_folder,
                                    adj_name=adj_name,
                                    cov_name=cov_name,
                                    ego=ego,
                                    alter=alter,
                                    egoX=egoX,
                                    attr_name=attr_name)
    Xs = np.array(X)

    MTCOV = mtcov.MTCOV(
        N=A[0].number_of_nodes(),  # number of nodes
        L=len(B),  # number of layers
        C=C,  # number of communities
        Z=X.shape[1],  # number of modalities of the attribute
        gamma=gamma,  # scaling parameter gamma
        undirected=undirected,  # if True, the network is undirected
        rseed=rseed,  # random seed for the initialization
        inf=inf,  # initial value for log-likelihood and parameters
        err_max=err_max,  # minimum value for the parameters
        err=err,  # error for the initialization of W
        N_real=
        N_real,  # number of iterations with different random initialization
        tolerance=tolerance,  # tolerance parameter for convergence
        decision=decision,  # convergence parameter
        maxit=maxit,  # maximum number of EM steps before aborting
        folder=out_folder,  # path for storing the output
        end_file=end_file,  # output file suffix
        assortative=assortative  # if True, the network is assortative
    )

    # test case function to check the mtcov.set_name function
    def test_import_data(self):
        print("Start import data test\n")
        self.assertTrue(self.B.sum() > 0)
        print('B has ', self.B.sum(), ' total weight.')

    # test case function to check the Person.get_name function
    def test_running_algorithm(self):
        print("\nStart running algorithm test\n")

        _ = self.MTCOV.fit(data=self.B,
                           data_X=self.Xs,
                           flag_conv='log',
                           nodes=self.nodes)

        theta = np.load(self.MTCOV.folder + 'theta' + self.MTCOV.end_file +
                        '.npz')
        thetaGT = np.load(self.MTCOV.folder + 'theta_test_GT.npz')

        self.assertTrue(np.array_equal(self.MTCOV.u_f, theta['u']))
        self.assertTrue(np.array_equal(self.MTCOV.v_f, theta['v']))
        self.assertTrue(np.array_equal(self.MTCOV.w_f, theta['w']))
        self.assertTrue(np.array_equal(self.MTCOV.beta_f, theta['beta']))

        self.assertTrue(np.array_equal(thetaGT['u'], theta['u']))
        self.assertTrue(np.array_equal(thetaGT['v'], theta['v']))
        self.assertTrue(np.array_equal(thetaGT['w'], theta['w']))
        self.assertTrue(np.array_equal(thetaGT['beta'], theta['beta']))
Ejemplo n.º 6
0
def main_cv():

    p = ArgumentParser()
    p.add_argument('-f', '--in_folder', type=str, default='../data/input/')  # path of the input network
    p.add_argument('-j', '--adj_name', type=str, default='adj_cv.csv')  # name of the adjacency tensor
    p.add_argument('-c', '--cov_name', type=str, default='X_cv.csv')  # name of the design matrix
    p.add_argument('-o', '--ego', type=str, default='source')  # name of the source of the edge
    p.add_argument('-r', '--alter', type=str, default='target')  # name of the target of the edge
    p.add_argument('-x', '--egoX', type=str, default='Name')  # name of the column with node labels
    p.add_argument('-a', '--attr_name', type=str, default='Metadata')  # name of the attribute to consider
    p.add_argument('-C', '--C', type=int, default=2)  # number of communities
    p.add_argument('-g', '--gamma', type=float, default=0.5)  # scaling hyper parameter
    p.add_argument('-u', '--undirected', type=bool, default=True)  # flag to call the undirected network
    p.add_argument('-F', '--flag_conv', type=str, choices=['log', 'deltas'], default='log')  # flag for convergence
    # p.add_argument('-d', '--force_dense', type=bool, default=False)  # flag to force a dense transformation in input
    p.add_argument('-b', '--batch_size', type=int, default=None)  # size of the batch to use to compute the likelihood
    p.add_argument('-v', '--cv_type', type=str, choices=['kfold', 'random'], default='kfold')  # type of CV to use
    p.add_argument('-NF', '--NFold', type=int, default=5)  # number of fold to perform cross-validation
    p.add_argument('-T', '--out_mask', type=int, default=False)  # flag to output the masks
    p.add_argument('-or', '--out_results', type=bool, default=True)  # flag to output the results in a csv file
    args = p.parse_args()

    # setting to run the algorithm
    with open('setting_MTCOV.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
    conf['out_folder'] = '../data/output/5-fold_cv/'
    if not os.path.exists(conf['out_folder']):
        os.makedirs(conf['out_folder'])

    force_dense = True

    '''
    Cross validation parameters
    '''
    cv_type = args.cv_type
    NFold = args.NFold
    prng = np.random.RandomState(seed=conf['rseed'])  # set seed random number generator
    rseed = prng.randint(1000)
    out_mask = args.out_mask
    out_results = args.out_results

    '''
    Model parameters
    '''
    C = args.C
    gamma = args.gamma

    '''
    Set up directories
    '''
    in_folder = args.in_folder
    out_folder = conf['out_folder']

    dataset = args.adj_name.split('.')[0]

    '''
    Import data
    '''
    A, B, X, nodes = tl.import_data(in_folder, adj_name=args.adj_name, cov_name=args.cov_name, ego=args.ego,
                                    alter=args.alter, egoX=args.egoX, attr_name=args.attr_name,
                                    undirected=args.undirected, force_dense=force_dense,
                                    noselfloop=True, verbose=True)

    Xs = np.array(X)
    valid_types = [np.ndarray, skt.dtensor, skt.sptensor]
    assert any(isinstance(B, vt) for vt in valid_types)

    if args.batch_size and args.batch_size > len(nodes):
        raise ValueError('The batch size has to be smaller than the number of nodes.')
    if len(nodes) > 1000:
        args.flag_conv = 'deltas'

    print('\n### CV procedure ###')
    comparison = [0 for _ in range(10)]
    comparison[0], comparison[1] = C, gamma

    # save the results
    if out_results:
        out_file = out_folder+dataset+'_results.csv'
        if not os.path.isfile(out_file):  # write header
            with open(out_file, 'w') as outfile:
                wrtr = csv.writer(outfile, delimiter=',', quotechar='"')
                wrtr.writerow(['C', 'gamma', 'fold', 'rseed', 'logL', 'acc_train', 'auc_train', 'logL_test',
                               'acc_test', 'auc_test'])
        outfile = open(out_file, 'a')
        wrtr = csv.writer(outfile, delimiter=',', quotechar='"')
        print(f'Results will be saved in: {out_file}')

    time_start = time.time()

    L = B.shape[0]
    N = B.shape[1]
    assert N == X.shape[0]

    '''
    Extract masks
    '''
    if cv_type == 'kfold':
        idxG = cvfun.shuffle_indicesG(N, L, rseed=rseed)
        idxX = cvfun.shuffle_indicesX(N, rseed=rseed)
    else:
        idxG = None
        idxX = None

    print('\nC =', C, 'gamma =', gamma, '\n')

    for fold in range(NFold):
        print('FOLD ', fold)

        rseed += prng.randint(500)
        comparison[2], comparison[3] = fold, rseed

        maskG, maskX = cvfun.extract_masks(N, L, idxG=idxG, idxX=idxX, cv_type=cv_type, NFold=NFold, fold=fold,
                                           rseed=rseed, out_mask=out_mask)

        '''
        Set up training dataset    
        '''
        B_train = B.copy()
        B_train[maskG > 0] = 0

        X_train = Xs.copy()
        X_train[maskX > 0] = 0

        conf['end_file'] = 'GT'+str(fold)+'C'+str(C)+'g'+str(gamma)+'_'+dataset

        '''
        Run MTCOV on the training 
        '''
        tic = time.time()
        U, V, W, BETA, comparison[4] = cvfun.train_running_model(B_cv=B_train, X_cv=X_train, flag_conv=args.flag_conv,
                                                                 C=C, Z=X.shape[1], gamma=gamma, undirected=args.undirected,
                                                                 nodes=nodes, batch_size=args.batch_size, **conf)

        '''
        Output performance results
        '''
        if gamma != 0:
            comparison[5] = cvfun.covariates_accuracy(X, U, V, BETA, mask=np.logical_not(maskX))
            comparison[8] = cvfun.covariates_accuracy(X, U, V, BETA, mask=maskX)
        if gamma != 1:
            comparison[6] = cvfun.calculate_AUC(B, U, V, W, mask=np.logical_not(maskG))
            comparison[9] = cvfun.calculate_AUC(B, U, V, W, mask=maskG)

        comparison[7] = cvfun.loglikelihood(B, X, U, V, W, BETA, gamma, maskG=maskG, maskX=maskX)

        print("Time elapsed:", np.round(time.time() - tic, 2), " seconds.")

        if out_results:
            wrtr.writerow(comparison)
            outfile.flush()
    if out_results:
        outfile.close()

    print("\nTime elapsed:", np.round(time.time() - time_start, 2), " seconds.")
Ejemplo n.º 7
0
def sheets(username):
    u = models.User.query.filter_by(username=username).first()
    import_data(u)
    return redirect(
        'https://docs.google.com/spreadsheets/d/1M_DSqPghGoCGUzn8dciDoXeHW0lAHoDk0i90G913A5U/edit#gid=0'
    )
Ejemplo n.º 8
0
def main():
    p = ArgumentParser()
    p.add_argument('-a',
                   '--algorithm',
                   type=str,
                   choices=['Crep', 'Crepnc', 'Crep0'],
                   default='CRep')  # configuration
    p.add_argument('-K', '--K', type=int, default=3)  # number of communities
    p.add_argument('-A', '--adj', type=str,
                   default='syn111.dat')  # name of the network
    p.add_argument('-f', '--in_folder', type=str,
                   default='../data/input/')  # path of the input network
    p.add_argument(
        '-o', '--out_folder', type=str,
        default='../data/output/5-fold_cv/')  # path to store outputs
    p.add_argument('-e', '--ego', type=str,
                   default='source')  # name of the source of the edge
    p.add_argument('-t', '--alter', type=str,
                   default='target')  # name of the target of the edge
    # p.add_argument('-d', '--force_dense', type=bool, default=True)  # flag to force a dense transformation in input
    p.add_argument('-F',
                   '--flag_conv',
                   type=str,
                   choices=['log', 'deltas'],
                   default='log')  # flag for convergence
    p.add_argument('-N', '--NFold', type=int,
                   default=5)  # number of fold to perform cross-validation
    p.add_argument('-m', '--out_mask', type=bool,
                   default=False)  # flag to output the masks
    p.add_argument('-r', '--out_results', type=bool,
                   default=True)  # flag to output the results in a csv file
    p.add_argument('-i', '--out_inference', type=bool,
                   default=True)  # flag to output the inferred parameters
    args = p.parse_args()

    prng = np.random.RandomState(seed=17)  # set seed random number generator
    '''
    Cross validation parameters and set up output directory
    '''
    NFold = args.NFold
    out_mask = args.out_mask
    out_results = args.out_results

    out_folder = args.out_folder
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    '''
    Model parameters
    '''
    K = args.K
    network = args.in_folder + args.adj  # network complete path
    algorithm = args.algorithm  # algorithm to use to generate the samples
    adjacency = args.adj.split('.dat')[
        0]  # name of the network without extension
    with open('setting_' + algorithm + '.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
    conf['out_folder'] = out_folder
    conf['out_inference'] = args.out_inference
    '''
    Import data
    '''
    A, B, B_T, data_T_vals = tl.import_data(network,
                                            ego=args.ego,
                                            alter=args.alter,
                                            force_dense=True,
                                            header=0)
    nodes = A[0].nodes()
    valid_types = [np.ndarray, skt.dtensor, skt.sptensor]
    assert any(isinstance(B, vt) for vt in valid_types)

    print('\n### CV procedure ###')
    comparison = [0 for _ in range(11)]
    comparison[0] = K

    # save the results
    if out_results:
        out_file = out_folder + adjacency + '_cv.csv'
        if not os.path.isfile(out_file):  # write header
            with open(out_file, 'w') as outfile:
                wrtr = csv.writer(outfile, delimiter=',', quotechar='"')
                wrtr.writerow([
                    'K', 'fold', 'rseed', 'eta', 'auc_train', 'auc_test',
                    'auc_cond_train', 'auc_cond_test', 'opt_func_train',
                    'opt_func_test', 'max_it'
                ])
        outfile = open(out_file, 'a')
        wrtr = csv.writer(outfile, delimiter=',', quotechar='"')
        print(f'Results will be saved in: {out_file}')

    time_start = time.time()
    L = B.shape[0]
    N = B.shape[-1]

    rseed = prng.randint(1000)
    indices = cvfun.shuffle_indices_all_matrix(N, L, rseed=rseed)
    init_end_file = conf['end_file']

    for fold in range(NFold):
        print('\nFOLD ', fold)
        comparison[1], comparison[2] = fold, rseed

        mask = cvfun.extract_mask_kfold(indices, N, fold=fold, NFold=NFold)
        if out_mask:
            outmask = out_folder + 'mask_f' + str(
                fold) + '_' + adjacency + '.pkl'
            print(f'Mask saved in: {outmask}')
            with open(outmask, 'wb') as f:
                pickle.dump(np.where(mask > 0), f)
        '''
        Set up training dataset    
        '''
        B_train = B.copy()
        B_train[mask > 0] = 0
        '''
        Run CRep on the training 
        '''
        tic = time.time()
        conf['end_file'] = init_end_file + '_' + str(fold) + 'K' + str(K)
        u, v, w, eta, maxPSL, algo_obj = cvfun.fit_model(
            B_train,
            B_T,
            data_T_vals,
            nodes=nodes,
            N=N,
            L=L,
            K=K,
            algo=algorithm,
            flag_conv=args.flag_conv,
            **conf)
        '''
        Output performance results
        '''
        comparison[3] = eta
        M = cvfun.calculate_expectation(u, v, w, eta=eta)
        comparison[4] = cvfun.calculate_AUC(M, B, mask=np.logical_not(mask))
        comparison[5] = cvfun.calculate_AUC(M, B, mask=mask)
        M_cond = cvfun.calculate_conditional_expectation(B, u, v, w, eta=eta)
        comparison[6] = cvfun.calculate_AUC(M_cond,
                                            B,
                                            mask=np.logical_not(mask))
        comparison[7] = cvfun.calculate_AUC(M_cond, B, mask=mask)
        comparison[9] = cvfun.calculate_opt_func(
            B, algo_obj, mask=mask, assortative=conf['assortative'])
        comparison[8] = maxPSL
        comparison[10] = algo_obj.final_it

        print(f'Time elapsed: {np.round(time.time() - tic, 2)} seconds.')

        if out_results:
            wrtr.writerow(comparison)
            outfile.flush()

    if out_results:
        outfile.close()

    print(f'\nTime elapsed: {np.round(time.time() - time_start, 2)} seconds.')
Ejemplo n.º 9
0
class Test(unittest.TestCase):
    """
    The basic class that inherits unittest.TestCase
    """
    N = 100
    L = 2
    C = 2
    gamma = 0.5
    in_folder = '../data/input/'
    out_folder = '../data/output/test/'
    end_file = '_test'
    adj_name = 'adj_cv.csv'
    cov_name = 'X_cv.csv'
    ego = 'source'
    alter = 'target'
    egoX = 'Name'
    attr_name = 'Metadata'
    rseed = 107261
    N_real = 1
    undirected = False
    force_dense = True
    flag_conv = 'log'
    err = 0.1
    tolerance = 0.0001
    decision = 10
    maxit = 500
    assortative = False
    inf = 1e10
    err_max = 0.0000001
    cv_type = 'kfold'
    NFold = 5
    '''
    Import data
    '''
    A, B, X, nodes = tl.import_data(in_folder,
                                    adj_name=adj_name,
                                    cov_name=cov_name,
                                    ego=ego,
                                    alter=alter,
                                    egoX=egoX,
                                    attr_name=attr_name)
    Xs = np.array(X)

    def test_running_algorithm(self):
        print("\nStart running algorithm test\n")

        L = self.B.shape[0]
        N = self.B.shape[1]
        assert N == self.X.shape[0]

        if self.cv_type == 'kfold':
            idxG = cvfun.shuffle_indicesG(N, L, rseed=self.rseed)
            idxX = cvfun.shuffle_indicesX(N, rseed=self.rseed)
        else:
            idxG = None
            idxX = None

        for fold in range(self.NFold):

            ind = self.rseed + fold
            maskG, maskX = cvfun.extract_masks(self.A[0].number_of_nodes(),
                                               len(self.B),
                                               idxG=idxG,
                                               idxX=idxX,
                                               cv_type=self.cv_type,
                                               NFold=self.NFold,
                                               fold=fold,
                                               rseed=ind)
            '''
            Set up training dataset    
            '''
            B_train = self.B.copy()
            print(B_train.shape, maskG.shape)
            B_train[maskG > 0] = 0

            X_train = self.Xs.copy()
            X_train[maskX > 0] = 0

            U, V, W, BETA, logL = cvfun.train_running_model(
                B_train,
                X_train,
                self.flag_conv,
                N=self.A[0].number_of_nodes(),
                L=len(self.B),
                C=self.C,
                Z=self.X.shape[1],
                gamma=self.gamma,
                undirected=self.undirected,
                cv=True,
                rseed=self.rseed,
                inf=self.inf,
                err_max=self.err_max,
                err=self.err,
                N_real=self.N_real,
                tolerance=self.tolerance,
                decision=self.decision,
                maxit=self.maxit,
                folder=self.out_folder,
                end_file=self.end_file,
                assortative=self.assortative)
            '''
            Output parameters
            '''
            outinference = self.out_folder + 'theta_cv' + str(
                fold) + 'C' + str(self.C) + 'g' + str(self.gamma)
            np.savez_compressed(outinference + '.npz',
                                u=U,
                                v=V,
                                w=W,
                                beta=BETA,
                                fold=fold)
            # To load: theta = np.load('test.npz'), e.g. print(np.array_equal(U, theta['u']))
            '''
            Load parameters
            '''
            theta = np.load(outinference + '.npz')
            thetaGT = np.load(self.out_folder + 'thetaGT' + str(fold) +
                              'C2g0.5_adj_cv.npz')

            self.assertTrue(np.array_equal(U, theta['u']))
            self.assertTrue(np.array_equal(V, theta['v']))
            self.assertTrue(np.array_equal(W, theta['w']))
            self.assertTrue(np.array_equal(BETA, theta['beta']))

            self.assertTrue(np.array_equal(thetaGT['u'], theta['u']))
            self.assertTrue(np.array_equal(thetaGT['v'], theta['v']))
            self.assertTrue(np.array_equal(thetaGT['w'], theta['w']))
            self.assertTrue(np.array_equal(thetaGT['beta'], theta['beta']))
Ejemplo n.º 10
0
def import_data():
    import_data()
Ejemplo n.º 11
0
class Test(unittest.TestCase):
    """
    The basic class that inherits unittest.TestCase
    """
    C = 2
    gamma = 0.5
    in_folder = '../data/input/'
    adj_name = 'adj_cv.csv'
    cov_name = 'X_cv.csv'
    ego = 'source'
    alter = 'target'
    egoX = 'Name'
    attr_name = 'Metadata'
    undirected = True
    flag_conv = 'log'
    batch_size = None
    cv_type = 'kfold'
    NFold = 5
    out_mask = False

    with open('setting_MTCOV.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
    prng = np.random.RandomState(seed=conf['rseed'])
    rseed = prng.randint(1000)

    dataset = adj_name.split('.')[0]

    '''
    Import data
    '''
    A, B, X, nodes = tl.import_data(in_folder, adj_name=adj_name, cov_name=cov_name, ego=ego,
                                    alter=alter, egoX=egoX, attr_name=attr_name,
                                    undirected=undirected, force_dense=True)
    Xs = np.array(X)

    def test_running_algorithm(self):
        print("\nStart running algorithm test\n")

        L = self.B.shape[0]
        N = self.B.shape[1]
        assert N == self.X.shape[0]

        '''
        Extract masks
        '''
        if self.cv_type == 'kfold':
            idxG = cvfun.shuffle_indicesG(N, L, rseed=self.rseed)
            idxX = cvfun.shuffle_indicesX(N, rseed=self.rseed)
        else:
            idxG = None
            idxX = None

        for fold in range(self.NFold):

            self.rseed += self.prng.randint(500)
            maskG, maskX = cvfun.extract_masks(N, L, idxG=idxG, idxX=idxX, cv_type=self.cv_type, NFold=self.NFold,
                                               fold=fold, rseed=self.rseed, out_mask=self.out_mask)

            '''
            Set up training dataset    
            '''
            B_train = self.B.copy()
            print(B_train.shape, maskG.shape)
            B_train[maskG > 0] = 0

            X_train = self.Xs.copy()
            X_train[maskX > 0] = 0

            self.conf['end_file'] = 'CV' + str(fold) + 'C' + str(self.C) + 'g' + str(self.gamma) + '_' + self.dataset

            U, V, W, BETA, logL = cvfun.train_running_model(B_cv=B_train, X_cv=X_train, flag_conv=self.flag_conv,
                                                            C=self.C, Z=self.X.shape[1], gamma=self.gamma,
                                                            undirected=self.undirected,
                                                            nodes=self.nodes, batch_size=self.batch_size, **self.conf)

            '''
            Load parameters
            '''
            outinference = self.conf['out_folder'] + 'thetaCV' + str(fold) + 'C' + str(self.C) + \
                           'g' + str(self.gamma) + '_' + self.dataset
            theta = np.load(outinference+'.npz')
            thetaGT = np.load('../data/output/5-fold_cv/thetaGT'+str(fold)+'C' + str(self.C) + \
                           'g' + str(self.gamma) + '_' + self.dataset + '.npz')

            self.assertTrue(np.array_equal(U,theta['u']))
            self.assertTrue(np.array_equal(V,theta['v']))
            self.assertTrue(np.array_equal(W,theta['w']))
            self.assertTrue(np.array_equal(BETA,theta['beta']))

            self.assertTrue(np.array_equal(thetaGT['u'],theta['u']))
            self.assertTrue(np.array_equal(thetaGT['v'],theta['v']))
            self.assertTrue(np.array_equal(thetaGT['w'],theta['w']))
            self.assertTrue(np.array_equal(thetaGT['beta'],theta['beta']))
Ejemplo n.º 12
0
def main_cv():

    inf = 1e10
    err_max = 0.0000001
    p = ArgumentParser()
    p.add_argument('-j', '--adj_name', type=str, default='adj_cv.csv')
    p.add_argument('-c', '--cov_name', type=str, default='X_cv.csv')
    p.add_argument('-o', '--ego', type=str, default='source')
    p.add_argument('-r', '--alter', type=str, default='target')
    p.add_argument('-x', '--egoX', type=str, default='Name')
    p.add_argument('-a', '--attr_name', type=str, default='Metadata')
    p.add_argument('-C', '--C', type=int, default=2)
    p.add_argument('-g', '--gamma', type=float, default=0.5)
    p.add_argument('-u', '--undirected', type=bool, default=False)
    p.add_argument('-d', '--force_dense', type=bool, default=True)
    p.add_argument('-F',
                   '--flag_conv',
                   type=str,
                   choices=['log', 'deltas'],
                   default='log')
    p.add_argument('-z', '--rseed', type=int, default=107261)
    p.add_argument('-e', '--err', type=float, default=0.1)
    p.add_argument('-i', '--N_real', type=int, default=1)
    p.add_argument('-t', '--tolerance', type=float, default=0.0001)
    p.add_argument('-y', '--decision', type=int, default=10)
    p.add_argument('-m', '--maxit', type=int, default=500)
    p.add_argument('-E', '--end_file', type=str, default='_results.csv')
    p.add_argument('-I', '--in_folder', type=str, default='../data/input/')
    p.add_argument('-O',
                   '--out_folder',
                   type=str,
                   default='../data/output/5-fold_cv/')
    p.add_argument('-A', '--assortative', type=bool, default=False)
    p.add_argument('-v',
                   '--cv_type',
                   type=str,
                   choices=['kfold', 'random'],
                   default='kfold')
    p.add_argument('-NF', '--NFold', type=int, default=5)
    p.add_argument('-T', '--out_mask', type=int, default=False)
    p.add_argument('-W', '--out_inference', type=int, default=False)
    args = p.parse_args()
    '''
    Cross validation parameters
    '''
    cv_type = args.cv_type
    NFold = args.NFold
    rseed = args.rseed
    out_mask = args.out_mask
    out_inference = args.out_inference
    end_file = args.end_file
    '''
    Model parameters
    '''
    C = args.C
    gamma = args.gamma

    dataset = args.adj_name.split('.')[0]
    '''
    Set up output directory
    '''
    in_folder = args.in_folder
    out_folder = args.out_folder
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    '''
    Import data
    '''
    A, B, X, nodes = tl.import_data(in_folder,
                                    adj_name=args.adj_name,
                                    cov_name=args.cov_name,
                                    ego=args.ego,
                                    alter=args.alter,
                                    egoX=args.egoX,
                                    attr_name=args.attr_name,
                                    undirected=args.undirected,
                                    force_dense=args.force_dense)

    Xs = np.array(X)
    valid_types = [np.ndarray, skt.dtensor, skt.sptensor]
    assert any(isinstance(B, vt) for vt in valid_types)

    print('\n### CV procedure ###')
    comparison = [0 for _ in range(10)]
    comparison[0], comparison[1] = C, gamma

    out_file = out_folder + dataset + end_file
    if not os.path.isfile(out_file):  # write header
        with open(out_file, 'w') as outfile:
            wrtr = csv.writer(outfile, delimiter=',', quotechar='"')
            wrtr.writerow([
                'C', 'gamma', 'fold', 'rseed', 'logL', 'acc_train',
                'auc_train', 'logL_test', 'acc_test', 'auc_test'
            ])

    time_start = time.time()

    L = B.shape[0]
    N = B.shape[1]
    assert N == X.shape[0]

    if cv_type == 'kfold':
        idxG = cvfun.shuffle_indicesG(N, L, rseed=rseed)
        idxX = cvfun.shuffle_indicesX(N, rseed=rseed)
    else:
        idxG = None
        idxX = None

    with open(out_file, 'a') as outfile:
        wrtr = csv.writer(outfile, delimiter=',', quotechar='"')
        print('Results will be saved in:', out_file)
        print('\nC =', C, 'gamma =', gamma, '\n')

        for fold in range(NFold):
            print('FOLD ', fold)

            ind = rseed + fold  # set the random seed
            comparison[2], comparison[3] = fold, ind

            maskG, maskX = cvfun.extract_masks(N,
                                               L,
                                               idxG=idxG,
                                               idxX=idxX,
                                               cv_type=cv_type,
                                               NFold=NFold,
                                               fold=fold,
                                               rseed=ind,
                                               out_mask=out_mask)
            '''
            Set up training dataset    
            '''
            B_train = B.copy()
            B_train[maskG > 0] = 0

            X_train = Xs.copy()
            X_train[maskX > 0] = 0
            '''
            Run MTCOV on the training 
            '''
            tic = time.time()

            U, V, W, BETA, comparison[4] = cvfun.train_running_model(
                B_train,
                X_train,
                args.flag_conv,
                N=A[0].number_of_nodes(),  # number of nodes
                L=len(B),  # number of layers
                C=args.C,  # number of communities
                Z=X.shape[1],  # number of modalities of the attribute
                gamma=args.gamma,  # scaling parameter gamma
                undirected=args.
                undirected,  # if True, the network is undirected
                cv=True,
                rseed=args.rseed,  # random seed for the initialization
                inf=inf,  # initial value for log-likelihood and parameters
                err_max=err_max,  # minimum value for the parameters
                err=args.err,  # error for the initialization of W
                N_real=args.
                N_real,  # number of iterations with different random initialization
                tolerance=args.
                tolerance,  # tolerance parameter for convergence
                decision=args.decision,  # convergence parameter
                maxit=args.maxit,  # maximum number of EM steps before aborting
                folder=out_folder,  # path for storing the output
                end_file='GT' + str(fold) + 'C' + str(args.C) + 'g' +
                str(args.gamma),  # output file suffix
                assortative=args.
                assortative  # if True, the network is assortative
            )
            '''
            Output parameters
            '''
            if out_inference:
                outinference = '../data/output/test/thetaGT' + str(
                    fold) + 'C' + str(C) + 'g' + str(gamma) + '_' + dataset
                np.savez_compressed(outinference + '.npz',
                                    u=U,
                                    v=V,
                                    w=W,
                                    beta=BETA)
                # To load: theta = np.load('test.npz'), e.g. print(np.array_equal(U, theta['u']))
                print('Parameters saved in: ', outinference + '.npz')
            '''
            Output performance results
            '''
            if gamma != 0:
                comparison[5] = cvfun.covariates_accuracy(
                    X, U, V, BETA, mask=np.logical_not(maskX))
                comparison[8] = cvfun.covariates_accuracy(X,
                                                          U,
                                                          V,
                                                          BETA,
                                                          mask=maskX)
            if gamma != 1:
                comparison[6] = cvfun.calculate_AUC(B,
                                                    U,
                                                    V,
                                                    W,
                                                    mask=np.logical_not(maskG))
                comparison[9] = cvfun.calculate_AUC(B, U, V, W, mask=maskG)

            comparison[7] = cvfun.loglikelihood(B,
                                                X,
                                                U,
                                                V,
                                                W,
                                                BETA,
                                                gamma,
                                                maskG=maskG,
                                                maskX=maskX)

            print("Time elapsed:", np.round(time.time() - tic, 2), " seconds.")

            wrtr.writerow(comparison)
            outfile.flush()

    print("\nTime elapsed:", np.round(time.time() - time_start, 2),
          " seconds.")
Ejemplo n.º 13
0
class Test(unittest.TestCase):
    """
    The basic class that inherits unittest.TestCase
    """
    C = 2
    gamma = 0.5
    in_folder = '../data/input/'
    out_folder = '../data/output/test/'
    end_file = '_test'
    adj_name = 'adj.csv'
    cov_name = 'X.csv'
    ego = 'source'
    alter = 'target'
    egoX = 'Name'
    attr_name = 'Metadata'
    undirected = False
    flag_conv = 'log'
    force_dense = False
    batch_size = None

    with open('setting_MTCOV.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
    '''
    Import data
    '''
    A, B, X, nodes = tl.import_data(in_folder,
                                    adj_name=adj_name,
                                    cov_name=cov_name,
                                    ego=ego,
                                    alter=alter,
                                    egoX=egoX,
                                    attr_name=attr_name,
                                    undirected=undirected,
                                    force_dense=force_dense)
    Xs = np.array(X)

    MTCOV = mtcov.MTCOV(N=A[0].number_of_nodes(),
                        L=len(A),
                        C=C,
                        Z=X.shape[1],
                        gamma=gamma,
                        undirected=undirected,
                        **conf)

    def test_import_data(self):
        print("Start import data test\n")
        if self.force_dense:
            self.assertTrue(self.B.sum() > 0)
            print('B has ', self.B.sum(), ' total weight.')
        else:
            self.assertTrue(self.B.vals.sum() > 0)
            print('B has ', self.B.vals.sum(), ' total weight.')

    def test_running_algorithm(self):
        print("\nStart running algorithm test\n")

        _ = self.MTCOV.fit(data=self.B,
                           data_X=self.Xs,
                           flag_conv=self.flag_conv,
                           nodes=self.nodes,
                           batch_size=self.batch_size)

        theta = np.load(self.MTCOV.out_folder + 'theta' + self.MTCOV.end_file +
                        '.npz')
        thetaGT = np.load(self.MTCOV.out_folder + 'theta_test_GT.npz')

        self.assertTrue(np.array_equal(self.MTCOV.u_f, theta['u']))
        self.assertTrue(np.array_equal(self.MTCOV.v_f, theta['v']))
        self.assertTrue(np.array_equal(self.MTCOV.w_f, theta['w']))
        self.assertTrue(np.array_equal(self.MTCOV.beta_f, theta['beta']))

        self.assertTrue(np.array_equal(thetaGT['u'], theta['u']))
        self.assertTrue(np.array_equal(thetaGT['v'], theta['v']))
        self.assertTrue(np.array_equal(thetaGT['w'], theta['w']))
        self.assertTrue(np.array_equal(thetaGT['beta'], theta['beta']))
Ejemplo n.º 14
0
def main():
    p = ArgumentParser()
    p.add_argument('-a',
                   '--algorithm',
                   type=str,
                   choices=['Crep', 'Crepnc', 'Crep0'],
                   default='CRep')  # configuration
    p.add_argument('-K', '--K', type=int, default=3)  # number of communities
    p.add_argument('-A', '--adj', type=str,
                   default='syn111.dat')  # name of the network
    p.add_argument('-f', '--in_folder', type=str,
                   default='../data/input/')  # path of the input network
    p.add_argument('-e', '--ego', type=str,
                   default='source')  # name of the source of the edge
    p.add_argument('-t', '--alter', type=str,
                   default='target')  # name of the target of the edge
    p.add_argument(
        '-d', '--force_dense', type=bool,
        default=False)  # flag to force a dense transformation in input
    p.add_argument('-F',
                   '--flag_conv',
                   type=str,
                   choices=['log', 'deltas'],
                   default='log')  # flag for convergence
    args = p.parse_args()

    # setting to run the algorithm
    with open('setting_' + args.algorithm + '.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
    if not os.path.exists(conf['out_folder']):
        os.makedirs(conf['out_folder'])
    with open(conf['out_folder'] + '/setting_' + args.algorithm + '.yaml',
              'w') as f:
        yaml.dump(conf, f)
    '''
    Import data
    '''
    network = args.in_folder + args.adj  # network complete path
    A, B, B_T, data_T_vals = tl.import_data(network,
                                            ego=args.ego,
                                            alter=args.alter,
                                            force_dense=args.force_dense,
                                            header=0)
    nodes = A[0].nodes()

    valid_types = [np.ndarray, skt.dtensor, skt.sptensor]
    assert any(isinstance(B, vt) for vt in valid_types)
    '''
    Run CRep    
    '''
    print(f'\n### Run {args.algorithm} ###')

    time_start = time.time()
    model = CREP.CRep(N=A[0].number_of_nodes(), L=len(A), K=args.K, **conf)
    _ = model.fit(data=B,
                  data_T=B_T,
                  data_T_vals=data_T_vals,
                  flag_conv=args.flag_conv,
                  nodes=nodes)

    print(f'\nTime elapsed: {np.round(time.time() - time_start, 2)} seconds.')
Ejemplo n.º 15
0
import tools
import sys
import pandas as pd

path = sys.argv[1] if len(sys.argv) > 1 else '2016.xlsx'
path2 = sys.argv[2] if len(sys.argv) > 2 else 'divisions.xlsx'
# true: find the minimum total no of seshs for all faculties
# false: find the most even schedule for all
tALLfEVEN = False

# initializations
# st = students DataFrame
# ID, SID, Mj, Mn
# ts = teachers DataFrame
# ID, SID, DP, 1Y, 2Y, UB
st, ts = tools.import_data()
modfile = open('ampl/mock.mod', 'w')
datfile = open('ampl/mock.dat', 'w')

dept_list = tools.get_depts(path)
div_dept, div_prof = tools.get_div(path, path2)

# cache
s_count = len(st) # number of students
t_count = len(ts) # number of teachers
d_count = 3 # number of days
i_count = 7 # number of sesh/day
depts_c = len(dept_list) # number of depts
maxpday = 4 # max no of sesh/day
if tALLfEVEN:
    maxpall = 12 # max no of sesh/all
Ejemplo n.º 16
0
class Test(unittest.TestCase):
    """
    The basic class that inherits unittest.TestCase
    """
    algorithm = 'CRep'
    K = 3
    in_folder = '../data/input/'
    out_folder = '../data/output/5-fold_cv/'
    end_file = '_test'
    adj = 'syn111.dat'
    ego = 'source'
    alter = 'target'
    # force_dense = True
    flag_conv = 'log'
    NFold = 5
    out_mask = False
    out_results = True
    out_inference = True

    prng = np.random.RandomState(seed=17)  # set seed random number generator
    rseed = prng.randint(1000)
    ''' 
    Setting to run the algorithm
    '''
    with open('setting_' + algorithm + '.yaml') as f:
        conf = yaml.load(f, Loader=yaml.FullLoader)
        conf['out_folder'] = out_folder
    '''
    Import data
    '''
    network = in_folder + adj  # network complete path
    A, B, B_T, data_T_vals = tl.import_data(network,
                                            ego=ego,
                                            alter=alter,
                                            force_dense=True,
                                            header=0)
    nodes = A[0].nodes()

    def test_running_algorithm(self):
        print("\nStart running algorithm test\n")

        L = self.B.shape[0]
        N = self.B.shape[1]

        indices = cvfun.shuffle_indices_all_matrix(N, L, rseed=self.rseed)

        for fold in range(self.NFold):
            mask = cvfun.extract_mask_kfold(indices,
                                            N,
                                            fold=fold,
                                            NFold=self.NFold)
            '''
            Set up training dataset    
            '''
            B_train = self.B.copy()
            print(B_train.shape, mask.shape)
            B_train[mask > 0] = 0

            self.conf['end_file'] = '_' + str(fold) + 'K' + str(
                self.K) + self.end_file
            u, v, w, eta, maxPSL, algo_obj = cvfun.fit_model(
                B_train,
                self.B_T,
                self.data_T_vals,
                nodes=self.nodes,
                N=N,
                L=L,
                K=self.K,
                algo=self.algorithm,
                flag_conv=self.flag_conv,
                **self.conf)
            '''
            Load parameters
            '''
            theta = np.load(self.out_folder + 'theta_' + str(fold) + 'K' +
                            str(self.K) + self.end_file + '.npz')
            thetaGT = np.load(self.out_folder + 'theta_' +
                              str(self.algorithm) + '_' + str(fold) + 'K' +
                              str(self.K) + '.npz')

            self.assertTrue(np.array_equal(u, theta['u']))
            self.assertTrue(np.array_equal(v, theta['v']))
            self.assertTrue(np.array_equal(w, theta['w']))
            self.assertTrue(np.array_equal(algo_obj.eta_f, theta['eta']))

            self.assertTrue(np.array_equal(thetaGT['u'], theta['u']))
            self.assertTrue(np.array_equal(thetaGT['v'], theta['v']))
            self.assertTrue(np.array_equal(thetaGT['w'], theta['w']))
            self.assertTrue(np.array_equal(thetaGT['eta'], theta['eta']))