Esempio n. 1
0
def main():
    print("Staring main...")
    #define default parameters
    interactions = [(3, (0, 1)), (4, (1, 2)), (5, (2, 0))]
    thresh_mentions = 6
    neg = 2  #negative samples per positive
    rank = 5  #embedding dimensionality
    #n_train = 287044
    n_iter = 20
    #choose a dot product to use. # can be
    #multilinear
    #multilinear_square_product
    #generalised_multilinear_dot_product
    dot_product = generalised_multilinear_dot_product
    minibatch_size = 1000
    learning_rate = 0.001
    l2 = 0.0
    eval_file_name = 'output.txt'

    # use input arguments to define parameters (i.e. not using default above)
    for x in sys.argv:
        if len(x.split("__")) > 1:
            arg = x.split("__")[1]
        if "interactions" in x:
            interactions = arg
            print("Interactions not defined yet")
            #TODO
        elif "n_iter" in x:
            n_iter = int(x.split("__")[1])
        elif "thresh_mentions" in x:
            thresh_mentions = int(arg)
        elif "rank" in x:
            rank = int(arg)
        elif "n_train" in x:
            n_train = int(arg)
        elif "neg" in x:
            neg = int(arg)
        elif "minibatch" in x:
            minibatch_size = int(arg)
        elif "learning_rate" in x:
            learning_rate = float(arg)
        elif "L2" in x:
            l2 = float(arg)
        elif "eval_file_name" in x:
            eval_file_name = str(arg)

    print("Done parsing input arguments.")
    # load data & dictionaries
    train, String2Int, Int2String = dr.load_FB15K237_FB(
        'train', None, None, interactions, thresh_mentions)
    valid, String2Int, Int2String = dr.load_FB15K237_FB(
        'valid', String2Int, Int2String, interactions)
    test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int,
                                                       Int2String,
                                                       interactions)

    # in case not the full train data should be used
    #train = train[:n_train]
    n_train = len(train)
    values = [1.0 for x in range(n_train)]

    # sample some random negative validation tuples
    n_test = len(valid)
    valid_negative = np.random.randint(0, 1000,
                                       [n_test, 3 + len(interactions)])
    print("Done loading data.")

    # initialise embeddings, reserve one extra entry (the first) for unknown
    n_emb = len(Int2String.keys()) + 1
    emb0 = np.random.normal(size=(n_emb, rank)) * 0.1

    # initialise the norm scalers
    norm_scalers = np.random.normal(size=[3 + len(interactions)]) * 0.1

    # set other factorization inputs
    optim = tf.train.AdamOptimizer(learning_rate=learning_rate)

    scoring = lf.generalised_multilinear_dot_product_scorer

    # factorize the train tuples, obtain all model parameters
    # print("Starting factorisation...")
    params = lf.factorize_tuples((train, values),
                                 rank,
                                 minibatch_size=minibatch_size,
                                 emb0=emb0,
                                 n_iter=n_iter,
                                 negative_prop=neg,
                                 loss_type="logistic",
                                 tf_optim=optim,
                                 scoring=scoring,
                                 norm_scalers=norm_scalers)

    #define two placeholders to feed in train and valid data for evaluation
    inputs_train = tf.placeholder("int32", [n_train, len(train[0])])
    inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])])

    # define prediction ops for train and valid set
    pred_train = dot_product(params, inputs_train)
    pred_valid = dot_product(params, inputs_valid)

    # define the data feeders
    feed1 = {inputs_train: train}
    feed2 = {inputs_valid: valid_negative}
    feed3 = {inputs_valid: valid}

    print("Start evaluation...")
    # obtain predictions for train, valid and negative validation set
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        print("Generating predictions... training data")
        prediction_values = sigmoid(sess.run([pred_train], feed_dict=feed1)[0])
        print("Generating predictions... validation data (negative)")
        prediction_values2 = sigmoid(
            sess.run([pred_valid], feed_dict=feed2)[0])
        print("Generating predictions... validation data (positive)")
        prediction_values3 = sigmoid(
            sess.run([pred_valid], feed_dict=feed3)[0])

    # evaluation: avg prediction among the different tuple sets
    m1 = np.mean(prediction_values)
    m2 = np.mean(prediction_values2)
    m3 = np.mean(prediction_values3)
    print("Avg Pos Train Prediction", m1)
    print("Avg Random Neg Prediction", m2)
    print("Avg Pos Test Prediction", m3)

    # do batch ranking evaluation, compute MRR and HITS@10 on valid set.
    h = 20
    #h=len(valid)
    #params[0][0,:] = np.zeros([params[0].shape[1]])
    MRR, H10 = ranking_evaluation(valid[:h], prediction_values3[:h],
                                  interactions, 2, String2Int, Int2String,
                                  params, dot_product)
    print("MRR:", MRR)
    print("HITS@10:", H10)

    print("Writing results to file " + eval_file_name)
    with open(eval_file_name, 'w') as f:
        for x in sys.argv:
            f.write(x + "\n")
        f.write("---------\n")
        f.write("MRR: " + str(MRR) + "\n")
        f.write("HITS@10: " + str(H10) + "\n")
        f.write("Mean train score:" + str(m1) + "\n")
        f.write("Mean val score (neg):" + str(m2) + "\n")
        f.write("Mean val score (pos):" + str(m3) + "\n")
    print("EOF.")
Esempio n. 2
0
def test_tuples_factorization_rectangular_matrix(demo=False):
    """
    In this test, we compare the solution of the factorization given by an exact SVD and the solution given by
    the factorize_tuple function because with fully-observed matrix data and quadratic loss the solutions should match
    exactly.
    :param demo: True for demo mode where explanations are printed in the standard output. Otherwise a test is run.
    :return: Nothing
    """
    from naga.factorix.learn_factorization import factorize_tuples
    from scipy.sparse.linalg import svds
    # Create initial data
    n = 7  # number of rows
    m = 6  # number of column
    rk0 = 4  # size of the embeddings
    rk = 4  # embedding size for the model
    noise = 1  # noise level
    oracle_init = False  # do we initialize at the exact solution?
    u0_mat = np.random.randn(n, rk0)
    v0_mat = np.random.randn(m, rk0)
    y_mat = np.random.randn(n, m) * noise + np.dot(u0_mat, v0_mat.transpose())

    # svd solution
    u1_mat, d1_vec, v1_matt = svds(y_mat, rk)
    v1_mat = v1_matt.transpose()
    d1_diag_matrix = np.zeros((rk, rk))
    for i in range(rk):
        d1_diag_matrix[i, i] = np.sqrt(d1_vec[i])
    x_mat_est1 = np.dot(np.dot(u1_mat, np.square(d1_diag_matrix)), v1_matt)

    if demo:
        print('We obtained a first exact solution by Singular Value Decomposition')
        print('The difference between the observation matrix and the estimated solution is:')
        print(np.linalg.norm(x_mat_est1-y_mat))
        print()

    # #### sgd solution ####

    # conversion to tuples
    indices = [[i, n + j] for i in range(n) for j in range(m)]
    values = [y_mat[i, j] for i in range(n) for j in range(m)]

    # initialization
    if oracle_init:
        emb0_u_re = np.dot(u1_mat[:, :2], d1_diag_matrix[:2, :2])
        emb0_u_im = np.dot(u1_mat[:, 2:], d1_diag_matrix[2:, 2:])
        emb0_v_re = np.dot(v1_mat[:, :2], d1_diag_matrix[:2, :2])
        emb0_v_im = np.dot(v1_mat[:, 2:], d1_diag_matrix[2:, 2:])
        emb0_u = 1.0 * (np.concatenate([emb0_u_im, -emb0_u_re], axis=1) + np.concatenate([emb0_u_re, emb0_u_im], axis=1))
        emb0_v = np.concatenate([emb0_v_re, emb0_v_im], axis=1)
        emb0 = np.concatenate([emb0_u, emb0_v], axis=0)
    else:  # random initialization
        emb0 = np.random.normal(size=(n + m, rk)) * 0.1
    # x_mat_init = hermitian_dot(emb0[:n], emb0[n:])
    x_mat_init = np.dot(emb0[:n], emb0[n:].T)

    # choose an optimizer (Adam deems to be the most reliable)
    # optim = tf.train.GradientDescentOptimizer(learning_rate=1.)
    optim = tf.train.AdamOptimizer(learning_rate=0.1)
    # optim = tf.train.RMSPropOptimizer(learning_rate=1., decay=0.1)
    # optim = tf.train.AdagradOptimizer(learning_rate=.1)
    # optim = tf.train.FtrlOptimizer(1.0, -0.5, l2_regularization_strength=0.0, initial_accumulator_value=1e-8)

    # choose a scoring function (for rectangular matrices, the standard or the complex dot products work the same)
    scoring = lambda inputs: hermitian_tuple_scorer(inputs, rank=rk, n_emb=n + m, emb0=emb0, symmetry_coef=(1.0, 1.0),
                                                    learn_symmetry_coef=True)
    # scoring = None

    # optimization (we specify optional parameters, but the values by default could work as well)
    u2, coefs = factorize_tuples((indices, values), rk, emb0=emb0, n_iter=300, tf_optim=optim, scoring=scoring)
    # recover the matrix based on the hermitian dot product of the embeddings
    x_mat_est2_cplx = hermitian_dot(u2[:n, :], u2[n:, :].T)  # clpx2real(hermitian_dot(u2[:n], u2[n:]))
    x_mat_est2 = x_mat_est2_cplx[0] * coefs[0] + x_mat_est2_cplx[1] * coefs[1]
    if demo:
        print(x_mat_est2.shape, x_mat_init)
        print('We computed an estimator by minimizing the square loss on the tuples extracted from the matrix')
        print('The difference between the initial solution and the estimated solution is:')
        print(np.linalg.norm(x_mat_est2-x_mat_init))
        print('The difference between the observations and the estimated solution is:')
        print(np.linalg.norm(y_mat-x_mat_est2))
        print('The difference between the exact solution and the estimated solution is:')
        print(np.linalg.norm(x_mat_est1-x_mat_est2))
        print('Symmetry coefficients: ', coefs)
    assert(np.linalg.norm(x_mat_est1-x_mat_est2) < 1e-3)
Esempio n. 3
0
def test_tuples_factorization_rectangular_matrix(demo=False):
    """
    In this test, we compare the solution of the factorization given by an exact SVD and the solution given by
    the factorize_tuple function because with fully-observed matrix data and quadratic loss the solutions should match
    exactly.
    :param demo: True for demo mode where explanations are printed in the standard output. Otherwise a test is run.
    :return: Nothing
    """
    from naga.factorix.learn_factorization import factorize_tuples
    from scipy.sparse.linalg import svds
    # Create initial data
    n = 7  # number of rows
    m = 6  # number of column
    rk0 = 4  # size of the embeddings
    rk = 4  # embedding size for the model
    noise = 1  # noise level
    oracle_init = False  # do we initialize at the exact solution?
    u0_mat = np.random.randn(n, rk0)
    v0_mat = np.random.randn(m, rk0)
    y_mat = np.random.randn(n, m) * noise + np.dot(u0_mat, v0_mat.transpose())

    # svd solution
    u1_mat, d1_vec, v1_matt = svds(y_mat, rk)
    v1_mat = v1_matt.transpose()
    d1_diag_matrix = np.zeros((rk, rk))
    for i in range(rk):
        d1_diag_matrix[i, i] = np.sqrt(d1_vec[i])
    x_mat_est1 = np.dot(np.dot(u1_mat, np.square(d1_diag_matrix)), v1_matt)

    if demo:
        print(
            'We obtained a first exact solution by Singular Value Decomposition'
        )
        print(
            'The difference between the observation matrix and the estimated solution is:'
        )
        print(np.linalg.norm(x_mat_est1 - y_mat))
        print()

    # #### sgd solution ####

    # conversion to tuples
    indices = [[i, n + j] for i in range(n) for j in range(m)]
    values = [y_mat[i, j] for i in range(n) for j in range(m)]

    # initialization
    if oracle_init:
        emb0_u_re = np.dot(u1_mat[:, :2], d1_diag_matrix[:2, :2])
        emb0_u_im = np.dot(u1_mat[:, 2:], d1_diag_matrix[2:, 2:])
        emb0_v_re = np.dot(v1_mat[:, :2], d1_diag_matrix[:2, :2])
        emb0_v_im = np.dot(v1_mat[:, 2:], d1_diag_matrix[2:, 2:])
        emb0_u = 1.0 * (np.concatenate([emb0_u_im, -emb0_u_re], axis=1) +
                        np.concatenate([emb0_u_re, emb0_u_im], axis=1))
        emb0_v = np.concatenate([emb0_v_re, emb0_v_im], axis=1)
        emb0 = np.concatenate([emb0_u, emb0_v], axis=0)
    else:  # random initialization
        emb0 = np.random.normal(size=(n + m, rk)) * 0.1
    # x_mat_init = hermitian_dot(emb0[:n], emb0[n:])
    x_mat_init = np.dot(emb0[:n], emb0[n:].T)

    # choose an optimizer (Adam deems to be the most reliable)
    # optim = tf.train.GradientDescentOptimizer(learning_rate=1.)
    optim = tf.train.AdamOptimizer(learning_rate=0.1)
    # optim = tf.train.RMSPropOptimizer(learning_rate=1., decay=0.1)
    # optim = tf.train.AdagradOptimizer(learning_rate=.1)
    # optim = tf.train.FtrlOptimizer(1.0, -0.5, l2_regularization_strength=0.0, initial_accumulator_value=1e-8)

    # choose a scoring function (for rectangular matrices, the standard or the complex dot products work the same)
    scoring = lambda inputs: hermitian_tuple_scorer(inputs,
                                                    rank=rk,
                                                    n_emb=n + m,
                                                    emb0=emb0,
                                                    symmetry_coef=(1.0, 1.0),
                                                    learn_symmetry_coef=True)
    # scoring = None

    # optimization (we specify optional parameters, but the values by default could work as well)
    u2, coefs = factorize_tuples((indices, values),
                                 rk,
                                 emb0=emb0,
                                 n_iter=300,
                                 tf_optim=optim,
                                 scoring=scoring)
    # recover the matrix based on the hermitian dot product of the embeddings
    x_mat_est2_cplx = hermitian_dot(
        u2[:n, :], u2[n:, :].T)  # clpx2real(hermitian_dot(u2[:n], u2[n:]))
    x_mat_est2 = x_mat_est2_cplx[0] * coefs[0] + x_mat_est2_cplx[1] * coefs[1]
    if demo:
        print(x_mat_est2.shape, x_mat_init)
        print(
            'We computed an estimator by minimizing the square loss on the tuples extracted from the matrix'
        )
        print(
            'The difference between the initial solution and the estimated solution is:'
        )
        print(np.linalg.norm(x_mat_est2 - x_mat_init))
        print(
            'The difference between the observations and the estimated solution is:'
        )
        print(np.linalg.norm(y_mat - x_mat_est2))
        print(
            'The difference between the exact solution and the estimated solution is:'
        )
        print(np.linalg.norm(x_mat_est1 - x_mat_est2))
        print('Symmetry coefficients: ', coefs)
    assert (np.linalg.norm(x_mat_est1 - x_mat_est2) < 1e-3)
Esempio n. 4
0
def main():   
    print("Staring main...")        
    #define default parameters    
    interactions = [(3,(0,1)), (4,(1,2)), (5,(2,0))]
    thresh_mentions = 6
    neg = 2          #negative samples per positive
    rank = 5          #embedding dimensionality
    #n_train = 287044
    n_iter = 20
    #choose a dot product to use. # can be 
            #multilinear
            #multilinear_square_product
            #generalised_multilinear_dot_product
    dot_product = generalised_multilinear_dot_product
    minibatch_size=1000
    learning_rate = 0.001
    l2 = 0.0
    eval_file_name = 'output.txt'
    

    # use input arguments to define parameters (i.e. not using default above)
    for x in sys.argv:
        if len( x.split("__") ) > 1:
            arg = x.split("__")[1]
        if "interactions" in x:
            interactions = arg
            print("Interactions not defined yet")
            #TODO
        elif "n_iter" in x:
            n_iter = int(x.split("__")[1])
        elif "thresh_mentions" in x:
            thresh_mentions = int(arg)
        elif "rank" in x:
            rank =  int(arg)
        elif "n_train" in x:
            n_train = int(arg)
        elif "neg" in x:
            neg = int(arg)
        elif "minibatch" in x:
            minibatch_size = int(arg)
        elif "learning_rate" in x:
            learning_rate = float(arg)
        elif "L2" in x:
            l2 = float(arg)
        elif "eval_file_name" in x:
            eval_file_name = str(arg)

    
    print("Done parsing input arguments.")    
    # load data & dictionaries
    train, String2Int, Int2String = dr.load_FB15K237_FB('train', None, None,                      
                                                     interactions, thresh_mentions)  
    valid, String2Int, Int2String = dr.load_FB15K237_FB('valid', String2Int, 
                                                   Int2String, interactions)    
    test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, 
                                                    Int2String, interactions)
       
    # in case not the full train data should be used
    #train = train[:n_train]
    n_train = len(train)
    values = [1.0 for x in range(n_train)]

       
    # sample some random negative validation tuples
    n_test = len(valid)
    valid_negative = np.random.randint(0, 1000 ,[n_test,3+len(interactions)])
    print("Done loading data.")
    
    # initialise embeddings, reserve one extra entry (the first) for unknown
    n_emb = len(Int2String.keys()) + 1
    emb0 = np.random.normal(size=(n_emb, rank)) * 0.1
    
    # initialise the norm scalers
    norm_scalers = np.random.normal(size = [3+len(interactions)]) * 0.1
    
    # set other factorization inputs
    optim = tf.train.AdamOptimizer(learning_rate=learning_rate)
   

    scoring = lf.generalised_multilinear_dot_product_scorer
    
    # factorize the train tuples, obtain all model parameters
    # print("Starting factorisation...")
    params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size,
                                 emb0=emb0, n_iter=n_iter, 
                                 negative_prop = neg, 
                                 loss_type = "logistic", 
                                 tf_optim=optim, scoring=scoring,
                                 norm_scalers = norm_scalers)
    
    #define two placeholders to feed in train and valid data for evaluation
    inputs_train = tf.placeholder("int32", [n_train, len(train[0])])
    inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])])

    # define prediction ops for train and valid set    
    pred_train =  dot_product(params, inputs_train)
    pred_valid =  dot_product(params, inputs_valid)
    
    # define the data feeders
    feed1 = {inputs_train: train}
    feed2 = {inputs_valid: valid_negative}
    feed3 = {inputs_valid: valid}    
    

    print("Start evaluation...")
    # obtain predictions for train, valid and negative validation set
    with tf.Session() as sess:  
        sess.run(tf.initialize_all_variables())
        print("Generating predictions... training data")
        prediction_values = sigmoid( sess.run([pred_train], feed_dict=feed1)[0] )
        print("Generating predictions... validation data (negative)")
        prediction_values2 = sigmoid( sess.run([pred_valid], feed_dict=feed2)[0] )
        print("Generating predictions... validation data (positive)")
        prediction_values3 = sigmoid( sess.run([pred_valid], feed_dict=feed3)[0] )
        

    # evaluation: avg prediction among the different tuple sets
    m1 = np.mean(prediction_values)  
    m2 = np.mean(prediction_values2)  
    m3 = np.mean(prediction_values3)  
    print ("Avg Pos Train Prediction", m1)
    print ("Avg Random Neg Prediction", m2)
    print ("Avg Pos Test Prediction",  m3)
    
    # do batch ranking evaluation, compute MRR and HITS@10 on valid set.
    h = 20
    #h=len(valid)
    #params[0][0,:] = np.zeros([params[0].shape[1]])
    MRR, H10 = ranking_evaluation(valid[:h], prediction_values3[:h], 
                         interactions, 2, String2Int, Int2String, params, dot_product)
    print ("MRR:", MRR)
    print ("HITS@10:", H10)
    
    print("Writing results to file " + eval_file_name)
    with open(eval_file_name, 'w') as f:
        for x in sys.argv:
            f.write(x+"\n")
        f.write("---------\n")
        f.write("MRR: "+ str(MRR)+"\n")
        f.write("HITS@10: "+ str(H10)+"\n")
        f.write("Mean train score:" + str(m1)+"\n")
        f.write("Mean val score (neg):" + str(m2)+"\n")
        f.write("Mean val score (pos):" + str(m3)+"\n")
    print("EOF.")
Esempio n. 5
0
    # initialise embeddings, reserve one extra entry (the first) for unknown
    n_emb = len(Int2String.keys()) + 1
    emb0 = np.random.normal(size=(n_emb, rank)) * 0.1
    
    # initialise the norm scalers
    norm_scalers = np.random.normal(size = [3+len(interactions)]) * 0.1
    
    # set other factorization inputs
    optim = tf.train.AdamOptimizer(learning_rate=0.001)
    scoring = lf.generalised_multilinear_dot_product_scorer
    minibatch_size=1000
    
    # factorize the train tuples, obtain all model parameters
    params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size,
                                 emb0=emb0, n_iter=n_iter, 
                                 negative_prop = neg, 
                                 loss_type = "logistic", 
                                 tf_optim=optim, scoring=scoring,
                                 norm_scalers = norm_scalers)
    
    #define two placeholders to feed in train and valid data for evaluation
    inputs_train = tf.placeholder("int32", [n_train, len(train[0])])
    inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])])

    # define prediction ops for train and valid set    
    pred_train =  dot_product(params, inputs_train)
    pred_valid =  dot_product(params, inputs_valid)
    
    # define the data feeders
    feed1 = {inputs_train: train}
    feed2 = {inputs_valid: valid_negative}
    feed3 = {inputs_valid: valid}    
Esempio n. 6
0
    emb0 = np.random.normal(size=(n_emb, rank)) * 0.1

    # initialise the norm scalers
    norm_scalers = np.random.normal(size=[3 + len(interactions)]) * 0.1

    # set other factorization inputs
    optim = tf.train.AdamOptimizer(learning_rate=0.001)
    scoring = lf.generalised_multilinear_dot_product_scorer
    minibatch_size = 1000

    # factorize the train tuples, obtain all model parameters
    params = lf.factorize_tuples((train, values),
                                 rank,
                                 minibatch_size=minibatch_size,
                                 emb0=emb0,
                                 n_iter=n_iter,
                                 negative_prop=neg,
                                 loss_type="logistic",
                                 tf_optim=optim,
                                 scoring=scoring,
                                 norm_scalers=norm_scalers)

    #define two placeholders to feed in train and valid data for evaluation
    inputs_train = tf.placeholder("int32", [n_train, len(train[0])])
    inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])])

    # define prediction ops for train and valid set
    pred_train = dot_product(params, inputs_train)
    pred_valid = dot_product(params, inputs_valid)

    # define the data feeders
    feed1 = {inputs_train: train}