Esempio n. 1
0
    def evaluate(self, train, val, test, dim, area):
        eval_train = train + val

        s_mem_scores = self._train_mfs(['s_memory'],eval_train, dim, area)[0]
        log.info('Evaluating smoothed memory')
        s_mem_erank = self._compute_erank(test, s_mem_scores)

        results = {'S_MEMORY': s_mem_erank}
        self.pretty_print(results)

        return results 
Esempio n. 2
0
    def get_factorized_mat(self, data, dim, area):
        log.info('Running sklearn NMF')
        start = time.time()

        model = sk_NMF(n_components=dim, init='random', random_state=0)
        W = model.fit_transform(data.toarray())     # It can't run on the sparse representation :(
        H = model.components_
        mf = np.dot(W, H)

        log.info('Factorizing took %d seconds' % (time.time() - start))
        return mf
Esempio n. 3
0
    def get_factorized_mat(self, data, dim, area):
        log.info('Running sklearn NMF')
        start = time.time()

        model = sk_NMF(n_components=dim, init='random', random_state=0)
        W = model.fit_transform(
            data.toarray())  # It can't run on the sparse representation :(
        H = model.components_
        mf = np.dot(W, H)

        log.info('Factorizing took %d seconds' % (time.time() - start))
        return mf
Esempio n. 4
0
def learn_mix_mult_on_individual(alpha, mem_mult, mf_mult, val_mat, num_em_iter=10000, tol=0.00001):
    """
    Learning the mixing weights for mixture of two multinomials. Each individual learns mixing weights.

    NOTE: In order for the algorithm to work, there can be no location that can get 0 probability by both the mem_mult
    and the mf_mult. In my runs, I use MPE to estimate the mf_mult while using MLE for the mum_mul. That way the mf_mult
    has no 0 values.

     INPUT:
    -------
        1. alpha:       <float / (2, ) ndarray>   Dirichlet prior for the pi learning. If <float> is given it is treated
                                                  as a flat prior. Has to be bigger than 1.
        2. mem_mult:    <(I, L) ndarray>    each row is the multinomial parameter according to the "self" data
        3. mf_mult:     <(I, L) ndarray>    each row is the multinomial parameter according to the matrix factorization
        4. val_mat:     <(I, L) ndarray>    counts matrix to optimize on
        5. num_em_iter: <int>               number of em iterations
        6. tol:         <float>             convergence threshold

     OUTPUT:
    --------
        1. pis:  <(I, 2) ndarray>     each row is mixing weights for the i'th individual

     RAISE:
    -------
        1. ValueError:
                a. alphas are not bigger than 1
                b. the multinomial's rows don't sum to 1
                c. There is a location with both mults 0 (see NOTE)
    """
    I = mem_mult.shape[0]
    pis = np.zeros([I, 2])

    start = time.time()
    for i in range(I):
        # if i % 200 == 0:
        #     log.info('Em for individual %d out of %d' % (i + 1, I))

        # The way the global em is implemented, allows me to simply call it with the i_val_data and it will only
        # compute the \pi as a function of that user.
        i_val_data = convert_sparse_to_coo(val_mat[i])

        # The learning method treats the multinomials as matrices. So I have to wrap it in an array.
        # All the rows in i_val_data are going to be 0 because I'm coverting a single row_vector.
        i_mem_mult = np.array([mem_mult[i]])
        i_mf_mult = np.array([mf_mult[i]])

        i_pi = _learn_mix_mult(alpha, i_mem_mult, i_mf_mult, i_val_data, num_em_iter, tol)

        pis[i] = i_pi

    total_time = time.time() - start
    log.info('Finished EM on users. Total time = %d secs -- %.3f per user' % (total_time, total_time / I))
    return pis
Esempio n. 5
0
    def get_factorized_mat(self, data, dim, area):
        log.info('Running numpy svds')
        start = time.time()

        # For SVD we need to remove the mean from the data.
        tmp = np.copy(np.array(data.toarray()))
        m = np.mean(tmp, axis=0)
        tmp -= m

        u, s, v = svds(tmp, dim)
        W = u
        H = np.dot(np.diag(s), v)
        mf = np.dot(W, H)
        mf += m
        log.info('Factorizing took %d seconds' % (time.time() - start))

        return mf
Esempio n. 6
0
    def evaluate(self, train, val, test, dim, area):
        mem_scores = self._train_mfs(['memory'],train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'],train,dim,area)[0]

        mem_mult = normalize_mat_row(mem_scores)
        popularity_mult = normalize_mat_row(popularity_scores+0.001)

        pi_mem_pop = learn_mix_mult_on_individual(1.1, mem_mult, popularity_mult, val)

        # The flat prior won't change the ranking so there's no need to add it here.
        log.info('Evaluating memory with popularity')
        mem_pop_erank = self._compute_erank(test, mem_mult, popularity_mult, pi_mem_pop)

        results = {'MEMORY+POPULARITY': mem_pop_erank}
        self.pretty_print(results)

        return results
Esempio n. 7
0
    def get_factorized_mat(self, data, dim, area):
        log.info('Running numpy svds')
        start = time.time()

        # For SVD we need to remove the mean from the data.
        tmp = np.copy(np.array(data.toarray()))
        m = np.mean(tmp, axis=0)
        tmp -= m

        u, s, v = svds(tmp, dim)
        W = u
        H = np.dot(np.diag(s), v)
        mf = np.dot(W, H)
        mf += m
        log.info('Factorizing took %d seconds' % (time.time() - start))

        return mf
Esempio n. 8
0
def load_data(area):
    """
    Loads train, validation and test data for the given area.
    When testing, train and val should be combined (train += val).

     OUTPUT:
    --------
        1. train:   <(I, L) csr_mat>    sparse counts matrix. Rows are individuals, columns are locations.
        2. val:     <(I, L) csr_mat>    sparse counts matrix. Rows are individuals, columns are locations.
        3. test:    <(I, L) csr_mat>    sparse counts matrix. Rows are individuals, columns are locations.

     RAISE:
    -------
        1.  IOError:              Area or one of the files does not exist.
    """
    root_folder = ''

    log.info('Loading all data for area %s' % area)
    train_file = join(root_folder, area, 'train.csv')
    val_file = join(root_folder, area, 'val.csv')
    test_file = join(root_folder, area, 'test.csv')

    train_data = np.loadtxt(train_file, delimiter=',')
    val_data = np.loadtxt(val_file, delimiter=',')
    test_data = np.loadtxt(test_file, delimiter=',')

    # In order to create the coo_matrix we need to have the number of rows and columns in the matrix
    # All individuals will have data in train, val and test so it's enough to check how many uses are in train.
    I = np.unique(train_data[:, 0]).shape[0]

    # For location that is not tha case. We need to check the maximum location across all 3.
    L = np.max(train_data[:, 1])
    L = np.max([L, np.max(val_data[:, 1])])
    L = np.max([L, np.max(test_data[:, 1])])
    L += 1  # It's all 0 based

    train = coo_matrix(
        (train_data[:, 2], (train_data[:, 0], train_data[:, 1])),
        shape=(I, L)).tocsr()
    val = coo_matrix((val_data[:, 2], (val_data[:, 0], val_data[:, 1])),
                     shape=(I, L)).tocsr()
    test = coo_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1])),
                      shape=(I, L)).tocsr()

    return train, val, test
Esempio n. 9
0
    def evaluate(self, train, val, test, dim, area):
        mem_scores = self._train_mfs(['memory'],train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'],train,dim,area)[0]

        mem_mult = normalize_mat_row(mem_scores)
        popularity_mult = normalize_mat_row(popularity_scores+0.001)

        pi_mem_pop = learn_mix_mult_global(1.1, mem_mult, popularity_mult, val)
        log.info('Global mixing weight is %f and %f' % (pi_mem_pop[0],pi_mem_pop[1]))
        print sum((pi_mem_pop).astype(float))

        # The flat prior won't change the ranking so there's no need to add it here.
        log.info('Evaluating memory with popularity')
        mem_pop_erank = self._compute_erank(test, mem_mult, popularity_mult, pi_mem_pop)

        results = {'MEMORY+POPULARITY': mem_pop_erank}
        self.pretty_print(results)

        return results
Esempio n. 10
0
    def get_factorized_mat(self, data, dim, area):
        log.info('Loading hierarchical bayes NMF')

        start = time.time()
        I, L = data.shape

        assert area is not None

        root_dir = '/extra/disij0/data/person_mf/%s/hier_nmf' % area

        htheta = fu.load_np_txt(join(root_dir, 'htheta.tsv'), delimiter='\s\s\s\s')
        htheta = self._fix_projection(htheta, I, dim)

        hbeta = fu.load_np_txt(join(root_dir, 'hbeta.tsv'), delimiter='\s\s\s\s')
        hbeta = self._fix_projection(hbeta, L, dim)

        mf = htheta.dot(hbeta.T)

        log.info('Factorizing took %d seconds' % (time.time() - start))
        return mf
Esempio n. 11
0
def load_data(area):
    """
    Loads train, validation and test data for the given area.
    When testing, train and val should be combined (train += val).

     OUTPUT:
    --------
        1. train:   <(I, L) csr_mat>    sparse counts matrix. Rows are individuals, columns are locations.
        2. val:     <(I, L) csr_mat>    sparse counts matrix. Rows are individuals, columns are locations.
        3. test:    <(I, L) csr_mat>    sparse counts matrix. Rows are individuals, columns are locations.

     RAISE:
    -------
        1.  IOError:              Area or one of the files does not exist.
    """
    root_folder = ''

    log.info('Loading all data for area %s' % area)
    train_file = join(root_folder, area, 'train.csv')
    val_file = join(root_folder, area, 'val.csv')
    test_file = join(root_folder, area, 'test.csv')

    train_data = np.loadtxt(train_file,delimiter=',')
    val_data = np.loadtxt(val_file,delimiter=',')
    test_data = np.loadtxt(test_file,delimiter=',')

    # In order to create the coo_matrix we need to have the number of rows and columns in the matrix
    # All individuals will have data in train, val and test so it's enough to check how many uses are in train.
    I = np.unique(train_data[:, 0]).shape[0]

    # For location that is not tha case. We need to check the maximum location across all 3.
    L = np.max(train_data[:, 1])
    L = np.max([L, np.max(val_data[:, 1])])
    L = np.max([L, np.max(test_data[:, 1])])
    L += 1  # It's all 0 based

    train = coo_matrix((train_data[:, 2], (train_data[:, 0], train_data[:, 1])), shape=(I, L)).tocsr()
    val = coo_matrix((val_data[:, 2], (val_data[:, 0], val_data[:, 1])), shape=(I, L)).tocsr()
    test = coo_matrix((test_data[:, 2], (test_data[:, 0], test_data[:, 1])), shape=(I, L)).tocsr()

    return train, val, test
Esempio n. 12
0
    def evaluate(self, train, val, test, dim, area):
        eval_train = train + val

        gt_scores = self._train_mfs(['memory'], test, dim, area)[0]
        mem_scores = self._train_mfs(['memory'], eval_train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'], eval_train, dim,
                                            area)[0]

        log.info('Evaluating popularity')
        popularity_logp = self._compute_logp(test, popularity_scores)

        log.info('Evaluating memory')
        mem_logp = self._compute_logp(test, mem_scores)

        log.info('Evaluating ground truth')
        gt_logp = self._compute_logp(test, gt_scores)

        results = {
            'MEMORY': mem_logp,
            'GROUNDTRUTH': gt_logp,
            'POPULARITY': popularity_logp
        }
        self.pretty_print(results)

        return results
Esempio n. 13
0
    def get_factorized_mat(self, data, dim, area):
        log.info('Loading hierarchical bayes NMF')

        start = time.time()
        I, L = data.shape

        assert area is not None

        root_dir = '/extra/disij0/data/person_mf/%s/hier_nmf' % area

        htheta = fu.load_np_txt(join(root_dir, 'htheta.tsv'),
                                delimiter='\s\s\s\s')
        htheta = self._fix_projection(htheta, I, dim)

        hbeta = fu.load_np_txt(join(root_dir, 'hbeta.tsv'),
                               delimiter='\s\s\s\s')
        hbeta = self._fix_projection(hbeta, L, dim)

        mf = htheta.dot(hbeta.T)

        log.info('Factorizing took %d seconds' % (time.time() - start))
        return mf
Esempio n. 14
0
def learn_mix_mult_global(alpha, mem_mult, mf_mult, val_mat, num_em_iter=100000, tol=0.0001):
    """
    Learning the mixing weights for mixture of two multinomials globally for all users. Each observation is a point in
    model.

    NOTE: In order for the algorithm to work, there can be no location that can get 0 probability by both the mem_mult
    and the mf_mult. In my runs, I use MPE to estimate the mf_mult while using MLE for the mum_mul. That way the mf_mult
    has no 0 values.

     INPUT:
    -------
        1. alpha:       <float / (2, ) ndarray>   Dirichlet prior for the pi learning. If <float> is given it is treated
                                                  as a flat prior. Has to be bigger than 1.
        2. mem_mult:    <(I, L) ndarray>    each row is the multinomial parameter according to the "self" data
        3. mf_mult:     <(I, L) ndarray>    each row is the multinomial parameter according to the matrix factorization
        4. val_mat:     <(I, L) ndarray>    counts matrix to optimize on
        5. num_em_iter: <int>               number of em iterations
        6. tol:         <float>             convergence threshold

     OUTPUT:
    --------
        1. pis:  <(I, 2) ndarray>     each row is mixing weights for the i'th individual

     RAISE:
    -------
        1. ValueError:
                a. alphas are not bigger than 1
                b. the multinomial's rows don't sum to 1
                c. There is a location with both mults 0 (see NOTE)
    """
    log.info('Learning global mixing weights for all points')
    start = time.time()
    pi = _learn_mix_mult(alpha, mem_mult, mf_mult, convert_sparse_to_coo(val_mat), num_em_iter, tol)
    total_time = time.time() - start
    log.info('Finished EM on all data. Total time = %d secs' % total_time)

    return pi
Esempio n. 15
0
    def evaluate(self, train, val, test, dim, area):
        eval_train = train + val

        gt_scores = self._train_mfs(['memory'],test, dim, area)[0]
        mem_scores = self._train_mfs(['memory'],eval_train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'],eval_train,dim,area)[0]

        log.info('Evaluating popularity')
        popularity_erank = self._compute_erank(test, popularity_scores)

        log.info('Evaluating memory')
        mem_erank = self._compute_erank(test, mem_scores)

        log.info('Evaluating ground truth')
        gt_erank = self._compute_erank(test, gt_scores)


        results = {'MEMORY': mem_erank, 'GROUNDTRUTH': gt_erank, 'POPULARITY': popularity_erank}
        self.pretty_print(results)

        return results           
Esempio n. 16
0
 def __init__(self):
     log.info('Evaluating logp on a train+val without smoothing')
Esempio n. 17
0
def print_methods():
    """
    Prints the available methods.
    """
    log.info('Available MF methods: %s' % list(_mfs_factory.keys()))
Esempio n. 18
0
 def __init__(self):
     log.info('Evaluating ranking on a single component')
Esempio n. 19
0
 def __init__(self):
     log.info('Evaluating logp with global learned mixing weights')
Esempio n. 20
0
    def evaluate(self, train, val, test, dim, area):
        log.info('Learning Memory, NMF and hb NMF mfs on train only for mixing weights optimization')
        nmf_scores, hb_nmf_scores, mem_scores = self._train_mfs(['nmf', 'hbnmf', 'memory'], train, dim, area)

        log.info('Learning mix for MEM and NMF')
        mem_mult = normalize_mat_row(mem_scores)
        nmf_mult = normalize_mat_row(nmf_scores + 0.001)   # Small flat prior to avoid 0.
        pis_mem_nmf = learn_mix_mult_on_individual(1.1, mem_mult, nmf_mult, val)

        log.info('Learning mix for MEM and hb NMF')
        hb_nmf_mult = normalize_mat_row(hb_nmf_scores + 0.001)  # Small flat prior to avoid 0.
        pis_mem_hb_nmf = learn_mix_mult_on_individual(1.1, mem_mult, hb_nmf_mult, val)

        log.info('Learning Memory NMF and hier NMF mfs on train+val for evaluation')
        eval_train = train + val
        nmf_scores, hb_nmf_scores, mem_scores = self._train_mfs(['nmf', 'hbnmf', 'memory'], eval_train, dim, area)

        # The flat prior won't change the ranking so there's no need to add it here.
        log.info('Evaluating memory with NMF')
        mem_nmf_erank = self._compute_erank(test, mem_scores, nmf_scores, pis_mem_nmf)

        log.info('Evaluating memory with hb_NMF')
        mem_hb_nmf_erank = self._compute_erank(test, mem_scores, hb_nmf_scores, pis_mem_hb_nmf)

        results = {'mem_nmf': mem_nmf_erank, 'mem_hb_nmf': mem_hb_nmf_erank}
        self.pretty_print(results)

        return results
Esempio n. 21
0
    def evaluate(self, train, val, test, dim, area):
        # There is no mixing weights optimization in this code.
        # Therefore the val can be added to train.
        eval_train = train + val

        gt_scores = self._train_mfs(['memory'],test, dim, area)[0]
        s_mem_scores = self._train_mfs(['s_memory'],eval_train, dim, area)[0]
        mem_scores = self._train_mfs(['memory'],eval_train, dim, area)[0]

        log.info('Evaluating memory')
        mem_erank = self._compute_erank(test, mem_scores)

        log.info('Evaluating smoothed memory')
        s_mem_erank = self._compute_erank(test, s_mem_scores)

        log.info('Evaluating ground truth')
        gt_erank = self._compute_erank(test, gt_scores)

        svd_scores, nmf_scores, hb_nmf_scores= self._train_mfs(['svd', 'nmf', 'hbnmf'],eval_train, dim, area)

        log.info('Evaluating SVD')
        svd_erank = self._compute_erank(test, svd_scores)

        log.info('Evaluating sklearn NMF')
        nmf_erank = self._compute_erank(test, nmf_scores)

        log.info('Evaluating Hierarchical Bayes NMF')
        hb_nmf_erank = self._compute_erank(test, hb_nmf_scores)
        self.pretty_print({'SVD': svd_erank, 'NMF': nmf_erank})


        results = {'MEMORY': mem_erank, 'SVD': svd_erank, 'NMF': nmf_erank, 'HBPF': hb_nmf_erank, 'GROUNDTRUTH': gt_erank, 'S_MEMORY': s_mem_erank, 'MEMORY': mem_erank}
        self.pretty_print(results)

        return results
Esempio n. 22
0
 def __init__(self):
     log.info('Evaluating ranking with individual learned mixing weights')
Esempio n. 23
0
 def __init__(self):
     log.info(
         'Mem and popularity learnt from training data; searching alpha on validation set'
     )
Esempio n. 24
0
    def evaluate(self, train, val, test, dim, area):
        def logP(score_mat, test):
            logp_p = np.zeros(int(test.sum()))
            logp_indiv = np.zeros(test.shape[0])
            test_data = coo_matrix(test)

            temp = score_mat / np.sum(score_mat)
            idx = 0
            for i, j, v in zip(test_data.row, test_data.col, test_data.data):
                logp_p[int(idx):int(idx + v)] = np.log(temp[i, j])
                idx += v

            temp = normalize_mat_row(score_mat)
            for i, j, v in zip(test_data.row, test_data.col, test_data.data):
                logp_indiv[i] += v * np.log(temp[i, j])

            n_train = np.array([int(test.sum(axis=1)[i][0]) for i in range(I)])
            logp_indiv /= n_train

            return logp_p, logp_indiv

        ALPHA = np.arange(0.1, 1.1, 0.1)

        mem_scores = self._train_mfs(['memory'], train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'], train, dim,
                                            area)[0] + 0.0001

        mem_mult = normalize_mat_row(mem_scores)
        popularity_mult = normalize_mat_row(popularity_scores)

        N = int(np.sum(mem_scores))
        I, L = train.shape
        n_train = np.array([int(train.sum(axis=1)[i][0]) for i in range(I)])

        results = dict()
        headers = [
            'EM global', 'EM indiv', 'S_mem', 'Dirichlet', 'Translation_JM',
            'Translation_Dirichlet'
        ]
        logP_p = DataFrame(np.zeros((int(test.sum()), 6)), columns=headers)
        logP_indiv = DataFrame(np.zeros((I, 6)), columns=headers)
        mix_alpha = DataFrame(np.zeros((I, 6)), columns=headers)

        log.info('#####learning statistical translation model#######')
        log.info('computing sparse mutual information')

        binary = (train > 0) * 1  #I*L
        count_1d = binary.sum(axis=0)  #1*L
        count_2d = np.dot(binary.T, binary)  #L*L
        P_1d = count_1d / I  # exists zeros
        P_2d = count_2d / I
        temp = P_2d / np.outer(P_1d, P_1d)
        temp[~np.isfinite(temp)] = 1  # zero / zero = zero
        temp[temp == 0] = 1  # avoid log_zero
        PPMI = np.log2(temp)
        PPMI[PPMI < 0] = 0

        k = 50
        idx = np.array([[
            j for j in np.asarray(PPMI[i].argsort().T).reshape(-1)[-k:][::-1]
            if PPMI[i, j] > 0
        ] for i in range(L)])
        for u in range(L):
            if u not in idx[u]:
                idx[u].append(u)

        binary = (np.array(train.toarray()) > 0) * 1  #I*L
        MI = np.zeros((L, L))
        from sklearn import metrics
        for u in range(L):
            for w in idx[u]:
                if MI[u, w] == 0:
                    MI[u, w] = metrics.mutual_info_score(
                        None,
                        None,
                        contingency=np.histogram2d(binary[:, u], binary[:,
                                                                        w])[0])
                    MI[w, u] = MI[u, w]
        MI = normalize_mat_row(MI)
        MI[~np.isfinite(MI)] = 1 / L
        ##########and self transition probability########
        log.info(
            'gridsearching on validation set (can be optimized) with JM smoothing'
        )
        val_result = dict()
        for alpha in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
            for mu in [0, 0.1, 0.2, 0.3, 0.4, 0.5]:
                trans = MI * (1 - alpha) + np.identity(L) * alpha
                pref = np.dot(
                    mem_mult,
                    trans)  # consider each trans[i] as a  base vector
                temp = pref * mu + popularity_mult * (1 - mu)
                val_result[(alpha, mu)] = self._compute_logp_point(val, temp)
        #####choose alpha and mu that achieves best avg. point logP
        alpha, mu = max(val_result, key=val_result.get)
        trans = MI * (1 - alpha) + np.identity(L) * alpha
        pref = np.dot(mem_mult, trans)
        stm_scores = pref * mu + popularity_mult * (1 - mu)
        log.info('Evaluating MI based translation model with JM smoothing')
        stm_result = self._compute_erank_logp(test, stm_scores)
        results['Translation_JM'] = stm_result
        log.info("self transition weight and popularity weight: %f, %f" %
                 (alpha, 1 - mu))
        #####record results and mixture parameters########
        logP_p['Translation_JM'], logP_indiv['Translation_JM'] = logP(
            stm_scores, test)
        mix_alpha['Translation_JM'] = np.zeros(I) + mu * alpha

        ##########and self transition probability########
        log.info(
            'gridsearching on validation set (can be optimized) with Dirichlet prior'
        )
        val_result = dict()
        for alpha in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
            for mu in [0, 0.1, 0.2, 0.3, 0.4, 0.5]:
                trans = MI * (1 - alpha) + np.identity(L) * alpha
                pref = np.dot(
                    mem_scores,
                    trans)  # consider each trans[i] as a  base vector
                temp = pref + popularity_mult * mu * N / I
                val_result[(alpha, mu)] = self._compute_logp_point(val, temp)
        #####choose alpha and mu that achieves best avg. point logP
        alpha, mu = max(val_result, key=val_result.get)
        trans = MI * (1 - alpha) + np.identity(L) * alpha
        pref = np.dot(mem_scores, trans)
        stm_scores = pref + popularity_mult * mu * N / I
        log.info('Evaluating MI based translation model with Dirichlet prior')
        stm_result = self._compute_erank_logp(test, stm_scores)
        results['Translation_Dirichlet'] = stm_result
        log.info("self transition weight and prior strength: %f, %f" %
                 (alpha, mu * N / I))
        #####record results and mixture parameters########
        logP_p['Translation_Dirichlet'], logP_indiv[
            'Translation_Dirichlet'] = logP(stm_scores, test)
        mix_alpha['Translation_Dirichlet'] = n_train * alpha / (n_train +
                                                                mu * N / I)

        log.info('#############learning EM global#################')
        pi_mem_pop = learn_mix_mult_global(1.1, mem_mult, popularity_mult, val)
        log.info('Global mixing weight is %f and %f' %
                 (pi_mem_pop[0], pi_mem_pop[1]))
        log.info('Evaluating EM global')

        em_global_scores = pi_mem_pop[0] * mem_mult + pi_mem_pop[
            1] * popularity_mult
        EM_global_result = self._compute_erank_logp(test, em_global_scores)
        results['EM global'] = EM_global_result
        logP_p['EM global'], logP_indiv['EM global'] = logP(
            em_global_scores, test)
        mix_alpha['EM global'] = pi_mem_pop[0] + np.zeros(I)

        log.info('#############learning EM individual##############')
        pi_mem_pop = learn_mix_mult_on_individual(1.1, mem_mult,
                                                  popularity_mult, val)
        log.info('Evaluating EM indiv')

        em_indiv_scores = col_vector(pi_mem_pop[:, 0]) * mem_mult + col_vector(
            pi_mem_pop[:, 1]) * popularity_mult
        EM_indiv_result = self._compute_erank_logp(test, mem_mult,
                                                   popularity_mult, pi_mem_pop)
        results['EM indiv'] = EM_indiv_result
        logP_p['EM indiv'], logP_indiv['EM indiv'] = logP(
            em_indiv_scores, test)
        mix_alpha['EM indiv'] = pi_mem_pop[:, 0]

        log.info('#############learning S_memory###################')
        log.info('gridsearching on validation set')
        val_result = dict()
        for alpha in ALPHA:
            temp = mem_scores * alpha + popularity_scores * (1 - alpha)
            val_result[alpha] = self._compute_logp_point(val, temp)
        #####choose alpha that achieves best avg. point logP
        alpha = max(val_result, key=val_result.get)
        print('alpha:', alpha)
        s_mem_scores = mem_scores * alpha + popularity_scores * (1 - alpha)
        log.info('Evaluating smoothed memory')
        s_mem_result = self._compute_erank_logp(test, s_mem_scores)
        results['S_Mem'] = s_mem_result

        n_train = np.array([int(train.sum(axis=1)[i][0]) for i in range(I)])
        temp = n_train.mean()
        logP_p['S_mem'], logP_indiv['S_mem'] = logP(s_mem_scores, test)
        mix_alpha['S_mem'] = alpha * n_train / (alpha * n_train +
                                                (1 - alpha) * temp)

        log.info('############learning with Dirichlet prior#############')
        log.info('gridsearching on validation set')
        val_result = dict()
        for alpha in ALPHA:
            temp = mem_scores + popularity_mult * alpha * N / I
            val_result[alpha] = self._compute_logp_point(val, temp)
        #####choose alpha that achieves best avg. point logP
        alpha = max(val_result, key=val_result.get)
        print('alpha:', alpha)
        dirichlet_scores = mem_scores + popularity_mult * alpha * N / I
        log.info('Evaluating with Dirichlet prior')
        dirichlet_result = self._compute_erank_logp(test, dirichlet_scores)
        results['Dirichlet'] = dirichlet_result

        logP_p['Dirichlet'], logP_indiv['Dirichlet'] = logP(
            dirichlet_scores, test)
        mix_alpha['Dirichlet'] = n_train / (n_train + alpha * N / I)

        self.pretty_print(results)
        return logP_p, logP_indiv, mix_alpha
Esempio n. 25
0
    def evaluate(self, train, val, test, dim, area):

        ALPHA = [0,0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.999,1]
        mem_scores = self._train_mfs(['memory'],train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'],train,dim,area)[0]

        mem_mult = normalize_mat_row(mem_scores)
        popularity_mult = normalize_mat_row(popularity_scores)

        log.info('Mem and popularity learnt from training data; searching alpha')
        results_val = dict()
        results_test = dict()
        for alpha in ALPHA:
            log.info('Ranking when alpha is %.2f' % alpha)
            scores = alpha * mem_mult + (1-alpha)*popularity_mult
            erank_val = self._compute_erank(val, scores)
            erank_test = self._compute_erank(test, scores)
            results_val['%.2f' % alpha] = erank_val
            results_test['%.2f' % alpha] = erank_test
        log.info('Erank on validation data')
        self.pretty_print(results_val)
        log.info('Erank on test data')
        self.pretty_print(results_test)

        eval_train = train + val
        mem_scores = self._train_mfs(['memory'],eval_train, dim, area)[0]
        popularity_scores = self._train_mfs(['popularity'],eval_train,dim,area)[0]

        mem_mult = normalize_mat_row(mem_scores)
        popularity_mult = normalize_mat_row(popularity_scores)

        log.info('Mem and popularity learnt from training and val data; searching alpha')
        results_val = dict()
        results_test = dict()
        for alpha in ALPHA:
            log.info('Ranking when alpha is %.2f' % alpha)
            scores = alpha * mem_mult + (1-alpha)*popularity_mult
            erank_val = self._compute_erank(val, scores)
            erank_test = self._compute_erank(test, scores)
            results_val['%.2f' % alpha] = erank_val
            results_test['%.2f' % alpha] = erank_test
        log.info('Erank on validation data')
        self.pretty_print(results_val)
        log.info('Erank on test data')
        self.pretty_print(results_test)
Esempio n. 26
0
 def __init__(self):
     log.info('Evaluating ranking with global gridsearched mixing weights')
Esempio n. 27
0
def print_methods():
    """
    Prints the available methods.
    """
    log.info('Available MF methods: %s' % list(_mfs_factory.keys()))
Esempio n. 28
0
 def __init__(self):
     log.info('Evaluating logp with global learned mixing weights')
Esempio n. 29
0
 def __init__(self):
     log.info('Evaluating ranking on a train+val with smoothing')
Esempio n. 30
0
 def __init__(self):
     log.info('Evaluating logp on a train+val without smoothing')