Example #1
0
	def create_feat_eng_data(self, train_file, dev_file, test_file, train_out_file, dev_out_file, test_out_file):
		'''
		Feature engineered data creation using TF-IDF
		train_file: file containing (training) input features
		dev_file: file containing (dev) input features
		test_file: file containing (test) input features
		train_out_file: output file for engineered (training) features 		
		dev_out_file: output file for engineered (dev) features 		
		test_out_file: output file for engineered (test) features 		
		'''
		#create training features
		#x_mat = self.load_sparse_csr(feature_file).transpose().todense()	#feature matrix		
		#x_mat = self.load_sparse_csr(train_file).transpose().toarray()	#feature matrix		
		x_mat = self.load_sparse_csr(train_file).tocsc()		#feature matrix		
		x_mat = x_mat[:,1:]		#removing the first (all 1) feature
		#dict_terms = ast.literal_eval(open(dict_file, 'r').read())
		N = x_mat.shape[0]
		idf_arr = [math.log((N*1.0)/max(1., x_mat[:,i].nnz)) for i in range(x_mat.shape[1])]		
		idf = np.asarray(idf_arr)				
		x_mat = x_mat.tocsr()
		for i in range(0, N, 50000):
			j = min(i + 50000, N)			
			x_mat_temp = csr(x_mat[i:j,:].multiply(idf))
			if i==0:
				x_mat_new = x_mat_temp
			else:
				x_mat_new = vstack([x_mat_new, x_mat_temp])
			print i
		all1_row = csr(np.asarray([1] * x_mat.shape[0])).transpose()
		x_mat_new = hstack([all1_row, x_mat_new])		
		self.save_sparse_csr(train_out_file, csr(x_mat_new))	

		#create dev features
		if dev_file != None:
			x_mat = self.load_sparse_csr(dev_file).transpose().toarray()	#feature matrix
			x_mat = x_mat[1:,:]		#removing the first (all 1) feature
			x_mat = np.multiply(x_mat.transpose(), idf)
			all1_row = np.asarray([1] * x_mat.shape[0]).transpose()
			self.save_sparse_csr(dev_out_file, csr(np.column_stack((all1_row, x_mat))))		
		
		#create test features
		if test_file != None:
			x_mat = self.load_sparse_csr(test_file).transpose().toarray()	#feature matrix
			x_mat = x_mat[1:,:]		#removing the first (all 1) feature
			x_mat = np.multiply(x_mat.transpose(), idf)
			all1_row = np.asarray([1] * x_mat.shape[0]).transpose()
			self.save_sparse_csr(test_out_file, csr(np.column_stack((all1_row, x_mat))))	
Example #2
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir / '2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(
        ctf_file,
        StreamDefs(features=StreamDef(field='S0',
                                      shape=input_vocab_dim,
                                      is_sparse=True),
                   labels=StreamDef(field='S1',
                                    shape=label_vocab_dim,
                                    is_sparse=True))),
                          randomize=False,
                          max_samples=2)

    raw_input = sequence.input_variable(shape=input_vocab_dim,
                                        sequence_axis=Axis('inputAxis'),
                                        name='raw_input',
                                        is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
                                  input_map={raw_input: mbs.streams.features},
                                  device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [[3, 4, 5, 4, 7, 12, 1], [60, 61]]
    data = [
        csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data
    ]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data,
                         num_classes=input_vocab_dim,
                         device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a, b in zip(e_reader, e_hot)])
Example #3
0
    def train(self, start_section, end_section):
        print 'Fitting the hyperplane...'

        #Applying 'csr' to the vectors makes a sparse matrix.
        X_matrix = csr(self.getfeaturevectors(start_section, end_section))
        y_vector = np.array(self.getgsdata(start_section, end_section))

        self.hyperplane.fit(X_matrix, y_vector)
Example #4
0
def _get_kl(n, dx):
    rows2 = [i for i in range(n)]
    lower1rows = [i + 1 for i in range(n - 1)]
    rows = rows2 + rows2[:-1] + lower1rows
    cols = rows2 + lower1rows + rows2[:-1]
    vals = np.array([2.] * n + [-1.] * (n - 1) * 2) / dx**2
    k = csr((vals, (rows, cols)))

    vals = (np.array([2.] * n + [-1.] * (n - 1) * 2) * -1 / dx**2).tolist()
    iden = [1.] * n
    rowsid = (np.array(rows2) + n).tolist()
    i = csr((iden, (rows2, rows2)))

    l = csr(
        (vals + iden, ((np.array(rows) + n).tolist() + rows2, cols + rowsid)))

    return i, k, l
Example #5
0
File: vpesvm.py Project: kiankd/vpe
    def train(self, start_section, end_section):
        print 'Fitting the hyperplane...'

        #Applying 'csr' to the vectors makes a sparse matrix.
        X_matrix = csr(self.getfeaturevectors(start_section, end_section))
        y_vector = np.array(self.getgsdata(start_section, end_section))

        self.hyperplane.fit(X_matrix, y_vector)
Example #6
0
    def root_function_first_derivative_numerical(self,beta):
       
        num = beta.shape[0]

        idx = list(range(num))

        idx_diag_ends = np.array([0,num-1])

        Amatrix = csr((num,num))

        Amatrix += csr((np.ones(num-1),(idx[:-1],idx[1:])),shape=(num,num))
        Amatrix -= csr((np.ones(num-1),(idx[1:],idx[:-1])),shape=(num,num))
        Amatrix += csr((np.array([-1,1]),(idx_diag_ends,idx_diag_ends)),shape=(num,num))

        y_prime = Amatrix*root_function(beta,self.RR)
        x_prime = Amatrix*beta

        return y_prime/x_prime
Example #7
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input_variable, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, epoch_size = 2)

    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
        name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
            input_map={raw_input : mbs.streams.features},
            device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1],
            [60, 61]
            ]
    data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
Example #8
0
    def devtest(self, start_section, end_section, verbose=False):
        print 'Predicting...'

        #Applying 'csr' to the vectors makes a sparse matrix.
        predictions = self.hyperplane.predict(
            csr(self.getfeaturevectors(start_section, end_section)))
        self.compare(self.getgsdata(start_section, end_section),
                     predictions,
                     start_section,
                     verbose=verbose)
Example #9
0
	def predict_class(self, from_file, w_file, test_file, pred_file, x_crossval, w_crossval):
		'''
		Predicts the classes for the dev or test set
		Parameters:
			from_file: boolean value to check if data is to be read from file
			w_file: file containing the w learned from the training data for each class
			test_file: file containing the dev or test data (in csr)
			pred_file: prediction output file
			x_crossval: the testing can be done on a small cross validation set (test_file should be None)
			w_crossval: weights for cross validation
		'''		
		if from_file:
			x = self.load_sparse_csr(test_file)		#157010 x 1001
			x = csr(x[:,1:])						#removing the first (all 1) feature
			w = self.load_sparse_csr(w_file)		#5 x 1001, and without the first feature, 5 x 1000
		else:
			x = x_crossval
			w = w_crossval
		w_dot_x = x.dot(w.transpose())			#157010 x 5
		hard_pred = w_dot_x.toarray().argmax(axis=1) + 1

		w_dot_x_arr = w_dot_x.toarray()
		w_dot_x_arr[w_dot_x_arr>20] = 20
		w_dot_x_arr[w_dot_x_arr<-20] = -20
		w_dot_x = csr(w_dot_x_arr)
		w_dot_x_exp = csr(np.exp(w_dot_x.toarray()))	#157010 x 5
		w_dot_x_sum = w_dot_x_exp.sum(axis=1)			#row sum (157010 x 1)
		x_div_mat = csr(w_dot_x_exp.toarray()/w_dot_x_sum)			#157010 x 5	

		num_vectors = 5
		rating_arr = np.array([1,2,3,4,5]).transpose()
		soft_pred = x_div_mat.toarray().dot(rating_arr)

		if from_file:
			w = open(pred_file, 'w')
			for index in range(len(hard_pred)):
				w.write(str(hard_pred[index]))
				w.write(' ')
				w.write(str(soft_pred[index]))
				w.write('\n')
			w.close
		return hard_pred, soft_pred
def term_transitions(replace, DIST='damerau'):
    index2term = list(
        set([item for item in replace.keys()])
        | set([item for item in replace.values()]))
    term2index = {index2term[i]: i
                  for i in range(len(index2term))}
    rows, cols = zip(*[[term2index[item[0]], term2index[item[1]]]
                       for item in replace.items()])
    R = csr((np.ones(2 * len(rows)), (rows + cols, cols + rows)),
            dtype=bool,
            shape=(len(index2term), len(index2term)))
    labels = connected_components(R)[1]
    sorting = np.argsort(labels)
    labels_s = labels[sorting]
    _, starts = np.unique(labels_s, return_index=True)
    sizes = np.diff(starts)
    groups = [
        group for group in np.split(sorting, starts[1:]) if group.size > 1
    ]
    transition = dict()
    for group in groups:
        sum_group = float(sum([d[(index2term[index], )] for index in group]))
        max_index = None
        max_freq = 0
        for index in group:
            predict_term = index2term[index]
            predict_freq = d[(predict_term, )]
            if predict_freq > max_freq:
                max_freq = predict_freq
                max_index = index
        for index1 in group:
            given_term = index2term[index1]
            len_1 = len(given_term)
            transition[given_term] = dict()
            for index2 in [index1, max_index]:
                predict_term = index2term[index2]
                len_2 = len(predict_term)
                sim_prefix = prefix_normed(given_term, predict_term, len_1,
                                           len_2)
                sim_similar = similarity_normed(given_term, predict_term,
                                                len_1, len_2, DIST)
                transition[given_term][predict_term] = (d[
                    (predict_term, )] / sum_group) * sim_similar
                #(sim_similar+sim_prefix)/2;
            sum_sim = sum([
                transition[given_term][predict_term]
                for predict_term in transition[given_term]
            ])
            for predict_term in transition[given_term]:
                transition[given_term][predict_term] /= sum_sim
            for index2 in [index1, max_index]:
                print(given_term, '-->', index2term[index2],
                      transition[given_term][index2term[index2]])
    return transition
Example #11
0
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = sequence.input(shape=(dim, ), is_sparse=var_is_sparse)
        z = times(in1, multiplier * np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])
Example #12
0
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    in1 = input_variable(shape=(dim, ), is_sparse=True)
    z = times(in1, np.eye(dim).astype(np.float32))
    z *= multiplier
    batch = (np.eye(dim)[batch_index_data]).astype(np.float32)
    expected = batch * multiplier
    sparse_val = csr(batch)
    result = z.eval({in1: sparse_val}, device=cntk_device(device_id))
    assert np.allclose(result, [expected])
Example #13
0
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])
Example #14
0
def adjust_n_max(new_n_max):
    global n_max
    n_max = new_n_max
    global n_phonon
    n_phonon = np.array([i for i in range(n_max + 1)], dtype=np.float64)
    global sqrt_n
    sqrt_n = np.array([sqrt(i) for i in range(1, n_max + 1)])
    global annihilation, creation
    annihilation = csr(np.diag(sqrt_n, k=1))
    creation = csr(np.diag(sqrt_n, k=-1))
    annihilation.prune()
    creation.prune()
    global x_hat, p_hat  ### we assume \hbar = m * \omega = 1
    x_hat = sqrt(1 / 2) * (creation + annihilation)
    p_hat = 1.j * sqrt(1 / 2) * (creation - annihilation)
    x_hat.prune()
    p_hat.prune()
    global x_hat_2, p_hat_2, xp_px_hat
    x_hat_2 = x_hat.dot(x_hat)
    p_hat_2 = np.real(p_hat.dot(p_hat))
    xp_px_hat = x_hat.dot(p_hat) + p_hat.dot(x_hat)
    x_hat_2.prune()
    p_hat_2.prune()
    xp_px_hat.prune()
    global harmonic_Hamil
    harmonic_Hamil = omega * np.diag(1 / 2 + n_phonon)
    harmonic_Hamil = csr(harmonic_Hamil)
    harmonic_Hamil.prune()
    if __name__ == '__main__':
        print('n_max adjusted to {}'.format(new_n_max))
        global eigen_states
        eigen_states = []
        for i in range(new_n_max + 1):
            eigen_states.append(
                common_factor_of_1Dharmonics(i) *
                np.polynomial.hermite.hermval(
                    x.astype(np.float128, order='C'),
                    np.array([0.
                              for j in range(i)] + [1.], dtype=np.float128)))
        eigen_states = np.array(eigen_states).transpose().astype(np.float64,
                                                                 order='C')
Example #15
0
def construct_graph(indices, costs, N):
    """ Creates a compressed sparse row matrix of the travel costs associated with each
    node connection. The SciPy sparse row matrix function takes the following input arguments:

    :param indices: indices of the connected nodes. Each index is split into start nodes and end
    nodes for each connection. This is equivalent to the transposed indices matrix.
    :param costs: the costs associated with each node connection
    :param N: is the size of the sparse graph
    :return: a SciPy compressed sparse row matrix describing the costs associated with each node connection.
    """
    s_graph = csr((costs, (indices[:, 0], indices[:, 1])), shape=(N, N))
    return s_graph
Example #16
0
def test_validate():
    csr = sparse.csr_matrix
    sym = DiscreteSymmetry(
        projectors=[csr(np.array([[1], [0]])),
                    csr(np.array([[0], [1]]))])
    assert sym.validate(csr(np.array([[0], [1]]))) == 'Conservation law'
    assert sym.validate(np.array([[1], [0]])) is None
    assert sym.validate(np.eye(2)) is None
    assert sym.validate(1 - np.eye(2)) == 'Conservation law'

    sym = DiscreteSymmetry(particle_hole=sparse.identity(2))
    assert sym.validate(1j * sparse.identity(2)) is None
    assert sym.validate(sparse.identity(2)) == 'Particle-hole'

    sym = DiscreteSymmetry(time_reversal=sparse.identity(2))
    assert sym.validate(sparse.identity(2)) is None
    assert sym.validate(1j * sparse.identity(2)) == 'Time reversal'

    sym = DiscreteSymmetry(chiral=csr(np.diag((1, -1))))
    assert sym.validate(np.eye(2)) == 'Chiral'
    assert sym.validate(1 - np.eye(2)) is None
def intra_u(img, tmap, X):

    ##yet to add symmetricity
    N = X.shape[0]
    kuu = 5
    X[:, 3:5] = X[:, 3:5] / 20
    alpha = tmap.ravel()

    ind = np.arange(X.shape[0])

    unk = X[(alpha > 0.1) & (alpha < 0.9)]
    unkind = ind[(alpha > 0.1) & (alpha < 0.9)]

    #nearest unknown pixels to unknown
    kdt = KDTree(unk, leaf_size=30, metric='euclidean')
    nu = kdt.query(unk, k=kuu, return_distance=False)
    unk_nbr_true_ind = unkind[nu]
    unk_nu_ind = np.asarray(
        [int(i / kuu) for i in range(nu.shape[0] * nu.shape[1])])
    unk_nu_true_ind = unkind[unk_nu_ind]

    nbr = unk[nu]
    nbr = np.swapaxes(nbr, 1, 2)
    unk = unk.reshape((unk.shape[0], unk.shape[1], 1))

    x = nbr - unk
    x = np.abs(x)
    print(x.shape)
    y = 1 - np.sum(x, axis=1)
    y[y < 0] = 0
    # print(y.shape)

    row = unk_nu_true_ind
    col = unk_nbr_true_ind.ravel()
    data = y.ravel()
    z = csr((data, (col, row)), shape=(N, N))
    w = csr((data, (row, col)), shape=(N, N))
    # z = csr((data,(col,row)),shape=(h*w,h*w))
    w = w + z
    return w
Example #18
0
	def input_to_features(self, feature_list, unhashed_features, hashed_features, unhashed_csr_file, hashed_csr_file, num_features):
		'''
		Converts the parsed input file to features and returns a scipy csr matrix
		Parameters:
			feature_list: list of features in order of input
			unhashed_features: top n features
			hashed_features: top m features (hashed)
			unhashed_csr_file: file where csr matrix is to be stored (unhashed)
			hashed_csr_file: file where csr matrix is to be stored (hashed)
			num_features: number of features
		'''
		row_arr = []
		col_arr_unhashed = []
		col_arr_hashed = []
		data_arr = []
		row = 0
		for text_arr in feature_list:
			#add 1 corresponding to x_0 and w_0
			row_arr.append(row)
			col_arr_unhashed.append(0)
			col_arr_hashed.append(0)
			data_arr.append(1)
			for term in text_arr:
				term = term.encode('ascii', 'ignore')
				try:					
					col_unhashed = unhashed_features[term]						
					col_hashed = hashed_features[term]						
					row_arr.append(row)
					col_arr_unhashed.append(col_unhashed+1)
					col_arr_hashed.append(col_hashed+1)
					data_arr.append(1)					
				except KeyError:
					pass
			row += 1
			if row%5000==0:
				print row
		mat = csr((data_arr, (row_arr, col_arr_unhashed)), shape=(row, num_features+1))
		self.save_sparse_csr(unhashed_csr_file, mat)				
		mat = csr((data_arr, (row_arr, col_arr_hashed)), shape=(row, num_features+1))
		self.save_sparse_csr(hashed_csr_file, mat)				
Example #19
0
 def __init__(self, factorList=None, copy=True, isLog=False):
     """Take in a list of factors and convert & store them in the internal format
   Can also accept a matrix of Ising parameters
 """
     if factorList is None:
         self.h = np.zeros(0)
         self.L = csr((0, 0))
         return
     if not isinstance(factorList[0],
                       Factor):  # not a factor list => matrix?
         L = coo(factorList)
         LL = csr(factorList)
         n = L.shape[0]
         self.h = np.array([LL[i, i] for i in range(n)])
         # extract diagonal
         self.dims = np.array([2 for i in range(n)], dtype=int)
         # all variables binary
         keep = (L.row != L.col)
         data, row, col = L.data[keep], L.row[keep], L.col[keep]
         #for j in np.where(L.row > L.col): row[j],col[j] = col[j],row[j]
         self.L = csr((data, (row, col)),
                      shape=(n, n))  # keep in csr format
         self.L = .5 * (
             self.L + self.L.T
         )  # force symmetric if not (TODO: detect zeros & overwrite?)
     else:
         n = np.max(
             [np.max(f.vars.labels) for f in factorList if len(f.vars)]) + 1
         assert np.max([np.max(f.vars.dims()) for f in factorList
                        ]) <= 2, "Variables must be binary"
         assert np.max([f.nvar for f in factorList
                        ]) <= 2, "Factors must be pairwise"
         self.dims = np.zeros((n, ), dtype=int)
         for f in factorList:
             for v in f.vars:
                 self.dims[v] = v.states
         self.h = np.zeros(n)
         self.L = csr(([], ([], [])), shape=(n, n))
         self.addFactors(factorList, isLog=isLog)
Example #20
0
def get_banded(a: np.ndarray, p, q):
    """
    Converts the matrix a into a banded scipy.sparse.csr_matrix object with
    lower and upper bandwidths p and q, respectively. Returns a CSR matrix
    of the same shape and banded entries as a.
    """

    for i in range(a.shape[0]):
        for j in range(a.shape[1]):
            if i > j + p or j > i + q:
                a[i, j] = 0.

    return csr(a)
Example #21
0
def LBP(ising, maxIter=100, verbose=False):
    """Run loopy belief propagation (specialized for Ising models)
       lnZ, bel = LBP(ising, maxIter, verbose)
       lnZ : float, estimate of the log partition function
       bel : vector, bel[i] = estimated marginal probability that Xi = +1
    """
    # TODO: pass requested beliefs (like JT?), or "single", "factors", etc.
    assert isinstance(ising,Ising), "Model must be an Ising model for this version to work"
    R = ising.L.tocoo(); row = R.row; col = R.col;
    mu = csr(([],([],[])),shape=ising.L.shape)
    L_tanh = ising.L.tanh();
    for it in range(maxIter):
        mu_sum = arr(mu.sum(0)).reshape(-1);
        #R = csr( (ising.h[row]+mu_sum[row], (row,col)), shape=ising.L.shape) - mu.T
        R = csr( (ising.h[row]+mu_sum[row]-arr(mu[col,row]).reshape(-1), (row,col)), shape=ising.L.shape);
        mu = (L_tanh.multiply(R.tanh())).arctanh()
        if verbose: print("Iter "+str(it)+": "+str(__Bethe(ising,R,mu)));

    R = csr( (ising.h[row]+mu_sum[row]-arr(mu[col,row]).reshape(-1), (row,col)), shape=ising.L.shape);
    bel = 1./(1+np.exp(-2.*(arr(mu.sum(0)).reshape(-1)+ising.h)))
    lnZ = __Bethe(ising,R,mu,bel)
    return lnZ, bel
def writeSparseMatrix(nDays):
    for i in range(nDays):
        fil="day"+"%d"%i+"PoissonParametersNonHom.txt"
        A=np.loadtxt(os.path.join("NonHomegeneousPP",fil))
        fil2="daySparse"+"%d"%i+"PoissonParametersNonHom.txt"
        f=open(os.path.join("NonHomogeneousPP2",fil2),'w')
        A=csr(A)
        temp=A.nonzero()
        temp2=np.array([A.data,temp[0],temp[1]])
        np.savetxt(f,temp2)
        f.close()

        fil="day"+"%d"%i+"ExponentialTimesNonHom.txt"
        A=np.loadtxt(os.path.join("NonHomegeneousPP",fil))
        fil2="daySparse"+"%d"%i+"ExponentialTimesNonHom.txt"
        f=open(os.path.join("NonHomogeneousPP2",fil2),'w')
        A=csr(A)
        temp=A.nonzero()
        temp2=np.array([A.data,temp[0],temp[1]])
        
        np.savetxt(f,temp2)
        f.close()
Example #23
0
def writeSparseMatrix(nDays):
    for i in range(nDays):
        fil = "day" + "%d" % i + "PoissonParametersNonHom.txt"
        A = np.loadtxt(os.path.join("NonHomogeneousPP2", fil))
        fil2 = "daySparse" + "%d" % i + "PoissonParametersNonHom.txt"
        f = open(os.path.join("SparseNonHomogeneousPP2", fil2), 'w')
        A = csr(A)
        temp = A.nonzero()
        temp2 = np.array([A.data, temp[0], temp[1]])
        np.savetxt(f, temp2)
        f.close()

        fil = "day" + "%d" % i + "ExponentialTimesNonHom.txt"
        A = np.loadtxt(os.path.join("NonHomogeneousPP2", fil))
        fil2 = "daySparse" + "%d" % i + "ExponentialTimesNonHom.txt"
        f = open(os.path.join("SparseNonHomogeneousPP2", fil2), 'w')
        A = csr(A)
        temp = A.nonzero()
        temp2 = np.array([A.data, temp[0], temp[1]])

        np.savetxt(f, temp2)
        f.close()
Example #24
0
	def row_agg(self, mat, mapping, rowK):
		'''
		Aggregates the row (document) vectors
		Params:
			mat: original matrix
			mapping: the row mapping (to cluster centroids)
			rowK: number of document clusters
		'''			
		agg_mat = np.zeros(shape=(rowK, mat.shape[1]))		
		i = 0
		for key in mapping:						
			agg_mat[key].__iadd__(mat[i,:])
			i += 1
		return csr(agg_mat)
Example #25
0
def test_validate_commutator():
    symm_class = ['AI', 'AII', 'D', 'C', 'AIII', 'BDI']
    sym_dict = {
        'AI': ['Time reversal'],
        'AII': ['Time reversal'],
        'D': ['Particle-hole'],
        'C': ['Particle-hole'],
        'AIII': ['Chiral'],
        'BDI': ['Time reversal', 'Particle-hole', 'Chiral']
    }
    n = 10
    rng = 10
    for sym in symm_class:
        # Random matrix in symmetry class
        h = kwant.rmt.gaussian(n, sym, rng=rng)
        if kwant.rmt.p(sym):
            p_mat = np.array(kwant.rmt.h_p_matrix[sym])
            p_mat = csr(np.kron(np.identity(n // len(p_mat)), p_mat))
        else:
            p_mat = None
        if kwant.rmt.t(sym):
            t_mat = np.array(kwant.rmt.h_t_matrix[sym])
            t_mat = csr(np.kron(np.identity(n // len(t_mat)), t_mat))
        else:
            t_mat = None
        if kwant.rmt.c(sym):
            c_mat = csr(np.kron(np.identity(n // 2), np.diag([1, -1])))
        else:
            c_mat = None
        disc_symm = DiscreteSymmetry(particle_hole=p_mat,
                                     time_reversal=t_mat,
                                     chiral=c_mat)
        assert disc_symm.validate(h) == []
        a = random_onsite_hop(n, rng=rng)[1]
        for symmetry in disc_symm.validate(a):
            assert symmetry in sym_dict[sym]
Example #26
0
def test_eval_sparse_seq_0(batch_index_data, device_id):
    if cntk_device(device_id) != cpu():  # FIXME
        pytest.skip("sparse is not yet supported on GPU")
    dim = 10
    multiplier = 2
    in1 = input_variable(shape=(dim, ), is_sparse=True)
    z = times(in1, np.eye(dim).astype(np.float32))
    z *= multiplier
    batch = [(np.eye(dim)[seq_index_data]).astype(np.float32)
             for seq_index_data in batch_index_data]
    expected = batch * multiplier
    sparse_val = [csr(seq) for seq in batch]
    result = z.eval({in1: sparse_val}, device=cntk_device(device_id))
    assert np.all(np.allclose(a,b) \
            for a,b in zip(result, expected))
def set_diagonal(
    matrix, new
):  #WARNING: new is expected to be sparse csr matrix (as opposed to what is expected in set_new)
    matrix.eliminate_zeros()
    new.eliminate_zeros()
    rows, cols = matrix.nonzero()
    data = matrix.data
    old = rows != cols
    rows_old, cols_old = rows[old], cols[old]
    data_old = data[old]
    rows_cols_new = new.nonzero()[0]
    data_new = new.data
    cols_, rows_ = np.concatenate([cols_old, rows_cols_new],
                                  0), np.concatenate([rows_old, rows_cols_new],
                                                     0)
    data_ = np.concatenate([data_old, data_new], 0)
    return csr((data_, (rows_, cols_)), shape=matrix.shape)
Example #28
0
	def normalize_rows(self, mat, mapping):
		'''
		Normalizes the row clusters
		Parameters:
			mat: matrix
			mapping: mapping of row to cluster
		'''
		cluster_sum = [0] * mat.shape[0]
		for key in mapping:
			cluster_sum[key] += 1.
			
		for row_no in range(mat.shape[0]):
			if cluster_sum[row_no] == 0:
				cluster_sum[row_no] = 1.

		cluster_sum = np.array(cluster_sum)		
		return csr((mat.T.toarray().__mul__(1/cluster_sum)).T)
Example #29
0
	def kron_s(A, B, sec):
		fp = A.fparity
		ss = A.sym_sum
		sym = A.sym
		dim = sym * A.dim * B.dim
		ret = csr((dim, dim), dtype=np.complex)
		for row1 in range(sym):
			for col1 in range(sym):
				row2 = ss(sec, -row1)
				col2 = ss(sec, -col1)
				if not A._empty_(row1, col1) and not B._empty_(row2, col2):
					sign = 1 - 2*( fp[col1] * fp[ss(col2, -row2)] )
					temp = coo( sparse.kron(A.val[row1][col1], B.val[row2][col2]) * sign )
					block = A.dim * B.dim
					add = coo((temp.data, (temp.row + row1*block, temp.col + col1*block)),(dim, dim), dtype=np.complex)
					ret += add
		return ret
Example #30
0
	def __init__(self, sym=2, dim=1, datatype=np.float, fparity=None, sym_sum=None):

		def sym_sum_n(a, b):
			return (a+b+sym)%sym

		self.sym = sym		# number of total symmetry sectors
		self.dim = dim		# dimension of each symmetry sector
		self.datatype = datatype

		if fparity is None: self.fparity = np.zeros(sym, dtype=np.int)		# fermion parity
		else: self.fparity = fparity

		if sym_sum is None: self.sym_sum = sym_sum_n		# the rule to sum the symmetries
		else: self.sym_sum = sym_sum

		self.val = [[csr((dim,dim), dtype=datatype) for x in range(sym)] for y in range(sym)]
		self.basis = None
		self.L = 1
Example #31
0
	def normalize_cols(self, mat, mapping):
		'''
		Normalizes the column clusters
		Parameters:
			mat: matrix
			mapping: mapping of column to cluster
		'''
		cluster_sum = [0] * mat.shape[1]
		for key in mapping:
			cluster_sum[key] += 1.
			
		for col_no in range(mat.shape[1]):
			if cluster_sum[col_no] == 0:
				cluster_sum[col_no] = 1.

		cluster_sum = np.array(cluster_sum)		
		return csr(mat.toarray().__mul__(1/cluster_sum))
		
Example #32
0
def fp_fv_mod(x,y,time,m_tmp,a1_tmp,a2_tmp,D11,D22,D12,dt,dx):
	I = x.size
	J = y.size
	eye = sparse.identity((I*J),format='lil')
	#get function values
	[f1_array, f2_array] = iF.f_global(time,x,y,a1_tmp,a2_tmp)
	D11 = iF.Sigma_D11_test(time,x,y,a1_tmp,a2_tmp,m_tmp)
	D12 = iF.Sigma_D12_test(time,x,y,a1_tmp,a2_tmp,m_tmp)
	D22 = iF.Sigma_D22_test(time,x,y,a1_tmp,a2_tmp,m_tmp)
	#make matrices
	LHS = mg.add_diffusion_flux_Ometh(eye,D11,D22,D12,I,J,dx,dt)
	LHS = sparse.csr(LHS)
	RHS = mg.fp_fv_convection(time,x,a_tmp,m_tmp,dt,dx)
	#print LHS
	#print RHS
	#print ss
	#LHS = sparse.csr(sparse.eye(I)-mg.fp_fv_diffusion(time,x,a_tmp,m_tmp,dt,dx))
	#RHS = sparse.csr(sparse.eye(I)+mg.fp_fv_convection(time,x,a_tmp,m_tmp,dt,dx))
	return sparse.linalg.spsolve(LHS,RHS*m_tmp)
Example #33
0
 def addFactors(self, flist, copy=True, isLog=False):
   """Add a list of (binary, pairwise) factors to the model; factors are converted to Ising parameters"""
   row = np.zeros(2*len(flist),dtype=int)-1; col=row.copy(); data=np.zeros(2*len(flist));
   for k,f in enumerate(flist):
     if not isLog: 
         if np.any(f.t<=0): f = f+1e-10;   # TODO: log nonzero tol
         f = f.log()
     if f.nvar == 1:
       Xi = f.vars[0]
       self.h[Xi] += .5*(f[1]-f[0])
       self.c += .5*(f[1]+f[0])
     else:
       Xi,Xj = f.vars[0],f.vars[1]
       row[2*k],col[2*k],data[2*k] = int(Xi),int(Xj), .25*(f[1,1]+f[0,0]-f[0,1]-f[1,0])
       row[2*k+1],col[2*k+1],data[2*k+1] = col[2*k],row[2*k],data[2*k]          
       #L[Xi,Xj] += .25*(f[1,1]+f[0,0]-f[0,1]-f[1,0])
       self.h[Xi] += .5*(f[1,0]-f[0,0])+data[2*k] #L[Xi,Xj]
       self.h[Xj] += .5*(f[0,1]-f[0,0])+data[2*k] #L[Xi,Xj]
       self.c += .25*(f[1,1]+f[1,0]+f[0,1]+f[0,0])
   self.L += csr((data[row>=0],(row[row>=0],col[row>=0])),shape=(self.nvar,self.nvar));
Example #34
0
	def file_to_csr(self, file_name):
		'''
		Converts file to list of maps where each map represents a doc vector
		'''
		f = open(file_name, 'r')				
		count = 0
		row_arr = []
		col_arr = []
		val_arr = []		
		max_col = 0
		for line in iter(lambda: f.readline().rstrip(), ''):								
			for x in line.split(' '):
				col = int(x.split(':')[0])
				if col>max_col:
					max_col = col
				col_arr.append(col)	 			
				val_arr.append(int(x.split(':')[1]))
				row_arr.append(count)			
			count += 1														
		return csr((val_arr, (row_arr, col_arr)), shape=(count, max_col+1)).tocsc()
Example #35
0
    def __init__(self,
                 coor_x,
                 coor_y,
                 val,
                 max_rat,
                 min_rat,
                 user_num,
                 movie_num,
                 U=None,
                 V=None,
                 alpha=2.0,
                 mu0=0.0,
                 D=30,
                 T=100):
        """
        coor和val是表示稀疏的评分矩阵
        coor是一个列表,每一项为一个列表,此列表第一项为用户,第二项为电影
        val是一个列表,每一项为coor中对应的评分
        """
        self.alpha = alpha
        self.mu0 = mu0
        self.D = D
        self.v0 = D
        self.beta0 = 2.0

        self.val = val
        self.N = user_num
        self.M = movie_num

        self.max_rat = max_rat
        self.min_rat = min_rat

        self.T = T
        self.R = csr((val, (coor_x, coor_y)), shape=(self.N, self.M))
        self.U = np.random.normal(size=(self.N, self.D))
        self.V = np.random.normal(size=(self.M, self.D))

        self.W0_user = np.eye(self.D)
        self.W0_item = np.eye(self.D)

        self.rmses = []
Example #36
0
def bandedLU(M: csr, ml, mu):
    """
    Computes standard LU decomposition of a class 'scipy.sparse.csr.csr_matrix'
    banded square matrix M with lower and upper bandwidths ml and mu, respectively.
    Returns L and U as sparse CSR matrices.
    """

    m = M.shape[0]
    u = M.copy()  # can remove to act directly on M

    # Allocating memory to store nnzl number of non-zero entries of L
    nnzl = int(m * (ml + 1) - ml * (ml + 1) / 2)
    l_row = np.zeros(nnzl).astype(np.int_)
    l_val = np.ones(nnzl).astype(M.dtype)

    for i in range(m):
        l_row[i] = i
    l_col = l_row.copy()
    count = i + 1  # counter for the next entry of L

    for k in range(m - 1):
        column_entries_ind = u.indptr[k] + (
            u.indices[u.indptr[k]:u.indptr[min(k + ml + 1, m)]]
            == k).nonzero()[0]
        for i, ind in enumerate(column_entries_ind[1:]):
            l = u.data[ind] / u.data[column_entries_ind[0]]
            l_val[count] = l
            l_col[count] = k
            l_row[count] = int(k + i + 1)
            count += 1

            b = min(mu + 1, m - k)
            u.data[ind + 1:ind + b] -= l * u.data[column_entries_ind[0] +
                                                  1:column_entries_ind[0] + b]
            u.data[ind] = 0.

    u.eliminate_zeros()
    l = csr((l_val, (l_row, l_col)))

    return l, u
Example #37
0
    def fractionalflow(self):

        self.fw = (self.krw * self.muo) / (self.krw * self.muo +
                                           self.kro * self.muw)

        N = self.fw.size

        one = np.ones(N - 1)

        idx = np.array(list(range(N)))

        row = np.concatenate(((idx[0], idx[-1]), idx[:-1], idx[1:]))
        col = np.concatenate(((idx[0], idx[-1]), idx[1:], idx[:-1]))

        val = np.concatenate(((-1, 1), one, -one))

        G = csr((val, (row, col)), shape=(N, N))

        fw_diff = G * self.fw
        Sw_diff = G * self.Sw

        self.fw_der = fw_diff / Sw_diff
def generate_matrix(df, r, g_item1, g_item2):
    at1 = pd.to_numeric(df[g_item1[0]])
    op1 = g_item1[1]
    
    at2 = pd.to_numeric(df[g_item2[0]])
    op2 = g_item2[1]
    
    N = len(at1)
    matrix = np.zeros((N, N)) # Length is the number of transactions...
    
    if op1 == '>' and op2 == '>': 
        for i in range(N):
            for j in range(N):
                if i != j:
                    matrix[i,j] = Concordance_degree([at1[i], at1[j]],[at2[i], at2[j]],r)
                    
    if op1 == '>' and op2 == '<': 
        for i in range(N):
            for j in range(N):
                if i != j:
                    matrix[i,j] = Concordance_degree([at1[i], at1[j]],[at2[j], at2[i]],r)
                    
    return csr(matrix)
Example #39
0
 def kron_s(A, B, sec):
     fp = A.fparity
     ss = A.sym_sum
     sym = A.sym
     dim = sym * A.dim * B.dim
     ret = csr((dim, dim), dtype=np.complex)
     for row1 in range(sym):
         for col1 in range(sym):
             row2 = ss(sec, -row1)
             col2 = ss(sec, -col1)
             if not A._empty_(row1, col1) and not B._empty_(row2, col2):
                 sign = 1 - 2 * (fp[col1] * fp[ss(col2, -row2)])
                 temp = coo(
                     sparse.kron(A.val[row1][col1], B.val[row2][col2]) *
                     sign)
                 block = A.dim * B.dim
                 add = coo(
                     (temp.data,
                      (temp.row + row1 * block, temp.col + col1 * block)),
                     (dim, dim),
                     dtype=np.complex)
                 ret += add
     return ret
Example #40
0
def load_data(input_file, test_percentage):
    data = np.loadtxt(input_file)
    
    rating = data[:, 2]         #1-5
    number_ratings = len(rating)
    
    user_ids = data[:, 0].astype(int)
    user_ids -= 1           #convert to 0 based indexing
    movie_ids = data[:, 1].astype(int)
    movie_ids -= 1          #convert to 0 based indexing
    reviews = csr((rating, (user_ids, movie_ids)), shape = (max(user_ids) + 1, max(movie_ids) + 1))
    reviews = reviews.toarray()
    
    test_idxs = np.array(random.sample(range(number_ratings), number_ratings/test_percentage))
    
    train_reviews = np.array(reviews)
    for idx in test_idxs:
        train_reviews[user_ids[idx]][movie_ids[idx]] = 0
    
    test_reviews = np.zeros_like(reviews)
    for idx in test_idxs:
        test_reviews[user_ids[idx]][movie_ids[idx]] = reviews[user_ids[idx]][movie_ids[idx]]

    return train_reviews, test_reviews
Example #41
0
    def __init__(self, mesh, **kwargs):

        LinearSimulation.__init__(self, mesh, **kwargs)

        # Find non-zero cells
        if getattr(self, "actInd", None) is not None:
            if self.actInd.dtype == "bool":
                indices = np.where(self.actInd)[0]
            else:
                indices = self.actInd

        else:

            indices = np.asarray(range(self.mesh.nC))

        self.nC = len(indices)

        # Create active cell projector
        projection = csr(
            (np.ones(self.nC), (indices, range(self.nC))), shape=(self.mesh.nC, self.nC)
        )

        # Create vectors of nodal location for the lower and upper corners
        bsw = self.mesh.gridCC - self.mesh.h_gridded / 2.0
        tne = self.mesh.gridCC + self.mesh.h_gridded / 2.0

        xn1, xn2 = bsw[:, 0], tne[:, 0]
        yn1, yn2 = bsw[:, 1], tne[:, 1]

        self.Yn = projection.T * np.c_[mkvc(yn1), mkvc(yn2)]
        self.Xn = projection.T * np.c_[mkvc(xn1), mkvc(xn2)]

        # Allows for 2D mesh where Zn is defined by user
        if self.mesh.dim > 2:
            zn1, zn2 = bsw[:, 2], tne[:, 2]
            self.Zn = projection.T * np.c_[mkvc(zn1), mkvc(zn2)]
Example #42
0
def split_data(X, test_percentage=10):
    X = csr(X)
    number_ratings = len(X.indices)
    user_ids = []
    indptr = X.indptr
    for i in range(len(indptr) - 1):
        t1 = indptr[i + 1] - indptr[i]
        for j in range(t1):
            user_ids.append(i)
    movie_ids = X.indices
    test_idxs = np.array(
        random.sample(range(number_ratings), number_ratings / test_percentage))

    X = X.toarray()
    train_reviews = np.array(X)
    for idx in test_idxs:
        train_reviews[user_ids[idx]][movie_ids[idx]] = 0

    test_reviews = np.zeros_like(X)
    for idx in test_idxs:
        test_reviews[user_ids[idx]][movie_ids[idx]] = X[user_ids[idx]][
            movie_ids[idx]]

    return train_reviews, test_reviews
Example #43
0
def ls_featuresign_sub(A,y, AtA, Aty, gamma, xinit=None):

		L, M = A.shape

		rankA = min(A.shape[0]-10, A.shape[0]-10)

		# Step 1: initialize
		usexinit=False
		if xinit is None:
					xinit = []
					x = ssp.csr(np.zeros((M,1)))
					theta = ssp.csr(np.zeros((M,1)))
					act = ssp.csr(np.zeros((M,1)))
					allowZero = False
		else
					x = ssp.csr(xinit)
					theta = ssp.csr(x)
					act = ssp.csr(np.abs(theta))
					usexinit = True
					allowZero = True

		#[TO BE INSERTED] debug file

		fobj = 0

		ITERMAX=1000
		optimality1=False
		for iter in range(ITERMAX):
				act_indx0 = np.where(act==0)
				grad = np.dot(AtA,ssp.csr(x)) - Aty
				theta = np.sign(x)

				optimality0 = False
				#step 2
				mx, indx = max(np.abs(grad[act_indx0]))

				if mx is None and mx >= gamma and (iter > 1 or not usexinit):
						act[act_indx0[idx]] = 1
						theta[act_indx0[idx]] = -np.sign(grad[act_indx0[idx]])
						usexinit = False
				else
						optimality0 =True
						if optimality1: break

				act_indx1 = np.where(act == 1)

				if len(act_indx1) > rankA:
						print "warning: sparsity penality is too small: too many coefficients are activated!"
						return

				if act_indx1.size == 0:
						if allowZero:
								allowZero = False
								continue
						return


				k=0
				while 1:
						k +=1

						if k > ITERMAX
								print "Maximum number of iterations reached. The solution may not be optimal"
								return

						if act_indx1.size == 0:
									if allowZero:
											allowZero = False
											break
									return

						#step 3
						x, theta, act, act_indx1, optimality1, lsearch, fobj = compute_FS_step(x,A,y,AtA,Aty,theta, act, act_indx1, gamma)

						#step 4
						if optimality1: break;
						if lsearch>0: continue;


				if iter >= ITERMAX:
						print "maximum number of iterations reached. The solution may not be optimal"

				#[add later] if 0 #check optimality

		fobj = fobj_featersign(x,A,y,AtA, Aty, gamma)

		return x, fobj
Example #44
0
File: vpesvm.py Project: kiankd/vpe
    def devtest(self, start_section, end_section, verbose=False):
        print 'Predicting...'

        #Applying 'csr' to the vectors makes a sparse matrix.
        predictions = self.hyperplane.predict(csr(self.getfeaturevectors(start_section, end_section)))
        self.compare(self.getgsdata(start_section, end_section), predictions, start_section, verbose=verbose)
Example #45
0
     [0,1,6],
    ])
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])

@pytest.mark.parametrize("batch", [
    [csr([0,1,2,0])],
    [
        csr([[0, 2, 0, 7], [10, 20, 0, 0]]),
        csr([0, 0, 0, 3])
    ]
    ])
def test_eval_sparse_seq_1(batch, device_id):
    dim = 4
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        if isinstance(batch[0], list):
            expected = [np.vstack([m.todense() * multiplier for m in seq]) for seq in
                    batch]
        else:
Example #46
0
	def batched_gradient_descent(self, feature_file, y_file, w_final_file, num_features, alpha, batch_size, lambda_var, cv_percentage):
		'''
		Batched gradient descent function
		Parameters:
			feature_file: file containing features in sparse scipy format
			y_file: file containing ratings in same order as feature_file
			w_final_file: file storing final w as csr matrix
			num_features: number of features
			alpha: value of alpha for gradient descent 
			batch_size: size of each batch 
			lambda_var: value of lambda 
			cv_percentage: percentage of dataset to be used for cross validation
		'''
		x_mat = obj.load_sparse_csr(feature_file)	#feature matrix				
		x_mat = csr(x_mat[:,1:])		#removing the first (all 1) feature
		min_rating = 1000
		max_rating = 0
		y_arr = []	#y values corr to feature matrix				
		f = open(y_file, 'r')
		stars_list = ast.literal_eval(f.read())
		for rating in stars_list:
			if rating>max_rating:
				max_rating = rating
			if rating<min_rating:
				min_rating = rating
			y_arr.append(rating)		
		f.close()	
		y_mat = np.zeros((max_rating-min_rating+1, len(y_arr)))	#5 x num_input_examples
		#y_mat = csr((y_arr,(row_arr, col_arr)), shape=(1, len(col_arr)))
		col = 0
		for rating in y_arr:
			y_mat[rating-1][col] = 1
			col += 1		
		#init max_rating-min_rating+1 number of feature vectors; init with all weights as 1/n
		num_vectors = max_rating-min_rating+1
		#w_mat = csr(np.random.rand(num_vectors, num_features))		
		w_mat = np.zeros(shape = (num_vectors, num_features))
		w_mat[:] = 1./num_features
		w_mat = csr(w_mat)
		
		#parameters
		m = batch_size 	#batch size
		total_x = len(y_arr)

		#determine cross validation dataset
		cv_size = int((cv_percentage*0.01*total_x)%m) * m		
		max_start = int(total_x/m) * m - m - cv_size
		cv_start = 0
		while True:
			rand_start = random.randint(0, max_start)
			cv_start = int(rand_start/m) * m
			if cv_start <= max_start:
				break			
		
		#iterate
		i = 0
		j = 0
		old_hard_accuracy = 0.
		old_soft_accuracy = 0.
		iter_no = 1
		#gold_y = y_arr[cv_start:(cv_start+cv_size)]				
		#x_crossval = x_mat[cv_start:(cv_start+cv_size),:]	
		temp_alpha = alpha			
		while True:
			if i==cv_start:
				i = cv_start + cv_size
			if i>=total_x:				
				#check stopping condition: cross validation error
				#NOTE: soft_accuracy = rmse
				gold_y = y_arr[cv_start:(cv_start+cv_size)]				
				x_crossval = x_mat[cv_start:(cv_start+cv_size),:]								
				hard_pred, soft_pred = self.predict_class(False, None, None, None, x_crossval, w_mat)
				hard_accuracy, soft_accuracy = self.compute_cv_error(hard_pred, soft_pred, gold_y)
				print hard_accuracy, soft_accuracy
				if math.fabs(hard_accuracy-old_hard_accuracy)<0.01 and iter_no>10:
					print 'FINAL TRAINING ACCURACY:'
					hard_pred, soft_pred = self.predict_class(False, None, None, None, x_mat, w_mat)
					hard_accuracy, rmse = self.compute_cv_error(hard_pred, soft_pred, y_arr)
					print hard_accuracy, rmse
					print '--------------------------'
					break
				'''
				w_sq = ((w_mat-w_mat_prev).toarray())**2
				print np.sqrt(np.sum(w_sq))	
				if np.sqrt(np.sum(w_sq)) < 0.01:
					break
				'''
				i = 0				
				old_hard_accuracy = hard_accuracy
				old_soft_accuracy = soft_accuracy							
				#get a new set of examples to be used for cross validation: random cross-validation
				cv_start = 0
				while True:
					rand_start = random.randint(0, max_start)
					cv_start = int(rand_start/m) * m
					if cv_start <= max_start:
						break		
				iter_no += 1			
				#alpha /= (iter_no**2)
				alpha /= (2**iter_no)
				#alpha = temp_alpha / (iter_no**2)
				#alpha *= 0.8
				continue

			j = min(i+m-1, total_x-1)	#total m examples in a batch
			x_batch_mat = csr(x_mat[i:j+1,:])							#m x num_features
			w_dot_x = w_mat.dot(x_batch_mat.transpose()).transpose()	#m x 5
			w_dot_x_arr = w_dot_x.toarray()
			w_dot_x_arr[w_dot_x_arr>20] = 20
			w_dot_x_arr[w_dot_x_arr<-20] = -20
			w_dot_x = csr(w_dot_x_arr)
			w_dot_x_exp = csr(np.exp(w_dot_x.toarray()))				#m x 5
			w_dot_x_sum = w_dot_x_exp.sum(axis=1)						#row sum (m x 1)
			
			x_div_mat = csr(w_dot_x_exp.toarray()/w_dot_x_sum)			#m x 5			
			y_batch_mat = csr(y_mat[:,i:j+1].transpose())				#m x 5
			sub_mat = csr(y_batch_mat.toarray() - x_div_mat.toarray())	#m x 5
			sum_by_m_mat = csr(sub_mat.transpose().dot(x_batch_mat)/m)	#5 x num_features

			w_mat_prev = w_mat
			w_mat = w_mat + alpha * (sum_by_m_mat - lambda_var * w_mat)						
			i = j + 1			

		self.save_sparse_csr(w_final_file, w_mat)
Example #47
0
    [0, 1, 6],
])
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = sequence.input(shape=(dim, ), is_sparse=var_is_sparse)
        z = times(in1, multiplier * np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])


@pytest.mark.parametrize("batch", [[csr(
    [0, 1, 2, 0])], [csr([[0, 2, 0, 7], [10, 20, 0, 0]]),
                     csr([0, 0, 0, 3])]])
def test_eval_sparse_seq_1(batch, device_id):
    dim = 4
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = sequence.input(shape=(dim, ), is_sparse=var_is_sparse)
        z = times(in1, multiplier * np.eye(dim))
        if isinstance(batch[0], list):
            expected = [
                np.vstack([m.todense() * multiplier for m in seq])
                for seq in batch
            ]
        else:
            expected = [seq.todense() * multiplier for seq in batch]
        result = z.eval({in1: batch}, device=cntk_device(device_id))
Example #48
0
	def load_sparse_csr(self, filename):
		'''
		Loads a sparse matrix
		'''
		loader = np.load(filename)
		return csr((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])