def gradient_descent(X, y, theta, alpha, num_iters):
	m = y.size
	J_history = np.zeros(shape=(num_iters, 1))
 
	for i in range(num_iters):
 		predictions = X.dot(theta).flatten()
		errors_x1 = (predictions - y) * X[:, 0]
		errors_x2 = (predictions - y) * X[:, 1]
		theta[0][0] = theta[0][0] - alpha * (1.0 / m) * errors_x1.sum()
		theta[1][0] = theta[1][0] - alpha * (1.0 / m) * errors_x2.sum()
	
		J_history[i, 0] = compute_cost(X, y, theta)
 
	return theta, J_history
#and replace all ones in second column by X values
#first column stays one : remember x subscript zero is all ones. just to make it mathematically convinient 
x = np.ones(shape=(m, 2))
x[:, 1] = X
 
#we need to intialize theta with some dummy values, lets assume its all zeros
theta = np.zeros(shape=(2, 1))
 
#set number of iterations for updating gradient discent
iterations = 50000

#set learning rate
alpha = 0.01
 
#display initial cost
print compute_cost(x, y, theta)
 
theta, J_history = gradient_descent(x, y, theta, alpha, iterations)
 
print theta
#Predict values for population sizes of 35,000 and 70,000
predict1 = np.array([1, 3.5]).dot(theta).flatten()
print 'For population = 35,000, we predict a profit of %f' % (predict1 * 10000)
predict2 = np.array([1, 5.0]).dot(theta).flatten()
print 'For population = 50,000, we predict a profit of %f' % (predict2 * 10000)
 
#Plot the results
result = x.dot(theta).flatten()
pl.plot(data[:, 0], result)
pl.show()