Esempio n. 1
0
def lassoLossGradient(features, weights, true_value, tuning_parameter):
    """Computes the value of the training loss gradient (with respect to the
    weight vector) at a specific example.

    Training loss includes a lasso (L1) regularization term.

    Args:
        features (dict): A sparse vector of feature values.
        weights (dict): A sparse vector of feature weights.
        true_value (int): The true value of an example.
        tuning_parameter (double): Coefficient of the lasso regularization term.

    Returns:
        A sparse vector (dict) representing the gradient value.
    """
    # Standard squared loss
    gradient = {}
    scale = 2 * (dotProduct(features, weights) - true_value)

    # Lasso term: add gradient of the lasso term to the scaling factor (i.e.
    # add gradient of |tuning_parameter| * (1-norm of weights)
    weight_signs = [np.sign(weights[w]) for w in weights]

    for w in weights:
        gradient[w] = tuning_parameter * np.sign(weights[w])

    increment(gradient, scale, features)
    return gradient
Esempio n. 2
0
def learnRegression(examples, numIters, stepSize, tuning_parameter):
    """Learns linear regression weights and generates a predictor function.

    Args:
        examples: An array of training examples.
        numIters (int): Number of training iterations.
        stepSize(double): Stochastic gradient descent step size.
        tuning_parameter (double): Tuning parameter for the loss function.
        
    Returns:
        A predictor function that outputs a price (int) given a single input
        tuple.
    """
    weights = defaultdict(int)
    print ""
    for i in range(numIters):
        for x, y in examples:
            gradient = regularizationLossGradient(x, weights, y,
                                                  tuning_parameter)
            increment(weights, -stepSize, gradient)
        print "Training progress: " + str(100.0 * (i + 1) / numIters) + "%"

    def predictor(x):
        return dotProduct(x, weights)

    return predictor
Esempio n. 3
0
def regularizationLossGradient(features, weights, true_value,
                               tuning_parameter):
    """Computes the value of the training loss gradient (with respect to the
    weight vector) at a specific example.

    Training loss includes a ridge (L2) regularization term.

    Args:
        features (dict): A sparse vector of feature values.
        weights (dict): A sparse vector of feature weights.
        true_value (int): The true value of an example.
        tuning_parameter (double): Coefficient of the ridge regularization term.

    Returns:
        A sparse vector (dict) representing the gradient value.
    """
    # Standard squared loss
    gradient = {}
    scale = 2 * (dotProduct(features, weights) - true_value)

    # Regularization term: add gradient of the regularization term to the
    # scaling factor (i.e. add gradient of |tuning_parameter| *
    # (2-norm of weights)^2
    increment(gradient, tuning_parameter, weights)
    increment(gradient, scale, features)
    return gradient
Esempio n. 4
0
def learnPredictor(trainExamples, testExamples, featureExtractor):
    weights = collections.Counter()
    def loss(w, phi, y):
        return max(1 - util.dotProduct(w, phi) * y, 0)
    
    eta = 0.1  
    numIters = 3 
    def sgradLoss(w, phi, y):
        if loss(w, phi, y) == 0:
            return collections.Counter()
        for key, value in phi.items():
            phi[key] = -1 * phi[key] * y
        return phi
    
    def predictor(x):
        if x == None:
            return -1
        if util.dotProduct(featureExtractor(x), weights) > 0:
            return 1
        else:
            return 0 

    for iteration in xrange(numIters):
        for input, output in trainExamples:
            if input == None:
                continue
            util.increment(weights, -1 * eta, sgradLoss(weights, 
                featureExtractor(input), output))
        
        if DEBUG:
            print util.evaluatePredictor(trainExamples, predictor) 
            #print util.evaluatePredictor(testExamples, predictor)
    
    return weights
Esempio n. 5
0
    def POST(self):
        i = web.input()
        counter = increment("sign", 0)+1

        if i.name == "":
            logging.error("name field should not be empty")
            render = web.template.render('templates')
            return render.error()

        try:
            obj = Sign(
                count = counter,
                date  = datetime.utcnow().date(),
                name  = i.name,
                birth = date(int(i.birthyear), int(i.birthmonth), int(i.birthdate)),
                addr  = i.addr,
                phone = i.phone)
            obj.put()
        except:
            logging.error("Error happens when write to db " + str(sys.exc_info()[0]))
            render = web.template.render('templates')
            return render.error()

        counter = increment("sign", 1)
        render = web.template.render('templates')
        return render.thanks(i.name)
Esempio n. 6
0
def pegasos_sw(X_train, y_train, lambda_reg=1, max_it=1000, tol=1e-4):
    W = Counter()
    s = 1
    t = 1
    epoch = 1
    objective = 1e5
    objective2 = 10
    m = len(y_train)

    while abs(objective - objective2) > tol and epoch <= max_it:
        objective2 = objective
        objective = 0
        for j in range(m):
            t = t + 1
            step = 1 / (t * lambda_reg)
            review = X_train[j]
            result = y_train[j]
            scale = -(step * lambda_reg)
            cond = result * s * util.dotProduct(W, review)

            if cond < 1:
                s = (1 + scale) * s
                util.increment(W, step * result / s, review)
            else:
                s = (1 + scale) * s

            objective += max(0, 1 - cond)

        objective = objective / m
        objective = objective + lambda_reg / 2 * (s**2) * util.dotProduct(W, W)
        epoch += 1

    return s, W
Esempio n. 7
0
File: hw3.py Progetto: pinesol/mlcs
def SparseGradChecker(loss_func,
                      gradient_loss_func,
                      x,
                      y_val,
                      theta,
                      epsilon=0.01,
                      tolerance=1e-4):
    """Question 3.2: Implement Generic Gradient Checker for Sparse Matrices.

    Check that the function gradient_loss_func returns the correct gradient for 
    the given x, y_val, and theta.

    Let d be the number of features. Here we numerically estimate the
    gradient by approximating the directional derivative in each of
    the d coordinate directions: 
    (e_1 = (1,0,0,...,0), e_2 = (0,1,0,...,0), ..., e_d = (0,...,0,1) 

    The approximation for the directional derivative of J at the point
    theta in the direction e_i is given by: 
    ( J(theta + epsilon * e_i) - J(theta - epsilon * e_i) ) / (2*epsilon).

    We then look at the Euclidean distance between the gradient
    computed using this approximation and the gradient computed by
    gradient_loss_func(x, y_val, theta).  If the Euclidean
    distance exceeds tolerance, we say the gradient is incorrect.

    Args:
        loss_func - A function that computes the loss for (x, y_val, theta).
        gradient_loss_func - A function that computes gradient for (x, y_val, theta).
        x - A single row in the design matrix, represented by a dict/Counter object. (key length = num_features)
        y_val - the label for the corresponding x_row (-1 or 1)
        theta - the parameter vector, dict/Counter object. (key length = num_features)
        epsilon - the epsilon used in approximation
        tolerance - the tolerance error
    
    Return:
        A boolean value indicate whether the gradient is correct or not

    """
    true_gradient = gradient_loss_func(x, y_val, theta)
    approx_grad = dict.fromkeys(theta.keys(), 0.0)

    for key in theta.iterkeys():
        # Compute the approximate directional derivative in the chosen direction
        # Avoid copying since it's so slow.
        theta_key_original = theta[key]
        theta[key] += epsilon
        plus_loss = loss_func(x, y_val, theta)
        theta[key] = theta_key_original - epsilon
        minus_loss = loss_func(x, y_val, theta)
        theta[key] = theta_key_original  # restore theta
        approx_grad[key] = (plus_loss - minus_loss) / (2 * epsilon)
    util.increment(approx_grad, -1,
                   true_gradient)  # approx_grad - true_gradient
    error = math.sqrt(util.dotProduct(
        approx_grad,
        approx_grad))  # np.linalg.norm(approx_grad - true_gradient)
    if error > tolerance:
        print 'gradient doesn\'t match approximation. Error:', error
    return (error < tolerance)
Esempio n. 8
0
File: hw3.py Progetto: pinesol/mlcs
def PegasosSubgradientLoss(x, y_val, theta, lambda_reg):
    '''Question 3.2: The Subgradient of the Pegasos Loss function.'''
    margin = y_val * util.dotProduct(theta, x)
    subgrad = theta.copy()
    util.scale(subgrad, lambda_reg)
    if margin < 1:
        util.increment(subgrad, -y_val, x)
    return subgrad
Esempio n. 9
0
def pegasos_grad(X,y,w,lamb):
    tmp = y*dotProduct(w,X)
    if 1-tmp > 0:
        an1 = increment({},lamb,w)
        ans = increment(an1,y,X)
    else:
        ans = increment({},lamb,w)
    return ans
Esempio n. 10
0
def learnBoostedRegression(examples, num_iters, step_size, num_trees):
    """Learns a linear regression model using boosted trees.

    Args:
        examples: An array of training examples.
        num_iters (int): Number of training iterations.
        step_size (int): Stochastic gradient descent step size.
        num_trees (int): Number of gradient boosting trees.

    Returns:
        A predictor function that outputs a price (int) given a single input
        tuple.
    """
    list_weights = []
    objectives = [cur[1] for cur in examples]

    filename = "boostedtree_" + str(num_trees -
                                    1) + "_" + str(cross_val_seg) + ".p"
    if num_trees > 1 and SAVE:
        (list_weights, num_trees_prev, num_iters_prev) = pickle.load(
            open(os.path.join("boostedtree_weights", filename), "rb"))

    for k in range(num_trees):
        if k >= len(list_weights):
            print ""
            print "TREE " + str(k + 1) + " OF " + str(num_trees)
            curWeights = defaultdict(int)
            for i in range(num_iters):
                for ind in range(len(examples)):
                    x = examples[ind][0]
                    gradient = regression.lassoLossGradient(
                        x, curWeights, objectives[ind], .5)
                    increment(curWeights, -step_size / (i + 1), gradient)
                if VERBOSE:
                    print "Training progress: " + str(
                        100.0 * (i + 1) / num_iters) + "%"

            list_weights.append(curWeights)
        else:
            curWeights = list_weights[k]

        for j in range(len(examples)):
            x, y = examples[j]
            objectives[j] = objectives[j] - dotProduct(x, curWeights)

        if VERBOSE: print "COMPLETE"

    if SAVE:
        filename = "boostedtree_" + str(num_trees) + "_" + str(
            cross_val_seg) + ".p"
        pickle.dump((list_weights, num_trees, num_iters),
                    open(os.path.join("boostedtree_weights", filename), "wb"))

    # Define the predictor function
    def predictor(x):
        return sum(dotProduct(x, curWeight) for curWeight in list_weights)

    return predictor
 def update_weights_with_derivative(self, feature,weight,training_label):
    """
    if loss function is 1/1 + e^-score then the S.G.D update courtesy of CS229 notes is 
        w_j = w_j + eta*(label - 1/1_e^-score)*feature_vector
    where z is the margin. 
    """
    margin = self.calculate_margin(feature,weight,training_label)
    vector_weight = self.diseased_weight if training_label == 1 else self.healthy_weight
    update_coeff = self.logistic_func(margin)*training_label* vector_weight
    util.increment(weight,self.eta * update_coeff, feature)  
Esempio n. 12
0
 def learn(self, trainExamples):
     numIters = 10
     step = 0.0001
     for i in range(numIters):
         for feature_vec, y in trainExamples:
             score = util.dotProduct(self.weights, feature_vec)
             dloss = {}
             if score*y > 1:
                 continue
             else:
                 util.increment(dloss, -y, feature_vec)
             util.increment(self.weights, -step, dloss)
Esempio n. 13
0
def pegasos_fast(x, y, l):

    w = dict()
    temp_w = dict()
    t = 2
    s = 1
    temp_loss = 0
    flag = True
    while flag:
        for j in range(len(x)):
            t = t + 1
            n = 1/(l*t)
            s = (1-n*l)*s 
            if y[j]*(dotProduct(w, x[j])) < s:
                temp = x[j].copy()
                increment(temp, (n*y[j]-1), temp)
                increment(w,(1/s), temp)
        temp_w = w.copy()
        increment(temp_w, s-1, temp_w)
        loss_real = loss(x,y,l,temp_w)
        if abs(temp_loss - loss_real) < 10**-2:
            flag = False
        temp_loss = loss_real

    increment(w, s-1, w)
    return w
Esempio n. 14
0
def pegasos_SGD(X,y,lamb,num_iter):

    w = {}
    t = 1
    s = 1
    for i in range(num_iter):

        for j in range(len(X)):

            t += 1
            alpha = 1.0/(t*lamb)
            tmp = y[j] * s * dotProduct(X[j], w)
            g = l_de(tmp)
            s *= (1 - alpha * lamb)
            w = increment(w, -(alpha*y[j]*g/s), X[j])
        print "epoch "+str(i)
    return increment({},s,w)
Esempio n. 15
0
def train(trainingSet, subredditLabels, args):
    numIterations = 20
    eta = 0.05

    #dictionary of dictionaries (weights)
    weightDict = {}
    for label in subredditLabels:
        weightDict[label] = {}


    def gradLoss(phiX, w, y):
        score = util.dotProduct(w, phiX)
        margin = score * y
        if margin < 1:
            for name, feature in phiX.iteritems():
                phiX[name] = -1 * y * feature
            return phiX
        else:
            return 0


    for label in subredditLabels:
        trainingSet.seek(0)
        weightVector = weightDict[label]
        for i in range(numIterations):
            for example in trainingSet:
                example = json.loads(example)
                title = example['title']
                subreddit = example['subreddit']

                features = FeatureExtractor.extractFeatures(title, args)

                y = -1
                if label == subreddit:
                    y = 1

                grad = gradLoss(features, weightVector, y)

                if grad != 0:
                    util.increment(weightVector, -1 * eta, grad)
                    weightDict[label] = weightVector
                else:
                    weightDict[label] = weightVector

    return weightDict
Esempio n. 16
0
def update_challenges():
    """Fetch challenge list from vimgolf and update datastore."""
    logging.info('update_challenges()')

    rows = BeautifulSoup(fetch('/')).findAll('h5')
    count = increment('challenge_tasks', len(rows))
    logging.info('init challenge_tasks = %d' % count)
    for row in rows:
        handle = row.a['href'].split('/')[-1]
        taskqueue.add(url='/challenges/'+handle)
Esempio n. 17
0
def pegasos(X_train, y_train, lambda_reg=1, max_it=1000, tol=1e-6):
    w = Counter()
    t = 1
    epoch = 1
    objective = 1e5
    objective2 = 10
    m = len(y_train)

    while abs(objective - objective2) > tol and epoch <= max_it:
        objective2 = objective
        objective = 0
        for j in range(m):
            t = t + 1
            step = 1 / (t * lambda_reg)
            review = X_train[j]
            result = y_train[j]
            scale = -(step * lambda_reg)
            cond = result * util.dotProduct(w, review)

            if cond < 1:
                util.increment(w, scale, w)
                util.increment(w, step * result, review)
            else:
                util.increment(w, scale, w)

            objective += max(0, 1 - cond)

        objective = objective / m
        objective = objective + lambda_reg / 2 * util.dotProduct(w, w)
        epoch += 1

    return w
	def train(self, dataset, featureExtractor):
		def hingeLoss(w, features, y):
			return max(0, 1 - dotProduct(w, features) * y)

	   	def lossGradient(w, features, y):
			if hingeLoss(w, features, y) <= 0:
				return {}

			return {feature: -y * magnitude for feature, magnitude in features.iteritems()}

		weights = collections.defaultdict(int)
		for i, step in enumerate(xrange(self.numIters)):
			for example in dataset:
				input = example[0]
				output = example[1]

				features = featureExtractor(input)
				gradient = lossGradient(weights, features, output)

				increment(weights, -1 * stepSize, gradient)
		
		return weights
	def trainCorrect(self, dataset, featureExtractor):
		def squareLoss(answerProb, y):
			return (answerProb - y) ** 2

	   	def lossGradient(answerProb, y, features):
	   		derivative = 2 * (answerProb - y)
			return {feature: derivative * magnitude for feature, magnitude in features.iteritems()}

		def getAnswerProbs(weights, questionData):
			proposedAnswers = questionData["proposedAnswers"]
			correctIndex = questionData["correctAnswerIndex"]
			answerScores = []

			for aIndex, proposed in enumerate(proposedAnswers):
				score = dotProduct(weights, featureExtractor(proposed))
				answerScores.append(score)

			return softmax(answerScores)

		weights = collections.defaultdict(int)
		for _ in xrange(self.numIters):
			error = 0
			stepSize = 0.12
			for questionData in dataset:
				yVector = [int(i == questionData["correctAnswerIndex"]) for i in xrange(4)] #[0, 1, 0, 0], where the 1 indicates which is correct
				answerProbs = getAnswerProbs(weights, questionData) #[0.2, 0.7, 0.06, 0.04] -> each indicates the likelihood
				# print answerProbs, yVector

				proposedAnswers = questionData["proposedAnswers"]
				for aIndex, proposed in enumerate(proposedAnswers):
					error += squareLoss(answerProbs[aIndex], yVector[aIndex])
					features = featureExtractor(proposed)
					gradient = lossGradient(answerProbs[aIndex], yVector[aIndex], features)
					increment(weights, -1 * stepSize, gradient)

			print "Error:", error, weights
		
		return weights
Esempio n. 20
0
def update_challenge(handle):
    """Fetch Leaderboard and active golfers of the specified challenge, and update datastore."""
    logging.info('update_challenge(%s)' % handle)

    soup = BeautifulSoup(fetch('challenges/' + handle))
    title = soup.findAll('h3')[1].text
    golfers = [row.text.split('@')[-1] for row in soup.findAll('h5')[-1].parent.findAll('h6')]
    record = Challenge(key_name=handle, handle=handle, title=title, active_golfers=golfers)
    record.put()
    logging.info('updated Challenge(%s, %s) with %d golfers' % (handle, title, len(golfers)))

    count = increment('challenge_tasks', -1)
    logging.info('challenge_tasks = %d' % count)
    if count == 0:
        taskqueue.add(url='/top')
Esempio n. 21
0
def pegasos(x, y, l):

    w = dict()
    t = 2
    temp_loss = 0
    cnt = 0
    flag = True
    for i in range(2):
        for j in range(len(x)):
            t = t + 1
            n = 1/(l*t)
            if y[j]*(dotProduct(w, x[j])) < 1:
                cnt = cnt +1
                temp = x[j].copy()
                increment(temp, (n*y[j]-1), temp)
                increment(w,-n*l,w)
                increment(w,1,temp)
            else:
                increment(w,-n*l,w)
    return w
    def fit(self, X):
        self.w_ = dict()
        t = 0

        for j in range(len(X)):
            t += 1
            step_size = 1 / (t * self.lambda_reg)

            w_dot_x = util.dotProduct(self.w_, X[j])
            y = 1 if 1 in X[j] else -1

            if y * w_dot_x < 1:
                util.increment(self.w_, (1 - 1 / t), self.w_)
                util.increment(self.w_, step_size * y, X[j])

            else:
                util.increment(self.w_, (1 - 1 / t), self.w_)

        return self.w_
Esempio n. 23
0
File: hw3.py Progetto: pinesol/mlcs
def Pegasos(X, y, lambda_reg, max_epochs=1000, check_gradient=False):
    '''Question 4.2.
    Finds the sparse weight vector that minimizes the SVM loss function on X and y.
    '''
    print 'Running Pegasos with regularization parameter', lambda_reg
    loss_func = lambda x, y_val, theta: PegasosLoss(x, y_val, theta, lambda_reg
                                                    )
    gradient_loss_func = lambda x, y_val, theta: PegasosSubgradientLoss(
        x, y_val, theta, lambda_reg)

    # Initialize theta to have zero for every word mentioned in any review
    theta = {key: 0.0 for x in X for key in x.keys()}
    t = 2  # NOTE: This normally starts at zero, but that causes a divide-by-zero error.
    weight_scalar = 1.0

    for epoch in range(max_epochs):
        #        print '--Epoch', epoch
        old_theta = theta.copy()
        for j, x in enumerate(X):
            t += 1
            eta = 1.0 / (t * lambda_reg)
            margin = y[j] * weight_scalar * util.dotProduct(theta, x)
            # NOTE that the gradient is not differentiable at 1.0, so we don't check it near there.
            if check_gradient and abs(margin - 1.0) > 0.01:
                if SparseGradChecker(loss_func, gradient_loss_func, x, y[j],
                                     theta):
                    print 'Computed gradient doesn\'t match approximations.'
                    sys.exit(1)
                grad = gradient_loss_func(x, y[j], theta)
                util.increment(theta, -eta, grad)
            else:
                weight_scalar *= 1.0 - 1.0 / t
                if margin < 1:
                    util.increment(theta, eta * y[j] / weight_scalar, x)
        util.increment(old_theta, -1, theta)
        util.scale(old_theta, weight_scalar)
        total_change = math.sqrt(util.dotProduct(old_theta, old_theta))
        #        print '----Change from previous theta:', total_change
        if total_change < 0.01:
            break
    util.scale(theta, weight_scalar)
    return theta
Esempio n. 24
0
 def GET(self):
     count = increment("sign", 0)
     render = web.template.render('templates')
     signlist = [{'text':'1-1000','url':'url'}]
     return render.summary(signlist)
Esempio n. 25
0
def hinge_no_players(train_set, val_set, test_set, calendar):
    # hinge loss with no player features

    def feature_extractor(game, calendar):
        fv = collections.defaultdict(float)

        day = game[1]
        home = game[3]
        away = game[2]

        fv["home" + home] = 1.
        fv["away" + away] = 1.
        fv["date"] = float(day - 105) / float(88)

        # check for back to back
        year = game[0]
        if calendar[year][day - 1] != 0 and home in calendar[day - 1]:
            fv["back_to_back_home"] = 1
        if calendar[year][day - 1] != 0 and away in calendar[day - 1]:
            fv["back_to_back_away"] = 1

        # check for three in four
        home_counter = 0
        away_counter = 0
        for new_day in [day - 1, day - 2, day - 3]:
            if calendar[year][new_day] != 0 and home in calendar[new_day]:
                home_counter += 1
            if calendar[year][new_day] != 0 and away in calendar[new_day]:
                away_counter += 1
        if home_counter > 1:
            fv["three_in_four_home"] = 1
        if away_counter > 1:
            fv["three_in_four_away"] = 1

        return fv

    # training
    weights = collections.defaultdict(float)
    eta = 0.11  # 0.1?
    reg_lambda = 0.11  # revisit values 0.11: 56,55,51
    for game in train_set:

        y = game[4]
        fv = feature_extractor(game, calendar)

        hinge_loss = max(0, 1 - (util.dotProduct(weights, fv) * y))
        for key in fv:
            if hinge_loss == 0:
                fv[key] = 0
            else:
                fv[key] *= (-1 * y)
        #util.increment(weights, (eta * -1), fv)
        # now using regularization
        util.increment(fv, reg_lambda, weights)
        util.increment(weights, (eta * -1), fv)

    # prediction
    # training
    right_count = 0.
    total_count = 0.
    for game in train_set:

        y = game[4]
        fv = feature_extractor(game, calendar)
        pred = util.dotProduct(weights, fv)

        if pred >= 0 and y > 0 or pred < 0 and y < 0:
            #print y, pred, "RIGHT"
            right_count += 1
        elif pred >= 0 and y < 0 or pred < 0 and y > 0:
            #print y, pred, "WRONG"
            pass
        else:
            print pred, y
            raise ("ERROR")

        total_count += 1

    train_acc = float(right_count) / float(total_count)

    # validation
    right_count = 0.
    total_count = 0.
    for game in val_set:

        y = game[4]
        fv = feature_extractor(game, calendar)
        pred = util.dotProduct(weights, fv)

        if pred >= 0 and y > 0 or pred < 0 and y < 0:
            #print y, pred, "RIGHT"
            right_count += 1
        elif pred >= 0 and y < 0 or pred < 0 and y > 0:
            #print y, pred, "WRONG"
            pass
        else:
            print pred, y
            raise ("ERROR")

        total_count += 1

    val_acc = float(right_count) / float(total_count)

    # test
    right_count = 0.
    total_count = 0.
    for game in test_set:

        y = game[4]
        fv = feature_extractor(game, calendar)
        pred = util.dotProduct(weights, fv)

        if pred >= 0 and y > 0 or pred < 0 and y < 0:
            #print y, pred, "RIGHT"
            right_count += 1
        elif pred >= 0 and y < 0 or pred < 0 and y > 0:
            #print y, pred, "WRONG"
            pass
        else:
            print pred, y
            raise ("ERROR")

        total_count += 1

    test_acc = float(right_count) / float(total_count)

    #fout = open("hinge_no_player.csv", "a")
    #fout.write(str(train_acc)+", "+str(val_acc)+", "+str(test_acc)+", \n")
    #fout.close()
    #print "hinge no player"
    #print (str(train_acc)+", "+str(val_acc)+", "+str(test_acc)+", \n")
    return (train_acc, val_acc, test_acc)
Esempio n. 26
0
def hinge_projected_years_players(train_set, val_set, test_set, calendar,
                                  data):
    # hinge loss with current years player features

    py_stat_map = {}
    py_stat_map["1718"] = get_projected_years_stats(data, "1718")
    py_stat_map["1617"] = get_current_years_stats(data, "1617")
    py_stat_map["1516"] = get_current_years_stats(data, "1516")
    py_stat_map["1415"] = get_current_years_stats(data, "1415")

    def feature_extractor(game, calendar, stat_map):
        fv = collections.defaultdict(float)

        day = game[1]
        home = game[3]
        away = game[2]

        fv["home" + home] = 1.
        fv["away" + away] = 1.
        fv["date"] = float(day - 105) / float(88)

        # check for back to back
        year = game[0]
        if calendar[year][day - 1] != 0 and home in calendar[day - 1]:
            fv["back_to_back_home"] = 1
        if calendar[year][day - 1] != 0 and away in calendar[day - 1]:
            fv["back_to_back_away"] = 1

        # check for three in four
        home_counter = 0
        away_counter = 0
        for new_day in [day - 1, day - 2, day - 3]:
            if calendar[year][new_day] != 0 and home in calendar[new_day]:
                home_counter += 1
            if calendar[year][new_day] != 0 and away in calendar[new_day]:
                away_counter += 1
        if home_counter > 1:
            fv["three_in_four_home"] = 1
        if away_counter > 1:
            fv["three_in_four_away"] = 1

        # use stat map
        for feat in stat_map[year][home].keys():
            #if "WS" in feat or "VORP" in feat:
            if "VORP" in feat:
                fv["home_" + feat] = stat_map[year][home][feat]
            #fv["home_" + feat] = stat_map[year][home][feat]
        for feat in stat_map[year][away].keys():
            #if "WS" in feat or "VORP" in feat:
            if "VORP" in feat:
                fv["away_" + feat] = stat_map[year][away][feat]
            #fv["away_" + feat] = stat_map[year][away][feat]

        return fv

    # training
    weights = collections.defaultdict(float)
    eta = 0.11  # 0.1?
    reg_lambda = 0.11  # experiment with vals
    for game in train_set:

        y = game[4]
        fv = feature_extractor(game, calendar, py_stat_map)

        hinge_loss = max(0, 1 - (util.dotProduct(weights, fv) * y))
        for key in fv:
            if hinge_loss == 0:
                fv[key] = 0
            else:
                fv[key] *= (-1 * y)
        #util.increment(weights, (eta * -1), fv)
        # now using regularization
        util.increment(fv, reg_lambda, weights)
        util.increment(weights, (eta * -1), fv)

    # prediction
    # training
    right_count = 0.
    total_count = 0.
    for game in train_set:

        y = game[4]
        fv = feature_extractor(game, calendar, py_stat_map)
        pred = util.dotProduct(weights, fv)

        if pred >= 0 and y > 0 or pred < 0 and y < 0:
            #print y, pred, "RIGHT"
            right_count += 1
        elif pred >= 0 and y < 0 or pred < 0 and y > 0:
            #print y, pred, "WRONG"
            pass
        else:
            print pred, y
            raise ("ERROR")

        total_count += 1

    training_accuracy = float(right_count) / float(total_count)

    # validation
    right_count = 0.
    total_count = 0.
    for game in val_set:

        y = game[4]
        fv = feature_extractor(game, calendar, py_stat_map)
        pred = util.dotProduct(weights, fv)

        if pred >= 0 and y > 0 or pred < 0 and y < 0:
            #print y, pred, "RIGHT"
            right_count += 1
        elif pred >= 0 and y < 0 or pred < 0 and y > 0:
            #print y, pred, "WRONG"
            pass
        else:
            print pred, y
            raise ("ERROR")

        total_count += 1

    validation_accuracy = float(right_count) / float(total_count)

    # test
    right_count = 0.
    total_count = 0.
    for game in test_set:

        y = game[4]
        fv = feature_extractor(game, calendar, py_stat_map)
        pred = util.dotProduct(weights, fv)

        if pred >= 0 and y > 0 or pred < 0 and y < 0:
            #print y, pred, "RIGHT"
            right_count += 1
        elif pred >= 0 and y < 0 or pred < 0 and y > 0:
            #print y, pred, "WRONG"
            pass
        else:
            print pred, y
            raise ("ERROR")

        total_count += 1

    test_accuracy = float(right_count) / float(total_count)

    #fout = open("hinge_player.csv", "a")
    #fout.write(str(training_accuracy)+", "+str(validation_accuracy)+", "+str(test_accuracy)+", \n")
    #fout.close()
    #print "hinge player"
    #print (str(training_accuracy)+", "+str(validation_accuracy)+", "+str(test_accuracy)+", \n")
    return (training_accuracy, validation_accuracy, test_accuracy)
Esempio n. 27
0
def compute_oracle():
    
    stats = ['2PAPM', '2PP', '2PPM', '3PAPM', '3PAr', '3PP', '3PPM', 'ASTP', 'ASTPM', 'BLKP', 'BLKPM', 'BPM', 'DBPM', 'DRBP', 'DRBPM', 'DWSPM', 'FGAPM', 'FGP', 'FGPM', 'FTAPM', 'FTP', 'FTPM', 'FTr', 'G', 'GS', 'GSPG', 'MP', 'MPG', 'OBPM', 'ORBP', 'ORBPM', 'OWSPM', 'PER', 'PFPM', 'PPM', 'STLP', 'STLPM', 'TOVP', 'TOVPM', 'TRBP', 'TRBPM', 'TSP', 'USGP', 'VORP', 'WSP48', 'WSPM', 'eFGP']

    comparison = {}
    comparison["rookies"] = {}
    comparison["veterans"] = {}
    comparison["rookies"]["predicted"] = {}
    comparison["rookies"]["true"] = {}
    comparison["veterans"]["predicted"] = {}
    comparison["veterans"]["true"] = {}

    for stat in stats:
        comparison["rookies"]["predicted"][stat] = []
        comparison["rookies"]["true"][stat] = []
        comparison["veterans"]["predicted"][stat] = []
        comparison["veterans"]["true"][stat] = []
        

    fname = "data_-1to1.pkl"
    fin = open(os.path.dirname("/Users/dliedtka/Documents/stanford/cs221/project/files/data_generation/") + "/" + fname, "rb")
    data = pickle.load(fin)
    fin.close()

    # do veterans and rookies the same

    # build a model for each stat
    weights = {}
    for stat in stats:
        weights[stat] = collections.defaultdict(float)
        #weights[stat] = {}
    # eta, try 0.01?
    eta = 0.01

    def oracle_feature_extractor(x, stat):
        fv = collections.defaultdict(float)
        #fv = {}
        for feature in x.keys():
            if feature != stat and feature not in ["team", "position", "year"]: 
                fv[feature] = x[feature]
        return fv

    # iterate through each player, training the model for each stat (stochastic gradient descent using least squares regression)
    for player in data.keys():
        #print player
        for season in data[player]["professional"].keys():
            # don't train on what oracle will predict
            if season != "2017-18":
                for stat in stats:
                    y = data[player]["professional"][season][stat]
                    fv = oracle_feature_extractor(data[player]["professional"][season], stat)
                    # compute
                    pred = util.dotProduct(fv, weights[stat])
                    #print pred
                    # compute gradient
                    scale = -eta * 2. * (pred - y)
                    #print scale
                    #print weights[stat]
                    # add gradient to weights
                    util.increment(weights[stat], scale, fv)
                    #print weights[stat]

    #print weights

    # iterate through each player, making a prediction and storing the true value in comparison dict
    for player in data.keys():
        #print player

        if "2017-18" not in data[player]["professional"].keys():
            continue

        # veteran
        if len(data[player]["professional"].keys()) != 1:
            for value in stats:
                # predict based on weights
                fv = oracle_feature_extractor(data[player]["professional"]["2017-18"], value)
                pred = util.dotProduct(fv, weights[value])
                comparison["veterans"]["predicted"][value].append(pred)
                # retrieve true value
                comparison["veterans"]["true"][value].append(data[player]["professional"]["2017-18"][value])
        # rookie
        else:
            for value in stats:
                # predict based on weights
                fv = oracle_feature_extractor(data[player]["professional"]["2017-18"], value)
                pred = util.dotProduct(fv, weights[value])
                comparison["rookies"]["predicted"][value].append(pred)
                # retrieve true value
                comparison["rookies"]["true"][value].append(data[player]["professional"]["2017-18"][value])


    return comparison
Esempio n. 28
0
        print "Progress:", 1.0*i/maxIters * 100, "%"
    	loss = 0
    	means = [{} for _ in range(K)]
        val_means = [0 for _ in range(K)]
    	cluster_count = [0 for _ in range(K)]
    	prev_centroids = centroids
    	prev_assign = assign[:]

    	precomputed_quantities = [util.dotProduct(centroids[i], centroids[i]) \
                for i in range(K)]

    	#loop through the examples to assign
    	for j in range(len(examples)):
            assign[j] , dist = find_center(j, examples[j], precomputed_x,
                    precomputed_quantities, centroids)
            util.increment(means[assign[j]], 1, examples[j])
            val_means[assign[j]] += full_examples[j][1]
            cluster_count[assign[j]] += 1

            loss_list[j] = (full_examples[j][1] - centroid_vals[assign[j]])
            loss += dist
        print "LOSS: " + str(loss)
    	if assign == prev_assign:
            print loss
            return centroids, assign, loss, loss_list, centroid_vals

    	for index in range(K):
            divide(means[index], cluster_count[index])
            val_means[index] = val_means[index]/cluster_count[index]

        centroids = means
 def update_weights_with_derivative(self,feature,weight,training_label):
    util.increment(weight,self.eta * training_label, feature) 
    util.increment(weight,-2*self.lambda_value,weight) # regularization factor
Esempio n. 30
0
def test3b0():
    ans = {'a': 1.1, 'b': 1, 'c': 1.1, 'd': 0.1}
    d1 = {'a': 1, 'b': 1, 'c': 1}
    d2 = {'a': 1, 'c': 1, 'd': 1}
    util.increment(d1, 0.1, d2)
    grader.require_is_equal(ans, d1)
Esempio n. 31
0
    if abs(msg.velocity) > MAX_LINEAR_VEL or abs(msg.omega) > MAX_ANG_VEL:
        scale = max(
            abs(msg.velocity) / MAX_LINEAR_VEL,
            abs(msg.omega) / MAX_ANG_VEL)
        msg.velocity /= scale
        msg.omega /= scale
    targetVel = msg


rospy.init_node("base_controller")
targetSub = rospy.Subscriber('/target_vel',
                             BaseCommand,
                             targetCB,
                             queue_size=1)
motorPub = rospy.Publisher('/cmd_vel', MotorCommand, queue_size=1)
rate = rospy.Rate(RATE)
while not rospy.is_shutdown():
    rate.sleep()
    t = rospy.get_rostime()
    if t - targetVel.header.stamp > rospy.Duration(COMMAND_TIMEOUT):
        targetVel.velocity = 0
        targetVel.omega = 0
        targetVel.header.stamp = t
    l_target = targetVel.velocity - (targetVel.omega * WHEEL_SEP) / 2
    r_target = targetVel.velocity + (targetVel.omega * WHEEL_SEP) / 2
    command.header.stamp = t
    command.left, dl = increment(command.left, l_target / WHEEL_RAD, WHEEL_ACC)
    command.right, dr = increment(command.right, r_target / WHEEL_RAD,
                                  WHEEL_ACC)
    motorPub.publish(command)
Esempio n. 32
0
def get_veteran_comparison(data_type=2):

    veteran_comparison = {}
    veteran_comparison["training"] = {}
    veteran_comparison["validation"] = {}
    veteran_comparison["test"] = {}
    veteran_comparison["training"]["predicted"] = {}
    veteran_comparison["training"]["true"] = {}
    veteran_comparison["validation"]["predicted"] = {}
    veteran_comparison["validation"]["true"] = {}
    veteran_comparison["test"]["predicted"] = {}
    veteran_comparison["test"]["true"] = {}

    relevant_stats = [
        'DRBPM', '2PP', 'FGPM', '3PP', 'DWSPM', 'TRBP', 'PER', 'FTPM', '3PAPM',
        'DRBP', 'USGP', 'TSP', 'PFPM', 'eFGP', 'STLPM', 'DBPM', '3PPM', 'GSPG',
        'FGP', 'PPM', 'FTAPM', 'OBPM', 'TOVP', 'WSP48', 'MP', 'FTP', 'GS',
        'BLKPM', 'G', 'BPM', 'VORP', 'ORBPM', 'TRBPM', '3PAr', 'ASTP', '2PPM',
        'MPG', 'FTr', 'ORBP', 'BLKP', '2PAPM', 'STLP', 'FGAPM', 'TOVPM',
        'ASTPM', 'OWSPM', 'WSPM'
    ]
    relevant_college_stats = [
        "G", "FGP", "3PP", "FTP", "MP", "FGPM", "FGAPM", "3PPM", "3PAPM",
        "2PPM", "2PAPM", "FTPM", "FTAPM", "TRBPM", "ASTPM", "STLPM", "BLKPM",
        "TOVPM", "PPM", "MPG", "2PP"
    ]

    fnames = [
        "data.pkl", "data_removeoutliers.pkl", "data_-1to1.pkl",
        "data_-1to1_removeoutliers.pkl", "data_0to1.pkl",
        "data_0to1_removeoutliers.pkl"
    ]

    if data_type == 0 or data_type == 1:
        normalization_type = None
    elif data_type == 2 or data_type == 3:
        normalization_type = "-1to1"
    elif data_type == 4 or data_type == 5:
        normalization_type = "0to1"
    else:
        raise ("ERROR")

    # intialize as empty lists
    for value in relevant_stats:
        veteran_comparison["training"]["predicted"][value] = []
        veteran_comparison["training"]["true"][value] = []
        veteran_comparison["validation"]["predicted"][value] = []
        veteran_comparison["validation"]["true"][value] = []
        veteran_comparison["test"]["predicted"][value] = []
        veteran_comparison["test"]["true"][value] = []

    # build a model for each stat based on validation data
    weights = {}
    for stat in relevant_stats:
        weights[stat] = collections.defaultdict(float)
        #weights[stat] = {}
    # eta, try 0.01?
    eta = 0.0033
    # computed by trial and error
    reg_lambda_mapping = {}
    for stat in relevant_stats:
        if stat in [
                "BPM", "FTAPM", "FTr", "MP", "MPG", "OWSPM", "PFPM", "TOVPM",
                "VORP", "WSPM"
        ]:
            reg_lambda = 1.
        elif stat in ["GS"]:
            reg_lambda = 0.5
        elif stat in ["3PAPM", "DWSPM", "FTPM", "G", "GSPG"]:
            reg_lambda = 0.333
        elif stat in ["3PAr", "ASTPM"]:
            reg_lambda = 0.1
        elif stat in ["ASTP", "DRBP"]:
            reg_lamba = 0.0333
        elif stat in ["3PPM", "USGP"]:
            reg_lambda = 0.01
        elif stat in ["FGAPM"]:
            reg_lambda = 0.000333
        elif stat in [
                "2PAPM", "2PP", "2PPM", "3PP", "BLKP", "BLKPM", "DBPM",
                "DRBPM", "FGP", "FGPM", "FTP", "OBPM", "ORBP", "ORBPM", "PER",
                "PPM", "STLP", "STLPM", "TOVP", "TRBP", "TRBPM", "TSP",
                "WSP48", "eFGP"
        ]:
            reg_lambda = 0.
        else:
            raise ("ERROR")

    # load validation data
    fin = open(
        os.path.dirname(
            "/Users/dliedtka/Documents/stanford/cs221/project/files/data_generation/"
        ) + "/" + fnames[data_type][:-4] + "_validation.pkl", "rb")
    val_data = pickle.load(fin)
    fin.close()

    # iterate through each player, training the model on 2nd to last seasons for each stat (stochastic gradient descent using least squares regression)
    for player in val_data.keys():
        # skip rookies
        if len(val_data[player]["professional"].keys()) == 1:
            continue
        #print player
        # use previous seasons as features, current season as y
        for season_idx in range(
                1, len(sorted(val_data[player]["professional"].keys()))):
            predicting_season = sorted(
                val_data[player]["professional"].keys())[season_idx]
            prior_season = sorted(
                val_data[player]["professional"].keys())[season_idx - 1]
            if predicting_season == "2016-17":
                continue

            fv = algorithm_feature_extractor(val_data, player, season_idx,
                                             relevant_stats,
                                             relevant_college_stats,
                                             normalization_type)
            for stat in relevant_stats:
                #print fv["lastDRBPM"]
                y = val_data[player]["professional"][predicting_season][stat]
                #print y
                # predict
                #pred = util.dotProduct(fv, weights[stat])
                # *** try making prediction last years stats plus inference
                pred = val_data[player]["professional"][prior_season][
                    stat] + util.dotProduct(fv, weights[stat])
                #print pred
                # gradient
                scale = -eta * 2. * (pred - y)
                #print scale
                #print weights[stat]["lastDRBPM"]
                # update
                # now with regularization
                util.increment(weights[stat], -eta * reg_lambda, weights[stat])
                util.increment(weights[stat], scale, fv)
                #print weights[stat]["lastDRBPM"]

    # iterate through each player, predicting on training data (all season prior to 2016-17)
    for player in val_data.keys():
        # skip players that only played in 2016-17
        if len(val_data[player]["professional"].keys()
               ) == 1 and "2016-17" in val_data[player]["professional"].keys():
            continue
        #print player

        season_list = sorted(val_data[player]["professional"].keys())
        for season_idx in range(len(season_list)):
            season = season_list[season_idx]
            # skip rookie year
            if season == season_list[0]:
                continue
            prior_season = season_list[season_idx - 1]
            season_number = season_idx
            fv = algorithm_feature_extractor(val_data, player, season_number,
                                             relevant_stats,
                                             relevant_college_stats,
                                             normalization_type)
            for stat in relevant_stats:
                # predict
                pred_val = val_data[player]["professional"][prior_season][
                    stat] + util.dotProduct(fv, weights[stat])
                true_val = val_data[player]["professional"][season][stat]
                # append
                veteran_comparison["training"]["predicted"][stat].append(
                    pred_val)
                veteran_comparison["training"]["true"][stat].append(true_val)

    # iterate through each player, making predictions for 2016-17
    for player in val_data.keys():
        # skip rookies, players who didn't play this year
        if len(val_data[player]["professional"].keys(
        )) == 1 or "2016-17" not in val_data[player]["professional"].keys():
            continue
        #print player

        # use previous seasons as features, 2016-17 as y
        season = "2016-17"
        prior_season = sorted(val_data[player]["professional"].keys())[-2]
        season_number = len(val_data[player]["professional"].keys()
                            ) - 1  # 2016-17 is last season
        fv = algorithm_feature_extractor(val_data, player, season_number,
                                         relevant_stats,
                                         relevant_college_stats,
                                         normalization_type)
        for stat in relevant_stats:
            # predict
            pred_val = val_data[player]["professional"][prior_season][
                stat] + util.dotProduct(fv, weights[stat])
            true_val = val_data[player]["professional"][season][stat]
            # append
            veteran_comparison["validation"]["predicted"][stat].append(
                pred_val)
            veteran_comparison["validation"]["true"][stat].append(true_val)

    # iterate through each player, making predictions for 2017-18
    # load test data
    fin = open(
        os.path.dirname(
            "/Users/dliedtka/Documents/stanford/cs221/project/files/data_generation/"
        ) + "/" + fnames[data_type], "rb")
    test_data = pickle.load(fin)
    fin.close()
    veteran_preds = {}
    for player in test_data.keys():
        # skip rookies, players who didn't play this year
        if len(test_data[player]["professional"].keys(
        )) == 1 or "2017-18" not in test_data[player]["professional"].keys():
            continue
        #print player
        veteran_preds[player] = {}

        # use previous seasons as features, 2016-17 as y
        season = "2017-18"
        prior_season = sorted(test_data[player]["professional"].keys())[-2]
        season_number = len(test_data[player]["professional"].keys()
                            ) - 1  # 2016-17 is last season
        fv = algorithm_feature_extractor(test_data, player, season_number,
                                         relevant_stats,
                                         relevant_college_stats,
                                         normalization_type)
        for stat in relevant_stats:
            # predict
            pred_val = test_data[player]["professional"][prior_season][
                stat] + util.dotProduct(fv, weights[stat])
            true_val = test_data[player]["professional"][season][stat]
            # append
            veteran_comparison["test"]["predicted"][stat].append(pred_val)
            veteran_comparison["test"]["true"][stat].append(true_val)
            veteran_preds[player][stat] = pred_val

    fout = open("veteran_preds.pkl", "wb")
    pickle.dump(veteran_preds, fout)
    fout.close()

    return veteran_comparison
Esempio n. 33
0
def kmeans(full_examples, K, maxIters):
    '''
    examples: list of examples, each example is a string-to-double dict representing a sparse vector.
    K: number of desired clusters. Assume that 0 < K <= |examples|.
    maxIters: maximum number of iterations to run (you should terminate early if the algorithm converges).
    Return: (length K list of cluster centroids,
            list of assignments (i.e. if examples[i] belongs to centers[j], then assignments[i] = j)
            final reconstruction loss)
    '''
    # BEGIN_YOUR_CODE (our solution is 32 lines of code, but don't worry if you deviate from this)
    examples = [full_examples[j][0] for j in range(len(full_examples))]

    def find_center(ex_index, example, precomputed_x, precomputed_quantities,
                    centroids):
        assign = 0
        min_dist = 1, 000
        for i in range(K):
            cur_dist = precomputed_x[ex_index] - 2 * util.dotProduct(
                centroids[i], example) + precomputed_quantities[i]
            if cur_dist < min_dist:
                assign = i
                min_dist = cur_dist
        return assign, min_dist

    def divide(vec, scale):
        for k, v in vec.items():
            vec[k] = 1.0 * v / scale

    rand_list = random.sample(xrange(len(examples)), K)
    centroids = [examples[i] for i in rand_list]
    centroid_vals = [full_examples[i][1] for i in rand_list]
    assign = [0 for _ in range(len(examples))]
    loss_list = [0 for _ in range(len(examples))]
    precomputed_x = [
        util.dotProduct(examples[i], examples[i]) for i in range(len(examples))
    ]

    for i in range(maxIters):
        print "Progress:", 1.0 * i / maxIters * 100, "%"
        loss = 0
        means = [{} for _ in range(K)]
        val_means = [0 for _ in range(K)]
        cluster_count = [0 for _ in range(K)]
        prev_centroids = centroids
        prev_assign = assign[:]

        precomputed_quantities = [
            util.dotProduct(centroids[i], centroids[i]) for i in range(K)
        ]

        #loop through the examples to assign
        for j in range(len(examples)):
            assign[j], dist = find_center(j, examples[j], precomputed_x,
                                          precomputed_quantities, centroids)
            util.increment(means[assign[j]], 1, examples[j])
            val_means[assign[j]] += full_examples[j][1]
            cluster_count[assign[j]] += 1

            loss_list[j] = (full_examples[j][1] - centroid_vals[assign[j]])
            loss += dist
        print "LOSS: " + str(loss)
        if assign == prev_assign:
            print loss
            return centroids, assign, loss, loss_list, centroid_vals

        for index in range(K):
            divide(means[index], cluster_count[index])
            val_means[index] = val_means[index] / cluster_count[index]

        centroids = means
        centroid_vals = val_means

        if centroids == prev_centroids:
            print loss
            return centroids, assign, loss, loss_list, centroid_vals

    print "The reconstruction loss is:", loss
    return centroids, assign, loss, loss_list, centroid_vals