Esempio n. 1
0
def pegasos_sw(X_train, y_train, lambda_reg=1, max_it=1000, tol=1e-4):
    W = Counter()
    s = 1
    t = 1
    epoch = 1
    objective = 1e5
    objective2 = 10
    m = len(y_train)

    while abs(objective - objective2) > tol and epoch <= max_it:
        objective2 = objective
        objective = 0
        for j in range(m):
            t = t + 1
            step = 1 / (t * lambda_reg)
            review = X_train[j]
            result = y_train[j]
            scale = -(step * lambda_reg)
            cond = result * s * util.dotProduct(W, review)

            if cond < 1:
                s = (1 + scale) * s
                util.increment(W, step * result / s, review)
            else:
                s = (1 + scale) * s

            objective += max(0, 1 - cond)

        objective = objective / m
        objective = objective + lambda_reg / 2 * (s**2) * util.dotProduct(W, W)
        epoch += 1

    return s, W
Esempio n. 2
0
def loss(x,y,l,w):
    loss = (l*dotProduct(w,w))/2
    m = len(x)

    for i in range(m):
        loss = loss + (max(0, 1- y[i]*dotProduct(w,x[i])))/m
    return loss
Esempio n. 3
0
def pegasos(X_train, y_train, lambda_reg=1, max_it=1000, tol=1e-6):
    w = Counter()
    t = 1
    epoch = 1
    objective = 1e5
    objective2 = 10
    m = len(y_train)

    while abs(objective - objective2) > tol and epoch <= max_it:
        objective2 = objective
        objective = 0
        for j in range(m):
            t = t + 1
            step = 1 / (t * lambda_reg)
            review = X_train[j]
            result = y_train[j]
            scale = -(step * lambda_reg)
            cond = result * util.dotProduct(w, review)

            if cond < 1:
                util.increment(w, scale, w)
                util.increment(w, step * result, review)
            else:
                util.increment(w, scale, w)

            objective += max(0, 1 - cond)

        objective = objective / m
        objective = objective + lambda_reg / 2 * util.dotProduct(w, w)
        epoch += 1

    return w
Esempio n. 4
0
  def getIntensity(self, pos):
    """Returns the appropriate intensity of the sound being played assuming intensity falls off at 1/r^2"""
    #Camera doesnt have position so im just using the position of the followed object (of 1st camera)
    camPos = glad.renderer.cameraList[0].objectFollowed.getPos()

    r=(pos-camPos)#separation vector
    if r.isNullVector(): #if the vector is null, sound will be max anyways
      sin = 1
      cos = 1
    else:
      #calculate angles to determine where sound is coming from
      cos = dotProduct(r.getNormalized(),Vector(-1,0))
      sin = dotProduct(r.getNormalized(), Vector(0,1))
    #Calculate intensity for left and right channels
    #when sound is directly to the side have 80 percent come from that side speaker
    #hopefully this will give some directional sounds
    k = 130000 #arbitrary constant to calculate sound intensity
    if r.isNullVector():
      intensity = k #removes division by zero error
    else:
      intensity = k/r.getMagnitude()**2
    #major is the percent of the sound intensity from the side with the greater intensity
    a=0.68 #max percent of the intensity coming from one side
    major = (a*0.5)/((0.5*cos)**2+(a*sin)**2)**0.5 #equation for an ellipse
    if r[0] <= 0:
      right = major
      left = 1-major
    else:
      left = major
      right = 1-major
    right *= intensity
    left *= intensity
    if right > 1: right = 1
    if left > 1: left = 1
    return left,right
Esempio n. 5
0
 def kmeanspredictor(x):
     assignment = 0
     min_dist = 1000000
     for j in range(NUM_CLUSTERS):
         cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
             centroids[j], x) + pre_computed_centroid_dots[j]
         if cur_dist < min_dist:
             assignment = j
             min_dist = cur_dist
     return centroid_vals[assignment]
Esempio n. 6
0
 def predictor(x):
     centroid_ind = 0
     minDist = float('inf')
     for k in range(len(centroids)):
         cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
             centroids[k], x) + pre_computed_centroid_dots[k]
         min_dist = float('inf')
         if cur_dist < min_dist:
             assignment = k
             min_dist = cur_dist
     return predictor_list[i](x)
Esempio n. 7
0
 def predictor(x):
     if x == None:
         return -1
     if util.dotProduct(featureExtractor(x), weights) > 0:
         return 1
     else:
         return 0 
Esempio n. 8
0
def pegasos_fast(x, y, l):

    w = dict()
    temp_w = dict()
    t = 2
    s = 1
    temp_loss = 0
    flag = True
    while flag:
        for j in range(len(x)):
            t = t + 1
            n = 1/(l*t)
            s = (1-n*l)*s 
            if y[j]*(dotProduct(w, x[j])) < s:
                temp = x[j].copy()
                increment(temp, (n*y[j]-1), temp)
                increment(w,(1/s), temp)
        temp_w = w.copy()
        increment(temp_w, s-1, temp_w)
        loss_real = loss(x,y,l,temp_w)
        if abs(temp_loss - loss_real) < 10**-2:
            flag = False
        temp_loss = loss_real

    increment(w, s-1, w)
    return w
Esempio n. 9
0
def lassoLossGradient(features, weights, true_value, tuning_parameter):
    """Computes the value of the training loss gradient (with respect to the
    weight vector) at a specific example.

    Training loss includes a lasso (L1) regularization term.

    Args:
        features (dict): A sparse vector of feature values.
        weights (dict): A sparse vector of feature weights.
        true_value (int): The true value of an example.
        tuning_parameter (double): Coefficient of the lasso regularization term.

    Returns:
        A sparse vector (dict) representing the gradient value.
    """
    # Standard squared loss
    gradient = {}
    scale = 2 * (dotProduct(features, weights) - true_value)

    # Lasso term: add gradient of the lasso term to the scaling factor (i.e.
    # add gradient of |tuning_parameter| * (1-norm of weights)
    weight_signs = [np.sign(weights[w]) for w in weights]

    for w in weights:
        gradient[w] = tuning_parameter * np.sign(weights[w])

    increment(gradient, scale, features)
    return gradient
Esempio n. 10
0
def findExampleStatsFn(examples, weights, featureExtractor, examineFn):
    summ = 0
    correct = 0
    tot = 0
    summNeg = 0
    correctNeg = 0
    totNeg = 0
    for example in examples:
        prompt, response = example[0]
        if examineFn(prompt, response):
            phi = featureExtractor(example[0])
            score = dotProduct(weights, phi)
            if example[1] == 1:
                    summ += score
                    if score > 0:
                        correct += 1
                    tot += 1
            if example[1] == -1:
                    summNeg += score
                    if score < 0:
                        correctNeg += 1
                    totNeg += 1
    if tot > 0:
        print "Average Score (+): {0}".format(1.0*summ/tot)
        print "Average Correct (+): {0}".format(1.0*correct/tot)
    if totNeg > 0:
        print "Average Score (-): {0}".format(1.0*summNeg/totNeg)
        print "Average Correct (-): {0}".format(1.0*correctNeg/totNeg)
Esempio n. 11
0
File: hw3.py Progetto: pinesol/mlcs
def SparseGradChecker(loss_func,
                      gradient_loss_func,
                      x,
                      y_val,
                      theta,
                      epsilon=0.01,
                      tolerance=1e-4):
    """Question 3.2: Implement Generic Gradient Checker for Sparse Matrices.

    Check that the function gradient_loss_func returns the correct gradient for 
    the given x, y_val, and theta.

    Let d be the number of features. Here we numerically estimate the
    gradient by approximating the directional derivative in each of
    the d coordinate directions: 
    (e_1 = (1,0,0,...,0), e_2 = (0,1,0,...,0), ..., e_d = (0,...,0,1) 

    The approximation for the directional derivative of J at the point
    theta in the direction e_i is given by: 
    ( J(theta + epsilon * e_i) - J(theta - epsilon * e_i) ) / (2*epsilon).

    We then look at the Euclidean distance between the gradient
    computed using this approximation and the gradient computed by
    gradient_loss_func(x, y_val, theta).  If the Euclidean
    distance exceeds tolerance, we say the gradient is incorrect.

    Args:
        loss_func - A function that computes the loss for (x, y_val, theta).
        gradient_loss_func - A function that computes gradient for (x, y_val, theta).
        x - A single row in the design matrix, represented by a dict/Counter object. (key length = num_features)
        y_val - the label for the corresponding x_row (-1 or 1)
        theta - the parameter vector, dict/Counter object. (key length = num_features)
        epsilon - the epsilon used in approximation
        tolerance - the tolerance error
    
    Return:
        A boolean value indicate whether the gradient is correct or not

    """
    true_gradient = gradient_loss_func(x, y_val, theta)
    approx_grad = dict.fromkeys(theta.keys(), 0.0)

    for key in theta.iterkeys():
        # Compute the approximate directional derivative in the chosen direction
        # Avoid copying since it's so slow.
        theta_key_original = theta[key]
        theta[key] += epsilon
        plus_loss = loss_func(x, y_val, theta)
        theta[key] = theta_key_original - epsilon
        minus_loss = loss_func(x, y_val, theta)
        theta[key] = theta_key_original  # restore theta
        approx_grad[key] = (plus_loss - minus_loss) / (2 * epsilon)
    util.increment(approx_grad, -1,
                   true_gradient)  # approx_grad - true_gradient
    error = math.sqrt(util.dotProduct(
        approx_grad,
        approx_grad))  # np.linalg.norm(approx_grad - true_gradient)
    if error > tolerance:
        print 'gradient doesn\'t match approximation. Error:', error
    return (error < tolerance)
Esempio n. 12
0
def regularizationLossGradient(features, weights, true_value,
                               tuning_parameter):
    """Computes the value of the training loss gradient (with respect to the
    weight vector) at a specific example.

    Training loss includes a ridge (L2) regularization term.

    Args:
        features (dict): A sparse vector of feature values.
        weights (dict): A sparse vector of feature weights.
        true_value (int): The true value of an example.
        tuning_parameter (double): Coefficient of the ridge regularization term.

    Returns:
        A sparse vector (dict) representing the gradient value.
    """
    # Standard squared loss
    gradient = {}
    scale = 2 * (dotProduct(features, weights) - true_value)

    # Regularization term: add gradient of the regularization term to the
    # scaling factor (i.e. add gradient of |tuning_parameter| *
    # (2-norm of weights)^2
    increment(gradient, tuning_parameter, weights)
    increment(gradient, scale, features)
    return gradient
Esempio n. 13
0
def pegasos_grad(X,y,w,lamb):
    tmp = y*dotProduct(w,X)
    if 1-tmp > 0:
        an1 = increment({},lamb,w)
        ans = increment(an1,y,X)
    else:
        ans = increment({},lamb,w)
    return ans
Esempio n. 14
0
File: hw3.py Progetto: pinesol/mlcs
def PegasosSubgradientLoss(x, y_val, theta, lambda_reg):
    '''Question 3.2: The Subgradient of the Pegasos Loss function.'''
    margin = y_val * util.dotProduct(theta, x)
    subgrad = theta.copy()
    util.scale(subgrad, lambda_reg)
    if margin < 1:
        util.increment(subgrad, -y_val, x)
    return subgrad
Esempio n. 15
0
File: hw3.py Progetto: pinesol/mlcs
def PercentageWrong(X, y, theta):
    '''Question 4.3: The percentage incorrect when using theta to predict y from X.'''
    num_wrong = 0
    for i, x in enumerate(X):
        estimate_sign = np.sign(util.dotProduct(theta, x))
        if estimate_sign != y[i]:
            num_wrong += 1
    return 1.0 * num_wrong / len(y)
Esempio n. 16
0
def per_loss(x,y,w):
    cnt = 0
    total = len(y)
    for i in range(total):
        if np.sign(dotProduct(w, x[i])) != np.sign(y[i]):
            cnt = cnt + 1
    error = (cnt/total)*100.0
    return error
Esempio n. 17
0
def learnBoostedRegression(examples, num_iters, step_size, num_trees):
    """Learns a linear regression model using boosted trees.

    Args:
        examples: An array of training examples.
        num_iters (int): Number of training iterations.
        step_size (int): Stochastic gradient descent step size.
        num_trees (int): Number of gradient boosting trees.

    Returns:
        A predictor function that outputs a price (int) given a single input
        tuple.
    """
    list_weights = []
    objectives = [cur[1] for cur in examples]

    filename = "boostedtree_" + str(num_trees -
                                    1) + "_" + str(cross_val_seg) + ".p"
    if num_trees > 1 and SAVE:
        (list_weights, num_trees_prev, num_iters_prev) = pickle.load(
            open(os.path.join("boostedtree_weights", filename), "rb"))

    for k in range(num_trees):
        if k >= len(list_weights):
            print ""
            print "TREE " + str(k + 1) + " OF " + str(num_trees)
            curWeights = defaultdict(int)
            for i in range(num_iters):
                for ind in range(len(examples)):
                    x = examples[ind][0]
                    gradient = regression.lassoLossGradient(
                        x, curWeights, objectives[ind], .5)
                    increment(curWeights, -step_size / (i + 1), gradient)
                if VERBOSE:
                    print "Training progress: " + str(
                        100.0 * (i + 1) / num_iters) + "%"

            list_weights.append(curWeights)
        else:
            curWeights = list_weights[k]

        for j in range(len(examples)):
            x, y = examples[j]
            objectives[j] = objectives[j] - dotProduct(x, curWeights)

        if VERBOSE: print "COMPLETE"

    if SAVE:
        filename = "boostedtree_" + str(num_trees) + "_" + str(
            cross_val_seg) + ".p"
        pickle.dump((list_weights, num_trees, num_iters),
                    open(os.path.join("boostedtree_weights", filename), "wb"))

    # Define the predictor function
    def predictor(x):
        return sum(dotProduct(x, curWeight) for curWeight in list_weights)

    return predictor
Esempio n. 18
0
def trainAndTest():

    # Import the training and test data as numpy arrays
    train_array = util.csvAsArray('data/train_updated.csv')
    test_array = util.csvAsArray('data/test.csv')
    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = util.getCsvHeaders('data/train_updated.csv')
    train_examples = []
    k_examples = []
    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))
        k_examples.append(feature_vector)
    # Train a k-means model on the training data and evaluate its mean
    # squared error with the test data

    random.shuffle(train_examples)
    for i in range(0, NUM_SPLITS, 2):
        startTest = i * len(train_examples) / NUM_SPLITS
        endTest = (i + 1) * len(train_examples) / NUM_SPLITS
        currentTrainExamples = train_examples[0:startTest] + train_examples[
            endTest:len(train_examples)]
        (centroids, assign, loss, loss_list,
         centroid_vals) = kmeans(currentTrainExamples, NUM_CLUSTERS, 500)

        currentBoostedExamples = [(currentTrainExamples[ind][0],
                                   loss_list[ind])
                                  for ind in range(len(currentTrainExamples))]

        boostedRegPredictor = learnBoostedRegression(currentBoostedExamples, 500, \
                0.00000000001, num_trees=NUM_B_TREES)

        pre_computed_centroid_dots = [
            util.dotProduct(centroids[ind], centroids[ind])
            for ind in range(NUM_CLUSTERS)
        ]

        def kmeanspredictor(x):
            assignment = 0
            min_dist = 1000000
            for j in range(NUM_CLUSTERS):
                cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
                    centroids[j], x) + pre_computed_centroid_dots[j]
                if cur_dist < min_dist:
                    assignment = j
                    min_dist = cur_dist
            return centroid_vals[assignment]

        def boostedKPredictor(x):
            return kmeanspredictor(x) + boostedRegPredictor(x)

        print "leaving out the", (
            i + 1
        ), "th segment of the data, the validation error for the regression is:", util.evaluatePredictor(
            boostedKPredictor, train_examples[startTest:endTest])
Esempio n. 19
0
def test3c1():
    weights = {}
    for _ in range(100):
        k = ''.join(random.choice(string.ascii_lowercase) for _ in range(5))
        v = random.uniform(-1, 1)
        weights[k] = v
    data = submission.generateDataset(100, weights)
    for phi, y in data:
        grader.require_is_equal(util.dotProduct(phi, weights) >= 0, y == 1)
Esempio n. 20
0
 def gradLoss(phiX, w, y):
     score = util.dotProduct(w, phiX)
     margin = score * y
     if margin < 1:
         for name, feature in phiX.iteritems():
             phiX[name] = -1 * y * feature
         return phiX
     else:
         return 0
Esempio n. 21
0
File: hw3.py Progetto: pinesol/mlcs
def PlotScoresAgainstAccuracy(X_training, y_training, X_testing, y_testing,
                              lambda_reg):
    '''Question 4.5.
    Divides the training set into buckets by score, and creates a bar chart showing the accuracy of
    each bucket.
    '''
    NUM_BUCKETS = 10

    theta = Pegasos(X_training, y_training, lambda_reg)
    # Calculate the score for each row in a list
    scores = [util.dotProduct(theta, x) for x in X_testing]

    low_score = min(scores)
    high_score = max(scores)

    # f(score) -> bucket
    score_to_bucket_func = lambda score: int(
        round((NUM_BUCKETS - 1) * (score - low_score) /
              (high_score - low_score)))

    # Make a list of empty lists with NUM_BUCKETS elements
    # Each entry is a list of the indexes of X's rows that fall in the same score bucket.
    score_histogram = [[] for _ in range(NUM_BUCKETS)]
    for row_index, score in enumerate(scores):
        bucket = score_to_bucket_func(score)
        score_histogram[bucket].append(row_index)

    bucket_means = [0.0] * NUM_BUCKETS
    bucket_accuracy = [0.0] * NUM_BUCKETS

    for bucket, row_indices in enumerate(score_histogram):
        # calculate the percentage wrong loss for each bucket
        # make a scatter plot of these
        bucket_scores = [scores[row_index] for row_index in row_indices]
        bucket_score_mean = abs(np.mean(bucket_scores))
        bucket_means[bucket] = bucket_score_mean
        bucket_score_std = np.std(bucket_scores)
        #        print 'Bucket', bucket, 'ranges from', min(bucket_scores), 'to', max(bucket_scores)
        #        print 'Bucket', bucket, 'mean:', bucket_score_mean
        #        print 'Bucket', bucket, 'stdev:', bucket_score_std
        X_bucket = [X_testing[row_index] for row_index in row_indices]
        y_bucket = [y_testing[row_index] for row_index in row_indices]
        bucket_accuracy[bucket] = 100 * (
            1.0 - PercentageWrong(X_bucket, y_bucket, theta))

    fig, ax = plt.subplots()
    ax.set_xlabel('Mean Score for Bucket')
    ax.set_ylabel('Percentage Correct')
    ax.set_title('Pegasos Sentiment Analysis: Score vs. Accuracy')
    width = 0.4
    positions = range(0, len(bucket_accuracy))
    rects1 = ax.bar(positions, bucket_accuracy, width, color='b', alpha=0.8)
    plt.xticks(rotation=-45)
    ax.set_xticks([pos + width for pos in positions])
    ax.set_xticklabels(["%0.1f" % mean for mean in bucket_means])
    plt.show()
Esempio n. 22
0
 def find_center(ex_index, example, precomputed_x, precomputed_quantities,
             centroids):
 	assign = 0
 	min_dist = 1,000
 	for i in range(K):
         cur_dist = precomputed_x[ex_index] - 2 * util.dotProduct(
                 centroids[i], example) + precomputed_quantities[i]
 		if cur_dist < min_dist:
 			assign = i
 			min_dist = cur_dist
		def getAnswerProbs(weights, questionData):
			proposedAnswers = questionData["proposedAnswers"]
			correctIndex = questionData["correctAnswerIndex"]
			answerScores = []

			for aIndex, proposed in enumerate(proposedAnswers):
				score = dotProduct(weights, featureExtractor(proposed))
				answerScores.append(score)

			return softmax(answerScores)
Esempio n. 24
0
File: hw3.py Progetto: pinesol/mlcs
def Pegasos(X, y, lambda_reg, max_epochs=1000, check_gradient=False):
    '''Question 4.2.
    Finds the sparse weight vector that minimizes the SVM loss function on X and y.
    '''
    print 'Running Pegasos with regularization parameter', lambda_reg
    loss_func = lambda x, y_val, theta: PegasosLoss(x, y_val, theta, lambda_reg
                                                    )
    gradient_loss_func = lambda x, y_val, theta: PegasosSubgradientLoss(
        x, y_val, theta, lambda_reg)

    # Initialize theta to have zero for every word mentioned in any review
    theta = {key: 0.0 for x in X for key in x.keys()}
    t = 2  # NOTE: This normally starts at zero, but that causes a divide-by-zero error.
    weight_scalar = 1.0

    for epoch in range(max_epochs):
        #        print '--Epoch', epoch
        old_theta = theta.copy()
        for j, x in enumerate(X):
            t += 1
            eta = 1.0 / (t * lambda_reg)
            margin = y[j] * weight_scalar * util.dotProduct(theta, x)
            # NOTE that the gradient is not differentiable at 1.0, so we don't check it near there.
            if check_gradient and abs(margin - 1.0) > 0.01:
                if SparseGradChecker(loss_func, gradient_loss_func, x, y[j],
                                     theta):
                    print 'Computed gradient doesn\'t match approximations.'
                    sys.exit(1)
                grad = gradient_loss_func(x, y[j], theta)
                util.increment(theta, -eta, grad)
            else:
                weight_scalar *= 1.0 - 1.0 / t
                if margin < 1:
                    util.increment(theta, eta * y[j] / weight_scalar, x)
        util.increment(old_theta, -1, theta)
        util.scale(old_theta, weight_scalar)
        total_change = math.sqrt(util.dotProduct(old_theta, old_theta))
        #        print '----Change from previous theta:', total_change
        if total_change < 0.01:
            break
    util.scale(theta, weight_scalar)
    return theta
Esempio n. 25
0
 def learn(self, trainExamples):
     numIters = 10
     step = 0.0001
     for i in range(numIters):
         for feature_vec, y in trainExamples:
             score = util.dotProduct(self.weights, feature_vec)
             dloss = {}
             if score*y > 1:
                 continue
             else:
                 util.increment(dloss, -y, feature_vec)
             util.increment(self.weights, -step, dloss)
Esempio n. 26
0
def percent_error(X, y, w):
    correct = 0
    pos = 0
    total = len(y)
    for i in range(total):
        sign_value = np.sign(dotProduct(X[i], w))
        if y[i] == sign_value:
            correct += 1
        if sign_value > 0:
            pos += 1
    print pos
    return 1-float(correct)/total
Esempio n. 27
0
def chooseEval(examples, weights):
    correct = 0
    for i in range(len(examples)):
        prompt = examples[i][0][0]
        response1 = examples[i][0][1]
        randomInt = random.randint(0, len(examples)-1)
        response2 = examples[randomInt][0][1]
        # Relies on random to break loop
        while response1 == response2 or (neg_restrict_bad and isBadTurn(response2)) or response1[0].caller == response2[0].caller:
            randomInt = random.randint(0, len(examples)-1)
            response2 = examples[randomInt][0][1]
        guess1 = (prompt, response1)
        phi1 = swda_feature_extractor(guess1)
        score1 = dotProduct(weights, phi1)
        guess2 = (prompt, response2)
        phi2 = swda_feature_extractor(guess2)
        score2 = dotProduct(weights, phi2)
        if(score1 > score2):
            correct = correct + 1
        if(score1 == score2):
            correct = correct + .5 
    return 1.0 * correct / len(examples)
Esempio n. 28
0
def test3c0():
    ans = 0.05
    grader.require_is_equal(
        ans,
        util.dotProduct({
            'Movie': 0.1,
            'is': 0.2,
            'good': 0.25
        }, {
            'Movie': 0.1,
            'is': 0.2,
            'very': -0.25,
            'bad': -0.25
        }))
Esempio n. 29
0
    def predict(self, stories):
        if self.args.verbose > 3:
            print 'Predicting'

        if self.args.verbose:
            print "weights are ", self.weights

        ANS_LETTERS = ['A', 'B', 'C', 'D']
        ans = []
        for story in stories:
            if self.args.verbose > 0:
                print story.name
                print formatForPrint(story.rawPassage), "\n"
                print story.rawQuestions, "\n"
                print story.rawAnswers, "\n"

            for qid in range(len(story.questions)):
                scores = []
                for aid in range(len(story.answers[qid])):
                    score = max([(util.dotProduct(self.weights, self.extractFeatures(story, sid, qid, aid)),sid) for \
                                     sid in range(len(story.passageSentences))])
                    scores.append((score, aid))
                # if question contains "n't | not", and begin
                # with "what, who, whose", select the minium score.
                s = story.rawQuestions[qid][1].strip()
                if re.search('^(who|what|whose).*(n\'t|not)',
                             s,
                             flags=re.IGNORECASE):
                    answer = min(scores)[1]
                else:
                    answer = max(scores)[1]

                ans.append(ANS_LETTERS[answer])

                if self.args.verbose > 0:
                    if answer != story.correctAnswers[qid]:
                        print 'WRONG: %s: correct answer %s, predicted answer %s, scores %s' \
                            %(story.rawQuestions[qid][0], ANS_LETTERS[story.correctAnswers[qid]], ANS_LETTERS[answer], scores)
                    else:
                        print 'RIGHT: %s: correct answer %s, predicted answer %s, scores %s' \
                            %(story.rawQuestions[qid][0], ANS_LETTERS[story.correctAnswers[qid]], ANS_LETTERS[answer], scores)

            if self.args.verbose > 0:
                print "\n"

        return ans
Esempio n. 30
0
def pegasos_SGD(X,y,lamb,num_iter):

    w = {}
    t = 1
    s = 1
    for i in range(num_iter):

        for j in range(len(X)):

            t += 1
            alpha = 1.0/(t*lamb)
            tmp = y[j] * s * dotProduct(X[j], w)
            g = l_de(tmp)
            s *= (1 - alpha * lamb)
            w = increment(w, -(alpha*y[j]*g/s), X[j])
        print "epoch "+str(i)
    return increment({},s,w)
Esempio n. 31
0
def show_incorrect_case():
    train_data,test_data = read_files()
    X_train, y_train = data_label(train_data)
    X_test, y_test = data_label(test_data)

    lamb = 0.1
    w_o= pickle.load(open("weight",'rb'))

    test_len = len(X_test)

    example = 0

    for i in range(test_len):
        w = w_o
        tmp = dotProduct(X_test[i],w)
        if (np.sign(tmp)!=y_test[i]):
            example += 1
            print "predicted_score: ",np.sign(tmp)
            print "true vote: ",y_test[i]
            dict_tmp = dotProduct_vector(X_test[i],w)
            sorted_dict = sorted(dict_tmp.items(),key=lambda x: abs(x[1]),reverse=True)
            print_list_wx = sorted_dict[:8]
            print_list_abs_wx = [(a,abs(b)) for a,b in print_list_wx]
            print_list_x = [X_test[i][x] for (x,k) in print_list_wx ]
            print_list_w = [w[x] for (x,k) in print_list_wx ]

            print "wx:"
            print print_list_wx
            print "\n"


            print "abs_wx"
            print print_list_abs_wx
            print "\n"


            print "x"
            print print_list_x
            print "\n"


            print "w"
            print print_list_w
            if example == 3:
                break
Esempio n. 32
0
def score_interval():
    train_data,test_data = read_files()
    X_train, y_train = data_label(train_data)
    X_test, y_test = data_label(test_data)

    lamb = 1e-1

    w = pegasos_SGD(X_train,y_train,lamb,30)

    a = []
    test_len = len(X_test)
    for i in range(test_len):
        a.append(dotProduct(X_test[i],w))
    a.sort()
    print min(a)
    print max(a)
    plt.plot(a)
    plt.show()
Esempio n. 33
0
def score_confidence():
    train_data,test_data = read_files()
    X_train, y_train = data_label(train_data)
    X_test, y_test = data_label(test_data)

    lamb = 1e-1

    # w = pegasos_SGD(X_train,y_train,lamb,30)
    #
    # pickle.dump(w,open("weight",'wb'))
    w = pickle.load(open("weight",'rb'))

    a = [0]*18     #correct count
    b = [0]*18     #incorrect count
    c = [0]*18     #total number
    test_len = len(X_test)

    for i in range(test_len):
        tmp = dotProduct(X_test[i],w)
        c[int(tmp+9)] += 1
        if (np.sign(tmp)==y_test[i]):
            a[int(tmp+9)] += 1
        else:
            b[int(tmp+9)] += 1

    # for j in range(18):
    #     if b[j]==0:
    #         print "interval [%s,%s] has %s points and %s are correct, the ratio is %s "%(j-9,j-8,b[j],a[j],0)
    #     else:
    #         print "interval [%s,%s] has %s points and %s are correct, the ratio is %4.2f "%(j-9,j-8,b[j],a[j],float(a[j])/b[j])

    wide = 0.35
    p1 = plt.bar(np.arange(18),a,width=wide, color='g')
    p2 = plt.bar(np.arange(18),b,width=wide, color='r', bottom=a)

    plt.ylabel("Frequency")
    plt.xlabel("Score Intervals")
    # plt.title("Score Confidence")
    X_ticks = ["[%s,%s]"%(k-9,k-8) for k in np.arange(18)]

    plt.xticks(np.arange(18)+wide/2,X_ticks,rotation=45)
    plt.legend((p1[0],p2[0]),("Correct","Incorrect"))

    plt.savefig("t45_score_confidence.png")
Esempio n. 34
0
def guessEval(examples, weights):
    correct = 0
    for i in range(len(examples)):
        prompt = examples[i][0][0]
        maxScore = 0
        maxResponse = examples[0][0][1]
        for j in range(len(examples)):
            response = examples[j][0][1]
            if prompt[0].caller == response[0].caller:
                continue
            guess = (prompt, response)
            phi = swda_feature_extractor(guess)
            score = dotProduct(weights, phi)
            if score > maxScore:
                maxScore = score
                maxResponse = response
        if maxResponse == examples[i][0][1]:
            correct = correct + 1
    return 1.0 * correct / len(examples)
    def fit(self, X):
        self.w_ = dict()
        t = 0

        for j in range(len(X)):
            t += 1
            step_size = 1 / (t * self.lambda_reg)

            w_dot_x = util.dotProduct(self.w_, X[j])
            y = 1 if 1 in X[j] else -1

            if y * w_dot_x < 1:
                util.increment(self.w_, (1 - 1 / t), self.w_)
                util.increment(self.w_, step_size * y, X[j])

            else:
                util.increment(self.w_, (1 - 1 / t), self.w_)

        return self.w_
Esempio n. 36
0
    def expectimax_value(self, game, action_list, depth=1):
        #Do all of the actions in action_list
        for action in action_list:
            game = game.players[self.turn_num].do_move(game, action)

        turn_num = (self.turn_num+1) % 4
        total_action_list = []      #Stores all of the actions made so that you can undo them

        while depth > 0:
            opp = game.players[turn_num]
            opp_action_list = self.guess_opp_move(opp, game)

            #Add opponent actions with the opponent object so we can undo them
            if opp_action_list:
                for opp_action in opp_action_list:
                    game = opp.do_move(game, opp_action)
                    total_action_list.append((opp.turn_num, opp_action))
            
            turn_num = (turn_num+1) % 4
            if turn_num == self.turn_num: depth -= 1

        #Find value of your estimated future state
        expected_features = self.feature_extractor(game)
        expected_score = util.dotProduct(expected_features, self.weights)
    
        for i in range(len(game.players)):
            player = game.players[i]
            print "cities and settlements: ", player.turn_num, player.cities_and_settlements
        print "total actions list: ", total_action_list

        #Undo moves the player made
        for i in range(len(total_action_list)-1, -1, -1):
            to_undo = total_action_list[i]
            opp_num, opp_action = to_undo
            print "Loop cities and settles: ",opp_num, game.players[opp_num].cities_and_settlements
            game = game.players[opp_num].undo_move(game, opp_action)

        #Undo moves the player made
        for i in range(len(action_list)-1, -1, -1):
            game = game.players[self.turn_num].undo_move(game, action_list[i])

        return expected_score
Esempio n. 37
0
def pegasos(x, y, l):

    w = dict()
    t = 2
    temp_loss = 0
    cnt = 0
    flag = True
    for i in range(2):
        for j in range(len(x)):
            t = t + 1
            n = 1/(l*t)
            if y[j]*(dotProduct(w, x[j])) < 1:
                cnt = cnt +1
                temp = x[j].copy()
                increment(temp, (n*y[j]-1), temp)
                increment(w,-n*l,w)
                increment(w,1,temp)
            else:
                increment(w,-n*l,w)
    return w
Esempio n. 38
0
    def updateWeights(self, game):
        # print game
        #Get current score
        cur_features = self.feature_extractor(game)
        target = util.dotProduct(cur_features, self.weights)
        pred = self.prevScore
        
        # print "f**k" , cur_features["Player "+ str(1) + " Settlements"]

        #If there are no previous features you can't update
        if self.prevFeatures: 

            #Update weights
            diff = pred - target
            for feature, val in self.prevFeatures.items():
                # print diff, val
                self.weights[feature] -= self.eta * diff * val
        
        self.prevFeatures = cur_features
        self.prevScore = target
Esempio n. 39
0
def predict(weights, testSet, args):
    correct = 0
    incorrect = 0
    total = 0
    for data in testSet:
        data = json.loads(data)
        title = data['title']
        subreddit = data['subreddit']
        features = FeatureExtractor.extractFeatures(title, args)
        maxScore = float('-inf')
        prediction = ''

        for key in weights.keys():
            weightVector = weights[key]

            score = util.dotProduct(weightVector, features)
            if score > maxScore:
                prediction = key
                maxScore = score

        if prediction == subreddit:
            correct += 1
        else:
            if args.verbose:
                try:
                    print title
                    print "predicted: " + prediction.encode('utf-8')
                    print features
                    printRelevantWeights(weights, features)
                    print "-----------------"


                except UnicodeEncodeError:
                    print "error"
            incorrect += 1
        total += 1


    print 'accuracy ' + str(float(correct) / total)
    print 'wrong ' + str(float(incorrect) / total)
Esempio n. 40
0
def learnPredictor(trainExamples, testExamples, featureExtractor):
    '''
    Given |trainExamples| and |testExamples| (each one is a list of (x,y)
    pairs), a |featureExtractor| to apply to x, and the number of iterations to
    train |numIters|, return the weight vector (sparse feature vector) learned.

    You should implement stochastic gradient descent.

    Note: call evaluatePredictor() on both trainExamples and testExamples
    to see how you're doing as you learn after each iteration.
    '''

    weights = {}  # feature => weight
    stepSize = 1
    numIters = 15 
    for it in range(0, numIters):
        # iterate through every training example and extract the features of x
        for x, y in trainExamples:
            phi = featureExtractor(x)
            # print phi
            # if y * score < 1 (wrong prediction) then calculate gradient loss then update weight for each feature
            margin = y*util.dotProduct(weights, phi)
            if (1-margin) > 0:
                indicator = 1
            else:
                indicator = 0
            scale = stepSize*indicator*y
            increment(weights, scale, phi)  

        # this uses the defined feature extractor to predict the classification of x
        def predictor(x):
            phi = featureExtractor(x)
            # create thresholds for different scores
            score = dotProduct(phi, weights)
            # return 1 if (dotProduct(phi, weights) > 0) else -1

        # Print out training and test error for every iteration:
        # print 'TRAINING ERROR:', util.evaluatePredictor(trainExamples, predictor)
        # print 'TEST ERROR:', util.evaluatePredictor(testExamples, predictor)
    return weights
def main():

    path = ""
    image_name = ""
    if len(sys.argv) > 1:
        path = sys.argv[1]
        if not os.path.exists(path):
            print "The path provided does not exist."
            return
        directories = path.split('/')
        file_name = directories[len(directories) - 1]
    else:
        print "Please supply a path to an image."
        return

    os.system("python segment.py " + path)
    segments = []
    for f in os.listdir(SEGMENTS_PATH):
        if 'temp' in f and image_name in f:  # Ways to identify segments of the given path
            segments.append(os.path.join(SEGMENTS_PATH, f))

    f = open('weights.out')  # Read in weights
    weights = eval(f.readline())

    stop_sign_flag = False
    for segment in segments:
        score = util.dotProduct(weights,
                                seg_util.segmentFeatureExtractor(segment))

        if score >= 0:  # Stop sign found
            stop_sign_flag = True
            break

    if stop_sign_flag:
        print "Stop sign detected!"
    else:
        print "No stop sign detected"
Esempio n. 42
0
File: hw3.py Progetto: pinesol/mlcs
def ErrorAnalysis(X_training, y_training, X_testing, y_testing, lambda_reg):
    '''Question 5.1.
    Prints information about the top incorrect reviews, ordered by the magnitude of their score.
    '''
    theta = Pegasos(X_training, y_training, lambda_reg)
    scores = [util.dotProduct(theta, x) for x in X_testing]

    # (index, score) pairs, sorted by the score's absolute value in descending order.
    score_indexes = sorted(
        enumerate(scores),
        reverse=True,
        key=lambda index_score_pair: abs(index_score_pair[1]))

    num_incorrect_examples = 0
    MAX_NUM_WRONG_EXAMPLES = 10
    # Print out the information about all the incorrect examples, in order of largest score.
    for row_index, score in score_indexes:
        if num_incorrect_examples >= MAX_NUM_WRONG_EXAMPLES:
            break
        y_testing_val = y_testing[row_index]
        if np.sign(score) != np.sign(y_testing_val):
            x_testing_row = X_testing[row_index]
            PrintReviewInfo(x_testing_row, y_testing_val, score, theta)
            num_incorrect_examples += 1
def main():

    path = ""
    image_name = ""
    if len(sys.argv) > 1:
        path = sys.argv[1]
        if not os.path.exists(path):
            print "The path provided does not exist."
            return
        directories = path.split("/")
        file_name = directories[len(directories) - 1]
    else:
        print "Please supply a path to an image."
        return

    os.system("python segment.py " + path)
    segments = []
    for f in os.listdir(SEGMENTS_PATH):
        if "temp" in f and image_name in f:  # Ways to identify segments of the given path
            segments.append(os.path.join(SEGMENTS_PATH, f))

    f = open("weights.out")  # Read in weights
    weights = eval(f.readline())

    stop_sign_flag = False
    for segment in segments:
        score = util.dotProduct(weights, seg_util.segmentFeatureExtractor(segment))

        if score >= 0:  # Stop sign found
            stop_sign_flag = True
            break

    if stop_sign_flag:
        print "Stop sign detected!"
    else:
        print "No stop sign detected"
		def hingeLoss(w, features, y):
			return max(0, 1 - dotProduct(w, features) * y)
Esempio n. 45
0
 def predictor(x):
     return dotProduct(x, weights)
Esempio n. 46
0
def trainAndTest():
    """Defines K-means clustering and perform clustered regression.
    """
    # Import the training and test data as numpy arrays
    train_array = util.csvAsArray('data/train_updated.csv')
    test_array = util.csvAsArray('data/test.csv')

    # Generate a list of (feature vector, value) tuples for the training data
    feature_names = util.getCsvHeaders('data/train_updated.csv')

    train_examples = []
    k_examples = []

    for i in range(len(train_array)):
        feature_count = range(len(train_array[i]) - 1)
        feature_values = [train_array[i][j] for j in feature_count]
        feature_vector = util.featurize(feature_values, feature_names)
        output = train_array[i][len(train_array[0]) - 1]
        train_examples.append((feature_vector, output))

    random.shuffle(train_examples)

    for i in range(1, NUM_SPLITS, 2):
        startTest = i * len(train_examples) / NUM_SPLITS
        endTest = (i + 1) * len(train_examples) / NUM_SPLITS
        currentTrain = train_examples[0:startTest] + train_examples[
            endTest:len(train_examples)]
        currentTest = train_examples[startTest:endTest]

        # Cluster the data using k-means
        (centroids, assign, loss, loss_list,
         centroid_vals) = kmeans.kmeans(currentTrain, NUM_CENTROIDS, K_ITERS)

        # Make clusters
        cluster_list = [[] for _ in range(len(centroids))]

        for j in range(len(currentTrain)):
            cluster_list[assign[i]].append(currentTrain[j])

        # Train a regression model on the training data (by cluster)
        # and evaluate its mean squared error with the train data
        regression_error = 0
        predictor_list = []
        pre_computed_centroid_dots = [
            util.dotProduct(centroids[k], centroids[k])
            for k in range(len(centroids))
        ]

        for cluster_points in cluster_list:
            boostedRegressionPredictor = boostedtree.learnBoostedRegression(
                cluster_points, SGD_ITERS, ETA, 5, 0)
            predictor_list.append(boostedRegressionPredictor)

        def predictor(x):
            centroid_ind = 0
            minDist = float('inf')
            for k in range(len(centroids)):
                cur_dist = util.dotProduct(x, x) - 2 * util.dotProduct(
                    centroids[k], x) + pre_computed_centroid_dots[k]
                min_dist = float('inf')
                if cur_dist < min_dist:
                    assignment = k
                    min_dist = cur_dist
            return predictor_list[i](x)

        regression_error = boostedtree.evaluatePredictor(
            predictor, currentTest)
        #regression_error /= len(train_examples)

        # Print the results
        print ""
        print "------------------"
        print "CLUSTERED REGRESSION WITH BOOSTING"
        print "------------------"
        print "Leaving out segment: " + str(i)
        print "Number of centroids: " + str(10)
        print "Number of examples: " + str(len(train_examples))
        print "Regression MSE:     " + str(regression_error)
        print ""

    return predictor_list, centroids, regression_error
Esempio n. 47
0
def test3c0():
    weights = {"hello": 1, "world": 1}
    data = submission.generateDataset(5, weights)
    for datapt in data:
        grader.requireIsEqual((util.dotProduct(datapt[0], weights) >= 0),
                              (datapt[1] == 1))
def test2c():
    weights = {'hello': 1, 'world': 1}
    data = submission.generateDataset(5, weights)
    for datapt in data:
        grader.requireIsEqual(util.dotProduct(datapt[0], weights) >= 0,
                              datapt[1] == 1)
Esempio n. 49
0
def test3c_0():
    weights = {"hello": 1, "world": 1}
    data = submission.generateDataset(5, weights)
    for datapt in data:
        grader.requireIsEqual((util.dotProduct(datapt[0], weights) >= 0), (datapt[1] == 1))
Esempio n. 50
0
#Compare data
# threeCards = {}
# for _ in range(0,3): 
# 	thisChoice = random.choice(pCard.keys())
# 	threeCards[thisChoice] = util.extract(thisChoice,pCard)

# 	modelValue = util.dotProduct(weights,threeCards[thisChoice])
# 	oracleValue = 

correctCount = 0
totalCount = 0

for key,value in testData.iteritems():
	totalCount += 1
	compareResults = []
	myChoice = (0,0)
	trueChoice = (0,0)
	for i in range(0,3):
		if value[i] > trueChoice[1]: trueChoice = (key[i],value[i])
		thisValue = []
		modelValue = util.dotProduct(weights,util.extract(key[i],pCard))
		if modelValue > myChoice[1]: myChoice = (key[i],modelValue)
		thisValue.append((key[i],value[i],modelValue))
		compareResults.append(thisValue)
	if trueChoice[0] == myChoice[0]: 
		compareResults.append((trueChoice,True))
		correctCount += 1
	else: compareResults.append(False)
	print compareResults

print correctCount/float(totalCount)
Esempio n. 51
0
 def getTDScore(self, features):
     return util.dotProduct(self.weights_, features)
Esempio n. 52
0
def pegasos_loss(X,y,w,lamb):
    ans = (lamb/2.0)*dotProduct(w,w)+max(0,1-y*dotProduct(w,X))
    return ans
Esempio n. 53
0
 def predict(self, x):
     print "Learned Score:" + str(util.dotProduct(self.weights, x))
     return math.copysign(1.0, util.dotProduct(self.weights, x))
Esempio n. 54
0
def main():

    #loading the shuffled data
    with open('data.pickle', 'rb') as f:
        review = pickle.load(f)
    

    #Splitting into training and test sets
    train, test = split(review)

    #Splitting x and y values and getting reasy for training
    x_train = []
    x_test = []
    y_train = []
    y_test = []

    for i in train:
        y_train.append(i.pop())
        x_train.append(bag_of_words(i))

    for i in test:
        y_test.append(i.pop())
        x_test.append(bag_of_words(i))

    
    l = 0.5

    print("Pegasos fast")
    
    start_time = time.time()
    w1 = pegasos_fast(x_train, y_train, l)
    time1 = time.time() - start_time
    print("--- %s seconds ---" % (time1) )
    #print(len(w1))
    #print("percentage error:", per_loss(x_test,y_test,w1))


    #error analysis
    pos = []
    count = 0
    for i in range(len(y_test)):
        if np.sign(dotProduct(w1, x_test[i])) != np.sign(y_test[i]) and count<2:
            count = count +1
            pos.append(i)

    wrong1 = x_test[pos[0]].copy()
    new_wrong1 = wrong1.copy()
    abs_wrong1 = wrong1.copy()
    print("wrong 1, real:", y_test[pos[0]])
    wrong2 = x_test[pos[1]].copy()
    new_wrong2 = wrong2.copy()
    abs_wrong2 = wrong2.copy()
    print("wrong 2, real:", y_test[pos[1]])

    #multiplying weight

    for i in wrong2:
        wt = w1.get(i,0)
        abs_wrong2[i] = abs(abs_wrong2[i]*wt)
        wrong2[i] = wrong2[i]*wt


    keys = sorted(abs_wrong2)
    absvals = sorted(abs_wrong2.values())
    #print("name:","         absolute product","           product", "          x","            w")
    for i in range(len(keys)):
        print(keys[-i],",", abs_wrong2[keys[-i]],",", wrong2[keys[-i]],",", new_wrong2[keys[-i]],",", w1.get(keys[-i],0),",")


    
    

    
    


    '''
Esempio n. 55
0
 def loss(w, phi, y):
     return max(1 - util.dotProduct(w, phi) * y, 0)
Esempio n. 56
0
def printExamples(examples, weights, featureExtractor, SCORE_THRESHOLD):
    # random.jumpahead(1)
    random.shuffle(examples)
    # SCORE_THRESHOLD = .5
    print "Finding Interesting Examples..."
    tpFound = fpFound = tnFound = fnFound = False
    for example in examples:
        prompt, response = example[0]
        if not examineCollab(prompt, response) : continue
        phi = featureExtractor(example[0])
        score = dotProduct(weights, phi)
        if not tpFound and score > SCORE_THRESHOLD and example[1] == 1:
            print "FOUND: True Positive"
            print "Prompt"
            for utt in example[0][0]:
                print utt.text_words();
            print "Response"
            for utt in example[0][1]:
                print utt.text_words();
            for key in phi:
                if key in weights:
                    print "{0}: {1}".format(key, weights[key])
            tpFound = True
        if not fpFound and score > SCORE_THRESHOLD and example[1] == -1:
            print "FOUND: False Positive"
            print "Prompt"
            for utt in example[0][0]:
                print utt.text_words();
            print "Response"
            for utt in example[0][1]:
                print utt.text_words();
            for key in phi:
                if key in weights:
                    print "{0}: {1}".format(key, weights[key])
            fpFound = True
        if not tnFound and score < -SCORE_THRESHOLD and example[1] == -1:
            print "FOUND: True Negative"
            print "Prompt"
            for utt in example[0][0]:
                print utt.text_words();
            print "Response"
            for utt in example[0][1]:
                print utt.text_words();
            for key in phi:
                if key in weights:
                    print "{0}: {1}".format(key, weights[key])
            tnFound = True
        if not fnFound and score < -SCORE_THRESHOLD and example[1] == 1:
            print "FOUND: False Negative"
            print "Prompt"
            for utt in example[0][0]:
                print utt.text_words();
            print "Response"
            for utt in example[0][1]:
                print utt.text_words();
            for key in phi:
                if key in weights:
                    print "{0}: {1}".format(key, weights[key])
            fnFound = True
        if tpFound and fpFound and tnFound and fnFound:
            break