예제 #1
0
  def cross_validate(self, dataset, num_folds):
    self.rho_list = [0.1] * 12
    self.alpha_list = [7.3476104736328125, 6.8925933837890625, 8.2190093994140625, 8.2137908935546875, 8.2814483642578125, 7.9523162841796875, 7.9453582763671875, 6.6873321533203125, 7.2606353759765625, 7.0821990966796875, 7.1634979248046875, 8.4375]

    if 1:
      dataset.createFolds(num_folds)
      for rho in GlmNet.rho_values:
        print "rho: %f" % (rho)
        best_rmsle_list = [float("inf")] * 12
        min_alpha_list = np.array([self.min_alpha] * 12)
        max_alpha_list = np.array([self.max_alpha] * 12)
        rho_list = [rho] * 12
        self.search(dataset, num_folds, self.max_depth, self.max_width, min_alpha_list, max_alpha_list, rho_list)
        
  def train(self, dataset):
    if self.debug:
      print "Training elastic net..."
    self._train(dataset, self.alpha_list, self.rho_list)
    
  def predict(self, dataset):
    features = dataset.getFeatures()
    num_samples, num_features = features.shape

    predictions = np.zeros((num_samples, 12))
    for month_ind in range(12):
      predictions[:, month_ind] = self.regressor_list[month_ind].predict(features)

    return predictions
    
LearnerBase.register(GlmNet)
예제 #2
0
                self._train(fold_train, k)
                cur_score.addFold(fold_test.getSales(),
                                  self.predict(fold_test))

            for month_ind in range(12):
                cur_rmsle = cur_score.getRMSLE(month_ind)
                if cur_rmsle < best_rmsle_list[month_ind]:
                    best_rmsle_list[month_ind] = cur_rmsle
                    best_k_list[month_ind] = k

        self.k_list = list(best_k_list)
        if self.debug:
            print "Best k-values by month: %s" % str(self.k_list)

    def train(self, dataset):
        self._train(dataset, None, k_list=self.k_list)

    def predict(self, dataset):
        features = dataset.getFeatures()
        num_samples, num_features = features.shape

        predictions = np.zeros((num_samples, 12))
        for month_ind in range(12):
            predictions[:,
                        month_ind] = self.knn_list[month_ind].predict(features)

        return predictions


LearnerBase.register(NearestNeighbor)
예제 #3
0
    def predict(self, x):
        """ TODO fix this somehow. """
        return 0.0

    def grade(self, features, essay_set, domain, options={}):
        """Return an integer grade for each feature vector in the specified array."""

        f = open('output/ds.set%d.dom%d.%s.matOut' % (essay_set, domain, options['postfix']), 'r')
        scores = list()
        for line in f.readlines():
            scores.append(float(line))

        if options['round']:
            grades = [int(round(score)) for score in scores]
        else:
            grades = [self.curve.curve(score) for score in scores]

        return np.asarray(grades)

    def set_curve(self, scores, grade_counts):
        """Set curve with histogram."""
        self.curve = Curve(scores, histogram=grade_counts)


LearnerBase.register(MatlabExample)




예제 #4
0
                    fold_train = dataset.getTrainFold(fold_ind)
                    fold_test = dataset.getTestFold(fold_ind)
                    self.rf = RandomForestClassifier(n_estimators=n, min_split=split)
                    self.rf.fit(fold_train.getFeatures(), fold_train.getLabels())
                    fold_score = Score.Score(fold_test.getLabels(), self.predict(fold_test.getFeatures()))
                    prediction_inds = dataset.getTestFoldInds(fold_ind)
                    learner_predictions[prediction_inds] = self.predict(fold_test.getFeatures())

                cur_score = Score.Score(dataset.getLabels(), learner_predictions).getLogLoss()
                if cur_score < best_score:
                    if self.debug:
                        print "Achieved new best score %f" % cur_score
                    best_params = (n, split)
                    best_score = cur_score

        self.n_estimators, self.min_split = best_params

    def train(self, features, labels):
        if self.debug:
            print "Training random forest with n_estimators=%d, min_split=%d" % (self.n_estimators, self.min_split)
        self.rf = RandomForestClassifier(n_estimators=self.n_estimators, min_split=self.min_split)
        self.rf.fit(features, labels)

    def predict(self, features):
        num_samples, num_features = features.shape
        probs = [prob[1] for prob in self.rf.predict_proba(features)]
        return np.minimum(np.maximum(probs, 0.01 * np.ones(num_samples)), 0.99 * np.ones(num_samples))


LearnerBase.register(RandomForest)
        print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples)
      
      A = np.hstack((month_features, np.ones((num_samples,1))))
      
      month_params, residues, rank, s = linalg.lstsq(A, dataset.getSalesForMonth(month_ind))
      params[:, month_ind] = month_params
    
    self.params = params
    
  def predict(self, dataset):
    features = dataset.getQuantitativeFeatures()
    
    num_samples, num_features = features.shape
    A = np.hstack((features, np.ones((num_samples,1))))
    sales = A.dot(self.params)
    return np.maximum(sales, np.zeros(sales.shape))

    
LearnerBase.register(QuantLinearRegression)
    
    
    
    
    
    
    
    
    
    
    
예제 #6
0
      if cur_score < best_score:
        if self.debug:
          print "Achieved new best score %f" %cur_score
        best_k = k
        best_score = cur_score
        
    self.k = best_k
        
  def train(self, features, labels):
    self._train_with_k(features, labels, self.k)
    
  def predict(self, features):
    num_samples, num_features = features.shape
    A = np.hstack((features, np.ones((num_samples,1))))
    probs = A.dot(self.params)
    return np.minimum(np.maximum(probs, 0.01*np.ones(num_samples)), 0.99*np.ones(num_samples))

    
LearnerBase.register(RidgeRegression)
    
    
    
    
    
    
    
    
    
    
    
예제 #7
0
        if self.debug:
            print "Train SuportVectorMachines with %d features..." % (
                dataset.getNumFeatures())

        self.svr_list = []

        for month_ind in range(12):
            month_features = dataset.getFeaturesForMonth(month_ind)
            if self.debug:
                num_samples = month_features.shape[0]
                print "Learning on month %d of 12 with %d samples..." % (
                    month_ind + 1, num_samples)

            svr = SVR()
            svr.fit(month_features, dataset.getSalesForMonth(month_ind))
            self.svr_list.append(svr)

    def predict(self, dataset):
        features = dataset.getFeatures()
        num_samples, num_features = features.shape

        predictions = np.zeros((num_samples, 12))
        for month_ind in range(12):
            predictions[:,
                        month_ind] = self.svr_list[month_ind].predict(features)

        return predictions


LearnerBase.register(SupportVectorMachines)
예제 #8
0
            self.best_rmsle_list = [float("inf")] * 12

            self.min_n_estimators_list = np.array([self.min_n_estimators] * 12)
            self.max_n_estimators_list = np.array([self.max_n_estimators] * 12)
            self.search(
                dataset,
                num_folds,
                self.max_depth,
                self.max_width,
                self.min_n_estimators_list,
                self.max_n_estimators_list,
            )

    def train(self, dataset):
        if self.debug:
            print "Training GradientBoosting model with %d features..." % (dataset.getNumFeatures())
        self._train(dataset, self.n_estimators_list)

    def predict(self, dataset):
        features = dataset.getFeatures()
        num_samples, num_features = features.shape

        predictions = np.zeros((num_samples, 12))
        for month_ind in range(12):
            predictions[:, month_ind] = self.regressor_list[month_ind].predict(features)

        return predictions


LearnerBase.register(GradientBoosting)
예제 #9
0
  def cross_validate(self, dataset, num_folds):
    pass
    
  def train(self, dataset):
    if self.debug:
      print "Train SuportVectorMachines with %d features..." %(dataset.getNumFeatures())

    self.svr_list = []
    
    for month_ind in range(12):
      month_features = dataset.getFeaturesForMonth(month_ind)
      if self.debug:
        num_samples = month_features.shape[0]
        print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples)

      svr = SVR()
      svr.fit(month_features, dataset.getSalesForMonth(month_ind))
      self.svr_list.append(svr)
    
  def predict(self, dataset):
    features = dataset.getFeatures()
    num_samples, num_features = features.shape

    predictions = np.zeros((num_samples, 12))
    for month_ind in range(12):
      predictions[:, month_ind] = self.svr_list[month_ind].predict(features)

    return predictions
    
LearnerBase.register(SupportVectorMachines)
예제 #10
0
        if self.debug:
            print "Running linear regression with %d quantitative features..." % (
                num_features)

        for month_ind in range(12):
            month_features = dataset.getQuantitativeFeaturesForMonth(month_ind)
            num_samples = month_features.shape[0]

            if self.debug:
                print "Learning on month %d of 12 with %d samples..." % (
                    month_ind + 1, num_samples)

            A = np.hstack((month_features, np.ones((num_samples, 1))))

            month_params, residues, rank, s = linalg.lstsq(
                A, dataset.getSalesForMonth(month_ind))
            params[:, month_ind] = month_params

        self.params = params

    def predict(self, dataset):
        features = dataset.getQuantitativeFeatures()

        num_samples, num_features = features.shape
        A = np.hstack((features, np.ones((num_samples, 1))))
        sales = A.dot(self.params)
        return np.maximum(sales, np.zeros(sales.shape))


LearnerBase.register(QuantLinearRegression)
예제 #11
0
            for split in RandomForest.split_values:
                print "Split: %d" % (split)
                self.min_n_estimators_list = np.array([0] * 12)
                self.max_n_estimators_list = np.array([150] * 12)

                self.search(dataset, num_folds, self.max_depth, self.max_width,
                            self.min_n_estimators_list,
                            self.max_n_estimators_list, [split] * 12)

    def train(self, dataset):
        if self.debug:
            print "Training random forest with n_estimators_list: %s, min_samples_split_list: %s" % (
                str(self.n_estimators_list), str(self.min_samples_split_list))
        self._train(dataset, self.n_estimators_list,
                    self.min_samples_split_list)

    def predict(self, dataset):
        features = dataset.getFeatures()
        num_samples, num_features = features.shape

        predictions = np.zeros((num_samples, 12))

        for month_ind in range(12):
            predictions[:, month_ind] = self.regressor_list[month_ind].predict(
                features)
        return predictions


LearnerBase.register(RandomForest)
예제 #12
0
      print "Training SVM regression with C=%d, poly_degree=%d" %(self.C, self.poly_degree)
    self._train_with_values(dataset, poly_degree=self.poly_degree, C=self.C)
    
  def predict(self, dataset):
    assert self.svm_list is not None
    
    self._format_test_data(dataset)
    num_samples = dataset.getNumSamples()
    num_features = dataset.getNumFeatures()

    predictions = np.zeros((num_samples, 12))

    for month_ind in range(12):
      # import pdb;pdb.set_trace()
      predictions[:, month_ind] = svmlight.classify(self.svm_list[month_ind], self.formatted_data)
    return predictions
    
    
LearnerBase.register(SVMRegression)
    
    
    
    
    
    
    
    
    
    
    
예제 #13
0
      for fold_ind in range(num_folds):
        fold_train = dataset.getTrainFold(fold_ind)
        fold_test = dataset.getTestFold(fold_ind)
        self._train(fold_train, k)
        cur_score.addFold(fold_test.getSales(), self.predict(fold_test))
        
      for month_ind in range(12):
        cur_rmsle = cur_score.getRMSLE(month_ind)
        if cur_rmsle < best_rmsle_list[month_ind]:
          best_rmsle_list[month_ind] = cur_rmsle
          best_k_list[month_ind] = k
          
    self.k_list = list(best_k_list)  
    if self.debug:
      print "Best k-values by month: %s" %str(self.k_list)
    
  def train(self, dataset):
    self._train(dataset, None, k_list=self.k_list)
    
  def predict(self, dataset):
    features = dataset.getFeatures()
    num_samples, num_features = features.shape

    predictions = np.zeros((num_samples, 12))
    for month_ind in range(12):
      predictions[:, month_ind] = self.knn_list[month_ind].predict(features)

    return predictions
    
LearnerBase.register(NearestNeighbor)
예제 #14
0
  def cross_validate(self, dataset, num_folds):
    pass
    
  def train(self, dataset):
    if self.debug:
      print "Train SuportVectorMachines with %d features..." %(dataset.getNumFeatures())

    self.regressor_list = []
    
    for month_ind in range(12):
      month_features = dataset.getFeaturesForMonth(month_ind)
      if self.debug:
        num_samples = month_features.shape[0]
        print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples)

      regressor = DecisionTreeRegressor()
      regressor.fit(month_features, dataset.getSalesForMonth(month_ind))
      self.regressor_list.append(regressor)
    
  def predict(self, dataset):
    features = dataset.getFeatures()
    num_samples, num_features = features.shape

    predictions = np.zeros((num_samples, 12))
    for month_ind in range(12):
      predictions[:, month_ind] = self.regressor_list[month_ind].predict(features)

    return predictions
    
LearnerBase.register(DecisionTree)
예제 #15
0
        ]

        if 1:
            dataset.createFolds(num_folds)
            self.best_rmsle_list = [float("inf")] * 12

            self.min_n_estimators_list = np.array([self.min_n_estimators] * 12)
            self.max_n_estimators_list = np.array([self.max_n_estimators] * 12)
            self.search(dataset, num_folds, self.max_depth, self.max_width,
                        self.min_n_estimators_list, self.max_n_estimators_list)

    def train(self, dataset):
        if self.debug:
            print "Training GradientBoosting model with %d features..." % (
                dataset.getNumFeatures())
        self._train(dataset, self.n_estimators_list)

    def predict(self, dataset):
        features = dataset.getFeatures()
        num_samples, num_features = features.shape

        predictions = np.zeros((num_samples, 12))
        for month_ind in range(12):
            predictions[:, month_ind] = self.regressor_list[month_ind].predict(
                features)

        return predictions


LearnerBase.register(GradientBoosting)
예제 #16
0
        print "Learning on month %d of 12 with %d samples..." %(month_ind+1, num_samples)
      
      A = np.hstack((month_features, np.ones((num_samples,1))))
      
      month_params, residues, rank, s = linalg.lstsq(A, dataset.getSalesForMonth(month_ind))
      params[:, month_ind] = month_params
    
    self.params = params
    
  def predict(self, dataset):
    features = dataset.getFeatures()
    
    num_samples, num_features = features.shape
    A = np.hstack((features, np.ones((num_samples,1))))
    sales = A.dot(self.params)
    return np.maximum(sales, np.zeros(sales.shape))

    
LearnerBase.register(LinearRegression)
    
    
    
    
    
    
    
    
    
    
    
    
예제 #17
0
        if 1:
            dataset.createFolds(num_folds)
            for rho in GlmNet.rho_values:
                print "rho: %f" % (rho)
                best_rmsle_list = [float("inf")] * 12
                min_alpha_list = np.array([self.min_alpha] * 12)
                max_alpha_list = np.array([self.max_alpha] * 12)
                rho_list = [rho] * 12
                self.search(dataset, num_folds, self.max_depth, self.max_width,
                            min_alpha_list, max_alpha_list, rho_list)

    def train(self, dataset):
        if self.debug:
            print "Training elastic net..."
        self._train(dataset, self.alpha_list, self.rho_list)

    def predict(self, dataset):
        features = dataset.getFeatures()
        num_samples, num_features = features.shape

        predictions = np.zeros((num_samples, 12))
        for month_ind in range(12):
            predictions[:, month_ind] = self.regressor_list[month_ind].predict(
                features)

        return predictions


LearnerBase.register(GlmNet)