Beispiel #1
0
def predictScores(trainFeatures,trainTargets,testFeatures,testItemIds,isRegression = False):
    logging.info("Feature preparation done, fitting model...")
    
    predicted_scores = []
    if isRegression:
        clf = SGDRegressor(     penalty="l2", 
                                alpha=1e-4)
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(trainFeatures,trainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict(testFeatures)
    else:         
        clf = SGDClassifier(    loss="log", 
                                penalty="l2", 
                                alpha=1e-4, 
                                class_weight="auto")
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(trainFeatures,trainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict_proba(testFeatures).T[1]    
    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
Beispiel #2
0
def sgd(X_train, y_train, X_validate, y_validate, X_test, cw, alpha, regression=False):
  #cw = 2.5
  if regression:
    clf = SGDRegressor(alpha=alpha)
  else:
    #clf = SGDClassifier(class_weight = {1:cw}, alpha=alpha)
    clf = SGDClassifier(class_weight = {1:cw}, alpha=alpha, loss='log')
  print clf
  training_data_size = y_train.shape[0]
  n_iter = 3
  mb_size = 100
  iter_mb = minibatch_generator(training_data_size, mb_size = mb_size, n_iter = n_iter)
  total = 0
  n_total_batch = n_iter*training_data_size/mb_size
  t0 = time()
  recent_auc = []
  for n_batch, batch in enumerate(iter_mb):
    x, y = X_train[batch], y_train[batch]
    if regression:
      sw = np.ones(y.shape[0])
      sw[np.where(y==1)[0]] = cw
      clf.partial_fit(x, y, sample_weight=sw)
    else:
      clf.partial_fit(x, y, classes = [1, 0])
    total += y.shape[0]
    if (n_batch+1)%1000 == 0:
      if regression:
        #y_pred_validate_val = clf.decision_function(X_validate)
        y_pred_validate_val = clf.predict(X_validate)
      else:
        #y_pred_validate_val = clf.decision_function(X_validate)
        y_pred_validate_val = clf.predict_proba(X_validate)[:,1]
      print 'auc:%.3f, %d samples in %ds (cw: %.2f)' %(AUC(y_validate, y_pred_validate_val), total, time()-t0, cw)
    if n_batch>n_total_batch-100:
      if regression:
        y_pred_validate_val = clf.predict(X_validate)
      else:
        y_pred_validate_val = clf.predict_proba(X_validate)[:,1]
      recent_auc.append(AUC(y_validate, y_pred_validate_val))
  latest_auc_avg = np.mean(recent_auc)
  print 'cw=%.2f, avg auc of last %d bathes: %.3f' %(cw, len(recent_auc), latest_auc_avg)
  if regression:
    return clf.predict(X_test)
  else:
    return clf.predict_proba(X_test)[:,1]
Beispiel #3
0
def predictCrossValidatedScore(trainFeatures,trainTargets,trainItemIds,isRegression = False):
    logging.info("Feature preparation done, fitting model...")
                           
    randomPermutation = random.sample(range(trainFeatures.shape[0]), trainFeatures.shape[0])
    numPointsTrain = int(trainFeatures.shape[0]*0.5)
    
    dataTrainFeatures = trainFeatures[randomPermutation[:numPointsTrain]]
    dataValidationFeatures = trainFeatures[randomPermutation[numPointsTrain:]]
    
    dataTrainTargets = [trainTargets[i] for i in randomPermutation[:numPointsTrain]]
    dataValidationTargets = [trainTargets[i] for i in randomPermutation[numPointsTrain:]]

    predicted_scores = []
    if isRegression:
        clf = SGDRegressor(    penalty="l1", 
                                alpha=1e-4)
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(dataTrainFeatures,dataTrainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict(dataValidationFeatures)   
    else:         
        clf = SGDClassifier(    loss="log", 
                                penalty="l2", 
                                alpha=1e-4, 
                                class_weight="auto")
                            
        print("trainFeatures rows::"+str(trainFeatures.shape[0]))
        print("trainTargets rows::"+str(len(trainTargets)))
        clf.fit(dataTrainFeatures,dataTrainTargets)
        logging.info("Predicting...")    
        predicted_scores = clf.predict_proba(dataValidationFeatures).T[1]
            
    error = mean_squared_error(dataValidationTargets,predicted_scores)
    print("% Error:"+ str(error))
sgd = SGDRegressor()
sgd.fit(x_train, y)
# print sgd.coef_

sgd = LogisticRegression()
sgd.fit(x_train, y)
# print sgd.coef_
x_train = None

num_test_matrix, cat_test_matrix = encoding(test, numeric_cols, numeric_cols)
x_cat_test_data = cat_test_matrix.T.to_dict().values()
vec_x_cat_test = vectorized.transform(x_cat_test_data)
# print vec_x_cat_test, vec_x_cat_test.shape
x_test = np.hstack((num_test_matrix, vec_x_cat_test))

prob =  sgd.predict_proba(x_test)
# print prob
# print type(prob[:,1]), prob[:,1].shape

rows, = prob[:,1].shape
for i in range(rows):
    str = "%s,%s" %(ID[i], prob[i,1])
    print str
# print sgd.predict(x_test)

# can only process categorical col whose value is integer
# enc = preprocessing.OneHotEncoder()
#print enc.fit(cat_matrix)
# cat_matrix.fillna('NA', inplace=True)

                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

#---------------------------------------------------------------------
#logistic regression algorithm with SGD solver
#---------------------------------------------------------------------
clf = LogisticRegression().fit(X_test[:100], y_test[:100])

#---------------------------------------------------------------------
#Through a 1×2-axis figure, visualize training and testing samples as well as a decision boundary
#---------------------------------------------------------------------

xx, yy = np.mgrid[-5:5:.01, -5:5:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = clf.predict_proba(grid)[:, 1].reshape(xx.shape)

f, ax = plt.subplots(figsize=(8, 6))
ax.contour(xx, yy, probs, levels=[.5], cmap="Greys", vmin=0, vmax=.6)

ax.scatter(X[:, 0],
           X[:, 1],
           c=y,
           s=50,
           cmap="RdBu",
           vmin=-.2,
           vmax=1.2,
           edgecolor="white",
           linewidth=1)

ax.set(aspect="equal",