Exemple #1
0
def main():
    print("Reading the valid pairs") 
    valid = data_io.read_valid_pairs()
    features = fe.feature_extractor()
    print("Transforming features")
    trans_valid = features.fit_transform(valid)
    trans_valid = np.nan_to_num(trans_valid)

    print("Saving Valid Features")
    data_io.save_features(trans_valid)

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions") 
    orig_predictions = classifier.predict_proba(trans_valid)
    predictions = orig_predictions[:, 2] - orig_predictions[:, 0]
    predictions = predictions.flatten()

    print("Writing predictions to file")
    data_io.write_submission(predictions)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train_info = train_info.append(sup_info)
      train = train.append(sup)
      target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier) 
 
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1, 4):
        print "SUP", str(i)
        sup = data_io.read_sup_pairs(i)
        sup_info = data_io.read_sup_info(i)
        sup = combine_types(sup, sup_info)
        sup = get_types(sup)
        sup_target = data_io.read_sup_target(i)
        train_info = train_info.append(sup_info)
        train = train.append(sup)
        target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train)

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier)

    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff / 60, 1)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1,4):
      print "SUP", str(i)
      sup = data_io.read_sup_pairs(i)
      sup_info = data_io.read_sup_info(i)
      sup = combine_types(sup, sup_info)
      sup = get_types(sup)
      sup_target = data_io.read_sup_target(i)
      train = train.append(sup)
      target = target.append(sup_target)

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    (feature_trans, classifier) = get_pipeline()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train) 

    print("Train-test split")
    trainX, testX, trainY, testY = train_test_split(orig_train, target.Target, random_state = 1)
    print "TrainX size = ", str(trainX.shape)
    print "TestX size = ", str(testX.shape)

    print("Saving features")
    data_io.save_features(orig_train)

    classifier.fit(trainX, trainY)
    print("Saving the classifier")
    data_io.save_model(classifier)
 
    testX = numpy.nan_to_num(testX)
    print "Score on held-out test data ->", classifier.score(testX, testY)
    
    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])


    feature_importrance = classifier.feature_importances_
    logger = open("feature_importance.csv","a")
    for fi in feature_importrance:
      logger.write(str(fi))
      logger.write("\n")

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)