def make_submission():
    print("Reading data")
    fea, status = features.online_extract_features('data/train.csv',
                                                   limit=5e6)
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=1)
    
    rf.fit(fea, status['OpenStatus'])
    
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               fea.keys())
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    test_features = features.online_extract_features('data/'+test_file,
                                                     train=False,
                                                     limit=1e12)[0]
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def make_submission():
    print("Reading data")
    fea, status = features.online_extract_features('data/train.csv',
                                                   train=True,
                                                   limit=1e9)
    _dim(fea,'fea')
    print("Training Level 1 : Open/Rest model")
    open_status = [ or_binarize(e) for e in status['OpenStatus']  ]
    is_not_open_status = [ s != 'open' for s in open_status ]
    or_model = learn(fea,open_status)

    print("Training Level 2 : Not Open Split model")
    not_open_status = [ status['OpenStatus'][i] for i in range(len(is_not_open_status)) if is_not_open_status[i] ]
    no_fea = fea[is_not_open_status]
    _dim(no_fea,'no_fea')
    no_model = learn(no_fea,not_open_status)
    
    print("Reading test file and making predictions")
    test_features = features.online_extract_features('data/'+test_file,
                                                     train=False,
                                                     limit=1e9)[0]
    _dim(test_features,'test_features')
    or_probs = or_model.predict_proba(test_features)
    probs = []
    for i in range(0,len(or_probs)):
        or_prob = or_probs[i]
        if or_prob[0] > or_prob[1]:
            probs.append(np.array([1.0,0.0,0.0,0.0,0.0]))
        else:
            f = [ test_features[ff][i] for ff in test_features.keys() ]
            a = no_model.predict_proba(f)
            aa = np.insert(a,0,[0.0])
            probs.append(aa)
    probs = np.array(probs)

    if False:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)