def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:140323]) print("Reading test file and making predictions") features.compute_features(test_file,feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def classifier_predict(listname, modelname, outdir=None, n_jobs=None): if outdir == None: outdir = tempfile.mkdtemp(dir=os.curdir, prefix='out') else: if not os.path.exists(outdir): tsh.makedirs(outdir) inputname = os.path.splitext(os.path.basename(listname))[0] if listname.endswith('.gz'): inputname = os.path.splitext(inputname)[0] meta, data = read_listfile(listname) classifier = read_classifierfile(modelname) feature_method = classifier['features']['meta']['feature_method'] feature_args = meta.copy() # Training input_name would shadow the current one. del classifier['features']['meta']['input_name'] featurename = os.path.join(outdir, inputname + '-feats.csv.gz') if os.path.exists(featurename): _, features = read_listfile(featurename) else: feature_args.update(classifier['features']['meta']) args, features = compute_features(feature_method, feature_args, data, input_name=inputname, n_jobs=n_jobs, output_dir=outdir) assert (data['id'] == features['id']).all() clean_args(args) write_listfile(featurename, features, input_name=inputname, **args) labels_name = classifier['meta']['truth'] + '_labels' labels = classifier['meta'][labels_name] pred = predict(classifier['classifier'], sorted(labels.keys()), features, output_dir=outdir) write_listfile(os.path.join(outdir, inputname + '-predictions.csv.gz'), pred, classifier_name=modelname, truth=classifier['meta']['truth'], labels_name=labels, input_name=inputname)
def extract_features(filename): return joblib.load('scalers/scaler.pkl').transform( compute_features(filename).values.reshape(1, -1))
def train_classifier( train_file='train/train.csv', recompute_feats=False ): ''' Module that reads stackoverflow data from a .csv file, generates features, and trains a classifier. ''' # custom variables DATA_DIR = "../data/" SUBMISSION_DIR = "../data/submission/" # train_file = 'train/train-sample.csv' label_file = 'train/train-labels.csv' feature_file = 'train/train-feats.csv' # display progress logs on stdout logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s' ) log = logging.getLogger(__name__) if recompute_feats: features.compute_features( train_file, feature_file, label_file ) log.info( 'π: load features from file' ) X = pd.io.parsers.read_csv( os.path.join( DATA_DIR, feature_file ), header=None ) X = X.as_matrix() log.info( "π: encode labels" ) labels = pd.io.parsers.read_csv( os.path.join( DATA_DIR, label_file ), header=None )['X0'] lbl_map = { 'not a real question': 0, 'not constructive': 1, 'off topic': 2, 'open': 3, 'too localized': 4 } # cf. required submission format labels = labels.map( lbl_map ) y = labels.values log.info( 'π: select features' ) fselect = SelectPercentile( score_func=chi2, percentile=42 ) # !? # X = fselect.fit_transform( X, y ) log.info( 'π: define classifiers' ) priors = cu.get_priors( os.path.join( DATA_DIR, 'train/train.csv' ) ) clf_lda = LDA( priors=priors ) clf_rfc = RandomForestClassifier( n_estimators=50, verbose=2, n_jobs=-1, random_state=0, compute_importances=True, max_features=None ) #, criterion='entropy' ) clf_gbc = GradientBoostingClassifier() log.info( 'π: fit Random Forest' ) clf_rfc.fit( X, y ) log.info( "π: compute feature ranking for RFC" ) importances = clf_rfc.feature_importances_ std = np.std([ tree.feature_importances_ for tree in clf_rfc.estimators_ ], axis=0 ) indices = np.argsort( importances )[::-1] for f in xrange( 13 ): # the top thirteen features print "%d. feature %d (%f)" % (f + 1, indices[f], importances[ indices[f] ]) log.info( "π: standardize and normalize features" ) standardizer = StandardScaler( copy=False ).fit( X, y ) standardizer.transform( X, y ) # in-place normalizer = Normalizer( copy=False, norm='l2' ).fit( X, y ) # 'l1' normalizer.transform( X, y ) # in-place log.info( 'π: fit Linear Discriminant Analysis' ) clf_lda.fit( X, y ) # X = cld_lda.transform( X, y ) log.info( 'π: fit Gradient Boosting' ) clf_gbc.fit( X, y ) log.info( 'π: save classifiers' ) np.savez( SUBMISSION_DIR+'cfy.npz', X=X, y=y, fselect=fselect, standardizer=standardizer, normalizer=normalizer ) joblib.dump( clf_lda, SUBMISSION_DIR + 'clf_lda.pkl', compress=9 ) joblib.dump( clf_rfc, SUBMISSION_DIR + 'clf_rfc.pkl', compress=9 ) joblib.dump( clf_gbc, SUBMISSION_DIR + 'clf_gbc.pkl', compress=9 )
def doit(n,qmod,qcon,beta,T,outfile): G = nx.Graph() # Nodes that have gone through a DMC iteration and have not been burned. G.add_edge(1,2) InPlayPool = set([1,2]) # Nodes that are initially isolated or have been burned. IsolatedPool = set() for i in xrange(3,n+1): G.add_node(i) IsolatedPool.add(i) # Header. print "#Iter\tNodes\tInPlay\tIso\tEdges\tNmEval\tComps\tMIS\tDensity\tCC\tTris\tFracDeg1\tFracDeg0\tNGcc\tMGcc" for iter in xrange(1,T+1): assert G.order() == len(InPlayPool) + len(IsolatedPool) == n # Always n nodes. # If this statement passes over, some nodes will get removed from burning # and then in the next iteration it will go through. if len(IsolatedPool) != 0: # 0. If all nodes are isolated, start over with the dumbell. if len(InPlayPool) == 0: u = IsolatedPool.pop() v = IsolatedPool.pop() G.add_edge(u,v) InPlayPool.add(u) InPlayPool.add(v) # 1. Select random node to add. v = IsolatedPool.pop() # 2. Select node to copy from. u = InPlayPool.pop() InPlayPool.add(v) InPlayPool.add(u) # Hack because sets can't return+retain random element. # 3. Run DMC iteration. Delete = set() for neighbor in G.neighbors(u): assert neighbor != u if random.random() < qmod: # modify the edge. if random.random() < 0.5: # delete u->neighbor. Delete.add(neighbor) G.add_edge(v,neighbor) #else: # delete v->neighbor -- already done. else: # don't modify the edge. G.add_edge(v,neighbor) assert v != neighbor for neighbor in Delete: G.remove_edge(u,neighbor) if random.random() < qcon: assert u != v assert not G.has_edge(u,v) G.add_edge(u,v) # 4. Burn and remove infected nodes. b = random.choice(G.nodes()) # random infected node. Infected = F.compute_infected_set_sir(G,b,beta) for b in Infected: if b in IsolatedPool: # Burnt nodes was already isolated. assert len(Infected) == 1 # no one else should be infected. continue else: InPlayPool.remove(b) IsolatedPool.add(b) # Remove and add-back as isolated. G.remove_node(b) G.add_node(b) # In case a burnt node / dmc iter causes an in-play node to become isolated. for u in nx.isolates(G): if u in InPlayPool: assert u not in IsolatedPool IsolatedPool.add(u) InPlayPool.remove(u) # 6. Compute distances after burning. (nmeval,comps,mis,density,cc,tris,fracdeg1,fracdeg0,ngcc,mgcc) = F.compute_features(G) m = G.size() # 8. Output results. print "%i\t%i\t%i\t%i\t%i\t%.5f\t%i\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%i\t%i" %(iter,G.order(),len(InPlayPool),len(IsolatedPool),m,nmeval,comps,mis,density,cc,tris,fracdeg1,fracdeg0,ngcc,mgcc) # 9. Last iteration: print component sizes for final graph. comps = "" for Gc in nx.connected_component_subgraphs(G): comps += "%i\t" %(Gc.order()) print "# Components\t%s" %(comps.strip()) # Print the final network as well. out = open(outfile,"w") #TODO: write header optionally # out.write("#nodes=%i\n#edges=%i\n#qmod=%.1f\n#qcon=%.1f\n#beta=%.2f\n" %(n,G.size(),qmod,qcon,beta)) # out.write("#comps=%i\n#isolates=%i\n" %(nx.number_connected_components(G),len(nx.isolates(G)))) for u,v in G.edges_iter(): out.write("%s\t%s\n" %(u,v)) for u in nx.isolates(G): out.write("%s\t%s\n" %(u,u)) out.close()
def process(imdb, args, validation=False): if validation: # test on the validation set features = compute_features(imdb, args, useValSet=False) print('Experiment setup: trainining set: train, test set: val') clf = train_classifier( features[ imdb.train_indices, :], # get rows corresponding to training imdb.class_ids[imdb.train_indices], args) val_preds, val_scores = make_predictions(clf, features[imdb.val_indices, :]) if validation: return get_confusion(imdb.class_ids[imdb.val_indices], val_preds) #show_confusion(imdb.class_ids[imdb.val_indices], val_preds) else: features = compute_features(imdb, args, useValSet=True) # ensure that indices haven't been accidentally modified: assert imdb.train_indices[0] == 0 and imdb.train_indices[-1] == 297 assert imdb.val_indices[0] == 1 and imdb.val_indices[-1] == 298 assert imdb.test_indices[0] == 2 and imdb.test_indices[-1] == 299 print('Experiment setup: trainining set: train+val, test set: test') clf = train_classifier( features[np.hstack((imdb.train_indices, imdb.val_indices)), :], imdb.class_ids[np.hstack( (imdb.train_indices, imdb.val_indices))], args) test_preds, test_scores = make_predictions( clf, features[imdb.test_indices, :]) show_confusion(imdb.class_ids[imdb.test_indices], test_preds) # confusion matrix of images: (store their indices in imdb.test_indices) # find first cat and first dog: cat, dog = -1, -1 for i in range(len( imdb.test_indices)): # location in imdb.test_indices if cat == -1 and imdb.class_ids[imdb.test_indices[i]] == 0: cat = i if dog == -1 and imdb.class_ids[imdb.test_indices[i]] == 1: dog = i top = np.array([[cat, cat], [dog, dog]]) for i in range(len( imdb.test_indices)): # location in imdb.test_indices # cat: 0, dog: 1 (labels) ans = imdb.class_ids[imdb.test_indices[i]] pred = test_preds[i] score = test_scores[i] if ans == 0 and pred == 0: # look for most cat-like cat if score > test_scores[top[0, 0]]: top[0, 0] = i if ans == 0 and pred == 1: # look for most dog-like cat if score > test_scores[top[0, 1]]: top[0, 1] = i if ans == 1 and pred == 1: # look for most dog-like dog if score > test_scores[top[1, 1]]: top[1, 1] = i if ans == 1 and pred == 0: # look for most cat-like dog if score > test_scores[top[1, 0]]: top[1, 0] = i # show the top images side by side fig, axarr = plt.subplots(2, 2, figsize=(5, 5)) for i in range(0, 2): for j in range(0, 2): img = cv2.imread(imdb.image_dir + "/" + imdb.image_names[imdb.test_indices[top[i, j]]]) axarr[i, j].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) fig.savefig("confusion.jpg", dpi=fig.dpi) plt.show()
def main(): im_a = misc.imread(IN_PATH_A) im_a_p = misc.imread(IN_PATH_A_P) im_b = misc.imread(IN_PATH_B) # image single to double (float in 0 to 1 scale) if np.max(im_a) > 1.0: im_a = im_a / 255. if np.max(im_b) > 1.0: im_b = im_b / 255. if np.max(im_a_p) > 1.0: im_a_p = im_a_p / 255. """Remap luminance for color artistic images""" if LUM_REMAP: im_a, im_a_p = remap_luminance(im_a, im_a_p, im_b) pyramid_a = compute_gaussian_pyramid(im_a, max_level) pyramid_a_p = compute_gaussian_pyramid(im_a_p, max_level) pyramid_b = compute_gaussian_pyramid(im_b, max_level) pyramid_b_p = pyramid_b # im_b_yiq = rgb2yiq(im_b) # pyramid_color = compute_gaussian_pyramid(im_b_yiq, max_level) # Compute features of B features_b = concat_features(pyramid_b) # Build structure for ANN flann, flann_params, As, As_size = ann_index(pyramid_a, pyramid_a_p, max_level + 1) ################################################################## # Algorithms for level in range(1, len(pyramid_a)): print('Computing level %d of %d' % (level, len(pyramid_a) - 1)) imh, imw = pyramid_b[level].shape[:2] im_out = np.nan * np.ones((imh, imw, 3)) s = [] for row in range(imh): for col in range(imw): px = np.array([row, col]) # do something about B and Bp feature feature_b_p = compute_features(pyramid_b_p) small_padded = np.pad(feature_b_p[level - 1], (n_sm // 2), 'reflect') big_padded = np.pad(feature_b_p[level], (n_lg // 2), 'reflect') BBp_feature = np.hstack([ features_b[level][to_1d(px, imw), :], extract_pixel_feature(small_padded, big_padded, px) ]) assert (BBp_feature.shape == (As_size[level][1], )) # Find Approx Nearest Neighbor p_app_ix = best_approximate_match(flann[level], flann_params[level], BBp_feature) Ap_imh, Ap_imw = pyramid_a_p[level].shape[:2] p_app = to_2d(p_app_ix, Ap_imw) if (len(s) < 1): p = p_app else: #Coherence match p_coh = best_coherence_match(As[level], (Ap_imh, Ap_imw), BBp_feature, s, px, imw, n_lg) if np.allclose(p_coh, np.array([-1, -1])): p = p_app else: AAp_feature_app = As[level][p_app] AAp_feature_coh = As[level][p_coh] d_app = norm(AAp_feature_app - BBp_feature)**2 d_coh = norm(AAp_feature_coh - BBp_feature)**2 if d_coh < d_app * (1 + (2**(level - 5) * 1)): p = p_coh else: p = p_app s.append(p) pyramid_b_p[level][row, col] = pyramid_a_p[level][tuple(p)] # Save color output images # pyramid_b_p_yiq = rgb2yiq(pyramid_b_p[level]) # im_out_yiq = np.dstack([pyramid_b_p_yiq[:, :, 0], pyramid_color[level][:, :, 1:]]) color_im_out = pyramid_b_p[level] color_im_out = np.clip(color_im_out, 0, 1) misc.imsave('output/level_%d_color.jpg' % level, color_im_out)
def main(args): parser = argparse.ArgumentParser( description='Train and evaluate a model on the Cats vs. Dogs dataset') parser.add_argument('-d', '--dataset-dir', required=True, type=str, help='Path to the dataset') parser.add_argument('-f', '--feature', required=True, choices=FEATURES, help='Select which feature representation to use. ' 'Choices are {' + ', '.join(FEATURES) + '}') parser.add_argument('-c', '--classifier', required=True, choices=CLASSIFIERS, help='Select which classifier to use. ' 'Choices are {' + ', '.join(CLASSIFIERS) + '}') parser.add_argument('-k', '--knn-k', default=3, type=int, help='Number of neighbors for kNN classifier') parser.add_argument('-l', '--svm-lambda', default=1.0, type=float, help='Lambda paramter for SVM') parser.add_argument('--tinyimage-patchdim', default=16, type=int) parser.add_argument('--patches-dictionarysize', default=128, type=int) parser.add_argument('--patches-radius', default=8, type=float) parser.add_argument('--patches-stride', default=12, type=int) parser.add_argument('--sift-dictionarysize', default=128, type=int) parser.add_argument('--sift-binsize', default=8, type=int, help='Size of the bin in terms of number of pixels in ' 'the image. Recall that SIFT has 4x4=16 bins.') parser.add_argument('--sift-stride', default=12, type=int, help='Spacing between succesive x (and y) coordinates ' 'for sampling dense features.') args = parser.parse_args(args) imdb = read_dataset(args.dataset_dir) features = compute_features(imdb, args) if args.feature != 'tinyimage': features = normalize_features(features) print(f'Experiment setup: trainining set: train, test set: val') clf = train_classifier(features[imdb.train_indices, :], imdb.class_ids[imdb.train_indices], args) val_preds, val_scores = make_predictions(clf, features[imdb.val_indices, :]) show_confusion(imdb.class_ids[imdb.val_indices], val_preds) print(f'Experiment setup: trainining set: train+val, test set: test') clf = train_classifier( features[np.hstack((imdb.train_indices, imdb.val_indices)), :], imdb.class_ids[np.hstack( (imdb.train_indices, imdb.val_indices))], args) test_preds, test_scores = make_predictions(clf, features[imdb.test_indices, :]) show_confusion(imdb.class_ids[imdb.test_indices], test_preds)
def predict_class(test_file="test/public_leaderboard.csv", recompute_feats=False): """ Module that predicts class probabilities for test data from a .csv file or precomputed feature vectors. """ # custom variables DATA_DIR = "../data/" SUBMISSION_DIR = "../data/submission/" train_file_all = "train/train.csv" test_file = "test/private_leaderboard.csv" feature_file = "test/private_leaderboard-feats.csv" output_file = "predictions.csv" logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger(__name__) if recompute_feats: # features.compute_features( 'test/test.csv', 'test/test-feats.csv' ) features.compute_features(test_file, feature_file) log.info("π: load features from file") X_test = pd.io.parsers.read_csv(os.path.join(DATA_DIR, feature_file), header=None) X_test = X_test.as_matrix() log.info("π: load classifier") npz_file = np.load(SUBMISSION_DIR + "cfy.npz") clf_lda = joblib.load(SUBMISSION_DIR + "clf_lda.pkl") clf_rfc = joblib.load(SUBMISSION_DIR + "clf_rfc.pkl") clf_gbc = joblib.load(SUBMISSION_DIR + "clf_gbc.pkl") log.info("π: load standardizer, normalizer") standardizer = npz_file["standardizer"].item() normalizer = npz_file["normalizer"].item() # log.info( 'π: perform feature selection' ) # fselect = npz_file[ 'fselect' ].item() # X_test = fselect.transform( X_test ) log.info("π: Random Forest predictions") y_rfc = clf_rfc.predict_proba(X_test) log.info("π: standardize and normalize test features") standardizer.transform(X_test) # in-place normalizer.transform(X_test) # in-place log.info("π: LDA and GBC class membership predictions") # X_test = clf_lda.transform( X_test ) y_lda = clf_lda.predict_proba(X_test) y_gbc = clf_gbc.predict_proba(X_test) y_pred = (y_rfc + y_gbc) / 2.0 log.info("π: calculate priors and update posteriors") new_priors = cu.get_priors(train_file_all) closed_reasons = pd.io.parsers.read_csv(os.path.join(DATA_DIR, train_labels), header=None)["X0"] closed_reason_counts = Counter(closed_reasons) reasons = sorted(closed_reason_counts.keys()) total = len(closed_reasons) old_priors = [closed_reason_counts[reason] / total for reason in reasons] y_pred = cu.cap_and_update_priors(old_priors, y_pred, new_priors, 0.001) y_pred = (2 * y_pred + y_lda) / 3.0 log.info("π: write predictions to file") writer = csv.writer(open(os.path.join(SUBMISSION_DIR, output_file), "w"), lineterminator="\n") writer.writerows(y_pred)