def multistage_classify(n, s, e, folds): features, metafeatures, labels = get_data(n, s, e, metadata=True) features2 = new_features(features, labels, metafeatures) return kfold_crossval(npa(features2), npa(labels), folds, 2)
# v_features = (v.fit_transform(POS_features)).toarray() # transform text into count vectors v = CountVectorizer(min_df=1, max_features=2000) v_features = (v.fit_transform(features)).toarray() print 'Finished vectorizing text data' X, Y = randomize(v_features, labels) print len(X[0]) if len(X) == len(Y): print 'Data check ... OK' else: print 'Data check failed. Aborting execution' return None return kfold_crossval(X, Y, folds, 2, RF=True) def iterateMNB(n_trials, d_range, s_size, folds): C0L, C1L, WL = [], [], [] # upper and lower date limits ll = datetime.date(2012,01,01) ul = datetime.date(2013,06,01) v = CountVectorizer(min_df=1, max_features=2000) for i in range(n_trials): s = tools.randomDate(ll, ul) e = date.fromordinal(s.toordinal() + d_range) # c0, c1, w = multistage_classify(s_size, s, e, folds) c0, c1, w = classify_text(s_size, s, e, folds)