def main(): if len(sys.argv) != 3: print "Usage " + sys.argv[0] + " file_train model_file" sys.exit(1) train_filename = sys.argv[1] model_filename = sys.argv[2] featureIndexes = processData(train_filename) trainFeatures, trainTargets, trainItemIds = processData(train_filename, featureIndexes) # joblib.dump((trainFeatures, trainTargets, trainItemIds), os.path.join(dataFolder, "train_data.pkl")) # trainFeatures, trainTargets, trainItemIds = joblib.load(os.path.join(dataFolder, "train_data.pkl")) trainTargets = np.asarray(trainTargets) logging.info("Feature preparation done, fitting model...") clf = SGDClassifier(loss="log", penalty="l2", alpha=1e-4, class_weight="auto") # clf.fit(trainFeatures, trainTargets) # joblib.dump(clf, model_filename) print utils.xvalidation(trainFeatures, trainTargets, clf)
return c_features, y#, ids, [vectorizer] if __name__ == '__main__': if len(sys.argv) != 3: print "Usage " + sys.argv[0] + " file_train model_file" sys.exit(1) train_filename = sys.argv[1] model_filename = sys.argv[2] X, y = read_train_nb(train_filename) xv = [] sz = [] for i in range(10): sz.append(X[i].shape[0]) nb = MultinomialNB() xv.append(utils.xvalidation(X[i], y[i], nb)) total_sz = float(sum(sz)) for i in range(10): print category_dict_r[i], sz[i]/total_sz, xv[i] #utils.xvalidation_result(train_filename, 'nb.ng2.txt', X, y, ids, nb) # nb.fit(X, y) # joblib.dump((nb, transformers), model_filename)