from sklearn import pipeline import numpy as np import sys from sklearn.feature_extraction.text import TfidfTransformer from sklearn.externals import joblib from mlloutils import expand_to_vectors if len(sys.argv) < 3: print >> sys.stderr, ('Usage: python ' + sys.argv[0] + ' <training csv input> <persistent model output>') exit(0) # Grab the training data training_name = sys.argv[1] print >> sys.stderr, 'Loading training set from '+training_name training_vectors, training_target = expand_to_vectors( training_name, [6, 7, 8], 9) print "%d vectors with dimension %d" % training_vectors.shape # Normalize the sparse positive features using the TF-IDF normalizer as field # 6, 7 and 8 are word occurrences in text fields tfidf = TfidfTransformer() training_vectors = tfidf.fit_transform(training_vectors) # Shuffle the samples as SGD models assume i.i.d. training_vectors, training_target = utils.shuffle( training_vectors, training_target, random_state=0) # Create a naive classifier models = [ (linear_model.sparse.SGDClassifier(n_iter=5), {'alpha': np.logspace(-7, -4, 5)}),
#!/usr/bin/env python import csv from sklearn import svm import numpy as np import scipy.sparse as sp import sys from sklearn.externals import joblib from mlloutils import expand_to_vectors if len(sys.argv) < 2: print >> sys.stderr, 'Usage: python '+sys.argv[0]+' <test csv input> <persistent model input>' exit(0) model_name = sys.argv[2] print >> sys.stderr, 'Loading classifier from '+model_name clf = joblib.load(model_name) test_name = sys.argv[1] print >> sys.stderr, 'Loading test set from '+test_name test_vectors, ids = expand_to_vectors(test_name, 1, 2, [6, 7, 8], 0, False) print "%d vectors with dimension %d" % test_vectors.shape print >> sys.stderr, 'Predicting...' prediction_matrix = clf.predict(test_vectors) prediction = prediction_matrix.tolist() print 'id,good' for index, value in enumerate(ids): print str(value)+','+str(prediction[index])
#!/usr/bin/env python import csv from sklearn import svm import numpy as np import scipy.sparse as sp import sys from sklearn.externals import joblib from mlloutils import expand_to_vectors if len(sys.argv) < 2: print >> sys.stderr, 'Usage: python '+sys.argv[0]+' <test csv input> <persistent model input>' exit(0) model_name = sys.argv[2] print >> sys.stderr, 'Loading classifier from '+model_name clf = joblib.load(model_name) test_name = sys.argv[1] print >> sys.stderr, 'Loading test set from '+test_name test_vectors, ids = expand_to_vectors(test_name, [6, 7, 8], 0) print "%d vectors with dimension %d" % test_vectors.shape print >> sys.stderr, 'Predicting...' prediction_matrix = clf.predict(test_vectors) prediction = prediction_matrix.tolist() print 'id,good' for index, value in enumerate(ids): print str(value)+','+str(prediction[index])
from sklearn import pipeline import numpy as np import sys from sklearn.feature_extraction.text import TfidfTransformer from sklearn.externals import joblib from mlloutils import expand_to_vectors if len(sys.argv) < 3: print >> sys.stderr, ('Usage: python ' + sys.argv[0] + ' <training csv input> <persistent model output>') exit(0) # Grab the training data training_name = sys.argv[1] print >> sys.stderr, 'Loading training set from '+training_name training_vectors, training_target = expand_to_vectors( training_name, 1, 2, [6, 7, 8], 9, True) print "%d vectors with dimension %d" % training_vectors.shape # Normalize the sparse positive features using the TF-IDF normalizer as field # 6, 7 and 8 are word occurrences in text fields tfidf = TfidfTransformer() training_vectors = tfidf.fit_transform(training_vectors) # Shuffle the samples as SGD models assume i.i.d. training_vectors, training_target = utils.shuffle( training_vectors, training_target, random_state=0) # Create a naive classifier models = [ (linear_model.sparse.SGDClassifier(n_iter=5), {'alpha': np.logspace(-7, -4, 5)}),