Esempio n. 1
0
from sklearn import pipeline
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.externals import joblib
from mlloutils import expand_to_vectors

if len(sys.argv) < 3:
  print >> sys.stderr, ('Usage: python ' + sys.argv[0]
                        + ' <training csv input> <persistent model output>')
  exit(0)

# Grab the training data
training_name = sys.argv[1]
print >> sys.stderr, 'Loading training set from '+training_name
training_vectors, training_target = expand_to_vectors(
    training_name, [6, 7, 8], 9)
print "%d vectors with dimension %d" % training_vectors.shape

# Normalize the sparse positive features using the TF-IDF normalizer as field
# 6, 7 and 8 are word occurrences in text fields
tfidf = TfidfTransformer()
training_vectors = tfidf.fit_transform(training_vectors)

# Shuffle the samples as SGD models assume i.i.d.
training_vectors, training_target = utils.shuffle(
    training_vectors, training_target, random_state=0)

# Create a naive classifier
models = [
    (linear_model.sparse.SGDClassifier(n_iter=5),
     {'alpha': np.logspace(-7, -4, 5)}),
Esempio n. 2
0
#!/usr/bin/env python
import csv
from sklearn import svm
import numpy as np
import scipy.sparse as sp
import sys
from sklearn.externals import joblib
from mlloutils import expand_to_vectors

if len(sys.argv) < 2:
  print >> sys.stderr, 'Usage: python '+sys.argv[0]+' <test csv input> <persistent model input>'
  exit(0)

model_name = sys.argv[2]
print >> sys.stderr, 'Loading classifier from '+model_name
clf = joblib.load(model_name)

test_name = sys.argv[1]
print >> sys.stderr, 'Loading test set from '+test_name
test_vectors, ids = expand_to_vectors(test_name, 1, 2, [6, 7, 8], 0, False)
print "%d vectors with dimension %d" % test_vectors.shape

print >> sys.stderr, 'Predicting...'
prediction_matrix = clf.predict(test_vectors)
prediction = prediction_matrix.tolist()

print 'id,good'
for index, value in enumerate(ids):
  print str(value)+','+str(prediction[index])
Esempio n. 3
0
#!/usr/bin/env python
import csv
from sklearn import svm
import numpy as np
import scipy.sparse as sp
import sys
from sklearn.externals import joblib
from mlloutils import expand_to_vectors

if len(sys.argv) < 2:
  print >> sys.stderr, 'Usage: python '+sys.argv[0]+' <test csv input> <persistent model input>'
  exit(0)

model_name = sys.argv[2]
print >> sys.stderr, 'Loading classifier from '+model_name
clf = joblib.load(model_name)

test_name = sys.argv[1]
print >> sys.stderr, 'Loading test set from '+test_name
test_vectors, ids = expand_to_vectors(test_name, [6, 7, 8], 0)
print "%d vectors with dimension %d" % test_vectors.shape

print >> sys.stderr, 'Predicting...'
prediction_matrix = clf.predict(test_vectors)
prediction = prediction_matrix.tolist()

print 'id,good'
for index, value in enumerate(ids):
  print str(value)+','+str(prediction[index])
Esempio n. 4
0
from sklearn import pipeline
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.externals import joblib
from mlloutils import expand_to_vectors

if len(sys.argv) < 3:
  print >> sys.stderr, ('Usage: python ' + sys.argv[0]
                        + ' <training csv input> <persistent model output>')
  exit(0)

# Grab the training data
training_name = sys.argv[1]
print >> sys.stderr, 'Loading training set from '+training_name
training_vectors, training_target = expand_to_vectors(
    training_name, 1, 2, [6, 7, 8], 9, True)
print "%d vectors with dimension %d" % training_vectors.shape

# Normalize the sparse positive features using the TF-IDF normalizer as field
# 6, 7 and 8 are word occurrences in text fields
tfidf = TfidfTransformer()
training_vectors = tfidf.fit_transform(training_vectors)

# Shuffle the samples as SGD models assume i.i.d.
training_vectors, training_target = utils.shuffle(
    training_vectors, training_target, random_state=0)

# Create a naive classifier
models = [
    (linear_model.sparse.SGDClassifier(n_iter=5),
     {'alpha': np.logspace(-7, -4, 5)}),