def main(filename): # initialize global data labels, _, comments = util.get_comments_data(filename) vec = TfidfVectorizer( ngram_range = (1, 2), strip_accents = None, charset_error = 'ignore', stop_words = None, min_df = 2 ) labels, instances = numpy.array(labels), vec.fit_transform(comments) random.seed(0) # cross validate n_models = 9; cv = 5; cv_accuracy = list(); for i in xrange(0, cv): print "Iteration #" + str(i) + "..." # initialize training/testing data cv_data = cross_validation.train_test_split(instances, labels, test_size = 0.3, random_state = i) x_training = cv_data[0] x_testing = cv_data[1] y_training = cv_data[2] y_testing = cv_data[3] # initialize the classifier clf = BaggingSVM(n_models) clf.fit(x_training, y_training) # measure prediction accuracy cv_accuracy.append(clf.score(x_testing, y_testing)) print "Scores => " + str(cv_accuracy) print "Mean => " + str(numpy.mean(cv_accuracy))
def main(infile, outfile): """read from infile, write to outfile""" print "Reading from %s..." % infile labels, timestamps, comments = get_comments_data(infile) print "Parsing..." vec = TfidfVectorizer(ngram_range=(1, 2), strip_accents=None, charset_error="ignore", stop_words=None, min_df=2) vec.fit(comments) print "Processing and writing to %s..." % outfile f = open(outfile, 'w') counter = 0 comments = vec.transform(comments) rows, cols = comments.get_shape() for row in xrange(0, rows): if counter % 100 == 0: print counter buf, indices = list(), comments[row].indices.tolist() indices.sort() buf.append(str(float(labels[row]))) for col in indices: buf.append("%d:%.3f" % (col, comments[row, col])) buf.append("\n") f.write(" ".join(buf)) counter = counter + 1 f.close() print "Done!"
def main(infile, outfile): """read from infile, write to outfile""" print "Reading from %s..." % infile labels, timestamps, comments = get_comments_data(infile) print "Parsing..." vec = TfidfVectorizer( ngram_range = (1, 2), strip_accents = None, charset_error = "ignore", stop_words = None, min_df = 2 ) vec.fit(comments) print "Processing and writing to %s..." % outfile f = open(outfile, 'w') counter = 0 comments = vec.transform(comments) rows, cols = comments.get_shape() for row in xrange(0, rows): if counter % 100 == 0: print counter buf, indices = list(), comments[row].indices.tolist() indices.sort() buf.append(str(float(labels[row]))) for col in indices: buf.append("%d:%.3f" % (col, comments[row, col])) buf.append("\n") f.write(" ".join(buf)) counter = counter + 1 f.close() print "Done!"