def predict(X, clf, vec, feat_obj=None): """ predict() """ # Data -> features if feat_obj == None: feat_obj = FeaturesWrapper() feats = feat_obj.extract_features(X) # predict return predict_vectorized(feats, clf, vec)
def predict( X, clf, vec, feat_obj=None ): """ predict() """ # Data -> features if feat_obj == None: feat_obj = FeaturesWrapper() feats = feat_obj.extract_features(X) # predict return predict_vectorized(feats, clf, vec)
def main(): """ main() Purpose: This program builds an SVM model for Twitter classification """ parser = argparse.ArgumentParser() parser.add_argument("-t", dest="txt", help="The files that contain the training examples", default=os.path.join(BASE_DIR, 'data/train-cleansed-A.txt') #default = os.path.join(BASE_DIR, 'data/sample.txt') ) parser.add_argument("-m", dest="model", help="The file to store the pickled model", default=os.path.join(BASE_DIR, 'models/awesome')) parser.add_argument("-g", dest="grid", help="Perform Grid Search", action='store_true', default=False) # Parse the command line arguments args = parser.parse_args() grid = args.grid # Decode arguments txt_files = glob.glob(args.txt) model_path = args.model print model_path # Cannot train on empty list if not txt_files: print 'no training files :(' exit(1) # Read the data into a Note object notes = [] for txt in txt_files: note_tmp = Note() note_tmp.read(txt) notes.append(note_tmp) # Get data from notes X = [] Y = [] for n in notes: X += zip(n.getIDs(), n.getTweets()) Y += n.getLabels() # Build model feat_obj = FeaturesWrapper() vec, svc = train(X, Y, model_path, grid, feat_obj)
def predict_using_model(X, model_path, out_dir): with open(model_path + '.model', 'rb') as fid: clf = pickle.load(fid) with open(model_path + '.dict', 'rb') as fid: vec = pickle.load(fid) feat_obj = FeaturesWrapper() # Predict labels = predict(X, clf, vec, feat_obj=feat_obj) return labels
def main(): parser = argparse.ArgumentParser() parser.add_argument("-t", dest="txt", help="The files that contain the training examples", default=os.path.join(BASE_DIR, 'data/annotated.txt')) parser.add_argument("-n", dest="length", help="Number of data points to use", default=-1) parser.add_argument("-f", dest="folds", help="Number of folds to partition data into", default=10) parser.add_argument("-g", dest="grid", help="Perform Grid Search", type=bool, default=False) parser.add_argument("-r", dest="random", help="Random shuffling of input data.", action='store_true', default=False) # Parse the command line arguments args = parser.parse_args() # Decode arguments txt_files = glob.glob(args.txt) length = int(args.length) num_folds = int(args.folds) # Get data from files if not txt_files: print 'no training files :(' sys.exit(1) notes = [] for txt in txt_files: note_tmp = Note() note_tmp.read(txt) notes.append(note_tmp) # List of all data X = [] Y = [] for n in notes: X += zip(n.getIDs(), n.getTweets()) Y += n.getLabels() # Build confusion matrix confusion = [[0 for i in labels_map] for j in labels_map] # Instantiate feat obj once (it'd really slow down CV to rebuild every time) feat_obj = FeaturesWrapper() # Extract features once feats = train.extract_features(X, feat_obj) data = zip(feats, Y)[:length] print len(data) # For each held-out test set i = 1 for training, testing in cv_partitions(data, num_folds=num_folds, shuffle=args.random): # Users like to see progress print 'Fold: %d of %d' % (i, num_folds) i += 1 # Train on non-heldout data X_train = [d[0] for d in training] Y_train = [d[1] for d in training] vec, clf = train.train_vectorized(X_train, Y_train, model_path=None, grid=False) # Predict on held out X_test = [d[0] for d in testing] Y_test = [d[1] for d in testing] labels, confs = predict.predict_vectorized(X_test, clf, vec) # Compute confusion matrix for held_out data testing_confusion = evaluate.create_confusion(Y_test, labels) confusion = add_matrix(confusion, testing_confusion) # Evaluate evaluate.display_confusion(confusion)
def extract_features(X, feat_obj=None): # Data -> features if feat_obj == None: feat_obj = FeaturesWrapper() return feat_obj.extract_features(X)