def predict(X, clf, vec, feat_obj=None):
    """
    predict()
    """

    # Data -> features
    if feat_obj == None: feat_obj = FeaturesWrapper()
    feats = feat_obj.extract_features(X)

    # predict
    return predict_vectorized(feats, clf, vec)
def predict( X, clf, vec, feat_obj=None ):

    """
    predict()
    """

    # Data -> features
    if feat_obj == None: feat_obj = FeaturesWrapper()
    feats  = feat_obj.extract_features(X)

    # predict
    return predict_vectorized(feats, clf, vec)
Exemple #3
0
def main():
    """
    main()

    Purpose: This program builds an SVM model for Twitter classification
    """

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        dest="txt",
                        help="The files that contain the training examples",
                        default=os.path.join(BASE_DIR,
                                             'data/train-cleansed-A.txt')
                        #default = os.path.join(BASE_DIR, 'data/sample.txt')
                        )

    parser.add_argument("-m",
                        dest="model",
                        help="The file to store the pickled model",
                        default=os.path.join(BASE_DIR, 'models/awesome'))

    parser.add_argument("-g",
                        dest="grid",
                        help="Perform Grid Search",
                        action='store_true',
                        default=False)

    # Parse the command line arguments
    args = parser.parse_args()
    grid = args.grid

    # Decode arguments
    txt_files = glob.glob(args.txt)
    model_path = args.model

    print model_path

    # Cannot train on empty list
    if not txt_files:
        print 'no training files :('
        exit(1)

    # Read the data into a Note object
    notes = []
    for txt in txt_files:
        note_tmp = Note()
        note_tmp.read(txt)
        notes.append(note_tmp)

    # Get data from notes
    X = []
    Y = []
    for n in notes:
        X += zip(n.getIDs(), n.getTweets())
        Y += n.getLabels()

    # Build model
    feat_obj = FeaturesWrapper()
    vec, svc = train(X, Y, model_path, grid, feat_obj)
def predict_using_model(X, model_path, out_dir):

    with open(model_path + '.model', 'rb') as fid:
        clf = pickle.load(fid)
    with open(model_path + '.dict', 'rb') as fid:
        vec = pickle.load(fid)
    feat_obj = FeaturesWrapper()

    # Predict
    labels = predict(X, clf, vec, feat_obj=feat_obj)
    return labels
Exemple #5
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-t",
                        dest="txt",
                        help="The files that contain the training examples",
                        default=os.path.join(BASE_DIR, 'data/annotated.txt'))

    parser.add_argument("-n",
                        dest="length",
                        help="Number of data points to use",
                        default=-1)

    parser.add_argument("-f",
                        dest="folds",
                        help="Number of folds to partition data into",
                        default=10)

    parser.add_argument("-g",
                        dest="grid",
                        help="Perform Grid Search",
                        type=bool,
                        default=False)

    parser.add_argument("-r",
                        dest="random",
                        help="Random shuffling of input data.",
                        action='store_true',
                        default=False)

    # Parse the command line arguments
    args = parser.parse_args()

    # Decode arguments
    txt_files = glob.glob(args.txt)
    length = int(args.length)
    num_folds = int(args.folds)

    # Get data from files
    if not txt_files:
        print 'no training files :('
        sys.exit(1)

    notes = []
    for txt in txt_files:
        note_tmp = Note()
        note_tmp.read(txt)
        notes.append(note_tmp)

    # List of all data
    X = []
    Y = []
    for n in notes:
        X += zip(n.getIDs(), n.getTweets())
        Y += n.getLabels()

    # Build confusion matrix
    confusion = [[0 for i in labels_map] for j in labels_map]

    # Instantiate feat obj once (it'd really slow down CV to rebuild every time)
    feat_obj = FeaturesWrapper()

    # Extract features once
    feats = train.extract_features(X, feat_obj)
    data = zip(feats, Y)[:length]

    print len(data)

    # For each held-out test set
    i = 1
    for training, testing in cv_partitions(data,
                                           num_folds=num_folds,
                                           shuffle=args.random):

        # Users like to see progress
        print 'Fold: %d of %d' % (i, num_folds)
        i += 1

        # Train on non-heldout data
        X_train = [d[0] for d in training]
        Y_train = [d[1] for d in training]
        vec, clf = train.train_vectorized(X_train,
                                          Y_train,
                                          model_path=None,
                                          grid=False)

        # Predict on held out
        X_test = [d[0] for d in testing]
        Y_test = [d[1] for d in testing]
        labels, confs = predict.predict_vectorized(X_test, clf, vec)

        # Compute confusion matrix for held_out data
        testing_confusion = evaluate.create_confusion(Y_test, labels)
        confusion = add_matrix(confusion, testing_confusion)

    # Evaluate
    evaluate.display_confusion(confusion)
def extract_features(X, feat_obj=None):
    # Data -> features
    if feat_obj == None: feat_obj = FeaturesWrapper()
    return feat_obj.extract_features(X)
Exemple #7
0
def extract_features(X, feat_obj=None):
    # Data -> features
    if feat_obj == None: feat_obj = FeaturesWrapper()
    return feat_obj.extract_features(X)