Esempio n. 1
0
def main(argv):
    # Initial local path for Stanford Twitter Data
    PATH = './StanfordTweetData/training.1600000.processed.noemoticon.csv'
    FEAT_PATH = './twitter_features.txt'

    # Parse command line arguments
    try:
        long_flags = ["help", "bernoulli", "multinomial", "gaussian"]
        opts, args = getopt.getopt(argv, "hi:f:", long_flags)
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    # Classifier variable. Used for training on tweet features below
    classifier = NaiveBayesClassifier
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage()
            sys.exit()
        elif opt == '-i':
            # Updates PATH to Stanford Tweet CSV data set
            if arg:
                PATH = arg
            else:
                print('Argument expected for the -i option\n')
                usage()
                sys.exit(2)
        elif opt == '-f':
            FEAT_PATH = arg
        elif opt in ("bernoulli", "multinomial", "gaussian"):
            ''' This section allows you to use scikit-learn packages for
            text classification.

            NLTKs SklearnClassifier makes the process much easier,
            since you dont have to convert feature dictionaries to
            numpy arrays yourself, or keep track of all known features.
            The Scikits classifiers also tend to be more memory efficient
            than the standard NLTK classifiers, due to their use of sparse
            arrays.

            Credit to "Jacob" and his post on Steamhacker.com
            '''
            pipeline = None
            if opt == "bernoulli":
                pipeline = Pipeline([('nb', BernoulliNB())])
            elif opt == "multinomial":
                pipeline = Pipeline([('nb', MultinomialNB())])
            elif opt == "gaussian":
                pipeline = Pipeline([('nb', GaussianNB())])
            classifier = SklearnClassifier(pipeline)

    # Perform tweet parsing and learning
    print("Opening CSV file...")
    print("Extracting Features...")

    all_data = list()
    # Checks if all_data has already been set
    if any([opt == '-f' for opt, arg in opts]):
        tweet_feats = open(FEAT_PATH, 'r')
        all_data = [eval(line) for line in tweet_feats]
    else:
        all_data = open_stanford_twitter_csv(PATH, feat_extractor=word_feats)

    print("CSV file opened and features extracted")
    train_set, dev_set, test_set = split_tweets(all_data, train=.9,
                                                dev=0, test=.1, shuffle=True)
    print("Data split into sets")

    classifier.train(train_set)
    print("Classifier trained")

    print("Evaluating accuracy and other features\n")
    evaluate_features(classifier, test_set)
Esempio n. 2
0
from nltk import NaiveBayesClassifier
from import_stanford_twitter import open_stanford_twitter_csv
from feature_evaluator import evaluate_features
from feature_extractors import word_feats
from ingest_twitter import split_tweets

print("Opening CSV file...")
print("Extracting Features...")
path = "/Users/bretts/Documents/Preparing_The_Torch"
path += "/StanfordTweetData/training.1600000.processed.noemoticon.csv"
all_data = open_stanford_twitter_csv(path, feat_extractor=word_feats)
print("CSV file opened and features extracted")
train_set, dev_set, test_set = split_tweets(all_data, train=0.9, dev=0, test=0.1, shuffle=True)
print("Data split into sets")
classifier = NaiveBayesClassifier.train(train_set)
print("Classifier trained")
evaluate_features(classifier, test_set)