def main(argv): try: long_flags = ["help", "save", "test", "verbose"] opts, args = getopt.getopt(argv, "hs:tv", long_flags) except: usage() sys.exit(2) model_name = None testing = False verbose = False for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() elif opt in ("-s", "--save"): print('Saving model to %s' % arg) model_name = arg elif opt in ("-t", "--test"): testing = True elif opt in ("-v", "--verbose"): verbose = True output_format = '%(asctime)s : %(levelname)s : %(message)s' logging.basicConfig(format=output_format, level=logging.INFO) # Prevents user from running script without saving # or testing the model if not (model_name or testing): logging.critical("Sentiment140_Pipeline script is neither saving or testing the model built") sys.exit() logging.info("Opening CSV file...") all_data = sentiment140.load_data(verbose=verbose) model = train_d2v_model(all_data, epoch_num=10) # Saves memory model.init_sims(replace=True) if model_name: model.save(model_name) if testing: test_model(model)
def main(argv): # Initial local path for Stanford Twitter Data Features is None FEAT_PATH = None verbose = False # Parse command line arguments try: long_flags = ["help", "bernoulli", "multinomial", "gaussian"] opts, args = getopt.getopt(argv, "hf:v", long_flags) except getopt.GetoptError: usage() sys.exit(2) # Classifier variable. Used for training on tweet features below classifier = NaiveBayesClassifier for opt, arg in opts: if opt in ('-h', '--help'): usage() sys.exit() elif opt == '-f': FEAT_PATH = arg elif opt == '-v': verbose = True output_format = '%(asctime)s : %(levelname)s : %(message)s' logging.basicConfig(format=output_format, level=logging.INFO) elif opt in ("bernoulli", "multinomial", "gaussian"): ''' This section allows you to use scikit-learn packages for text classification. NLTKs SklearnClassifier makes the process much easier, since you dont have to convert feature dictionaries to numpy arrays yourself, or keep track of all known features. The Scikits classifiers also tend to be more memory efficient than the standard NLTK classifiers, due to their use of sparse arrays. Credit to "Jacob" and his post on Steamhacker.com ''' pipeline = None if opt == "bernoulli": pipeline = Pipeline([('nb', BernoulliNB())]) elif opt == "multinomial": pipeline = Pipeline([('nb', MultinomialNB())]) elif opt == "gaussian": pipeline = Pipeline([('nb', GaussianNB())]) classifier = SklearnClassifier(pipeline) # Perform tweet parsing and learning logging.info("Opening CSV file...") logging.info("Extracting Features...") all_data = list() # Checks if all_data has already been set if FEAT_PATH is not None: tweet_feats = open(FEAT_PATH, 'r') all_data = [eval(line) for line in tweet_feats] else: all_data = sentiment140.load_data(feat_extractor=word_feats, verbose=verbose) logging.info("CSV file opened and features extracted") train_set, dev_set, test_set = split_data(all_data, train=.9, dev=0, test=.1, shuffle=True) logging.info("Data split into sets") classifier = classifier.train(train_set) logging.info("Classifier trained") logging.info("Evaluating accuracy and other features\n") test_model(classifier, test_set)