def main(arguments): # enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', filename=LOG_FNAME, level=logging.INFO) # parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-s', '--sartic-tweets', dest='sar_dir', help="Directory of example sartic tweets", default="../data/labeled_data/sarcastic/") parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../data/labeled_data/negative/") parser.add_argument( '-c', '--sample-count', dest='sample_count', help="Max number of samples of each class", default="10000") # 10k default, ~300k max with current data args = parser.parse_args(arguments) # set random seed np.random.seed(RAND_SEED) # create tweets iterators log_print("Creating tweet iterators...") sar_tweets_iter = Tweets([args.sar_dir]) pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) log_print() # load tweets with gold labels filtered to lists and shuffle log_print("Loading sarcastic tweets with gold labels filtered...") sar_tweets = [Tweets.filter_tags(tweet) for tweet in sar_tweets_iter] log_print("...loaded {} sarcastic tweets".format(len(sar_tweets))) log_print("Loading non-sarcastic tweets...") pos_tweets = [Tweets.filter_tags(tweet) for tweet in pos_tweets_iter] # filter gold label hashtags log_print("...loaded {} positive tweets...".format(len(pos_tweets))) neg_tweets = [Tweets.filter_tags(tweet) for tweet in neg_tweets_iter] log_print("...loaded {} negative tweets".format(len(neg_tweets))) log_print( "Selecting balanced sample sets of {} tweets per class...".format( args.sample_count)) sample_count = int(args.sample_count) sar_tweets = resample(sar_tweets, n_samples=sample_count, replace=False, random_state=1) pos_tweets = resample(pos_tweets, n_samples=sample_count // 2, replace=False, random_state=2) neg_tweets = resample(neg_tweets, n_samples=sample_count // 2, replace=False, random_state=3) non_tweets = pos_tweets + neg_tweets log_print() # shuffle tweets and split into training, dev, and test log_print("Shuffle all tweets...") sar_labels = [1 for _ in sar_tweets] non_labels = [0 for _ in non_tweets] tweets = np.append(sar_tweets, non_tweets) labels = np.append(sar_labels, non_labels) tweets, labels = shuffle(tweets, labels, random_state=4) log_print() # write to output file log_print("write to files as training, dev, and test sets...") output_gen = (n for n in zip(tweets, labels) ) # generator of (tweet, label) tuples with open(OUTFNAME_FORMAT.format("test"), "w+") as f: for tweet, label in itertools.islice(output_gen, sample_count // 10): f.write("{}\t{}\n".format(label, ' '.join(tweet))) with open(OUTFNAME_FORMAT.format("dev"), "w+") as f: for tweet, label in itertools.islice(output_gen, sample_count // 10): f.write("{}\t{}\n".format(label, ' '.join(tweet))) with open(OUTFNAME_FORMAT.format("train"), "w+") as f: for tweet, label in output_gen: f.write("{}\t{}\n".format(label, ' '.join(tweet))) log_print( "...training, dev, and test sets written to files {}, {}, and {}". format(OUTFNAME_FORMAT.format("train"), OUTFNAME_FORMAT.format("dev"), OUTFNAME_FORMAT.format("test")))
def main(pos_dir, neg_dir, sar_dir, random_seed): np.random.seed(random_seed) # Create tweets iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([pos_dir]) neg_tweets_iter = Tweets([neg_dir]) sar_tweets_iter = Tweets([sar_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter ] pos_tweets = shuffle(pos_tweets) update() update("Loading negative tweets...") neg_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter ] neg_tweets = shuffle(neg_tweets) update() update("Loading sarcastic tweets...") sar_tweets = [ ' '.join(Tweets.filter_tags(tweet)) for tweet in sar_tweets_iter ] sar_tweets = shuffle(sar_tweets) update() # Save sarcasm data update("Saving sarcasm data...") count = len(sar_tweets) print("len pos_tweets before take = {}".format(len(pos_tweets))) non_sar_tweets = take(pos_tweets, count // 2) + take( neg_tweets, count // 2) print("len pos_tweets after take = {}".format(len(pos_tweets))) sar_labels = [1 for _ in sar_tweets] non_sar_labels = [0 for _ in non_sar_tweets] sarcasm_data = np.append(sar_tweets, non_sar_tweets) sarcasm_labels = np.append(sar_labels, non_sar_labels) sarcasm_data, sarcasm_labels = shuffle(sarcasm_data, sarcasm_labels) size = len(sarcasm_data) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) sarcasm_dump = { "train": (sarcasm_data[train], sarcasm_labels[train]), "dev": (sarcasm_data[dev], sarcasm_labels[dev]), "test": (sarcasm_data[test], sarcasm_labels[test]) } pickle.dump(sarcasm_dump, open(os.path.join(SPLIT_DATA_DIR, "sarcasm.pkl"), 'wb')) update() # Save sentiment data update("Saving sentiment data...") count = min(len(pos_tweets), len(neg_tweets)) pos_tweets = pos_tweets[:count] neg_tweets = neg_tweets[:count] pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] sentiment_data = np.append(pos_tweets, neg_tweets) sentiment_labels = np.append(pos_labels, neg_labels) sentiment_data, sentiment_labels = shuffle(sentiment_data, sentiment_labels) size = len(sentiment_data) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) sentiment_dump = { "train": (sentiment_data[train], sentiment_labels[train]), "dev": (sentiment_data[dev], sentiment_labels[dev]), "test": (sentiment_data[test], sentiment_labels[test]) } pickle.dump(sentiment_dump, open(os.path.join(SPLIT_DATA_DIR, "sentiment.pkl"), 'wb')) update()
def main(arguments): # Parse optional filename arguments parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-p', '--positive-tweets', dest='pos_dir', help="Directory of example positive tweets", default="../data/labeled_data/positive/") parser.add_argument('-n', '--negative-tweets', dest='neg_dir', help="Directory of example negative tweets", default="../data/labeled_data/negative/") parser.add_argument('-c', '--sample-count', dest='sample_count', help="Max number of samples of each sentiment", default="800000") args = parser.parse_args(arguments) # Create Tweets Iterators update("Creating tweet iterators...") pos_tweets_iter = Tweets([args.pos_dir]) neg_tweets_iter = Tweets([args.neg_dir]) update() # Save situtations to lists and shuffle update("Loading positive tweets...") pos_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter] update() update("Loading negative tweets...") neg_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter] update() update("Selecting balanced sample sets...") sample_count = int(args.sample_count) pos_tweets = resample(pos_tweets, n_samples=sample_count, replace=False, random_state=1) neg_tweets = resample(neg_tweets, n_samples=sample_count, replace=False, random_state=2) update() # Shuffle tweets and split into training, dev, and test update("Shuffle tweets and split into training, dev, and test sets...") pos_labels = [1 for _ in pos_tweets] neg_labels = [0 for _ in neg_tweets] tweets = np.append(pos_tweets, neg_tweets) labels = np.append(pos_labels, neg_labels) tweets, labels = shuffle(tweets, labels, random_state=2) size = len(labels) train = slice(0, int(0.8 * size)) dev = slice(int(0.8 * size), int(0.9 * size)) test = slice(int(0.8 * size), size - 1) update() print() # Build Pipeline print("Performing grid search...") pipeline = Pipeline([('vect', CountVectorizer()), #('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) parameters = { #TODO check which parameters actually effect use in sarcasm detection 'vect__tokenizer': [tokenizer], 'vect__stop_words': [None], 'vect__binary': [False], 'vect__ngram_range': [(1,5)], #'tfidf__norm': [None, 'l1', 'l2'], #'tfidf__use_idf': [True, False], #'tfidf__smooth_idf': [True, False], #'tfidf__sublinear_tf': [True, False], 'clf__alpha': [1.0], # check range, these are guesses 'clf__fit_prior': [False], # not sure what the distribution in sarcasm data is } clf_pipe = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) print("pipeline:", [name for name, _ in pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() clf_pipe.fit(tweets[train], labels[train]) print("Done in %0.3fs" % (time() - t0)) print() # Print grid search results print("Best score: %0.3f" % clf_pipe.best_score_) print("Best parameters set:") best_parameters = clf_pipe.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print() # Evaluate classifier vect = clf_pipe.best_estimator_.named_steps['vect'] #tfidf = clf_pipe.best_estimator_.named_steps['tfidf'] clf = clf_pipe.best_estimator_.named_steps['clf'] predicted = clf_pipe.predict(tweets[test]) print("Classifier Evaluation:") print(metrics.classification_report(labels[test], predicted, target_names=["-", "+"])) # save classifier pickle.dump(clf_pipe, open(MODEL_FNAME, 'wb'))