def main(arguments):

    # enable logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        filename=LOG_FNAME,
                        level=logging.INFO)

    # parse optional filename arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-s',
                        '--sartic-tweets',
                        dest='sar_dir',
                        help="Directory of example sartic tweets",
                        default="../data/labeled_data/sarcastic/")
    parser.add_argument('-p',
                        '--positive-tweets',
                        dest='pos_dir',
                        help="Directory of example positive tweets",
                        default="../data/labeled_data/positive/")
    parser.add_argument('-n',
                        '--negative-tweets',
                        dest='neg_dir',
                        help="Directory of example negative tweets",
                        default="../data/labeled_data/negative/")
    parser.add_argument(
        '-c',
        '--sample-count',
        dest='sample_count',
        help="Max number of samples of each class",
        default="10000")  # 10k default, ~300k max with current data

    args = parser.parse_args(arguments)

    # set random seed
    np.random.seed(RAND_SEED)

    # create tweets iterators
    log_print("Creating tweet iterators...")
    sar_tweets_iter = Tweets([args.sar_dir])
    pos_tweets_iter = Tweets([args.pos_dir])
    neg_tweets_iter = Tweets([args.neg_dir])
    log_print()

    # load tweets with gold labels filtered to lists and shuffle
    log_print("Loading sarcastic tweets with gold labels filtered...")
    sar_tweets = [Tweets.filter_tags(tweet) for tweet in sar_tweets_iter]
    log_print("...loaded {} sarcastic tweets".format(len(sar_tweets)))

    log_print("Loading non-sarcastic tweets...")
    pos_tweets = [Tweets.filter_tags(tweet)
                  for tweet in pos_tweets_iter]  # filter gold label hashtags
    log_print("...loaded {} positive tweets...".format(len(pos_tweets)))
    neg_tweets = [Tweets.filter_tags(tweet) for tweet in neg_tweets_iter]
    log_print("...loaded {} negative tweets".format(len(neg_tweets)))

    log_print(
        "Selecting balanced sample sets of {} tweets per class...".format(
            args.sample_count))
    sample_count = int(args.sample_count)
    sar_tweets = resample(sar_tweets,
                          n_samples=sample_count,
                          replace=False,
                          random_state=1)
    pos_tweets = resample(pos_tweets,
                          n_samples=sample_count // 2,
                          replace=False,
                          random_state=2)
    neg_tweets = resample(neg_tweets,
                          n_samples=sample_count // 2,
                          replace=False,
                          random_state=3)
    non_tweets = pos_tweets + neg_tweets
    log_print()

    # shuffle tweets and split into training, dev, and test
    log_print("Shuffle all tweets...")
    sar_labels = [1 for _ in sar_tweets]
    non_labels = [0 for _ in non_tweets]

    tweets = np.append(sar_tweets, non_tweets)
    labels = np.append(sar_labels, non_labels)

    tweets, labels = shuffle(tweets, labels, random_state=4)
    log_print()

    # write to output file
    log_print("write to files as training, dev, and test sets...")
    output_gen = (n for n in zip(tweets, labels)
                  )  # generator of (tweet, label) tuples
    with open(OUTFNAME_FORMAT.format("test"), "w+") as f:
        for tweet, label in itertools.islice(output_gen, sample_count // 10):
            f.write("{}\t{}\n".format(label, ' '.join(tweet)))
    with open(OUTFNAME_FORMAT.format("dev"), "w+") as f:
        for tweet, label in itertools.islice(output_gen, sample_count // 10):
            f.write("{}\t{}\n".format(label, ' '.join(tweet)))
    with open(OUTFNAME_FORMAT.format("train"), "w+") as f:
        for tweet, label in output_gen:
            f.write("{}\t{}\n".format(label, ' '.join(tweet)))

    log_print(
        "...training, dev, and test sets written to files {}, {}, and {}".
        format(OUTFNAME_FORMAT.format("train"), OUTFNAME_FORMAT.format("dev"),
               OUTFNAME_FORMAT.format("test")))
Exemple #2
0
def main(pos_dir, neg_dir, sar_dir, random_seed):
    np.random.seed(random_seed)

    # Create tweets iterators
    update("Creating tweet iterators...")
    pos_tweets_iter = Tweets([pos_dir])
    neg_tweets_iter = Tweets([neg_dir])
    sar_tweets_iter = Tweets([sar_dir])
    update()

    # Save situtations to lists and shuffle
    update("Loading positive tweets...")
    pos_tweets = [
        ' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter
    ]
    pos_tweets = shuffle(pos_tweets)
    update()

    update("Loading negative tweets...")
    neg_tweets = [
        ' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter
    ]
    neg_tweets = shuffle(neg_tweets)
    update()

    update("Loading sarcastic tweets...")
    sar_tweets = [
        ' '.join(Tweets.filter_tags(tweet)) for tweet in sar_tweets_iter
    ]
    sar_tweets = shuffle(sar_tweets)
    update()

    # Save sarcasm data
    update("Saving sarcasm data...")
    count = len(sar_tweets)
    print("len pos_tweets before take = {}".format(len(pos_tweets)))
    non_sar_tweets = take(pos_tweets, count // 2) + take(
        neg_tweets, count // 2)
    print("len pos_tweets after take = {}".format(len(pos_tweets)))
    sar_labels = [1 for _ in sar_tweets]
    non_sar_labels = [0 for _ in non_sar_tweets]

    sarcasm_data = np.append(sar_tweets, non_sar_tweets)
    sarcasm_labels = np.append(sar_labels, non_sar_labels)

    sarcasm_data, sarcasm_labels = shuffle(sarcasm_data, sarcasm_labels)

    size = len(sarcasm_data)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)

    sarcasm_dump = {
        "train": (sarcasm_data[train], sarcasm_labels[train]),
        "dev": (sarcasm_data[dev], sarcasm_labels[dev]),
        "test": (sarcasm_data[test], sarcasm_labels[test])
    }

    pickle.dump(sarcasm_dump,
                open(os.path.join(SPLIT_DATA_DIR, "sarcasm.pkl"), 'wb'))
    update()

    # Save sentiment data
    update("Saving sentiment data...")
    count = min(len(pos_tweets), len(neg_tweets))
    pos_tweets = pos_tweets[:count]
    neg_tweets = neg_tweets[:count]
    pos_labels = [1 for _ in pos_tweets]
    neg_labels = [0 for _ in neg_tweets]

    sentiment_data = np.append(pos_tweets, neg_tweets)
    sentiment_labels = np.append(pos_labels, neg_labels)

    sentiment_data, sentiment_labels = shuffle(sentiment_data,
                                               sentiment_labels)

    size = len(sentiment_data)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)

    sentiment_dump = {
        "train": (sentiment_data[train], sentiment_labels[train]),
        "dev": (sentiment_data[dev], sentiment_labels[dev]),
        "test": (sentiment_data[test], sentiment_labels[test])
    }

    pickle.dump(sentiment_dump,
                open(os.path.join(SPLIT_DATA_DIR, "sentiment.pkl"), 'wb'))
    update()
Exemple #3
0
def main(arguments):

    # Parse optional filename arguments
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-p', '--positive-tweets', dest='pos_dir',
                        help="Directory of example positive tweets",
                        default="../data/labeled_data/positive/")
    parser.add_argument('-n', '--negative-tweets', dest='neg_dir',
                        help="Directory of example negative tweets",
                        default="../data/labeled_data/negative/")
    parser.add_argument('-c', '--sample-count', dest='sample_count',
                        help="Max number of samples of each sentiment",
                        default="800000")

    args = parser.parse_args(arguments)

    # Create Tweets Iterators
    update("Creating tweet iterators...")
    pos_tweets_iter = Tweets([args.pos_dir])
    neg_tweets_iter = Tweets([args.neg_dir])
    update()

    # Save situtations to lists and shuffle
    update("Loading positive tweets...")
    pos_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in pos_tweets_iter]
    update()

    update("Loading negative tweets...")
    neg_tweets = [' '.join(Tweets.filter_tags(tweet)) for tweet in neg_tweets_iter]
    update()

    update("Selecting balanced sample sets...")
    sample_count = int(args.sample_count)
    pos_tweets = resample(pos_tweets, n_samples=sample_count,
                              replace=False, random_state=1)
    neg_tweets = resample(neg_tweets, n_samples=sample_count,
                              replace=False, random_state=2)
    update()

    # Shuffle tweets and split into training, dev, and test
    update("Shuffle tweets and split into training, dev, and test sets...")
    pos_labels = [1 for _ in pos_tweets]
    neg_labels = [0 for _ in neg_tweets]

    tweets = np.append(pos_tweets, neg_tweets)
    labels = np.append(pos_labels, neg_labels)

    tweets, labels = shuffle(tweets, labels, random_state=2)
    size = len(labels)
    train = slice(0, int(0.8 * size))
    dev = slice(int(0.8 * size), int(0.9 * size))
    test = slice(int(0.8 * size), size - 1)
    update()
    print()

    # Build Pipeline
    print("Performing grid search...")
    pipeline = Pipeline([('vect', CountVectorizer()),
                         #('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])

    parameters = { #TODO check which parameters actually effect use in sarcasm detection
            'vect__tokenizer': [tokenizer],
            'vect__stop_words': [None],
            'vect__binary': [False],
            'vect__ngram_range': [(1,5)], 
            #'tfidf__norm': [None, 'l1', 'l2'],
            #'tfidf__use_idf': [True, False],
            #'tfidf__smooth_idf': [True, False],
            #'tfidf__sublinear_tf': [True, False],
            'clf__alpha': [1.0], # check range, these are guesses
            'clf__fit_prior': [False], # not sure what the distribution in sarcasm data is
    }

    clf_pipe = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    clf_pipe.fit(tweets[train], labels[train])
    print("Done in %0.3fs" % (time() - t0))
    print()

    # Print grid search results
    print("Best score: %0.3f" % clf_pipe.best_score_)
    print("Best parameters set:")
    best_parameters = clf_pipe.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print()

    # Evaluate classifier
    vect  = clf_pipe.best_estimator_.named_steps['vect']
    #tfidf = clf_pipe.best_estimator_.named_steps['tfidf']
    clf   = clf_pipe.best_estimator_.named_steps['clf']
    predicted = clf_pipe.predict(tweets[test])

    print("Classifier Evaluation:")
    print(metrics.classification_report(labels[test], predicted,
                                        target_names=["-", "+"]))

    # save classifier
    pickle.dump(clf_pipe, open(MODEL_FNAME, 'wb'))