Beispiel #1
0
def main():
    x, y = load_datasets(["../datasets/sentiment_uci/yelp_labelled.txt"])

    stopwords = set()
    with open('../stopwords.txt', 'r') as f:
        for w in f:
            stopwords.add(w.strip())

    tok = TweetTokenizer()

    x = [remove_stopwords(tok.tokenize(s.lower()), stopwords) for s in x]
    x = np.array(x)

    accumulate = dict()
    folds = 10
    for train_idx, test_idx in StratifiedKFold(y=y,
                                               n_folds=folds,
                                               shuffle=True):
        train_x, train_y = x[train_idx], y[train_idx]
        test_x, test_y = x[test_idx], y[test_idx]

        # train_x = [remove_stopwords(tok.tokenize(s), stopwords) for s in train_x]
        # test_x = [remove_stopwords(tok.tokenize(s), stopwords) for s in test_x]

        train_docs = [(sent, label) for sent, label in zip(train_x, train_y)]
        test_docs = [(sent, label) for sent, label in zip(test_x, test_y)]

        cls = SentimentAnalyzer()

        # train
        words_with_neg = cls.all_words([mark_negation(a) for a in train_x])
        unigram_feats = cls.unigram_word_feats(words_with_neg)
        bigram_feats = cls.bigram_collocation_feats(train_x)

        cls.add_feat_extractor(extract_unigram_feats,
                               unigrams=unigram_feats,
                               handle_negation=True)
        cls.add_feat_extractor(extract_bigram_feats, bigrams=bigram_feats)

        training_set = cls.apply_features(train_docs, labeled=True)

        cls.train(NaiveBayesClassifier.train, training_set)

        # test & evaluate
        test_set = cls.apply_features(test_docs)

        for key, value in sorted(cls.evaluate(test_set).items()):
            print('\t{0}: {1}'.format(key, value))
            accumulate.setdefault(key, 0.0)
            accumulate[key] += value

    print("Averages")
    for key, value in sorted(accumulate.items()):
        print('\tAverage {0}: {1}'.format(key, value / folds))
Beispiel #2
0
def run_sa_twitt(train, test):
    a = SentimentAnalyzer()
    tr = NaiveBayesClassifier.train
    all_words = [word for word in a.all_words(train)]
    # Add simple unigram word features
    unigram_feats = a.unigram_word_feats(all_words, top_n=1000)
    a.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = a.bigram_collocation_feats(
        [tweet[0] for tweet in train_twitt], top_n=100, min_freq=12)
    a.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)

    tr_set = a.apply_features(train)
    test_set = a.apply_features(test)

    #Training
    clf = a.train(tr, tr_set)
    res = a.evaluate(test_set)

    print(res)
Beispiel #3
0
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances / 2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv,
                                label='neg',
                                word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv,
                                label='pos',
                                word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs + train_neg_docs
    testing_tweets = test_pos_docs + test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats,
                                       unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats,
                                       bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print(
            'Your classifier does not provide a show_most_informative_features() method.'
        )
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output,
                        Dataset='labeled_tweets',
                        Classifier=type(classifier).__name__,
                        Tokenizer=tokenizer.__class__.__name__,
                        Feats=extr,
                        Results=results,
                        Instances=n_instances)
training_tweets, testing_tweets = split_train_test(result)
#x_train, x_test, y_train, y_test =  train_test_split(result['tweet'], result['senti'], test_size=0.20, random_state=0)

sentim_analyzer = SentimentAnalyzer()

stopwords = stopwords.words('english')
all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]

print(all_words)

# Add simple unigram word features
unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

# Add bigram collocation features
bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],top_n=100, min_freq=12)
sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)

training_set = sentim_analyzer.apply_features(training_tweets)
test_set = sentim_analyzer.apply_features(testing_tweets)

classifier = sentim_analyzer.train(trainer, training_set)
# classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
try:
    classifier.show_most_informative_features()
except AttributeError:
    print('Your classifier does not provide a show_most_informative_features() method.')
results = sentim_analyzer.evaluate(test_set)


Beispiel #5
0
def demo_tweets(trainer, n_instances=None, output=None):
    """
    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
    TweetTokenizer.
    Features are composed of:
        - 1000 most frequent unigrams
        - 100 top bigrams (using BigramAssocMeasures.pmi)

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total tweets that have to be used for
        training and testing. Tweets will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.tokenize import TweetTokenizer
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import twitter_samples, stopwords

    # Different customizations for the TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False)
    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)

    if n_instances is not None:
        n_instances = int(n_instances/2)

    fields = ['id', 'text']
    positive_json = twitter_samples.abspath("positive_tweets.json")
    positive_csv = 'positive_tweets.csv'
    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)

    negative_json = twitter_samples.abspath("negative_tweets.json")
    negative_csv = 'negative_tweets.csv'
    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)

    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_tweets = train_pos_docs+train_neg_docs
    testing_tweets = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    # stopwords = stopwords.words('english')
    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Add bigram collocation features
    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets],
        top_n=100, min_freq=12)
    sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats)

    training_set = sentim_analyzer.apply_features(training_tweets)
    test_set = sentim_analyzer.apply_features(testing_tweets)

    classifier = sentim_analyzer.train(trainer, training_set)
    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__,
                        Tokenizer=tokenizer.__class__.__name__, Feats=extr,
                        Results=results, Instances=n_instances)
Beispiel #6
0
def train_classifier(classifier, num_of_tweets, gram, lang, lemmas):
    sentim_analyzer = SentimentAnalyzer()

    print("num_of_tweets, gram, lang, lemmas_bool:")
    print(num_of_tweets, gram, lang, lemmas)

    training = []
    testing = []
    if lang == "rus":
        training = get_train_test("train.csv")
        testing = get_train_test("test.csv")
    if lang == "ger":
        training = get_train_test("train_de.csv")
        testing = get_train_test("test_de.csv")

    data = training + testing

    def removeStopWords(item):
        item[0] = delete_stop_words(lang, item[0])
        return item

    data_neg = []
    data_pos = []
    for i in data:
        if i[1] == 'neg':
            data_neg.append(i)
        if i[1] == 'pos':
            data_pos.append(i)

    data_even = []
    for i in range(len(data_neg)):
        data_even.append(data_neg[i])
        data_even.append(data_pos[i])

    training_data = data_even[:num_of_tweets]

    dict_1 = {}
    dict_1["Accuracy"] = 0
    dict_1["Precision [pos]"] = 0
    dict_1["Recall [pos]"] = 0
    dict_1["F-measure [pos]"] = 0
    dict_1["Precision [neg]"] = 0
    dict_1["Recall [neg]"] = 0
    dict_1["F-measure [neg]"] = 0
    vocab = 0
    unigram = 0
    bigram = 0

    for i in range(5):

        test = training_data[int(len(training_data) / 5) * i:int((len(training_data) / 5)) * (i + 1)]
        train = training_data[:int(len(training_data) / 5) * i] + training_data[
                                                                  int((len(training_data) / 5)) * (i + 1):len(
                                                                      training_data)]

        train = list(map(removeStopWords, train))
        test = list(map(removeStopWords, test))

        # print(train)

        shuffle(train)
        shuffle(test)

        print("len(train+test):")
        print(len(train) + len(test))

        print("train: pos, neg:")
        print(count_tags(train))

        print("test: pos, neg:")
        print(count_tags(test))

        vocabulary = sentim_analyzer.all_words(tokenize_set(train, lemmas, lang))
        print("vocab len:")
        print(len(vocabulary))
        vocab += len(vocabulary)
        # print("vocabulary[0]:")
        # print(vocabulary[0])

        if gram == "unigram":
            unigram_features = sentim_analyzer.unigram_word_feats(vocabulary)
            print("unigram feats len:")
            print(len(unigram_features))
            unigram += len(unigram_features)
            # print("unigram_features[0]:")
            # print(unigram_features[0])

            sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_features)
        if gram == "bigram":
            bigram_features = sentim_analyzer.bigram_collocation_feats(tokenize_set(train, lemmas, lang))

            print("bigram feats len:")
            print(len(bigram_features))
            bigram += len(bigram_features)
            # print("bigram_features[0]:")
            # print(bigram_features[0])
            # print("bigram_features[5]:")
            # print(bigram_features[5])
            sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_features)

        _train_X = sentim_analyzer.apply_features(tokenize_set(train, lemmas, lang), labeled=False)
        _train_Y = get_y(train)

        _test_X = sentim_analyzer.apply_features(tokenize_set(test, lemmas, lang), labeled=False)
        _test_Y = get_y(test)
        sentim_analyzer.train(classifier.train, list(zip(_train_X, _train_Y)))
        dict = sentim_analyzer.evaluate(list(zip(_test_X, _test_Y)))
        print(dict)
        dict_1["Accuracy"] += dict.get('Accuracy')
        dict_1["Precision [pos]"] += dict.get('Precision [pos]')
        dict_1["Recall [pos]"] += dict.get('Recall [pos]')
        dict_1["F-measure [pos]"] += dict.get('F-measure [pos]')
        dict_1["Precision [neg]"] += dict.get('Precision [neg]')
        dict_1["Recall [neg]"] += dict.get('Recall [neg]')
        dict_1["F-measure [neg]"] += dict.get('F-measure [neg]')

    print("Accuracy:")
    print(dict_1.get('Accuracy') / 5)
    print("Precision [pos]:")
    print(dict_1.get('Precision [pos]') / 5)
    print("Precision [neg]:")
    print(dict_1.get('Precision [neg]') / 5)
    print("F-measure [pos]:")
    print(dict_1.get('F-measure [pos]') / 5)
    print("F-measure [neg]:")
    print(dict_1.get('F-measure [neg]') / 5)
    print("Recall [pos]:")
    print(dict_1.get('Recall [pos]') / 5)
    print("Recall [neg]:")
    print(dict_1.get('Recall [neg]') / 5)
    print("vocab length: ")
    print(vocab / 5)
    if gram == "bigram":
        print("bigram features:")
        print(bigram / 5)
    if gram == "unigram":
        print("unigram features:")
        print(unigram / 5)