Ejemplo n.º 1
0
def getTrainingAndTestData(tweets, K, k, method, feature_set):

    add_ngram_feat = feature_set.get('ngram', 1)
    add_negtn_feat = feature_set.get('negtn', False)

    from functools import wraps
    import preprocessing

    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]

    stemmer = nltk.stem.PorterStemmer()

    all_tweets = []  #DATADICT: all_tweets =   [ (words, sentiment), ... ]
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                    for word in text.split() \
                    if len(word) >= 3]
        words = [stemmer.stem(w)
                 for w in words]  #DATADICT: words = [ 'word1', 'word2', ... ]
        all_tweets.append((words, sentiment))

    # train_tweets = all_tweets[:int(len(all_tweets)*ratio)]      #DATADICT: train_tweets = [ (words, sentiment), ... ]
    # test_tweets  = all_tweets[int(len(all_tweets)*ratio):]      #DATADICT: test_tweets  = [ (words, sentiment), ... ]
    train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k]
    test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k]

    unigrams_fd = nltk.FreqDist()
    if add_ngram_feat > 1:
        n_grams_fd = nltk.FreqDist()

    for (words, sentiment) in train_tweets:
        words_uni = words
        unigrams_fd.update(words)

        if add_ngram_feat >= 2:
            words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
            n_grams_fd.update(words_bi)

        if add_ngram_feat >= 3:
            words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
            n_grams_fd.update(words_tri)

    sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys())))

    #unigrams_sorted = nltk.FreqDist(unigrams).keys()
    unigrams_sorted = unigrams_fd.keys()
    #bigrams_sorted = nltk.FreqDist(bigrams).keys()
    #trigrams_sorted = nltk.FreqDist(trigrams).keys()
    if add_ngram_feat > 1:
        sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd)))
        ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1]
        sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted)))

    def get_word_features(words):
        bag = {}
        words_uni = ['has(%s)' % ug for ug in words]

        if (add_ngram_feat >= 2):
            words_bi = [
                'has(%s)' % ','.join(map(str, bg))
                for bg in nltk.bigrams(words)
            ]
        else:
            words_bi = []

        if (add_ngram_feat >= 3):
            words_tri = [
                'has(%s)' % ','.join(map(str, tg))
                for tg in nltk.trigrams(words)
            ]
        else:
            words_tri = []

        for f in words_uni + words_bi + words_tri:
            bag[f] = 1

        #bag = collections.Counter(words_uni+words_bi+words_tri)
        return bag

    negtn_regex = re.compile(
        r"""(?:
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
        )$
    )
    |
    n't
    """, re.X)

    def get_negation_features(words):
        INF = 0.0
        negtn = [bool(negtn_regex.search(w)) for w in words]

        left = [0.0] * len(words)
        prev = 0.0
        for i in range(0, len(words)):
            if (negtn[i]):
                prev = 1.0
            left[i] = prev
            prev = max(0.0, prev - 0.1)

        right = [0.0] * len(words)
        prev = 0.0
        for i in reversed(range(0, len(words))):
            if (negtn[i]):
                prev = 1.0
            right[i] = prev
            prev = max(0.0, prev - 0.1)

        return dict(
            zip(['neg_l(' + w + ')'
                 for w in words] + ['neg_r(' + w + ')' for w in words],
                left + right))

    def counter(
        func
    ):  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
        @wraps(func)
        def tmp(*args, **kwargs):
            tmp.count += 1
            return func(*args, **kwargs)

        tmp.count = 0
        return tmp

    @counter  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
    def extract_features(words):
        features = {}

        word_features = get_word_features(words)
        features.update(word_features)

        if add_negtn_feat:
            negation_features = get_negation_features(words)
            features.update(negation_features)

        sys.stderr.write('\rfeatures extracted for ' +
                         str(extract_features.count) + ' tweets')
        return features

    extract_features.count = 0

    if ('1step' == method):
        # Apply NLTK's Lazy Map
        v_train = nltk.classify.apply_features(extract_features, train_tweets)
        v_test = nltk.classify.apply_features(extract_features, test_tweets)
        return (v_train, v_test)

    elif ('2step' == method):
        isObj = lambda sent: sent in ['neg', 'pos']
        makeObj = lambda sent: 'obj' if isObj(sent) else sent

        train_tweets_obj = [(words, makeObj(sent))
                            for (words, sent) in train_tweets]
        test_tweets_obj = [(words, makeObj(sent))
                           for (words, sent) in test_tweets]

        train_tweets_sen = [(words, sent) for (words, sent) in train_tweets
                            if isObj(sent)]
        test_tweets_sen = [(words, sent) for (words, sent) in test_tweets
                           if isObj(sent)]

        v_train_obj = nltk.classify.apply_features(extract_features,
                                                   train_tweets_obj)
        v_train_sen = nltk.classify.apply_features(extract_features,
                                                   train_tweets_sen)
        v_test_obj = nltk.classify.apply_features(extract_features,
                                                  test_tweets_obj)
        v_test_sen = nltk.classify.apply_features(extract_features,
                                                  test_tweets_sen)

        test_truth = [sent for (words, sent) in test_tweets]

        return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth)

    else:
        return nltk.classify.apply_features(extract_features, all_tweets)
Ejemplo n.º 2
0
def preprocessingStats(tweets, fileprefix=''):

    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        directory = os.path.dirname(fileprefix)
        if not os.path.exists(directory):
            os.makedirs(directory)
        print 'writing to', fileprefix + '_stats.txt'
        realstdout = sys.stdout
        sys.stdout = open(fileprefix + '_stats.txt', 'w')

    ###########################################################################

    print 'for', len(tweets), 'tweets:'

    print '###########################################################################'

    printFeaturesStats(tweets)

    print '###########################################################################'

    printAllRecuctionStats(tweets)

    print '###########################################################################'

    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]
    tweetsArr = []
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                        for word in text.split() \
                        if ( (len(word) >= 3) ) ]
        tweetsArr.append([words, sentiment])
    unigrams_fd = nltk.FreqDist()
    bigrams_fd = nltk.FreqDist()
    trigrams_fd = nltk.FreqDist()
    for (words, sentiment) in tweetsArr:
        words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
        words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
        unigrams_fd.update(words)
        bigrams_fd.update(words_bi)
        trigrams_fd.update(words_tri)

    print 'Unigrams Distribution'
    printFreqDistCSV(unigrams_fd, filename=fileprefix + '_1grams.csv')
    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_1grams.pdf')
    unigrams_fd.plot(50, cumulative=True)
    pylab.close()

    print 'Bigrams Distribution'
    printFreqDistCSV(bigrams_fd, filename=fileprefix + '_2grams.csv')
    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_2grams.pdf')
    bigrams_fd.plot(50, cumulative=True)
    pylab.close()

    print 'Trigrams Distribution'
    printFreqDistCSV(trigrams_fd, filename=fileprefix + '_3grams.csv')
    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_3grams.pdf')
    trigrams_fd.plot(50, cumulative=True)
    pylab.close()

    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_ngrams.pdf')
    unigrams_fd.plot(50, cumulative=True)
    bigrams_fd.plot(50, cumulative=True)
    trigrams_fd.plot(50, cumulative=True)
    pylab.close()

    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        sys.stdout.close()
        sys.stdout = realstdout
Ejemplo n.º 3
0
import preprocessing as p  # loading the preprocessing module
import heuristics as h
# heuristics contains all the corpuses of emoticons, sentiwords,
# spellchecker, acronym dictionary
import csv

with open('finaldataset.csv', 'rb') as inp:
    all = []
    reader = csv.reader(inp)
    cnt = 0
    for row in reader:
        mod = []
        mod.append(cnt)
        cnt += 1
        mod.append(row[2])
        psen = p.processAll(
            row[1])  # processAll is the function in "preprocessing module"
        # which converts
        for item in psen.split():
            if item.lower() in h.NEGATE:
                psen = psen.replace(item, '__NEG')
        mod.append(psen)
        all.append(mod)
print len(all)

# write the processed data to "data.csv"
with open('data.csv', 'wb') as op:
    writer = csv.writer(op, lineterminator='\n')
    writer.writerows(all)
Ejemplo n.º 4
0
def preprocessingStats( tweets, fileprefix='' ):

    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
        directory = os.path.dirname(fileprefix)
        if not os.path.exists(directory):
            os.makedirs(directory)
        print 'writing to', fileprefix+'_stats.txt'
        realstdout = sys.stdout
        sys.stdout = open( fileprefix+'_stats.txt' , 'w')

    ###########################################################################  

    print 'for', len(tweets), 'tweets:'

    print '###########################################################################'

    printFeaturesStats( tweets )

    print '###########################################################################'

    printAllRecuctionStats( tweets )

    print '###########################################################################'

    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]
    tweetsArr = []
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                        for word in text.split() \
                        if ( (len(word) >= 3) ) ]
        tweetsArr.append([words, sentiment])
    unigrams_fd = nltk.FreqDist()
    bigrams_fd = nltk.FreqDist()
    trigrams_fd = nltk.FreqDist()
    for (words, sentiment) in tweetsArr:
        words_bi = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ]
        words_tri  = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ]
        unigrams_fd.update( words )
        bigrams_fd.update( words_bi )
        trigrams_fd.update( words_tri )

    print 'Unigrams Distribution'
    printFreqDistCSV(unigrams_fd, filename=fileprefix+'_1grams.csv')
    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
        pylab.show = lambda : pylab.savefig(fileprefix+'_1grams.pdf')
    unigrams_fd.plot(50, cumulative=True)
    pylab.close()

    print 'Bigrams Distribution'
    printFreqDistCSV(bigrams_fd, filename=fileprefix+'_2grams.csv')
    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
        pylab.show = lambda : pylab.savefig(fileprefix+'_2grams.pdf')
    bigrams_fd.plot(50, cumulative=True)
    pylab.close()

    print 'Trigrams Distribution'
    printFreqDistCSV(trigrams_fd, filename=fileprefix+'_3grams.csv')
    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
        pylab.show = lambda : pylab.savefig(fileprefix+'_3grams.pdf')
    trigrams_fd.plot(50, cumulative=True)
    pylab.close()

    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
        pylab.show = lambda : pylab.savefig(fileprefix+'_ngrams.pdf')
    unigrams_fd.plot(50, cumulative=True)
    bigrams_fd.plot(50, cumulative=True)
    trigrams_fd.plot(50, cumulative=True)
    pylab.close()    

    if( len(fileprefix)>0 and '_'!=fileprefix[0] ):
        sys.stdout.close()
        sys.stdout = realstdout
Ejemplo n.º 5
0
def getTrainingAndTestData(tweets, K, k, method, feature_set):

    add_ngram_feat = feature_set.get('ngram', 1)
    add_negtn_feat = feature_set.get('negtn', False)

    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]
    #refer this http://www.nltk.org/howto/stem.html
    stemmer = nltk.stem.PorterStemmer()

    all_tweets = []  #DATADICT: all_tweets =   [ (words, sentiment), ... ]
    for (text, sentiment) in procTweets:
        words = [
            word if (word[0:2] == '__') else word.lower()
            for word in text.split() if len(word) >= 3
        ]
        words = [stemmer.stem(w)
                 for w in words]  #DATADICT: words = [ 'word1', 'word2', ... ]
        all_tweets.append((words, sentiment))

    train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k]
    test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k]

    unigrams_fd = nltk.FreqDist()
    if add_ngram_feat > 1:
        n_grams_fd = nltk.FreqDist()

    for (words, sentiment) in train_tweets:
        words_uni = words
        unigrams_fd.update(words)

        if add_ngram_feat >= 2:
            words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
            n_grams_fd.update(words_bi)

        if add_ngram_feat >= 3:
            words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
            n_grams_fd.update(words_tri)

    sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys())))

    #unigrams_sorted = nltk.FreqDist(unigrams).keys()
    p = []
    q = []
    for i, x in unigrams_fd.most_common(200):
        p.append(i)
        q.append(x)
    draw_result(p, q, "Words", "frequency", "Unigrams", "Unigrams_" + str(k))
    #bigrams_sorted = nltk.FreqDist(bigrams).keys()
    #trigrams_sorted = nltk.FreqDist(trigrams).keys()

    if add_ngram_feat > 1:
        sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd)))
        #p.setText('\nlen( n_grams ) = '+str(len( n_grams_fd ))+'\n')
        ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1]
        p1 = []
        q1 = []
        for i, x in n_grams_fd.most_common(200):
            p1.append(i)
            q1.append(x)
        sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted)))
        draw_result(p1, q1, "Words", "frequency", "bigrams",
                    "bigrams_" + str(k))
        #p.setText( '\nlen( ngrams_sorted ) = '+str(len( ngrams_sorted ))+'\n')

    ####################################################################################
    def get_word_features(words):
        bag = {}
        words_uni = ['has(%s)' % ug for ug in words]

        if (add_ngram_feat >= 2):
            words_bi = [
                'has(%s)' % ','.join(map(str, bg))
                for bg in nltk.bigrams(words)
            ]
        else:
            words_bi = []

        if (add_ngram_feat >= 3):
            words_tri = [
                'has(%s)' % ','.join(map(str, tg))
                for tg in nltk.trigrams(words)
            ]
        else:
            words_tri = []

        for f in words_uni + words_bi + words_tri:
            bag[f] = 1

        #bag = collections.Counter(words_uni+words_bi+words_tri)
        return bag

    #https://docs.python.org/3/library/re.html#re.X
    negtn_regex = re.compile(
        r"""(?:
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
        )$
    )
    |
    n't
    """, re.X)

    def get_negation_features(words):
        INF = 0.0
        negtn = [bool(negtn_regex.search(w)) for w in words]

        left = [0.0] * len(words)
        prev = 0.0
        for i in range(0, len(words)):
            if (negtn[i]):
                prev = 1.0
            left[i] = prev
            prev = max(0.0, prev - 0.1)

        right = [0.0] * len(words)
        prev = 0.0
        for i in reversed(range(0, len(words))):
            if (negtn[i]):
                prev = 1.0
            right[i] = prev
            prev = max(0.0, prev - 0.1)

        return dict(
            zip(['neg_l(' + w + ')'
                 for w in words] + ['neg_r(' + w + ')' for w in words],
                left + right))
#{[neg_l("saicharan"),neg_l("pavan")]:[1.0,0.0]}

#http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called

    def counter(func):
        @wraps(func)
        def tmp(*args, **kwargs):
            tmp.count += 1
            return func(*args, **kwargs)

        tmp.count = 0
        return tmp

    #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
    @counter
    def extract_features(words):
        features = {}
        #this will also removes the duplicates
        word_features = get_word_features(words)
        features.update(word_features)
        #duplicates are removed
        if add_negtn_feat:
            negation_features = get_negation_features(words)
            features.update(negation_features)

        sys.stderr.write('\rfeatures extracted for ' +
                         str(extract_features.count) + ' tweets')
        #p.setText( '\rfeatures extracted for ' + str(extract_features.count) + ' tweets'+'\n')
        return features

    extract_features.count = 0
    ####################################################################################
    if ('1step' == method):
        # Apply NLTK's Lazy Map
        v_train = nltk.classify.apply_features(extract_features, train_tweets)
        v_test = nltk.classify.apply_features(extract_features, test_tweets)
        return (v_train, v_test)

    elif ('2step' == method):
        isObj = lambda sent: sent in ['neg', 'pos']
        makeObj = lambda sent: 'obj' if isObj(sent) else sent

        train_tweets_obj = [(words, makeObj(sent))
                            for (words, sent) in train_tweets]
        test_tweets_obj = [(words, makeObj(sent))
                           for (words, sent) in test_tweets]

        train_tweets_sen = [(words, sent) for (words, sent) in train_tweets
                            if isObj(sent)]
        test_tweets_sen = [(words, sent) for (words, sent) in test_tweets
                           if isObj(sent)]

        v_train_obj = nltk.classify.apply_features(extract_features,
                                                   train_tweets_obj)
        v_train_sen = nltk.classify.apply_features(extract_features,
                                                   train_tweets_sen)
        v_test_obj = nltk.classify.apply_features(extract_features,
                                                  test_tweets_obj)
        v_test_sen = nltk.classify.apply_features(extract_features,
                                                  test_tweets_sen)

        test_truth = [sent for (words, sent) in test_tweets]

        return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth)

    else:
        return nltk.classify.apply_features(extract_features, all_tweets)
Ejemplo n.º 6
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import preprocessing as p
import heuristics as h
import csv, sys, nltk

with open('test/testdataset.csv', 'rb') as inp:
    all = []
    reader = csv.reader(inp)
    cnt = 0
    for row in reader:
        mod = []
        mod.append(cnt)
        cnt += 1
        #mod.append(row[2])
        psen = p.processAll(row[1])
        for item in psen.split():
            if item.lower() in h.NEGATE:
                psen = psen.replace(item, '__NEG')
        mod.append(psen)
        all.append(mod)
print len(all)
#print all[:5]
with open('test/data.csv', 'wb') as op:
    writer = csv.writer(op, lineterminator='\n')
    writer.writerows(all)
#with open('data.csv', 'rb') as ip:
#    reader=csv.reader(ip)
#    for item in reader:
#        print item
#sys.exit()
def getClassifyData(tweets):
    add_ngram_feat = 2
    add_negtn_feat = 1

    from functools import wraps
    import preprocessing
    procTweets=[]
    for tweet in tweets:
        procTweet=preprocessing.processAll(tweet, subject="", query="")
        procTweets.append(procTweet)

    stemmer = nltk.stem.PorterStemmer()

    all_tweets_pre = []  # DATADICT: all_tweets =   [ (words, sentiment), ... ]
    for text  in procTweets:
        words = [word if (word[0:2] == '__') else word.lower() \
                 for word in text.split() \
                 if len(word) >= 3]
        words = [stemmer.stem(w) for w in words]  # DATADICT: words = [ 'word1', 'word2', ... ]
        all_tweets_pre.append(words)

    unigrams_fd = nltk.FreqDist()
    if add_ngram_feat > 1:
        n_grams_fd = nltk.FreqDist()

    for (words) in all_tweets_pre:
        words_uni = words
        unigrams_fd.update(words)

        if add_ngram_feat >= 2:
            words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
            n_grams_fd.update(words_bi)

        if add_ngram_feat >= 3:
            words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
            n_grams_fd.update(words_tri)


    if add_ngram_feat > 1:
        sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd)))
        ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1]
        sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted)))

    def get_word_features(words):
        bag = {}
        words_uni = ['has(%s)' % ug for ug in words]

        if (add_ngram_feat >= 2):
            words_bi = ['has(%s)' % ','.join(map(str, bg)) for bg in nltk.bigrams(words)]
        else:
            words_bi = []

        if (add_ngram_feat >= 3):
            words_tri = ['has(%s)' % ','.join(map(str, tg)) for tg in nltk.trigrams(words)]
        else:
            words_tri = []

        for f in words_uni + words_bi + words_tri:
            bag[f] = 1

        # bag = collections.Counter(words_uni+words_bi+words_tri)
        return bag

    negtn_regex = re.compile(r"""(?:
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
        )$
    )
    |
    n't
    """, re.X)

    def get_negation_features(words):
        INF = 0.0
        negtn = [bool(negtn_regex.search(w)) for w in words]

        left = [0.0] * len(words)
        prev = 0.0
        for i in range(0, len(words)):
            if (negtn[i]):
                prev = 1.0
            left[i] = prev
            prev = max(0.0, prev - 0.1)

        right = [0.0] * len(words)
        prev = 0.0
        for i in reversed(range(0, len(words))):
            if (negtn[i]):
                prev = 1.0
            right[i] = prev
            prev = max(0.0, prev - 0.1)

        return dict(zip(
            ['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words],
            left + right))

    def extract_features(words):
        features = {}

        word_features = get_word_features(words)
        features.update(word_features)

        if add_negtn_feat:
            negation_features = get_negation_features(words)
            features.update(negation_features)

        #sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets')
        return features

    extract_features.count = 0;

    v_all=[]
    for tweet_pre in all_tweets_pre:
        v_all.append(extract_features(tweet_pre))
    return (v_all)
Ejemplo n.º 8
0
def getTrainingAndTestData(tweets, K, k, method, feature_set):

    add_ngram_feat = feature_set.get('ngram', 1)
    add_negtn_feat = feature_set.get('negtn', False)


    from functools import wraps
    import preprocessing

    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]

    

    stemmer = nltk.stem.PorterStemmer()

    all_tweets = []                                             #DATADICT: all_tweets =   [ (words, sentiment), ... ]
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                    for word in text.split() \
                    if len(word) >= 3]
        words = [stemmer.stem(w) for w in words]                #DATADICT: words = [ 'word1', 'word2', ... ]
        all_tweets.append((words, sentiment))

    # train_tweets = all_tweets[:int(len(all_tweets)*ratio)]      #DATADICT: train_tweets = [ (words, sentiment), ... ]
    # test_tweets  = all_tweets[int(len(all_tweets)*ratio):]      #DATADICT: test_tweets  = [ (words, sentiment), ... ]
    train_tweets = [x for i,x in enumerate(all_tweets) if i % K !=k]
    test_tweets  = [x for i,x in enumerate(all_tweets) if i % K ==k]

    unigrams_fd = nltk.FreqDist()
    if add_ngram_feat > 1 :
        n_grams_fd = nltk.FreqDist()

    for( words, sentiment ) in train_tweets:
        words_uni = words
        unigrams_fd.update(words)

        if add_ngram_feat>=2 :
            words_bi  = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ]
            n_grams_fd.update( words_bi )

        if add_ngram_feat>=3 :
            words_tri  = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ]
            n_grams_fd.update( words_tri )

    sys.stderr.write( '\nlen( unigrams ) = '+str(len( unigrams_fd.keys() )) )

    #unigrams_sorted = nltk.FreqDist(unigrams).keys()
    unigrams_sorted = unigrams_fd.keys()
    #bigrams_sorted = nltk.FreqDist(bigrams).keys()
    #trigrams_sorted = nltk.FreqDist(trigrams).keys()
    if add_ngram_feat > 1 :
        sys.stderr.write( '\nlen( n_grams ) = '+str(len( n_grams_fd )) )
        ngrams_sorted = [ k for (k,v) in n_grams_fd.items() if v>1]
        sys.stderr.write( '\nlen( ngrams_sorted ) = '+str(len( ngrams_sorted )) )

    def get_word_features(words):
        bag = {}
        words_uni = [ 'has(%s)'% ug for ug in words ]

        if( add_ngram_feat>=2 ):
            words_bi  = [ 'has(%s)'% ','.join(map(str,bg)) for bg in nltk.bigrams(words) ]
        else:
            words_bi  = []

        if( add_ngram_feat>=3 ):
            words_tri = [ 'has(%s)'% ','.join(map(str,tg)) for tg in nltk.trigrams(words) ]
        else:
            words_tri = []

        for f in words_uni+words_bi+words_tri:
            bag[f] = 1

        #bag = collections.Counter(words_uni+words_bi+words_tri)
        return bag

    negtn_regex = re.compile( r"""(?:
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
        )$
    )
    |
    n't
    """, re.X)

    def get_negation_features(words):
        INF = 0.0
        negtn = [ bool(negtn_regex.search(w)) for w in words ]
    
        left = [0.0] * len(words)
        prev = 0.0
        for i in range(0,len(words)):
            if( negtn[i] ):
                prev = 1.0
            left[i] = prev
            prev = max( 0.0, prev-0.1)
    
        right = [0.0] * len(words)
        prev = 0.0
        for i in reversed(range(0,len(words))):
            if( negtn[i] ):
                prev = 1.0
            right[i] = prev
            prev = max( 0.0, prev-0.1)
    
        return dict( zip(
                        ['neg_l('+w+')' for w in  words] + ['neg_r('+w+')' for w in  words],
                        left + right ) )
    
    def counter(func):  #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
        @wraps(func)
        def tmp(*args, **kwargs):
            tmp.count += 1
            return func(*args, **kwargs)
        tmp.count = 0
        return tmp

    @counter    #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called
    def extract_features(words):
        features = {}

        word_features = get_word_features(words)
        features.update( word_features )

        if add_negtn_feat :
            negation_features = get_negation_features(words)
            features.update( negation_features )
 
        sys.stderr.write( '\rfeatures extracted for ' + str(extract_features.count) + ' tweets' )
        return features

    extract_features.count = 0;

    
    if( '1step' == method ):
        # Apply NLTK's Lazy Map
        v_train = nltk.classify.apply_features(extract_features,train_tweets)
        v_test  = nltk.classify.apply_features(extract_features,test_tweets)
        return (v_train, v_test)

    elif( '2step' == method ):
        isObj   = lambda sent: sent in ['neg','pos']
        makeObj = lambda sent: 'obj' if isObj(sent) else sent
        
        train_tweets_obj = [ (words, makeObj(sent)) for (words, sent) in train_tweets ]
        test_tweets_obj  = [ (words, makeObj(sent)) for (words, sent) in test_tweets ]

        train_tweets_sen = [ (words, sent) for (words, sent) in train_tweets if isObj(sent) ]
        test_tweets_sen  = [ (words, sent) for (words, sent) in test_tweets if isObj(sent) ]

        v_train_obj = nltk.classify.apply_features(extract_features,train_tweets_obj)
        v_train_sen = nltk.classify.apply_features(extract_features,train_tweets_sen)
        v_test_obj  = nltk.classify.apply_features(extract_features,test_tweets_obj)
        v_test_sen  = nltk.classify.apply_features(extract_features,test_tweets_sen)

        test_truth = [ sent for (words, sent) in test_tweets ]

        return (v_train_obj,v_train_sen,v_test_obj,v_test_sen,test_truth)

    else:
        return nltk.classify.apply_features(extract_features,all_tweets)
Ejemplo n.º 9
0
def preprocessingStats(tweets, fileprefix=''):
    #print(tweets)
    #print("\n \n \n \n")

    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        directory = os.path.dirname(fileprefix)
        if not os.path.exists(directory):
            os.makedirs(directory)
        print('writing to', fileprefix + '_stats.txt')
        #what ever i print it has to print in file
        realstdout = sys.stdout
        sys.stdout = open(fileprefix + '_stats.txt', 'w')

    ###########################################################################

    print('for', len(tweets), 'tweets:')

    print(
        '###########################################################################'
    )

    printFeaturesStats(tweets)

    print(
        '###########################################################################'
    )

    printAllRecuctionStats(tweets)

    print(
        '###########################################################################'
    )


    procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent)    \
                        for (text, sent, subj, quer) in tweets]
    #print(procTweets)
    tweetsArr = []
    #it will eliminate all words whose length is less than 3 and lower the remaining
    for (text, sentiment) in procTweets:
        words = [word if(word[0:2]=='__') else word.lower() \
                        for word in text.split() \
                        if ( (len(word) >= 3) ) ]
        tweetsArr.append([words, sentiment])
    #print(tweetsArr)

    unigrams_fd = nltk.FreqDist()
    bigrams_fd = nltk.FreqDist()
    trigrams_fd = nltk.FreqDist()
    #print(nltk.bigrams(words))
    for (words, sentiment) in tweetsArr:
        words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)]
        #print(words_bi)
        words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)]
        #print(words_tri)
        #refer this for update http://www.nltk.org/howto/probability.html
        unigrams_fd.update(words)
        bigrams_fd.update(words_bi)
        trigrams_fd.update(words_tri)

    print('Unigrams Distribution')
    printFreqDistCSV(unigrams_fd, filename=fileprefix + '_1grams.csv')
    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_1grams.pdf')
    unigrams_fd.plot(50, cumulative=True)
    pylab.close()

    print('Bigrams Distribution')
    printFreqDistCSV(bigrams_fd, filename=fileprefix + '_2grams.csv')
    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_2grams.pdf')
    bigrams_fd.plot(50, cumulative=True)
    pylab.close()

    print('Trigrams Distribution')
    printFreqDistCSV(trigrams_fd, filename=fileprefix + '_3grams.csv')
    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        pylab.show = lambda: pylab.savefig(fileprefix + '_3grams.pdf')
    trigrams_fd.plot(50, cumulative=True)
    pylab.close()

    if (len(fileprefix) > 0 and '_' != fileprefix[0]):
        sys.stdout.close()
        sys.stdout = realstdout
Ejemplo n.º 10
0
df = pd.read_csv('tw_bought.csv',header=0) # read tw_bought.csv file which containg 18000 tweets
#tweet_ids=list()
#tweet_ids2=list()
#tweet_ids3=list()
X = df.as_matrix(columns=df.columns[0:7]) # segregate columns into matrix form

for x in X.tolist():
    R=''.join(x[6]) # tweet column data
    id1=int(x[0]) # contain userid
    d1=''.join(x[1]) # date column data
    d1=d1+' '
    t1=''.join(x[2]) # time column data
    dt=d1+t1
    struct_Time= time.mktime(time.strptime(dt,'%Y-%m-%d %H:%M')) # combine date and time into single data format
    #print (R)
    t= preprocessing.processAll(R) # remove all special characters from tweet and convert them into readable words
    print(t, ',', id1, ',', struct_Time)
    print(t,',',id1,',',struct_Time,file=corp_txt) # write to output file as well

    #corp_txt.write(pr_txt) # write to output file

df1 = pd.read_csv('tw_main.csv',header=0) # read main.csv file
#tweet_ids=list()
#tweet_ids2=list()
#tweet_ids3=list()
X1 = df1.as_matrix(columns=df1.columns[0:3]) # read only 3 columns

for x1 in X1.tolist():
    R1=''.join(x1[1]) # contain tweets
    id2=int(x1[0]) # contain user id
    dt2=''.join(x1[2]) # contain data time combined format