def getTrainingAndTestData(tweets, K, k, method, feature_set): add_ngram_feat = feature_set.get('ngram', 1) add_negtn_feat = feature_set.get('negtn', False) from functools import wraps import preprocessing procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] stemmer = nltk.stem.PorterStemmer() all_tweets = [] #DATADICT: all_tweets = [ (words, sentiment), ... ] for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if len(word) >= 3] words = [stemmer.stem(w) for w in words] #DATADICT: words = [ 'word1', 'word2', ... ] all_tweets.append((words, sentiment)) # train_tweets = all_tweets[:int(len(all_tweets)*ratio)] #DATADICT: train_tweets = [ (words, sentiment), ... ] # test_tweets = all_tweets[int(len(all_tweets)*ratio):] #DATADICT: test_tweets = [ (words, sentiment), ... ] train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k] test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k] unigrams_fd = nltk.FreqDist() if add_ngram_feat > 1: n_grams_fd = nltk.FreqDist() for (words, sentiment) in train_tweets: words_uni = words unigrams_fd.update(words) if add_ngram_feat >= 2: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] n_grams_fd.update(words_bi) if add_ngram_feat >= 3: words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] n_grams_fd.update(words_tri) sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys()))) #unigrams_sorted = nltk.FreqDist(unigrams).keys() unigrams_sorted = unigrams_fd.keys() #bigrams_sorted = nltk.FreqDist(bigrams).keys() #trigrams_sorted = nltk.FreqDist(trigrams).keys() if add_ngram_feat > 1: sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd))) ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1] sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted))) def get_word_features(words): bag = {} words_uni = ['has(%s)' % ug for ug in words] if (add_ngram_feat >= 2): words_bi = [ 'has(%s)' % ','.join(map(str, bg)) for bg in nltk.bigrams(words) ] else: words_bi = [] if (add_ngram_feat >= 3): words_tri = [ 'has(%s)' % ','.join(map(str, tg)) for tg in nltk.trigrams(words) ] else: words_tri = [] for f in words_uni + words_bi + words_tri: bag[f] = 1 #bag = collections.Counter(words_uni+words_bi+words_tri) return bag negtn_regex = re.compile( r"""(?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't """, re.X) def get_negation_features(words): INF = 0.0 negtn = [bool(negtn_regex.search(w)) for w in words] left = [0.0] * len(words) prev = 0.0 for i in range(0, len(words)): if (negtn[i]): prev = 1.0 left[i] = prev prev = max(0.0, prev - 0.1) right = [0.0] * len(words) prev = 0.0 for i in reversed(range(0, len(words))): if (negtn[i]): prev = 1.0 right[i] = prev prev = max(0.0, prev - 0.1) return dict( zip(['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words], left + right)) def counter( func ): #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called @wraps(func) def tmp(*args, **kwargs): tmp.count += 1 return func(*args, **kwargs) tmp.count = 0 return tmp @counter #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called def extract_features(words): features = {} word_features = get_word_features(words) features.update(word_features) if add_negtn_feat: negation_features = get_negation_features(words) features.update(negation_features) sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets') return features extract_features.count = 0 if ('1step' == method): # Apply NLTK's Lazy Map v_train = nltk.classify.apply_features(extract_features, train_tweets) v_test = nltk.classify.apply_features(extract_features, test_tweets) return (v_train, v_test) elif ('2step' == method): isObj = lambda sent: sent in ['neg', 'pos'] makeObj = lambda sent: 'obj' if isObj(sent) else sent train_tweets_obj = [(words, makeObj(sent)) for (words, sent) in train_tweets] test_tweets_obj = [(words, makeObj(sent)) for (words, sent) in test_tweets] train_tweets_sen = [(words, sent) for (words, sent) in train_tweets if isObj(sent)] test_tweets_sen = [(words, sent) for (words, sent) in test_tweets if isObj(sent)] v_train_obj = nltk.classify.apply_features(extract_features, train_tweets_obj) v_train_sen = nltk.classify.apply_features(extract_features, train_tweets_sen) v_test_obj = nltk.classify.apply_features(extract_features, test_tweets_obj) v_test_sen = nltk.classify.apply_features(extract_features, test_tweets_sen) test_truth = [sent for (words, sent) in test_tweets] return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth) else: return nltk.classify.apply_features(extract_features, all_tweets)
def preprocessingStats(tweets, fileprefix=''): if (len(fileprefix) > 0 and '_' != fileprefix[0]): directory = os.path.dirname(fileprefix) if not os.path.exists(directory): os.makedirs(directory) print 'writing to', fileprefix + '_stats.txt' realstdout = sys.stdout sys.stdout = open(fileprefix + '_stats.txt', 'w') ########################################################################### print 'for', len(tweets), 'tweets:' print '###########################################################################' printFeaturesStats(tweets) print '###########################################################################' printAllRecuctionStats(tweets) print '###########################################################################' procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] tweetsArr = [] for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if ( (len(word) >= 3) ) ] tweetsArr.append([words, sentiment]) unigrams_fd = nltk.FreqDist() bigrams_fd = nltk.FreqDist() trigrams_fd = nltk.FreqDist() for (words, sentiment) in tweetsArr: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] unigrams_fd.update(words) bigrams_fd.update(words_bi) trigrams_fd.update(words_tri) print 'Unigrams Distribution' printFreqDistCSV(unigrams_fd, filename=fileprefix + '_1grams.csv') if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_1grams.pdf') unigrams_fd.plot(50, cumulative=True) pylab.close() print 'Bigrams Distribution' printFreqDistCSV(bigrams_fd, filename=fileprefix + '_2grams.csv') if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_2grams.pdf') bigrams_fd.plot(50, cumulative=True) pylab.close() print 'Trigrams Distribution' printFreqDistCSV(trigrams_fd, filename=fileprefix + '_3grams.csv') if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_3grams.pdf') trigrams_fd.plot(50, cumulative=True) pylab.close() if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_ngrams.pdf') unigrams_fd.plot(50, cumulative=True) bigrams_fd.plot(50, cumulative=True) trigrams_fd.plot(50, cumulative=True) pylab.close() if (len(fileprefix) > 0 and '_' != fileprefix[0]): sys.stdout.close() sys.stdout = realstdout
import preprocessing as p # loading the preprocessing module import heuristics as h # heuristics contains all the corpuses of emoticons, sentiwords, # spellchecker, acronym dictionary import csv with open('finaldataset.csv', 'rb') as inp: all = [] reader = csv.reader(inp) cnt = 0 for row in reader: mod = [] mod.append(cnt) cnt += 1 mod.append(row[2]) psen = p.processAll( row[1]) # processAll is the function in "preprocessing module" # which converts for item in psen.split(): if item.lower() in h.NEGATE: psen = psen.replace(item, '__NEG') mod.append(psen) all.append(mod) print len(all) # write the processed data to "data.csv" with open('data.csv', 'wb') as op: writer = csv.writer(op, lineterminator='\n') writer.writerows(all)
def preprocessingStats( tweets, fileprefix='' ): if( len(fileprefix)>0 and '_'!=fileprefix[0] ): directory = os.path.dirname(fileprefix) if not os.path.exists(directory): os.makedirs(directory) print 'writing to', fileprefix+'_stats.txt' realstdout = sys.stdout sys.stdout = open( fileprefix+'_stats.txt' , 'w') ########################################################################### print 'for', len(tweets), 'tweets:' print '###########################################################################' printFeaturesStats( tweets ) print '###########################################################################' printAllRecuctionStats( tweets ) print '###########################################################################' procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] tweetsArr = [] for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if ( (len(word) >= 3) ) ] tweetsArr.append([words, sentiment]) unigrams_fd = nltk.FreqDist() bigrams_fd = nltk.FreqDist() trigrams_fd = nltk.FreqDist() for (words, sentiment) in tweetsArr: words_bi = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ] words_tri = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ] unigrams_fd.update( words ) bigrams_fd.update( words_bi ) trigrams_fd.update( words_tri ) print 'Unigrams Distribution' printFreqDistCSV(unigrams_fd, filename=fileprefix+'_1grams.csv') if( len(fileprefix)>0 and '_'!=fileprefix[0] ): pylab.show = lambda : pylab.savefig(fileprefix+'_1grams.pdf') unigrams_fd.plot(50, cumulative=True) pylab.close() print 'Bigrams Distribution' printFreqDistCSV(bigrams_fd, filename=fileprefix+'_2grams.csv') if( len(fileprefix)>0 and '_'!=fileprefix[0] ): pylab.show = lambda : pylab.savefig(fileprefix+'_2grams.pdf') bigrams_fd.plot(50, cumulative=True) pylab.close() print 'Trigrams Distribution' printFreqDistCSV(trigrams_fd, filename=fileprefix+'_3grams.csv') if( len(fileprefix)>0 and '_'!=fileprefix[0] ): pylab.show = lambda : pylab.savefig(fileprefix+'_3grams.pdf') trigrams_fd.plot(50, cumulative=True) pylab.close() if( len(fileprefix)>0 and '_'!=fileprefix[0] ): pylab.show = lambda : pylab.savefig(fileprefix+'_ngrams.pdf') unigrams_fd.plot(50, cumulative=True) bigrams_fd.plot(50, cumulative=True) trigrams_fd.plot(50, cumulative=True) pylab.close() if( len(fileprefix)>0 and '_'!=fileprefix[0] ): sys.stdout.close() sys.stdout = realstdout
def getTrainingAndTestData(tweets, K, k, method, feature_set): add_ngram_feat = feature_set.get('ngram', 1) add_negtn_feat = feature_set.get('negtn', False) procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] #refer this http://www.nltk.org/howto/stem.html stemmer = nltk.stem.PorterStemmer() all_tweets = [] #DATADICT: all_tweets = [ (words, sentiment), ... ] for (text, sentiment) in procTweets: words = [ word if (word[0:2] == '__') else word.lower() for word in text.split() if len(word) >= 3 ] words = [stemmer.stem(w) for w in words] #DATADICT: words = [ 'word1', 'word2', ... ] all_tweets.append((words, sentiment)) train_tweets = [x for i, x in enumerate(all_tweets) if i % K != k] test_tweets = [x for i, x in enumerate(all_tweets) if i % K == k] unigrams_fd = nltk.FreqDist() if add_ngram_feat > 1: n_grams_fd = nltk.FreqDist() for (words, sentiment) in train_tweets: words_uni = words unigrams_fd.update(words) if add_ngram_feat >= 2: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] n_grams_fd.update(words_bi) if add_ngram_feat >= 3: words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] n_grams_fd.update(words_tri) sys.stderr.write('\nlen( unigrams ) = ' + str(len(unigrams_fd.keys()))) #unigrams_sorted = nltk.FreqDist(unigrams).keys() p = [] q = [] for i, x in unigrams_fd.most_common(200): p.append(i) q.append(x) draw_result(p, q, "Words", "frequency", "Unigrams", "Unigrams_" + str(k)) #bigrams_sorted = nltk.FreqDist(bigrams).keys() #trigrams_sorted = nltk.FreqDist(trigrams).keys() if add_ngram_feat > 1: sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd))) #p.setText('\nlen( n_grams ) = '+str(len( n_grams_fd ))+'\n') ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1] p1 = [] q1 = [] for i, x in n_grams_fd.most_common(200): p1.append(i) q1.append(x) sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted))) draw_result(p1, q1, "Words", "frequency", "bigrams", "bigrams_" + str(k)) #p.setText( '\nlen( ngrams_sorted ) = '+str(len( ngrams_sorted ))+'\n') #################################################################################### def get_word_features(words): bag = {} words_uni = ['has(%s)' % ug for ug in words] if (add_ngram_feat >= 2): words_bi = [ 'has(%s)' % ','.join(map(str, bg)) for bg in nltk.bigrams(words) ] else: words_bi = [] if (add_ngram_feat >= 3): words_tri = [ 'has(%s)' % ','.join(map(str, tg)) for tg in nltk.trigrams(words) ] else: words_tri = [] for f in words_uni + words_bi + words_tri: bag[f] = 1 #bag = collections.Counter(words_uni+words_bi+words_tri) return bag #https://docs.python.org/3/library/re.html#re.X negtn_regex = re.compile( r"""(?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't """, re.X) def get_negation_features(words): INF = 0.0 negtn = [bool(negtn_regex.search(w)) for w in words] left = [0.0] * len(words) prev = 0.0 for i in range(0, len(words)): if (negtn[i]): prev = 1.0 left[i] = prev prev = max(0.0, prev - 0.1) right = [0.0] * len(words) prev = 0.0 for i in reversed(range(0, len(words))): if (negtn[i]): prev = 1.0 right[i] = prev prev = max(0.0, prev - 0.1) return dict( zip(['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words], left + right)) #{[neg_l("saicharan"),neg_l("pavan")]:[1.0,0.0]} #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called def counter(func): @wraps(func) def tmp(*args, **kwargs): tmp.count += 1 return func(*args, **kwargs) tmp.count = 0 return tmp #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called @counter def extract_features(words): features = {} #this will also removes the duplicates word_features = get_word_features(words) features.update(word_features) #duplicates are removed if add_negtn_feat: negation_features = get_negation_features(words) features.update(negation_features) sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets') #p.setText( '\rfeatures extracted for ' + str(extract_features.count) + ' tweets'+'\n') return features extract_features.count = 0 #################################################################################### if ('1step' == method): # Apply NLTK's Lazy Map v_train = nltk.classify.apply_features(extract_features, train_tweets) v_test = nltk.classify.apply_features(extract_features, test_tweets) return (v_train, v_test) elif ('2step' == method): isObj = lambda sent: sent in ['neg', 'pos'] makeObj = lambda sent: 'obj' if isObj(sent) else sent train_tweets_obj = [(words, makeObj(sent)) for (words, sent) in train_tweets] test_tweets_obj = [(words, makeObj(sent)) for (words, sent) in test_tweets] train_tweets_sen = [(words, sent) for (words, sent) in train_tweets if isObj(sent)] test_tweets_sen = [(words, sent) for (words, sent) in test_tweets if isObj(sent)] v_train_obj = nltk.classify.apply_features(extract_features, train_tweets_obj) v_train_sen = nltk.classify.apply_features(extract_features, train_tweets_sen) v_test_obj = nltk.classify.apply_features(extract_features, test_tweets_obj) v_test_sen = nltk.classify.apply_features(extract_features, test_tweets_sen) test_truth = [sent for (words, sent) in test_tweets] return (v_train_obj, v_train_sen, v_test_obj, v_test_sen, test_truth) else: return nltk.classify.apply_features(extract_features, all_tweets)
#!/usr/bin/env python2 # -*- coding: utf-8 -*- import preprocessing as p import heuristics as h import csv, sys, nltk with open('test/testdataset.csv', 'rb') as inp: all = [] reader = csv.reader(inp) cnt = 0 for row in reader: mod = [] mod.append(cnt) cnt += 1 #mod.append(row[2]) psen = p.processAll(row[1]) for item in psen.split(): if item.lower() in h.NEGATE: psen = psen.replace(item, '__NEG') mod.append(psen) all.append(mod) print len(all) #print all[:5] with open('test/data.csv', 'wb') as op: writer = csv.writer(op, lineterminator='\n') writer.writerows(all) #with open('data.csv', 'rb') as ip: # reader=csv.reader(ip) # for item in reader: # print item #sys.exit()
def getClassifyData(tweets): add_ngram_feat = 2 add_negtn_feat = 1 from functools import wraps import preprocessing procTweets=[] for tweet in tweets: procTweet=preprocessing.processAll(tweet, subject="", query="") procTweets.append(procTweet) stemmer = nltk.stem.PorterStemmer() all_tweets_pre = [] # DATADICT: all_tweets = [ (words, sentiment), ... ] for text in procTweets: words = [word if (word[0:2] == '__') else word.lower() \ for word in text.split() \ if len(word) >= 3] words = [stemmer.stem(w) for w in words] # DATADICT: words = [ 'word1', 'word2', ... ] all_tweets_pre.append(words) unigrams_fd = nltk.FreqDist() if add_ngram_feat > 1: n_grams_fd = nltk.FreqDist() for (words) in all_tweets_pre: words_uni = words unigrams_fd.update(words) if add_ngram_feat >= 2: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] n_grams_fd.update(words_bi) if add_ngram_feat >= 3: words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] n_grams_fd.update(words_tri) if add_ngram_feat > 1: sys.stderr.write('\nlen( n_grams ) = ' + str(len(n_grams_fd))) ngrams_sorted = [k for (k, v) in n_grams_fd.items() if v > 1] sys.stderr.write('\nlen( ngrams_sorted ) = ' + str(len(ngrams_sorted))) def get_word_features(words): bag = {} words_uni = ['has(%s)' % ug for ug in words] if (add_ngram_feat >= 2): words_bi = ['has(%s)' % ','.join(map(str, bg)) for bg in nltk.bigrams(words)] else: words_bi = [] if (add_ngram_feat >= 3): words_tri = ['has(%s)' % ','.join(map(str, tg)) for tg in nltk.trigrams(words)] else: words_tri = [] for f in words_uni + words_bi + words_tri: bag[f] = 1 # bag = collections.Counter(words_uni+words_bi+words_tri) return bag negtn_regex = re.compile(r"""(?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't """, re.X) def get_negation_features(words): INF = 0.0 negtn = [bool(negtn_regex.search(w)) for w in words] left = [0.0] * len(words) prev = 0.0 for i in range(0, len(words)): if (negtn[i]): prev = 1.0 left[i] = prev prev = max(0.0, prev - 0.1) right = [0.0] * len(words) prev = 0.0 for i in reversed(range(0, len(words))): if (negtn[i]): prev = 1.0 right[i] = prev prev = max(0.0, prev - 0.1) return dict(zip( ['neg_l(' + w + ')' for w in words] + ['neg_r(' + w + ')' for w in words], left + right)) def extract_features(words): features = {} word_features = get_word_features(words) features.update(word_features) if add_negtn_feat: negation_features = get_negation_features(words) features.update(negation_features) #sys.stderr.write('\rfeatures extracted for ' + str(extract_features.count) + ' tweets') return features extract_features.count = 0; v_all=[] for tweet_pre in all_tweets_pre: v_all.append(extract_features(tweet_pre)) return (v_all)
def getTrainingAndTestData(tweets, K, k, method, feature_set): add_ngram_feat = feature_set.get('ngram', 1) add_negtn_feat = feature_set.get('negtn', False) from functools import wraps import preprocessing procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] stemmer = nltk.stem.PorterStemmer() all_tweets = [] #DATADICT: all_tweets = [ (words, sentiment), ... ] for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if len(word) >= 3] words = [stemmer.stem(w) for w in words] #DATADICT: words = [ 'word1', 'word2', ... ] all_tweets.append((words, sentiment)) # train_tweets = all_tweets[:int(len(all_tweets)*ratio)] #DATADICT: train_tweets = [ (words, sentiment), ... ] # test_tweets = all_tweets[int(len(all_tweets)*ratio):] #DATADICT: test_tweets = [ (words, sentiment), ... ] train_tweets = [x for i,x in enumerate(all_tweets) if i % K !=k] test_tweets = [x for i,x in enumerate(all_tweets) if i % K ==k] unigrams_fd = nltk.FreqDist() if add_ngram_feat > 1 : n_grams_fd = nltk.FreqDist() for( words, sentiment ) in train_tweets: words_uni = words unigrams_fd.update(words) if add_ngram_feat>=2 : words_bi = [ ','.join(map(str,bg)) for bg in nltk.bigrams(words) ] n_grams_fd.update( words_bi ) if add_ngram_feat>=3 : words_tri = [ ','.join(map(str,tg)) for tg in nltk.trigrams(words) ] n_grams_fd.update( words_tri ) sys.stderr.write( '\nlen( unigrams ) = '+str(len( unigrams_fd.keys() )) ) #unigrams_sorted = nltk.FreqDist(unigrams).keys() unigrams_sorted = unigrams_fd.keys() #bigrams_sorted = nltk.FreqDist(bigrams).keys() #trigrams_sorted = nltk.FreqDist(trigrams).keys() if add_ngram_feat > 1 : sys.stderr.write( '\nlen( n_grams ) = '+str(len( n_grams_fd )) ) ngrams_sorted = [ k for (k,v) in n_grams_fd.items() if v>1] sys.stderr.write( '\nlen( ngrams_sorted ) = '+str(len( ngrams_sorted )) ) def get_word_features(words): bag = {} words_uni = [ 'has(%s)'% ug for ug in words ] if( add_ngram_feat>=2 ): words_bi = [ 'has(%s)'% ','.join(map(str,bg)) for bg in nltk.bigrams(words) ] else: words_bi = [] if( add_ngram_feat>=3 ): words_tri = [ 'has(%s)'% ','.join(map(str,tg)) for tg in nltk.trigrams(words) ] else: words_tri = [] for f in words_uni+words_bi+words_tri: bag[f] = 1 #bag = collections.Counter(words_uni+words_bi+words_tri) return bag negtn_regex = re.compile( r"""(?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't """, re.X) def get_negation_features(words): INF = 0.0 negtn = [ bool(negtn_regex.search(w)) for w in words ] left = [0.0] * len(words) prev = 0.0 for i in range(0,len(words)): if( negtn[i] ): prev = 1.0 left[i] = prev prev = max( 0.0, prev-0.1) right = [0.0] * len(words) prev = 0.0 for i in reversed(range(0,len(words))): if( negtn[i] ): prev = 1.0 right[i] = prev prev = max( 0.0, prev-0.1) return dict( zip( ['neg_l('+w+')' for w in words] + ['neg_r('+w+')' for w in words], left + right ) ) def counter(func): #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called @wraps(func) def tmp(*args, **kwargs): tmp.count += 1 return func(*args, **kwargs) tmp.count = 0 return tmp @counter #http://stackoverflow.com/questions/13512391/to-count-no-times-a-function-is-called def extract_features(words): features = {} word_features = get_word_features(words) features.update( word_features ) if add_negtn_feat : negation_features = get_negation_features(words) features.update( negation_features ) sys.stderr.write( '\rfeatures extracted for ' + str(extract_features.count) + ' tweets' ) return features extract_features.count = 0; if( '1step' == method ): # Apply NLTK's Lazy Map v_train = nltk.classify.apply_features(extract_features,train_tweets) v_test = nltk.classify.apply_features(extract_features,test_tweets) return (v_train, v_test) elif( '2step' == method ): isObj = lambda sent: sent in ['neg','pos'] makeObj = lambda sent: 'obj' if isObj(sent) else sent train_tweets_obj = [ (words, makeObj(sent)) for (words, sent) in train_tweets ] test_tweets_obj = [ (words, makeObj(sent)) for (words, sent) in test_tweets ] train_tweets_sen = [ (words, sent) for (words, sent) in train_tweets if isObj(sent) ] test_tweets_sen = [ (words, sent) for (words, sent) in test_tweets if isObj(sent) ] v_train_obj = nltk.classify.apply_features(extract_features,train_tweets_obj) v_train_sen = nltk.classify.apply_features(extract_features,train_tweets_sen) v_test_obj = nltk.classify.apply_features(extract_features,test_tweets_obj) v_test_sen = nltk.classify.apply_features(extract_features,test_tweets_sen) test_truth = [ sent for (words, sent) in test_tweets ] return (v_train_obj,v_train_sen,v_test_obj,v_test_sen,test_truth) else: return nltk.classify.apply_features(extract_features,all_tweets)
def preprocessingStats(tweets, fileprefix=''): #print(tweets) #print("\n \n \n \n") if (len(fileprefix) > 0 and '_' != fileprefix[0]): directory = os.path.dirname(fileprefix) if not os.path.exists(directory): os.makedirs(directory) print('writing to', fileprefix + '_stats.txt') #what ever i print it has to print in file realstdout = sys.stdout sys.stdout = open(fileprefix + '_stats.txt', 'w') ########################################################################### print('for', len(tweets), 'tweets:') print( '###########################################################################' ) printFeaturesStats(tweets) print( '###########################################################################' ) printAllRecuctionStats(tweets) print( '###########################################################################' ) procTweets = [ (preprocessing.processAll(text, subject=subj, query=quer), sent) \ for (text, sent, subj, quer) in tweets] #print(procTweets) tweetsArr = [] #it will eliminate all words whose length is less than 3 and lower the remaining for (text, sentiment) in procTweets: words = [word if(word[0:2]=='__') else word.lower() \ for word in text.split() \ if ( (len(word) >= 3) ) ] tweetsArr.append([words, sentiment]) #print(tweetsArr) unigrams_fd = nltk.FreqDist() bigrams_fd = nltk.FreqDist() trigrams_fd = nltk.FreqDist() #print(nltk.bigrams(words)) for (words, sentiment) in tweetsArr: words_bi = [','.join(map(str, bg)) for bg in nltk.bigrams(words)] #print(words_bi) words_tri = [','.join(map(str, tg)) for tg in nltk.trigrams(words)] #print(words_tri) #refer this for update http://www.nltk.org/howto/probability.html unigrams_fd.update(words) bigrams_fd.update(words_bi) trigrams_fd.update(words_tri) print('Unigrams Distribution') printFreqDistCSV(unigrams_fd, filename=fileprefix + '_1grams.csv') if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_1grams.pdf') unigrams_fd.plot(50, cumulative=True) pylab.close() print('Bigrams Distribution') printFreqDistCSV(bigrams_fd, filename=fileprefix + '_2grams.csv') if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_2grams.pdf') bigrams_fd.plot(50, cumulative=True) pylab.close() print('Trigrams Distribution') printFreqDistCSV(trigrams_fd, filename=fileprefix + '_3grams.csv') if (len(fileprefix) > 0 and '_' != fileprefix[0]): pylab.show = lambda: pylab.savefig(fileprefix + '_3grams.pdf') trigrams_fd.plot(50, cumulative=True) pylab.close() if (len(fileprefix) > 0 and '_' != fileprefix[0]): sys.stdout.close() sys.stdout = realstdout
df = pd.read_csv('tw_bought.csv',header=0) # read tw_bought.csv file which containg 18000 tweets #tweet_ids=list() #tweet_ids2=list() #tweet_ids3=list() X = df.as_matrix(columns=df.columns[0:7]) # segregate columns into matrix form for x in X.tolist(): R=''.join(x[6]) # tweet column data id1=int(x[0]) # contain userid d1=''.join(x[1]) # date column data d1=d1+' ' t1=''.join(x[2]) # time column data dt=d1+t1 struct_Time= time.mktime(time.strptime(dt,'%Y-%m-%d %H:%M')) # combine date and time into single data format #print (R) t= preprocessing.processAll(R) # remove all special characters from tweet and convert them into readable words print(t, ',', id1, ',', struct_Time) print(t,',',id1,',',struct_Time,file=corp_txt) # write to output file as well #corp_txt.write(pr_txt) # write to output file df1 = pd.read_csv('tw_main.csv',header=0) # read main.csv file #tweet_ids=list() #tweet_ids2=list() #tweet_ids3=list() X1 = df1.as_matrix(columns=df1.columns[0:3]) # read only 3 columns for x1 in X1.tolist(): R1=''.join(x1[1]) # contain tweets id2=int(x1[0]) # contain user id dt2=''.join(x1[2]) # contain data time combined format