def on_data(self, data): global positive_count global negative_count global start_time global current_time global start_date_time global current_date_time global f tweets = json.loads(data) if 'text' in tweets and tweets: s = tweets['text'] if not s.isspace(): language = tweets['lang'] if language == 'en': s = s.encode('ascii', 'ignore') print "original Tweet : " + s s = re.sub('\s+', ' ', s).strip() if s: s = normalize_sentence(s) s = remove.remove(s) s = re.sub(url, '', s) s = re.sub(username, '', s) s = re.sub(hashtags, '', s) tokenized = tok.tokenize(s) s = '' for token in tokenized: token = remove.remove(token) s = s + token + ' ' s = re.sub('\s+', ' ', s).strip() s = remove.remove(s) print "normalised Tweet : " + s ## table = string.maketrans("","") ## s=s.translate(table, string.punctuation) if not s.isspace(): f.write(s + '\n') current_time = time.time() elapsed_time = current_time - start_time if elapsed_time > 3600: f.close() start_date_time = datetime.datetime.now( ).strftime('%Y-%m-%d %H-%M-%S') filename = 'live_tweets_' + str( start_date_time) + '.txt' f = codecs.open(filename, 'w', 'utf-8') start_time = time.time() return True
def on_data(self, data): global positive_count global negative_count global start_time global current_time global start_date_time global current_date_time global f tweets = json.loads(data) if 'text' in tweets and tweets: s = tweets['text'] if not s.isspace(): language = tweets['lang'] if language == 'en': s=s.encode('ascii','ignore') print "original Tweet : "+ s s= re.sub( '\s+', ' ', s).strip() if s: s= normalize_sentence(s) s = remove.remove(s) s = re.sub(url,'',s) s = re.sub(username,'',s) s = re.sub(hashtags,'',s) tokenized = tok.tokenize(s) s='' for token in tokenized: token = remove.remove(token) s=s+token+' ' s= re.sub( '\s+', ' ', s).strip() s = remove.remove(s) print "normalised Tweet : " + s ## table = string.maketrans("","") ## s=s.translate(table, string.punctuation) if not s.isspace(): f.write(s+'\n') current_time=time.time(); elapsed_time = current_time-start_time if elapsed_time>3600: f.close() start_date_time=datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S') filename='live_tweets_' + str(start_date_time)+'.txt' f = codecs.open(filename,'w','utf-8') start_time=time.time(); return True
def on_data(self, data): global i i = i + 1 print i if i == 100000: exit(i) tweets = json.loads(data) s = tweets['text'] language = tweets['lang'] if language == 'en': s = s.encode('ascii', 'ignore') ## print "======================================================================" print "original Tweet : " + s s = re.sub('\s+', ' ', s).strip() s = normalize_sentence(s) ## print "normalized Tweet : " + s s = remove.remove(s) ## print "stop words removed Tweet : " + s s = re.sub(url, '', s) # s = re.sub(hashtags,'',s) s = re.sub(username, '', s) ## print "url hashtags usernames removed Tweet : " + s tokenized = tok.tokenize(s) ## print "Tokens in Tweet : " ## print "\n".join(tokenized) s = '' for token in tokenized: token = remove.remove(token) s = s + token + ' ' s = re.sub('\s+', ' ', s).strip() ## f.write(s+'\n'); s = remove.remove(s) ## print "Final Tweet : " + s ## if 'retweeted_status' in tweets: ## retweet_count = tweets['retweeted_status']['retweet_count'] ## else: ## retweet_count=0 ##writer.writerow((s,retweet_count)) ## print Stored_classifier.classify(extract_features(s.split())) return True
def on_data(self, data): global i i=i+1 print i if i==100000: exit(i) tweets = json.loads(data) s = tweets['text'] language = tweets['lang'] if language == 'en': s=s.encode('ascii','ignore') ## print "======================================================================" print "original Tweet : " + s s= re.sub( '\s+', ' ', s).strip() s= normalize_sentence(s) ## print "normalized Tweet : " + s s = remove.remove(s) ## print "stop words removed Tweet : " + s s = re.sub(url,'',s) # s = re.sub(hashtags,'',s) s = re.sub(username,'',s) ## print "url hashtags usernames removed Tweet : " + s tokenized = tok.tokenize(s) ## print "Tokens in Tweet : " ## print "\n".join(tokenized) s='' for token in tokenized: token = remove.remove(token) s=s+token+' ' s= re.sub( '\s+', ' ', s).strip() ## f.write(s+'\n'); s = remove.remove(s) ## print "Final Tweet : " + s ## if 'retweeted_status' in tweets: ## retweet_count = tweets['retweeted_status']['retweet_count'] ## else: ## retweet_count=0 ##writer.writerow((s,retweet_count)) ## print Stored_classifier.classify(extract_features(s.split())) return True
pkl_file = open('classifier_1Lac.pkl', 'rb') Stored_classifier = pickle.load(pkl_file) ##f1 = open('classified_tweets.txt', 'w') pkl_file = open('word_feature_1Lac.pkl', 'rb') word_features = pickle.load(pkl_file) print "classifier loaded" f1 = codecs.open('classified_test_tweets.txt', 'w', 'utf-8') writer = csv.writer(f1, delimiter='\t') print "file opened for writing" with open('testtweets.txt', 'rb') as csvfile: spamreader = csv.reader(csvfile, dialect="excel-tab") for (s, labelled_sentiment) in spamreader: if i % 16 == 0: print i s = re.sub('\s+', ' ', s).strip() s = normalize_sentence(s) s = remove.remove(s) s = re.sub(url, '', s) s = re.sub(hashtags, '', s) s = re.sub(username, '', s) tokenized = tok.tokenize(s) s = '' for token in tokenized: token = remove.remove(token) s = s + token + ' ' s = re.sub('\s+', ' ', s).strip() s = remove.remove(s) classified_sentiment = Stored_classifier.classify( extract_features(s.split())) print s + '\n' print classified_sentiment
Stored_classifier = pickle.load(pkl_file) ##f1 = open('classified_tweets.txt', 'w') pkl_file = open('word_features.pkl', 'rb') word_features= pickle.load(pkl_file) print "classifier loaded" f1 = codecs.open('classified_tweets.txt','w','utf-8') writer = csv.writer(f1, delimiter = '\t') print "file opened for writing" with open('200tweets.txt', 'rb') as csvfile: spamreader = csv.reader(csvfile, dialect="excel-tab") for (s, labelled_sentiment) in spamreader: print i i=i+1 ## s=s.decode("utf8") s= re.sub( '\s+', ' ', s).strip() s= normalize_sentence(s) s = remove.remove(s) s = re.sub(url,'',s) ## s = re.sub(hashtags,'',s) s = re.sub(username,'',s) tokenized = tok.tokenize(s) s='' for token in tokenized: token = remove.remove(token) s=s+token+' ' s= re.sub( '\s+', ' ', s).strip() s = remove.remove(s) classified_sentiment = Stored_classifier.classify(extract_features(s.split())) print s+'\n' print classified_sentiment writer.writerow((s,classified_sentiment))