def on_data(self, data):

        global positive_count
        global negative_count
        global start_time
        global current_time
        global start_date_time
        global current_date_time
        global f

        tweets = json.loads(data)
        if 'text' in tweets and tweets:
            s = tweets['text']
            if not s.isspace():
                language = tweets['lang']
                if language == 'en':
                    s = s.encode('ascii', 'ignore')
                    print "original Tweet           : " + s
                    s = re.sub('\s+', ' ', s).strip()
                    if s:
                        s = normalize_sentence(s)
                        s = remove.remove(s)
                        s = re.sub(url, '', s)
                        s = re.sub(username, '', s)
                        s = re.sub(hashtags, '', s)
                        tokenized = tok.tokenize(s)
                        s = ''
                        for token in tokenized:
                            token = remove.remove(token)
                            s = s + token + ' '
                        s = re.sub('\s+', ' ', s).strip()
                        s = remove.remove(s)
                        print "normalised Tweet         : " + s
                        ##                    table = string.maketrans("","")
                        ##                    s=s.translate(table, string.punctuation)
                        if not s.isspace():
                            f.write(s + '\n')
                            current_time = time.time()
                            elapsed_time = current_time - start_time
                            if elapsed_time > 3600:
                                f.close()
                                start_date_time = datetime.datetime.now(
                                ).strftime('%Y-%m-%d %H-%M-%S')
                                filename = 'live_tweets_' + str(
                                    start_date_time) + '.txt'
                                f = codecs.open(filename, 'w', 'utf-8')
                                start_time = time.time()
        return True
    def on_data(self, data):

        global positive_count
        global negative_count
        global start_time
        global current_time
        global start_date_time
        global current_date_time
        global f

        tweets = json.loads(data)
        if 'text' in tweets and tweets:
            s = tweets['text']
            if not s.isspace():
                language = tweets['lang']
                if language == 'en':
                    s=s.encode('ascii','ignore')
                    print "original Tweet           : "+ s
                    s= re.sub( '\s+', ' ', s).strip()
                    if s:
                        s= normalize_sentence(s)
                        s = remove.remove(s)
                        s = re.sub(url,'',s)
                        s = re.sub(username,'',s)
                        s = re.sub(hashtags,'',s)
                        tokenized = tok.tokenize(s)   
                        s=''
                        for token in tokenized:
                            token = remove.remove(token)
                            s=s+token+' '
                        s= re.sub( '\s+', ' ', s).strip()
                        s = remove.remove(s)
                        print "normalised Tweet         : " + s
    ##                    table = string.maketrans("","")
    ##                    s=s.translate(table, string.punctuation)
                        if not s.isspace():
                            f.write(s+'\n')
                            current_time=time.time();
                            elapsed_time = current_time-start_time
                            if elapsed_time>3600:
                                f.close()
                                start_date_time=datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')
                                filename='live_tweets_' + str(start_date_time)+'.txt'
                                f = codecs.open(filename,'w','utf-8')               
                                start_time=time.time();           
        return True
Ejemplo n.º 3
0
    def on_data(self, data):
        global i
        i = i + 1
        print i
        if i == 100000:
            exit(i)
        tweets = json.loads(data)
        s = tweets['text']
        language = tweets['lang']
        if language == 'en':
            s = s.encode('ascii', 'ignore')
            ##          print "======================================================================"
            print "original Tweet           : " + s
            s = re.sub('\s+', ' ', s).strip()
            s = normalize_sentence(s)
            ##            print "normalized Tweet         : " + s
            s = remove.remove(s)
            ##            print "stop words removed Tweet : " + s

            s = re.sub(url, '', s)
            #            s = re.sub(hashtags,'',s)
            s = re.sub(username, '', s)
            ##            print "url  hashtags usernames removed Tweet : " + s

            tokenized = tok.tokenize(s)
            ##            print "Tokens in Tweet : "
            ##            print "\n".join(tokenized)
            s = ''
            for token in tokenized:
                token = remove.remove(token)
                s = s + token + ' '
            s = re.sub('\s+', ' ', s).strip()
            ##            f.write(s+'\n');
            s = remove.remove(s)
            ##            print "Final Tweet : " + s
            ##            if 'retweeted_status' in tweets:
            ##                retweet_count = tweets['retweeted_status']['retweet_count']
            ##            else:
            ##                retweet_count=0

            ##writer.writerow((s,retweet_count))
            ##            print Stored_classifier.classify(extract_features(s.split()))

            return True
    def on_data(self, data):
        global i
        i=i+1
        print i
        if i==100000:
            exit(i)
        tweets = json.loads(data)
        s = tweets['text']
        language = tweets['lang']
        if language == 'en':
            s=s.encode('ascii','ignore')
##          print "======================================================================"
            print "original Tweet           : " + s
            s= re.sub( '\s+', ' ', s).strip()
            s= normalize_sentence(s)
##            print "normalized Tweet         : " + s
            s = remove.remove(s)
##            print "stop words removed Tweet : " + s
            
            s = re.sub(url,'',s)
#            s = re.sub(hashtags,'',s)
            s = re.sub(username,'',s)
##            print "url  hashtags usernames removed Tweet : " + s
            
            tokenized = tok.tokenize(s)
##            print "Tokens in Tweet : "
##            print "\n".join(tokenized)    
            s=''
            for token in tokenized:
                token = remove.remove(token)
                s=s+token+' '
            s= re.sub( '\s+', ' ', s).strip()
##            f.write(s+'\n');
            s = remove.remove(s)
##            print "Final Tweet : " + s
##            if 'retweeted_status' in tweets:
##                retweet_count = tweets['retweeted_status']['retweet_count']   
##            else:
##                retweet_count=0

            ##writer.writerow((s,retweet_count))
##            print Stored_classifier.classify(extract_features(s.split()))

            return True
pkl_file = open('classifier_1Lac.pkl', 'rb')
Stored_classifier = pickle.load(pkl_file)
##f1 = open('classified_tweets.txt', 'w')
pkl_file = open('word_feature_1Lac.pkl', 'rb')
word_features = pickle.load(pkl_file)
print "classifier loaded"
f1 = codecs.open('classified_test_tweets.txt', 'w', 'utf-8')
writer = csv.writer(f1, delimiter='\t')
print "file opened for writing"
with open('testtweets.txt', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, dialect="excel-tab")
    for (s, labelled_sentiment) in spamreader:
        if i % 16 == 0:
            print i
            s = re.sub('\s+', ' ', s).strip()
            s = normalize_sentence(s)
            s = remove.remove(s)
            s = re.sub(url, '', s)
            s = re.sub(hashtags, '', s)
            s = re.sub(username, '', s)
            tokenized = tok.tokenize(s)
            s = ''
            for token in tokenized:
                token = remove.remove(token)
                s = s + token + ' '
            s = re.sub('\s+', ' ', s).strip()
            s = remove.remove(s)
            classified_sentiment = Stored_classifier.classify(
                extract_features(s.split()))
            print s + '\n'
            print classified_sentiment
Stored_classifier = pickle.load(pkl_file)
##f1 = open('classified_tweets.txt', 'w')
pkl_file = open('word_features.pkl', 'rb')
word_features= pickle.load(pkl_file)
print "classifier loaded"
f1 = codecs.open('classified_tweets.txt','w','utf-8')
writer = csv.writer(f1, delimiter = '\t')
print "file opened for writing"
with open('200tweets.txt', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, dialect="excel-tab")
    for (s, labelled_sentiment) in spamreader:
        print i
        i=i+1
##        s=s.decode("utf8")
        s= re.sub( '\s+', ' ', s).strip()
        s= normalize_sentence(s)
        s = remove.remove(s)
        s = re.sub(url,'',s)
##        s = re.sub(hashtags,'',s)
        s = re.sub(username,'',s)
        tokenized = tok.tokenize(s)
        s=''
        for token in tokenized:
            token = remove.remove(token)
            s=s+token+' '
        s= re.sub( '\s+', ' ', s).strip()
        s = remove.remove(s)
        classified_sentiment =  Stored_classifier.classify(extract_features(s.split()))
        print s+'\n'
        print classified_sentiment
        writer.writerow((s,classified_sentiment))