def getTweets(topic,PATH_TO_RAW_DATA,MAX_TWEETS) : auth=tweepy.OAuthHandler(API_KEY, API_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) api = tweepy.API(auth,proxy="http://u.padalkar:[email protected]:3128/") #if using proxy based server,add an argument proxy="http://*****:*****@host:port/" for topic in topics : print topic outFile=PATH_TO_RAW_DATA+'/'+topic+".txt" ctr=0 while ctr < MAX_TWEETS : data=tweepy.Cursor(api.search, q=topic,lang='en').items(500) try: tweets = [data.text.lower().encode('utf-8') for tweet in data ] ctr=ctr+len(tweets) util.listTotxt(outFile,tweets,"a+") except tweepy.error.TweepError : print("Waiting for 15 mins : Rate Limit Restriction "+str(ctr)+" Tweets Extracted\n") time.sleep(60*15) # Rate Limit Restriction on no. of tweets extracted in one 15-min window
def Clustering(outfile,tweets,n_clusters,topic,k) : np.random.seed(0) vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(tweets) model = KMeans(n_clusters, init='k-means++', max_iter=500, n_init=20) model.fit(X) #Check for empty clusters if any Tweetclusters=list(model.predict(X)) nonempty=[ i for i in range(0,n_clusters) if not Tweetclusters.count(i)==0 ] empty=list(set(range(0,n_clusters))-set(nonempty)) print("Empty Clusters :"+str(topic)+" TOP "+str(k)+" KMEANS "+str(n_clusters)+" "+str(empty)) #Write Tweetwise Clusters to File outfile1=os.path.dirname(outfile)+"/TWEET_CLUSTER_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt" util.listTotxt(outfile1,Tweetclusters,"w+") #Get top ranked tweets from cluster ind=int(k)/int(n_clusters) TopTweet ,ClusterAllIndex ,AllTopTweet = [] , [] , [] for i in range(n_clusters) : ClusterTweets , ClusterIndex , TopClusterTweet=[] , [] ,[] for j in range(0,len(Tweetclusters)): if Tweetclusters[j]==i : ClusterTweets.append(tweets[j]) ClusterIndex.append(j) ClusterAllIndex.append(ClusterIndex) TopClusterTweet=rank_by_val(ClusterTweets,topic,ind) TopTweet.append(TopClusterTweet) AllTopTweet.extend(TopClusterTweet) outfile1=os.path.dirname(outfile)+"/INDEX_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt" util.listTotxt(outfile1,ClusterAllIndex,"w+") with open (outfile,"w+") as f: for i in range(len(AllTopTweet)) : j=AllTopTweet[i] f.write(str(tweets[j]).encode("utf-8")+"\n") outfile1=os.path.dirname(outfile)+"/CLUSTER_TWEETS_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt" util.listTocsv(outfile1,TopTweet,"w+")
def process(inPath,outPath,topics) : for topic in topics : inFile=inPath+'/'+topic+".csv" tweets=util.csvTolist(inFile) tweets= [ str(tweet).strip("[']") for tweet in tweets ] print("No. of Tweets extracted "+str(topic)+"\t\t\t"+str(len(tweets))) tweets=make_lowercase(tweets) tweets=remove_repetition(tweets) tweets=remove_newline(tweets) tweets=if_not_topic(tweets,topic.lower()) #POS-Tagging of tweets pos_tweets=tagger.runtagger_parse(tweets) #[[[tw1_token1,postag,confidence],[tw1_token2,postag,confidence]],[[tw2_token1,postag,confidence]]] tweets=common_except_url(pos_tweets) pos_tweets=tagger.runtagger_parse(tweets) print("No. of Tweets after cleaning :"+str(topic)+"\t\t\t"+str(len(tweets))) outFile=outPath+'/data_'+topic+".txt" util.listTotxt(outFile,tweets,"w+") outFile=outPath+'/POS_'+topic+".csv" util.listTocsv(outFile,pos_tweets,"w+")
def GreedyNormal(outfile,TweetTokens,aspects,tweets,limit): TopInd , ctr =[] , [] left=[aspect for aspect in aspects] for i in range(0,len(TweetTokens)) : x=[ intersect(TweetTokens[i],left) , i ] x.append(len(x[0])) ctr.append(x) ctr=sorted(ctr,key=lambda x: int(x[2]),reverse=True) while len(left)>0 and len(TopInd)<limit : TweetAsp=ctr[0][0] #aspects for that tweet TopInd.append(ctr[0][1])#index for tweet selected left=[ token for token in left if token not in TweetAsp ] # remove aspect from other tweets aspects too as it is covered ctr.remove(ctr[0]) for i in range(0,len(ctr)) : ctr[i][0]=intersect(ctr[i][0], left) ctr[i][2]=len(ctr[i][0]) ctr=sorted(ctr,key=lambda x: int(x[2]),reverse=True) results=[ tweets[i] for i in TopInd ] util.listTotxt( outfile, results , "w+") return results
def getTweets(topic, PATH_TO_RAW_DATA, MAX_TWEETS): auth = tweepy.OAuthHandler(API_KEY, API_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) api = tweepy.API( auth, proxy="http://*****:*****@HOSTNAME:PORT/" ) #if using proxy based server,add an argument proxy="http://*****:*****@host:port/" for topic in topics: print topic outFile = PATH_TO_RAW_DATA + '/' + topic + ".txt" ctr = 0 while ctr < MAX_TWEETS: data = tweepy.Cursor(api.search, q=topic, lang='en').items(500) try: tweets = [data.text.lower().encode('utf-8') for tweet in data] ctr = ctr + len(tweets) util.listTotxt(outFile, tweets, "a+") except tweepy.error.TweepError: print("Waiting for 15 mins : Rate Limit Restriction " + str(ctr) + " Tweets Extracted\n") time.sleep( 60 * 15 ) # Rate Limit Restriction on no. of tweets extracted in one 15-min window
def ranker(rfile,data,topic,ind) : TopIndex=rank_by_val(data,topic,ind) TopData=[ data[j].encode("utf-8") for j in TopIndex ] util.listTotxt(rfile,TopData,"w+") return TopData
topk = "TOP_" + str(ki) CosineSimilarityVSM = [] method = "ALL_TWEETS" outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method util.createFilePath(outPath) val = measures.entropy(tweets) print(topic + topk + method + " Entropy : " + str(val)) method = "RANDOM_TWEETS" outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method util.createFilePath(outPath) results = tweets[0:ki] rfile = outPath + "/" + topic + "_" + topk + "_" + method + ".txt" print rfile util.listTotxt(rfile, results, "w+") val = measures.entropy(results) print(topic + " " + topk + method + " Entropy : " + str(val)) measures.get_ParaphraseSim(tweets, rfile, outPath, topic, ki) CosineSimilarityVSM.append(measures.get_VSMsim(rfile, tweets, results)) outFile = outPath + "/" + topic + "_" + topk + "_" + method + "_VSMSimilarityMatrix.csv" measures.writeCosineSimMatrix(outFile, tweets, results) ind = outPath.rfind("/") outFile = outPath[ 0: ind] + "/" + topic + "_" + topk + "_" + method + "_Doc2vecSimilarityMatrix.csv" measures.writeDoc2vecSimMatrix(outFile, tweets, results, True) # UNSUPERVISED CLUSTERING USING KMEANS n_clusters = [5, 10, 25, 50, 100]
topk="TOP_"+str(ki) CosineSimilarityVSM=[] method="ALL_TWEETS" outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method util.createFilePath(outPath) val=measures.entropy(tweets) print(topic+topk+method+" Entropy : "+str(val)) method="RANDOM_TWEETS" outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method util.createFilePath(outPath) results=tweets[0:ki] rfile=outPath+"/"+topic+"_"+topk+"_"+method+".txt" print rfile util.listTotxt(rfile,results,"w+") val=measures.entropy(results) print(topic+" "+topk+method+" Entropy : "+str(val)) measures.get_ParaphraseSim(tweets,rfile,outPath,topic,ki) CosineSimilarityVSM.append(measures.get_VSMsim(rfile,tweets,results)) outFile=outPath+"/"+topic+"_"+topk+"_"+method+"_VSMSimilarityMatrix.csv" measures.writeCosineSimMatrix(outFile,tweets,results) ind=outPath.rfind("/") outFile=outPath[0:ind]+"/"+topic+"_"+topk+"_"+method+"_Doc2vecSimilarityMatrix.csv" measures.writeDoc2vecSimMatrix(outFile,tweets,results,True) # UNSUPERVISED CLUSTERING USING KMEANS n_clusters=[5,10,25,50,100] for n in n_clusters : if ki>=n :