Exemple #1
0
def Clustering(outfile,tweets,n_clusters,topic,k) :
	np.random.seed(0)
	vectorizer = TfidfVectorizer(stop_words='english')
	X = vectorizer.fit_transform(tweets)
	model = KMeans(n_clusters, init='k-means++', max_iter=500, n_init=20)
	model.fit(X)
	
	#Check for empty clusters if any
	Tweetclusters=list(model.predict(X))
	nonempty=[ i for i in range(0,n_clusters) if not Tweetclusters.count(i)==0 ]
	empty=list(set(range(0,n_clusters))-set(nonempty)) 
	print("Empty Clusters :"+str(topic)+" TOP "+str(k)+" KMEANS "+str(n_clusters)+" "+str(empty))

	#Write Tweetwise Clusters to File
	outfile1=os.path.dirname(outfile)+"/TWEET_CLUSTER_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt"
	util.listTotxt(outfile1,Tweetclusters,"w+")
	
	#Get top ranked tweets from cluster 
	ind=int(k)/int(n_clusters)
	TopTweet ,ClusterAllIndex ,AllTopTweet = [] , [] , []
	for i in range(n_clusters) :
		ClusterTweets , ClusterIndex , TopClusterTweet=[] , [] ,[]
		for j in range(0,len(Tweetclusters)):
			if Tweetclusters[j]==i :
				ClusterTweets.append(tweets[j])
				ClusterIndex.append(j)
		ClusterAllIndex.append(ClusterIndex)
		TopClusterTweet=rank_by_val(ClusterTweets,topic,ind)
		TopTweet.append(TopClusterTweet)
		AllTopTweet.extend(TopClusterTweet)
	outfile1=os.path.dirname(outfile)+"/INDEX_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt"
	util.listTotxt(outfile1,ClusterAllIndex,"w+")
	
	with open (outfile,"w+") as f:
		for i in range(len(AllTopTweet)) :
			j=AllTopTweet[i]
			f.write(str(tweets[j]).encode("utf-8")+"\n")

	outfile1=os.path.dirname(outfile)+"/CLUSTER_TWEETS_"+str(topic)+"_TOP_"+str(k)+"_KMEANS_"+str(n_clusters)+".txt"
	util.listTocsv(outfile1,TopTweet,"w+")
def process(inPath,outPath,topics) :
	for topic in topics :
		inFile=inPath+'/'+topic+".csv" 
		tweets=util.csvTolist(inFile)
		tweets= [ str(tweet).strip("[']") for tweet in tweets ]
	
		print("No. of Tweets extracted "+str(topic)+"\t\t\t"+str(len(tweets)))
		tweets=make_lowercase(tweets)
		tweets=remove_repetition(tweets)
		tweets=remove_newline(tweets)
		tweets=if_not_topic(tweets,topic.lower())

		#POS-Tagging of tweets
		pos_tweets=tagger.runtagger_parse(tweets) #[[[tw1_token1,postag,confidence],[tw1_token2,postag,confidence]],[[tw2_token1,postag,confidence]]]
		tweets=common_except_url(pos_tweets)
		pos_tweets=tagger.runtagger_parse(tweets)
		
		print("No. of Tweets after cleaning :"+str(topic)+"\t\t\t"+str(len(tweets)))
		
		outFile=outPath+'/data_'+topic+".txt" 
		util.listTotxt(outFile,tweets,"w+") 
		outFile=outPath+'/POS_'+topic+".csv" 
		util.listTocsv(outFile,pos_tweets,"w+")