Esempio n. 1
0
def ClusterQuestions(args):
    qs_dict = {}	
    with open("Question_Map","r") as qm:
		qs_dict = pickle.load(qm)			 
    filename = 'SuperQuestionSet_PosTagged.txt'
    dumpFile="tf_idf.dump"
    if len(args) >= 2:
        filename = args[1]
    if len(args)>=3:
        dumpFile=args[2]
    getDump(filename) 
    with open(filename) as title_file:
 
        job_titles = [line.strip() for line in title_file.readlines()]
         
        words = get_words(job_titles) 	
      	with open("ClusterCount","r") as cc:
      		no_of_clusters=pickle.load(cc)
        with open(dumpFile,"r") as inf:
	        dict_tf_idf=pickle.load(inf)
	        #pprint.pprint(dict_tf_idf)
        
        cluster = KMeansClusterer(no_of_clusters, cosine_distance,repeats = 10,avoid_empty_clusters=True)
        #cluster = GAAClusterer(4)
        print "Generating clusters"

        clusters =  cluster.cluster([vectorspaced(title,i,words,dict_tf_idf) for i,title in enumerate(job_titles) if title],True)
        
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        #classified_examples = [
        #        cluster.classify(vectorspaced(title,i,words,dict_tf_idf)) for i,title in enumerate(job_titles)
        #    ]
 
        # Saving the cluster:
        print "Generating graph"
        
        
       
        chart = plt.figure()
        splot = chart.add_subplot(111)

        #x = classified_examples
	x = clusters
        cols = len(cluster.means())
        splot.hist(x,cols,color='blue',alpha=0.6)
        
        QuestMap={}
        #print("Clustered as",clusters)
        for cluster_id, title in sorted(zip(clusters, job_titles)):
                
       		title=title.split()            
		
		if cluster_id in QuestMap:
		        QuestMap[cluster_id].append(qs_dict[title[0]])
		else:
		        QuestMap[cluster_id]=[qs_dict[title[0]]]
		
		
	clusObj={"cl":cluster,"cls":clusters,"QuestMap":QuestMap}	
        with open("kMeansModel.km",'wb') as f:
		pickle.dump(clusObj,f)
        plt.show()