Example #1
0
def exp2():
    # read CSV
    dataset = "stars_data.csv"
    # dataset = "starsPN.csv"
    datasetFObj = open(dataset)
    datasetFReader = csv.DictReader(datasetFObj, delimiter=",")
    rows = list(datasetFReader)
    print "dataset size:" + str(len(rows))
    # construct dict from data
    datasetDict = defaultdict(list)
    for row in rows:
        datasetDict["text"].append(row["text"])
        datasetDict["classLabel"].append(row["stars"])
        # select top words
    bagOfWords, topTenWords = wordSelector.selectWords(datasetDict["text"])
    # construct binary features
    featureSet = wordSelector.constructBinaryFeatures(datasetDict, bagOfWords)
    # json.dump(bagOfWords, open("bagOfWords.txt",'w'))
    # mine association rules based on confidence
    rules = apr.MineAssociationRules(featureSet, bagOfWords, 0.03, 0.05, 2)
    print "sorted rules: "
    # print rules
    # print top 30 rules
    for i, ruleItem in enumerate(rules):
        rule = ruleItem[0]
        chisq = ruleItem[1][0]
        pVal = ruleItem[1][1]
        support = ruleItem[1][2]
        if i == 30:
            break
            # build antecedent and consequence:
        antecedent = rule[0]
        conseq = rule[1]
        la = 0
        lc = 0
        if isinstance(antecedent, int):
            la = 1
        else:
            la = len(antecedent)
        if isinstance(conseq, int):
            lc = 1
        else:
            lc = len(conseq)
        if la == 1 and lc == 1:
            print "IF " + bagOfWords[antecedent] + " THEN " + bagOfWords[conseq] + ", support: " + str(
                support
            ) + ", interestingness: " + str(chisq) + ", p-val: " + str(pVal)
        elif la == 1 and lc == 2:
            print "IF " + bagOfWords[antecedent] + " THEN " + bagOfWords[conseq[0]] + " AND " + bagOfWords[
                conseq[1]
            ] + ", support: " + str(support) + ", interestingness: " + str(chisq) + ", p-val: " + str(pVal)
        elif la == 2 and lc == 1:
            print "IF " + bagOfWords[antecedent[0]] + " AND " + bagOfWords[antecedent[1]] + " THEN " + bagOfWords[
                conseq
            ] + ", support: " + str(support) + ", interestingness: " + str(chisq) + ", p-val: " + str(pVal)
Example #2
0
def exp0():
	#read CSV
	dataset = "stars_data.csv"
	datasetFObj = open(dataset)
	datasetFReader = csv.DictReader(datasetFObj, delimiter=',')
	rows = list(datasetFReader)
	print "dataset size:"+str(len(rows))
	#construct dict from data
	datasetDict = defaultdict(list)
	for row in rows:
		datasetDict['text'].append(row['text'])
		datasetDict['classLabel'].append(row['stars'])
	#select top words	
	bagOfWords,topTenWords = wordSelector.selectWords(datasetDict['text'])
	json.dump(bagOfWords, open("bagOfWords.txt",'w'))
	
	#prepare 100 topics from files written by experiment 1:StandardKMeans
	kmWPClusters = json.load(open("wordClustersStdKM_WP_50.txt"))
	kmWNPClusters = json.load(open("wordClustersStdKM_WNP_50.txt"))
	kmAllClusters = defaultdict(list)
	for i,topicLabel in enumerate(kmWPClusters.keys()):
		intLabel = int(float(topicLabel))
		kmAllClusters[intLabel] = kmWPClusters[topicLabel]

	for i,topicLabel in enumerate(kmWNPClusters.keys()):
		intLabel = int(float(topicLabel))
		intLabel += 50
		kmAllClusters[intLabel] = kmWNPClusters[topicLabel]
	json.dump(kmAllClusters, open("allTopics_KM.txt","w"))
	
	#prepare 100 topics from files written by experiment 1:SphericalKMeans
	skmWPClusters = json.load(open("wordClustersSP_KM_WP_50.txt"))
	skmWNPClusters = json.load(open("wordClustersSP_KM_WNP_50.txt"))
	skmAllClusters = defaultdict(list)
	for i,topicLabel in enumerate(skmWPClusters.keys()):
		intLabel = int(float(topicLabel))
		skmAllClusters[intLabel] = skmWPClusters[topicLabel]

	for i,topicLabel in enumerate(skmWNPClusters.keys()):
		intLabel = int(float(topicLabel))
		intLabel += 50
		skmAllClusters[intLabel] = skmWNPClusters[topicLabel]
	json.dump(skmAllClusters, open("allTopics_SKM.txt","w"))
Example #3
0
def exp1(clusteringMethod):
	#read CSV
	dataset = "stars_data.csv"
	datasetFObj = open(dataset)
	datasetFReader = csv.DictReader(datasetFObj, delimiter=',')
	rows = list(datasetFReader)
	#construct dict from data
	datasetDict = defaultdict(list)
	for row in rows:
		datasetDict['text'].append(row['text'])
		datasetDict['classLabel'].append(row['stars'])
	#select top words	
	bagOfWords,topTenWords = wordSelector.selectWords(datasetDict['text'])
	#print bagOfWords
	#construct features for KMeans using selected words, to select topics for NBC
	WP,WNP = constructFeatresForKMeans(datasetDict, 1, bagOfWords)
	
	clusterSizes = [10,20,50,100,200]
	resultsStdKMWP = defaultdict(list)
	resultsStdKMWNP = defaultdict(list)
	for size in clusterSizes:
		if clusteringMethod == 1:
			#WP
			centroids, clusterLabels, clusterScore, sumOfDistance = clusterWithRandomRestarts(WP, size)
			resultsStdKMWP['size'].append(size)
			resultsStdKMWP['cluster score'].append(clusterScore)
			clusters = defaultdict(list)
			for i,label in enumerate(clusterLabels):
				clusters[label].append(bagOfWords[i])
			#write word clustering for each size into a file:
			fname = "wordClustersStdKM_WP_"+str(size)+".txt"
			json.dump(clusters, open(fname,'w'))
			#WNP
			centroids, clusterLabels, clusterScore, sumOfDistance = clusterWithRandomRestarts(WNP, size)			
			resultsStdKMWNP['size'].append(size)
			resultsStdKMWNP['cluster score'].append(clusterScore)
			clusters = defaultdict(list)
			for i,label in enumerate(clusterLabels):
				clusters[label].append(bagOfWords[i])	
			#write word clustering for each size into a file:
			fname = "wordClustersStdKM_WNP_"+str(size)+".txt"
			json.dump(clusters, open(fname,'w'))

		elif clusteringMethod == 2:
			centroids, clusterLabels, clusterScore, sumOfDistance = spclusterWithRandomRestarts(WP, size)
			resultsStdKMWP['size'].append(size)
			resultsStdKMWP['cluster score'].append(clusterScore)
			clusters = defaultdict(list)
			for i,label in enumerate(clusterLabels):
				clusters[label].append(bagOfWords[i])
			#write word clustering for each size into a file:
			fname = "wordClustersSP_KM_WP_"+str(size)+".txt"
			json.dump(clusters, open(fname,'w'))
		
			centroids, clusterLabels, clusterScore, sumOfDistance = spclusterWithRandomRestarts(WNP, size)		
			resultsStdKMWNP['size'].append(size)
			resultsStdKMWNP['cluster score'].append(clusterScore)
			clusters = defaultdict(list)
			for i,label in enumerate(clusterLabels):
				clusters[label].append(bagOfWords[i])	
			#write word clustering for each size into a file:
			fname = "wordClustersSP_KM_WNP_"+str(size)+".txt"
			json.dump(clusters, open(fname,'w'))
	
	rfName = "results_WP_"+str(clusteringMethod)+".txt"
	rfName2 = "results_WNP_"+str(clusteringMethod)+".txt"
	json.dump(resultsStdKMWP, open(rfName,'w'))
	json.dump(resultsStdKMWNP, open(rfName2,'w'))