def exp2(): # read CSV dataset = "stars_data.csv" # dataset = "starsPN.csv" datasetFObj = open(dataset) datasetFReader = csv.DictReader(datasetFObj, delimiter=",") rows = list(datasetFReader) print "dataset size:" + str(len(rows)) # construct dict from data datasetDict = defaultdict(list) for row in rows: datasetDict["text"].append(row["text"]) datasetDict["classLabel"].append(row["stars"]) # select top words bagOfWords, topTenWords = wordSelector.selectWords(datasetDict["text"]) # construct binary features featureSet = wordSelector.constructBinaryFeatures(datasetDict, bagOfWords) # json.dump(bagOfWords, open("bagOfWords.txt",'w')) # mine association rules based on confidence rules = apr.MineAssociationRules(featureSet, bagOfWords, 0.03, 0.05, 2) print "sorted rules: " # print rules # print top 30 rules for i, ruleItem in enumerate(rules): rule = ruleItem[0] chisq = ruleItem[1][0] pVal = ruleItem[1][1] support = ruleItem[1][2] if i == 30: break # build antecedent and consequence: antecedent = rule[0] conseq = rule[1] la = 0 lc = 0 if isinstance(antecedent, int): la = 1 else: la = len(antecedent) if isinstance(conseq, int): lc = 1 else: lc = len(conseq) if la == 1 and lc == 1: print "IF " + bagOfWords[antecedent] + " THEN " + bagOfWords[conseq] + ", support: " + str( support ) + ", interestingness: " + str(chisq) + ", p-val: " + str(pVal) elif la == 1 and lc == 2: print "IF " + bagOfWords[antecedent] + " THEN " + bagOfWords[conseq[0]] + " AND " + bagOfWords[ conseq[1] ] + ", support: " + str(support) + ", interestingness: " + str(chisq) + ", p-val: " + str(pVal) elif la == 2 and lc == 1: print "IF " + bagOfWords[antecedent[0]] + " AND " + bagOfWords[antecedent[1]] + " THEN " + bagOfWords[ conseq ] + ", support: " + str(support) + ", interestingness: " + str(chisq) + ", p-val: " + str(pVal)
def exp0(): #read CSV dataset = "stars_data.csv" datasetFObj = open(dataset) datasetFReader = csv.DictReader(datasetFObj, delimiter=',') rows = list(datasetFReader) print "dataset size:"+str(len(rows)) #construct dict from data datasetDict = defaultdict(list) for row in rows: datasetDict['text'].append(row['text']) datasetDict['classLabel'].append(row['stars']) #select top words bagOfWords,topTenWords = wordSelector.selectWords(datasetDict['text']) json.dump(bagOfWords, open("bagOfWords.txt",'w')) #prepare 100 topics from files written by experiment 1:StandardKMeans kmWPClusters = json.load(open("wordClustersStdKM_WP_50.txt")) kmWNPClusters = json.load(open("wordClustersStdKM_WNP_50.txt")) kmAllClusters = defaultdict(list) for i,topicLabel in enumerate(kmWPClusters.keys()): intLabel = int(float(topicLabel)) kmAllClusters[intLabel] = kmWPClusters[topicLabel] for i,topicLabel in enumerate(kmWNPClusters.keys()): intLabel = int(float(topicLabel)) intLabel += 50 kmAllClusters[intLabel] = kmWNPClusters[topicLabel] json.dump(kmAllClusters, open("allTopics_KM.txt","w")) #prepare 100 topics from files written by experiment 1:SphericalKMeans skmWPClusters = json.load(open("wordClustersSP_KM_WP_50.txt")) skmWNPClusters = json.load(open("wordClustersSP_KM_WNP_50.txt")) skmAllClusters = defaultdict(list) for i,topicLabel in enumerate(skmWPClusters.keys()): intLabel = int(float(topicLabel)) skmAllClusters[intLabel] = skmWPClusters[topicLabel] for i,topicLabel in enumerate(skmWNPClusters.keys()): intLabel = int(float(topicLabel)) intLabel += 50 skmAllClusters[intLabel] = skmWNPClusters[topicLabel] json.dump(skmAllClusters, open("allTopics_SKM.txt","w"))
def exp1(clusteringMethod): #read CSV dataset = "stars_data.csv" datasetFObj = open(dataset) datasetFReader = csv.DictReader(datasetFObj, delimiter=',') rows = list(datasetFReader) #construct dict from data datasetDict = defaultdict(list) for row in rows: datasetDict['text'].append(row['text']) datasetDict['classLabel'].append(row['stars']) #select top words bagOfWords,topTenWords = wordSelector.selectWords(datasetDict['text']) #print bagOfWords #construct features for KMeans using selected words, to select topics for NBC WP,WNP = constructFeatresForKMeans(datasetDict, 1, bagOfWords) clusterSizes = [10,20,50,100,200] resultsStdKMWP = defaultdict(list) resultsStdKMWNP = defaultdict(list) for size in clusterSizes: if clusteringMethod == 1: #WP centroids, clusterLabels, clusterScore, sumOfDistance = clusterWithRandomRestarts(WP, size) resultsStdKMWP['size'].append(size) resultsStdKMWP['cluster score'].append(clusterScore) clusters = defaultdict(list) for i,label in enumerate(clusterLabels): clusters[label].append(bagOfWords[i]) #write word clustering for each size into a file: fname = "wordClustersStdKM_WP_"+str(size)+".txt" json.dump(clusters, open(fname,'w')) #WNP centroids, clusterLabels, clusterScore, sumOfDistance = clusterWithRandomRestarts(WNP, size) resultsStdKMWNP['size'].append(size) resultsStdKMWNP['cluster score'].append(clusterScore) clusters = defaultdict(list) for i,label in enumerate(clusterLabels): clusters[label].append(bagOfWords[i]) #write word clustering for each size into a file: fname = "wordClustersStdKM_WNP_"+str(size)+".txt" json.dump(clusters, open(fname,'w')) elif clusteringMethod == 2: centroids, clusterLabels, clusterScore, sumOfDistance = spclusterWithRandomRestarts(WP, size) resultsStdKMWP['size'].append(size) resultsStdKMWP['cluster score'].append(clusterScore) clusters = defaultdict(list) for i,label in enumerate(clusterLabels): clusters[label].append(bagOfWords[i]) #write word clustering for each size into a file: fname = "wordClustersSP_KM_WP_"+str(size)+".txt" json.dump(clusters, open(fname,'w')) centroids, clusterLabels, clusterScore, sumOfDistance = spclusterWithRandomRestarts(WNP, size) resultsStdKMWNP['size'].append(size) resultsStdKMWNP['cluster score'].append(clusterScore) clusters = defaultdict(list) for i,label in enumerate(clusterLabels): clusters[label].append(bagOfWords[i]) #write word clustering for each size into a file: fname = "wordClustersSP_KM_WNP_"+str(size)+".txt" json.dump(clusters, open(fname,'w')) rfName = "results_WP_"+str(clusteringMethod)+".txt" rfName2 = "results_WNP_"+str(clusteringMethod)+".txt" json.dump(resultsStdKMWP, open(rfName,'w')) json.dump(resultsStdKMWNP, open(rfName2,'w'))