def main(): ### redirecting stdout orig_stdout = sys.stdout f = file('analyszedata.out', 'w') sys.stdout = f ### Load file dataFile = 'example.train.csv' data = util.loadCsv(dataFile) data = np.asarray(data)[:,:-1] # print data ### Get and output feature types analayzeFeatureType(data) ### Get and output value counts for each feature getValueCountsAll(data) print "\n%s The number of distinct values and the corresponding feature IDs." % util.RESULT print distinctValCntFeatureMap distinctValCntFeatureCntMap = dict() for keys, values in distinctValCntFeatureMap.items(): distinctValCntFeatureCntMap[keys] = len(values) print "\n%s The number of distinct values and # of features." % util.RESULT print distinctValCntFeatureCntMap # plot x = [0] * len(distinctValCntFeatureCntMap) singley = [0] * len(distinctValCntFeatureCntMap) accumy = [0] * len(distinctValCntFeatureCntMap) accumValue = 0 for keys, values in distinctValCntFeatureCntMap.items(): x.append(keys) singley.append(values) accumValue += values accumy.append(accumValue) plt.plot(x, singley) plt.xlabel('# of distinct values in a feature') plt.ylabel('# of features') plt.show() plt.plot(x, accumy) plt.xlabel('# of distinct values in a feature') plt.ylabel('accumalated # of features') plt.show() ### redirecting stdout sys.stdout = orig_stdout f.close()
def countTargetValues(dataFile): data = util.loadCsv(dataFile) targets = data[:,-1] targets = map(int, targets) print np.count_nonzero(np.asarray(targets))