files.sort() for i in xrange(len(files)): filename = files[i] data = np.loadtxt(filename) pwo = removeoutliers(data) postfix = filename[-4:] fileout = filename[:-4] + "_wo_outliers" + postfix np.savetxt(os.path.basename(fileout), pwo) print "wo outliers saved: %s" % os.path.basename(fileout) # calculate avg and std. for the last file. # use that those informatoin to convert the z-score back to original score if i == len(files) - 1: writeoutStdAndAvg(pwo) zscoredata = Utils.zscore(pwo) fileout = filename[:-4] + "_zscore_wo_outliers" + postfix np.savetxt(os.path.basename(fileout), zscoredata) print "wo outliers saved: %s" % os.path.basename(fileout) if i == 0: # store the points from first file, for centroid generation print zscoredata firstpoints = zscoredata k = 3 # 3 features initialc = np.array(random.sample(zscoredata, k)) fileoutcentroids = filename[:-4] + "_zscore_wo_outliers.centroids" + postfix np.savetxt(os.path.basename(fileoutcentroids), initialc) print "wo outliers saved: %s" % os.path.basename(fileout) #
def removeoutliers(points): print "total number of points: %s" % len(points) tmpzscoredata = Utils.zscore(points) # To find outliers po = points[~(np.abs(tmpzscoredata) > 3).any(1)] # filter out outlier rows print "removed : %s" % (len(points) - len(po)) return po
for k,v in d.iteritems(): # filebyvalues.write('%s %s\n' % (str(k), ' '.join(map(str, v)))) filebyvalues.write('%s\n' % ' '.join(map(str, v))) # i += 1 # if i == 10: # break print 'done.' print 'Alle done!' #Normalize files and pick random distinct initial centroids files = glob.glob(filespath + '.dat') for filename in files: data = np.loadtxt(filename) zscoredata = Utils.zscore(data) postfix = filename[-4:] fileout = filename[:-4] + '_zscore' + postfix np.savetxt(fileout, zscoredata) k = 3 #3 features centroids = Utils.getInitialMeans(zscoredata, k) fileoutcentroids = filename[:-4] + '_zscore.centroids' + postfix np.savetxt(fileoutcentroids, centroids) print 'zscore saved: %s' % fileout