def distance_pruner(zlst): #Prunes cluster tree i.e. zlst with distance between their centroids if args['v'] > -1: sys.stderr.write( "\n#Pruning data based on eucledian distance between clusters.\n") print zlst, " #Old zlst before distprune" import dist z0 = zlst[0] pairs = [] for _i, i in enumerate(data[z0]): for _j, j in enumerate(data[z0]): if i != j: if dist.dist(i, j, z0, indep, nump) < 0.3: if [_i, _j] not in pairs and [_j, _i] not in pairs: pairs.append(['__' + str(_i + 1), '__' + str(_j + 1)]) def repaired(pairs): for i in pairs: for j in pairs: if i != j: if i[0] in j or i[1] in j: pairs = [list(set(i+j))]+\ [k for k in pairs if k not in [i,j]] return repaired(pairs) return pairs repairs = repaired(pairs) ps = [] for i in repairs: ps += i for i in zlst[1:]: if i not in ps: repairs.append([i]) temp_row = {} for ind, p in enumerate(repairs): temp_row[ind] = [] for i in p: temp_row[ind] += data[i] col = colname[z0] for Z in zlst: reader.removeTable(Z) zlst = [None] for ind, value in enumerate(temp_row.values()): Z = '__' + str(ind + 1) reader.makeTable(col, Z) for r in value: reader.addRow(r[:len(r) - 1] + [ind], Z) zlst.append(Z) xy_lib.buildzero(zlst, '', args['e']) if args['v'] > -1: print zlst, " #New zlst after distprune" return zlst
def distance_pruner(zlst): #Prunes cluster tree i.e. zlst with distance between their centroids if args['v'] > -1: sys.stderr.write("\n#Pruning data based on eucledian distance between clusters.\n") print zlst," #Old zlst before distprune" import dist z0 = zlst[0] pairs = [] for _i,i in enumerate(data[z0]): for _j,j in enumerate(data[z0]): if i != j: if dist.dist(i,j,z0,indep,nump) < 0.3: if [_i,_j] not in pairs and [_j,_i] not in pairs: pairs.append(['__'+str(_i+1),'__'+str(_j+1)]) def repaired(pairs): for i in pairs: for j in pairs: if i != j : if i[0] in j or i[1] in j: pairs = [list(set(i+j))]+\ [k for k in pairs if k not in [i,j]] return repaired(pairs) return pairs repairs = repaired(pairs) ps = [] for i in repairs: ps+=i for i in zlst[1:]: if i not in ps: repairs.append([i]) temp_row = {} for ind,p in enumerate(repairs): temp_row[ind] = [] for i in p: temp_row[ind] += data[i] col = colname[z0] for Z in zlst: reader.removeTable(Z) zlst = [None] for ind,value in enumerate(temp_row.values()): Z = '__'+str(ind+1) reader.makeTable(col,Z) for r in value: reader.addRow(r[:len(r)-1]+[ind],Z) zlst.append(Z) xy_lib.buildzero(zlst,'',args['e']) if args['v'] > -1: print zlst," #New zlst after distprune" return zlst
def discrete1(table, ntable, bins, b, label): for d in range(len(table.data[0])): a = [] for k in range(len(table.data)): val = table.data[k][d] if val != '?': if k in table.num: k = table.num.index(k) val = label(k, float(val), bins, b, table) a += [str(val)] reader.addRow(a, ntable)
def attrtable(table, attrlst): lst, name, row = [], [], [] for s in range(len(table.name)): if table.name[s][1:] in attrlst or table.name[s] in attrlst: lst += [s] lst += [table.klass[0]] name = [table.name[i] for i in lst] ntable = tablestr.Table() reader.makeTable(name, ntable) for s in range(len(table.data[0])): row = [table.data[k][s] for k in lst] reader.addRow(row, ntable) return ntable
def xval(start,stop,rows,table,f): rmax=len(rows) r=0 train = reader.makeTable(table.header) test = reader.makeTable(table.header) while(r<rmax): d=rows[r] r+=1 if ((r>= start) & (r <= stop)): reader.addRow(d, train) else: reader.addRow(d, test) print test return f.zeror(train.klass.expected, test.klass)
def makeNewTable(has, c1, table, tile, centable): c1 = c1 * 100 z = table.name.index('$_ZZ') newtable = tablestr.Table() reader.makeTable(table.name, newtable) for one in range(len(has)): d = has[one] row1 = [table.data[s][d] for s in range(len(table.data))] row1[z] = str(c1) reader.addRow(row1, newtable) centers = tablestr.centroid(newtable) #centers[0] is mu or mode centers[0][z] = str(c1) reader.addRow(centers[0], centable[0]) centable[c1/100] = newtable
def widen(table, x, y): adds = table.name[:] adds += ['$_XX'] adds += ['$_YY'] adds += ['$_Hell'] adds += ['$_ZZ'] ntable = tablestr.Table() reader.makeTable(adds, ntable) for s in range(len(table.data[0])): row = [table.data[k][s] for k in range(len(table.data))] #tmp = row[:] row += [x[s]] row += [y[s]] row += [table.data[table.klass[0]][s]] row += [str(0)] reader.addRow(row, ntable) return ntable
def mutate(conds, wcluster, appender): #mutates wcluster wrt conds temp_data = [] for c in conds: ind = colname[wcluster].index(c[0]) for d in data[wcluster]: le = c[1] if le: if d[ind] <= c[2]: if d not in temp_data: temp_data.append(d) else: if d[ind] > c[2]: if d not in temp_data: temp_data.append(d) wced = wcluster + appender reader.makeTable(colname[wcluster], wced) for r in temp_data: reader.addRow(r, wced) return wced
def mutate(conds,wcluster,appender): #mutates wcluster wrt conds temp_data = [] for c in conds: ind = colname[wcluster].index(c[0]) for d in data[wcluster]: le = c[1] if le: if d[ind] <= c[2]: if d not in temp_data: temp_data.append(d) else: if d[ind] > c[2]: if d not in temp_data: temp_data.append(d) wced = wcluster+appender reader.makeTable(colname[wcluster],wced) for r in temp_data: reader.addRow(r,wced) return wced
def xval(start, stop, rows, tables): testT = tablestr.Table() trainT = tablestr.Table() reader.makeTable(tables.name, testT) reader.makeTable(tables.name, trainT) for r in range(len(rows)): d = rows[r] a = [] for j in range(len(tables.order)): a.append(tables.data[j][d]) if r >= start and r < stop: #belonging to testing data set reader.addRow(a, testT) else: reader.addRow(a, trainT) testT = reader.klasses(testT) trainT = reader.klasses(trainT) tables = {} tables['train'] = trainT tables['test'] = testT return tables
def tshortener(z,zlst,colname,data,dep,indep,patt=1.0,discretize=True): #The infogain techniques of pruning columns and discretization class Bucket: #class for each column with splitted pairs of data def __init__(self,name): self.pairs = [] #unsorted row pairs self.name = name self.wsum = 0 self.dinds = {} #sorted split indexs self.lo = {} self.hi = {} def addpairs(self,pairs): self.pairs.append(pairs) def addwsum(self,wsum): self.wsum = wsum def __repr__(self): s = 'n: '+str(self.name)+":" s += ' l: '+str(len(self.pairs)) s += ' e: '+str(self.wsum)+'\n' return s from globfile import buckets outcols = [] for key,value in buckets.items(): buckets[key] = None for Z in zlst[1:]: for c in indep[Z]: if c == 'C_id': continue if c not in buckets.keys(): buckets[c] = Bucket(c) elif buckets[c] is None: buckets[c] = Bucket(c) ind = colname[Z].index(c) cind = colname[Z].index('C_id') for r in data[Z]: buckets[c].addpairs((r[ind],str(r[cind]))) reader.removeTable(Z) buckets = weighted_entropies(buckets) vals = buckets.values()[:] vals.sort(key=lambda x: x.wsum,reverse=False) for i in range(0,int(len(vals)*patt)): outcols.append(vals[i].name) zshort = 'shortenedz' outcols = [i for i in colname[z] if i in outcols] print outcols,"#infogained" #Convert outcols to discrete attributes if discretize: outcols = [c[1:] for c in outcols] print outcols,"#discretized" reader.makeTable(outcols+dep[z],zshort) for r in data[z]: temp = [] for i,c in enumerate(colname[z]): if discretize: if c[1:] in outcols or c in dep[z]: temp.append(r[i]) else: if c in outcols+dep[z]: temp.append(r[i]) reader.addRow(temp,zshort) if discretize: discretizer(zshort,buckets) for Z in zlst: reader.removeTable(Z) #discretizer(zshort,buckets) return zshort
def tshortener(z, zlst, colname, data, dep, indep, patt=1.0, discretize=True): #The infogain techniques of pruning columns and discretization class Bucket: #class for each column with splitted pairs of data def __init__(self, name): self.pairs = [] #unsorted row pairs self.name = name self.wsum = 0 self.dinds = {} #sorted split indexs self.lo = {} self.hi = {} def addpairs(self, pairs): self.pairs.append(pairs) def addwsum(self, wsum): self.wsum = wsum def __repr__(self): s = 'n: ' + str(self.name) + ":" s += ' l: ' + str(len(self.pairs)) s += ' e: ' + str(self.wsum) + '\n' return s from globfile import buckets outcols = [] for key, value in buckets.items(): buckets[key] = None for Z in zlst[1:]: for c in indep[Z]: if c == 'C_id': continue if c not in buckets.keys(): buckets[c] = Bucket(c) elif buckets[c] is None: buckets[c] = Bucket(c) ind = colname[Z].index(c) cind = colname[Z].index('C_id') for r in data[Z]: buckets[c].addpairs((r[ind], str(r[cind]))) reader.removeTable(Z) buckets = weighted_entropies(buckets) vals = buckets.values()[:] vals.sort(key=lambda x: x.wsum, reverse=False) for i in range(0, int(len(vals) * patt)): outcols.append(vals[i].name) zshort = 'shortenedz' outcols = [i for i in colname[z] if i in outcols] print outcols, "#infogained" #Convert outcols to discrete attributes if discretize: outcols = [c[1:] for c in outcols] print outcols, "#discretized" reader.makeTable(outcols + dep[z], zshort) for r in data[z]: temp = [] for i, c in enumerate(colname[z]): if discretize: if c[1:] in outcols or c in dep[z]: temp.append(r[i]) else: if c in outcols + dep[z]: temp.append(r[i]) reader.addRow(temp, zshort) if discretize: discretizer(zshort, buckets) for Z in zlst: reader.removeTable(Z) #discretizer(zshort,buckets) return zshort