def generateVantagePointsWithPattern(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) majorPattern = [[] for i in xrange(numberOfVP)] minorPattern = [[] for i in xrange(numberOfVP)] for j in xrange(dim): d = {} for i in xrange(numberOfData): if datas[i][j] in d: d[datas[i][j]] += 1 else: d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) for k in xrange(numberOfVP): majorPattern[k].append(d[k][0]) minorPattern[k].append(d[-1 - k][0]) utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, 'major'), majorPattern) utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, 'minor'), minorPattern)
def generateVantagePointsWithManyAlgorithm(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) vps = [] vps.append(datas[0]) while len(vps) < numberOfVP: print len(vps) ans, ansDataIndex = -1, -1 for i in xrange(len(datas)): ok = False for j in xrange(len(vps)): if datas[i] == vps[j]: ok = True if ok: continue vps.append(datas[i]) cur = calculateMany(vps) if cur > ans: ans, ansDataIndex = cur, i vps = vps[:-1] vps.append(datas[ansDataIndex]) utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP), vps)
def generateVantagePointsWithManyAlgorithm(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) vps = [] vps.append(datas[0]) while len(vps) < numberOfVP: print len(vps) ans,ansDataIndex = -1,-1 for i in xrange(len(datas)): ok = False for j in xrange(len(vps)): if datas[i] == vps[j]: ok = True if ok: continue vps.append(datas[i]) cur = calculateMany(vps) if cur > ans: ans,ansDataIndex = cur,i vps = vps[:-1] vps.append(datas[ansDataIndex]) utils.writeDataToFile('vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,typeOfVP),vps)
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP) # cdsDataFileName = utils.getCDSDataFileName(options) # cdsQueryFileName= utils.getCDSQueryFileName(options) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) d = [{} for i in xrange(dim)] for i in xrange(len(datas)): for j in xrange(dim): if datas[i][j] in d[j]: d[j][datas[i][j]] += 1 else: d[j][datas[i][j]] = 1 for i in xrange(dim): for key in d[i]: d[i][key] = 1.0 - float(d[i][key]) / float(len(datas)) def geh(a, b): ret = 0.0 for i in xrange(len(a)): if a[i] != b[i]: ret += 1.0 else: ret += d[i][a[i]] / float(dim) return ret / float(dim) datas = datas[:1000] average = 0.0 for i in xrange(len(datas)): cur = 98765432 for j in xrange(len(datas)): if i != j: cur = min(cur, geh(datas[i], datas[j])) average += cur average /= len(datas) print average
def generateRandomVP(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) vps = [] for i in xrange(numberOfVP): vps.append(datas[random.randint(0, numberOfData)]) utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP), vps)
def generateVantagePointsWithManyAlgorithm(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) majorPattern = [[] for i in xrange(numberOfVP + 1)] for j in xrange(dim): d = {} for i in xrange(numberOfData): if datas[i][j] in d: d[datas[i][j]] += 1 else: d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) for k in xrange(1): majorPattern[k].append(d[k][0]) vps = [] vps.append(majorPattern[0]) d = [0 for i in xrange(dim + 1)] one_pass = True threshold = 0 while len(vps) < numberOfVP: print len(vps) ans, ansDataIndex = -1, -1 for i in xrange(len(datas)): ok = False for j in xrange(len(vps)): dist = utils.hammingDistance(datas[i], vps[j]) if d[dist] > threshold: ok = True if datas[i] == vps[j]: ok = True if ok: continue for j in xrange(len(vps)): dist = utils.hammingDistance(datas[i], vps[j]) d[dist] += 1 vps.append(datas[i]) threshold += 1 utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP), vps)
def getMajorPattern(): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) rs = [random.randint(0, numberOfData - 1) for i in xrange(100)] majorPattern = '' for j in xrange(dim): d = {} for i in rs: if datas[i][j] in d: d[datas[i][j]] += 1 else: d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) majorPattern += d[0][0] return majorPattern
def generateVantagePointsWithManyAlgorithm(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) majorPattern = [[] for i in xrange(numberOfVP + 1)] for j in xrange(dim): d = {} for i in xrange(numberOfData): if datas[i][j] in d: d[datas[i][j]] += 1 else: d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) for k in xrange(1): majorPattern[k].append(d[k][0]) vps = [] vps.append(majorPattern[0]) while len(vps) < numberOfVP: print len(vps) ans, ansDataIndex = -1, -1 for i in xrange(len(datas)): ok = False for j in xrange(len(vps)): if datas[i] == vps[j]: ok = True if ok: continue vps.append(datas[i]) cur = calculateMany(vps) if cur > ans: ans, ansDataIndex = cur, i vps = vps[:-1] vps.append(datas[ansDataIndex]) utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP), vps)
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP) cdsDataFileName = utils.getCDSDataFileName(options) cdsQueryFileName = utils.getCDSQueryFileName(options) # cdsDataFileName = 'cds_data/data_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) # cdsQueryFileName = 'cds_query/query_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(datas[i], vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName, cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(querys[i], vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName, cdsQuerys) print cdsDataFileName, cdsQueryFileName
def generateVantagePointsWithPattern(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) majorPattern = [ [] for i in xrange(numberOfVP) ] minorPattern = [ [] for i in xrange(numberOfVP) ] for j in xrange(dim): d = {} for i in xrange(numberOfData): if datas[i][j] in d: d[datas[i][j]] += 1 else : d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) for k in xrange(numberOfVP): majorPattern[k].append(d[k][0]) minorPattern[k].append(d[-1-k][0]) utils.writeDataToFile('vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,'major'),majorPattern) utils.writeDataToFile('vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,'minor'),minorPattern)
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt'%(size,dim,distribution,cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt'%(size,dim,distribution,cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,typeOfVP) cdsDataFileName = utils.getCDSDataFileName(options) cdsQueryFileName= utils.getCDSQueryFileName(options) # cdsDataFileName = 'cds_data/data_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) # cdsQueryFileName = 'cds_query/query_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas),len(querys),len(vps) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(datas[i],vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName,cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(querys[i],vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName,cdsQuerys) print cdsDataFileName, cdsQueryFileName
if __name__ == '__main__': utils.createDirectory('ndt_data') utils.createDirectory('ndt_query') dictionary = makeDictionaryKeyIsAlphabet() dataFileNames = glob.glob('data/*.txt') for dataFileName in dataFileNames: print dataFileName onlyFileName = dataFileName.split('.')[0].split('/')[1] size = onlyFileName.split('_')[1] dim = onlyFileName.split('_')[2] vptype = onlyFileName.split('_')[3] cardinality = onlyFileName.split('_')[4] queryFileName = 'query/query_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) ndtDataFileName = 'ndt_data/data_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality) ndtQueryFileName = 'ndt_query/query_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality) if os.path.exists(ndtDataFileName): print '%s is exists'%ndtDataFileName else : with open(ndtDataFileName,'w') as fp: for i in xrange(len(datas)): for j in xrange(len(datas[i])): fp.write(str(dictionary[datas[i][j]])+' ') fp.write('\n') if os.path.exists(ndtQueryFileName): print '%s is exsts'%ndtQueryFileName
def convertNDDSToCDS(options): size = options['numberOfData'] dim = options['numberOfDimension'] distribution = options['distribution'] cardinality = options['numberOfAlphabet'] numberOfVP = options['numberOfVP'] typeOfVP = options['typeOfVP'] dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution, cardinality) vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP) # cdsDataFileName = utils.getCDSDataFileName(options) # cdsQueryFileName= utils.getCDSQueryFileName(options) cdsDataFileName = 'cds_data/data_%d_%d_%d_%s_%d_%sgeh.txt' % ( size, dim, numberOfVP, distribution, cardinality, typeOfVP) cdsQueryFileName = 'cds_query/query_%d_%d_%d_%s_%d_%sgeh.txt' % ( size, dim, numberOfVP, distribution, cardinality, typeOfVP) datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) d = [{} for i in xrange(dim)] for i in xrange(len(datas)): for j in xrange(dim): if datas[i][j] in d[j]: d[j][datas[i][j]] += 1 else: d[j][datas[i][j]] = 1 for i in xrange(dim): for key in d[i]: d[i][key] = 1.0 - float(d[i][key]) / float(len(datas)) def geh(a, b): ret = 0.0 for i in xrange(len(a)): if a[i] != b[i]: ret += 1.0 else: ret += d[i][a[i]] / float(dim) return ret / float(dim) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(geh(datas[i], vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName, cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(geh(querys[i], vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName, cdsQuerys) print cdsDataFileName, cdsQueryFileName
def generateVantagePointsWithHybridAlgorithm(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) majorPattern = [[] for i in xrange(numberOfVP + 1)] for j in xrange(dim): d = {} for i in xrange(numberOfData): if datas[i][j] in d: d[datas[i][j]] += 1 else: d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) for k in xrange(1): majorPattern[k].append(d[k][0]) vps = [] vps.append(majorPattern[0]) d = [0 for i in xrange(dim + 1)] one_pass = False while len(vps) < numberOfVP: print len(vps) ans, ansDataIndex = -1, -1 if one_pass: for i in xrange(len(datas)): ok = False for j in xrange(len(vps)): dist = utils.hammingDistance(datas[i], vps[j]) if d[dist] > 1: ok = True if datas[i] == vps[j]: ok = True if ok: continue for j in xrange(len(vps)): dist = utils.hammingDistance(datas[i], vps[j]) d[dist] += 1 vps.append(datas[i]) one_pass = False else: change = False for i in xrange(dim + 1): if d[i] == 0: change = True ans, ans_vp = -1, '' fx = 987654321 for j in xrange(len(vps)): cur_vp = generateVpWithDist(dim, cardinality, vps[j], i) vps.append(cur_vp) cur, dists = calculateMany(vps) if cur > ans: ans, ans_vp = cur, cur_vp fx = max(dists) elif cur == ans: if max(dists) < fx: ans, ans_vp = cur, cur_vp fx = max(dists) vps = vps[:-1] for j in xrange(len(vps)): dist = utils.hammingDistance(vps[j], ans_vp) d[dist] += 1 vps.append(ans_vp) break if not change: vps.append(datas[random.randrange(0, numberOfData)]) utils.writeDataToFile( 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP), vps)
def generateVantagePointsWithManyAlgorithm(options): numberOfData = options['numberOfData'] dim = options['numberOfDimension'] numberOfVP = options['numberOfVP'] cardinality = options['numberOfAlphabet'] typeOfVP = options['typeOfVP'] datas = utils.getDataInFile(utils.getDataFileName(options)) threshold = 2 majorPattern = [] for j in xrange(dim): d = {} for i in xrange(numberOfData): if datas[i][j] in d: d[datas[i][j]] += 1 else: d[datas[i][j]] = 0 d = sorted(d.items(), key=lambda x: x[1], reverse=True) majorPattern.append(d[0][0]) vps = [] vps.append(majorPattern) d = [0 for i in xrange(dim + 1)] isSelected = [False for i in xrange(len(datas))] for i in xrange(len(datas)): if datas[i] == majorPattern: isSelected[i] = True notChangedCount = 0 while len(vps) < numberOfVP: print len(vps) changed = False for i in xrange(len(datas)): if isSelected[i]: continue is_pass = False for j in xrange(len(vps)): if datas[i] == vps[j]: is_pass = True dist = utils.hammingDistance(datas[i], vps[j]) if d[dist] > threshold: is_pass = True if is_pass: continue for j in xrange(len(vps)): dist = utils.hammingDistance(datas[i], vps[j]) d[dist] += 1 vps.append(datas[i]) isSelected[i] = True changed = True if not changed: print 'not changed so pop worst (%d)' % notChangedCount worstIdx = getWorstVP(vps) for j in xrange(len(vps)): if j == worstIdx: continue dist = utils.hammingDistance(vps[j], vps[worstIdx]) d[dist] -= 1 nextVPS = popDataAtIndex(vps, worstIdx) vps = nextVPS notChangedCount += 1 if notChangedCount > numberOfVP / 2: notChangedCount = 0 for k in xrange(notChangedCount - 1): worstIdx = getWorstVP(vps) for j in xrange(len(vps)): if j == worstIdx: continue dist = utils.hammingDistance(vps[j], vps[worstIdx]) d[dist] -= 1 nextVPS = popDataAtIndex(vps, worstIdx) vps = nextVPS print len(vps) print d