Exemple #1
0
def generateVantagePointsWithPattern(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))
    majorPattern = [[] for i in xrange(numberOfVP)]
    minorPattern = [[] for i in xrange(numberOfVP)]
    for j in xrange(dim):
        d = {}
        for i in xrange(numberOfData):
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else:
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        for k in xrange(numberOfVP):
            majorPattern[k].append(d[k][0])
            minorPattern[k].append(d[-1 - k][0])
    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, 'major'),
        majorPattern)
    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, 'minor'),
        minorPattern)
def generateVantagePointsWithManyAlgorithm(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))

    vps = []
    vps.append(datas[0])
    while len(vps) < numberOfVP:
        print len(vps)
        ans, ansDataIndex = -1, -1
        for i in xrange(len(datas)):
            ok = False
            for j in xrange(len(vps)):
                if datas[i] == vps[j]:
                    ok = True
            if ok:
                continue
            vps.append(datas[i])
            cur = calculateMany(vps)
            if cur > ans:
                ans, ansDataIndex = cur, i
            vps = vps[:-1]
        vps.append(datas[ansDataIndex])
    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP),
        vps)
def generateVantagePointsWithManyAlgorithm(options):
    numberOfData = options['numberOfData']
    dim          = options['numberOfDimension']
    numberOfVP   = options['numberOfVP']
    cardinality  = options['numberOfAlphabet']
    typeOfVP     = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))

    vps = []
    vps.append(datas[0])
    while len(vps) < numberOfVP:
        print len(vps)
        ans,ansDataIndex = -1,-1
        for i in xrange(len(datas)):
            ok = False
            for j in xrange(len(vps)):
                if datas[i] == vps[j]:
                    ok = True
            if ok:
                continue
            vps.append(datas[i])
            cur = calculateMany(vps)
            if cur > ans:
                ans,ansDataIndex = cur,i
            vps = vps[:-1]
        vps.append(datas[ansDataIndex])
    utils.writeDataToFile('vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,typeOfVP),vps)
Exemple #4
0
def convertNDDSToCDS(options):
    size = options['numberOfData']
    dim = options['numberOfDimension']
    distribution = options['distribution']
    cardinality = options['numberOfAlphabet']
    numberOfVP = options['numberOfVP']
    typeOfVP = options['typeOfVP']

    dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution,
                                                  cardinality)
    queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution,
                                                     cardinality)
    vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality,
                                            typeOfVP)
    #    cdsDataFileName = utils.getCDSDataFileName(options)
    #    cdsQueryFileName= utils.getCDSQueryFileName(options)

    datas = utils.getDataInFile(dataFileName)
    querys = utils.readDataFromFile(queryFileName)
    vps = utils.readDataFromFile(vpFileName)
    print len(datas), len(querys), len(vps)

    d = [{} for i in xrange(dim)]
    for i in xrange(len(datas)):
        for j in xrange(dim):
            if datas[i][j] in d[j]:
                d[j][datas[i][j]] += 1
            else:
                d[j][datas[i][j]] = 1

    for i in xrange(dim):
        for key in d[i]:
            d[i][key] = 1.0 - float(d[i][key]) / float(len(datas))

    def geh(a, b):
        ret = 0.0
        for i in xrange(len(a)):
            if a[i] != b[i]:
                ret += 1.0
            else:
                ret += d[i][a[i]] / float(dim)
        return ret / float(dim)

    datas = datas[:1000]
    average = 0.0
    for i in xrange(len(datas)):
        cur = 98765432
        for j in xrange(len(datas)):
            if i != j:
                cur = min(cur, geh(datas[i], datas[j]))
        average += cur
    average /= len(datas)
    print average
def generateRandomVP(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))
    vps = []
    for i in xrange(numberOfVP):
        vps.append(datas[random.randint(0, numberOfData)])

    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP),
        vps)
Exemple #6
0
def generateVantagePointsWithManyAlgorithm(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))

    majorPattern = [[] for i in xrange(numberOfVP + 1)]
    for j in xrange(dim):
        d = {}
        for i in xrange(numberOfData):
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else:
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        for k in xrange(1):
            majorPattern[k].append(d[k][0])

    vps = []
    vps.append(majorPattern[0])
    d = [0 for i in xrange(dim + 1)]
    one_pass = True
    threshold = 0
    while len(vps) < numberOfVP:
        print len(vps)
        ans, ansDataIndex = -1, -1
        for i in xrange(len(datas)):
            ok = False
            for j in xrange(len(vps)):
                dist = utils.hammingDistance(datas[i], vps[j])
                if d[dist] > threshold:
                    ok = True
                if datas[i] == vps[j]:
                    ok = True
            if ok:
                continue
            for j in xrange(len(vps)):
                dist = utils.hammingDistance(datas[i], vps[j])
                d[dist] += 1
            vps.append(datas[i])
        threshold += 1

    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP),
        vps)
Exemple #7
0
def getMajorPattern():
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))
    rs = [random.randint(0, numberOfData - 1) for i in xrange(100)]
    majorPattern = ''
    for j in xrange(dim):
        d = {}
        for i in rs:
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else:
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        majorPattern += d[0][0]
    return majorPattern
Exemple #8
0
def generateVantagePointsWithManyAlgorithm(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))

    majorPattern = [[] for i in xrange(numberOfVP + 1)]
    for j in xrange(dim):
        d = {}
        for i in xrange(numberOfData):
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else:
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        for k in xrange(1):
            majorPattern[k].append(d[k][0])

    vps = []
    vps.append(majorPattern[0])
    while len(vps) < numberOfVP:
        print len(vps)
        ans, ansDataIndex = -1, -1
        for i in xrange(len(datas)):
            ok = False
            for j in xrange(len(vps)):
                if datas[i] == vps[j]:
                    ok = True
            if ok:
                continue
            vps.append(datas[i])
            cur = calculateMany(vps)
            if cur > ans:
                ans, ansDataIndex = cur, i
            vps = vps[:-1]
        vps.append(datas[ansDataIndex])
    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP),
        vps)
Exemple #9
0
def convertNDDSToCDS(options):
    size = options['numberOfData']
    dim = options['numberOfDimension']
    distribution = options['distribution']
    cardinality = options['numberOfAlphabet']
    numberOfVP = options['numberOfVP']
    typeOfVP = options['typeOfVP']

    dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution,
                                                  cardinality)
    queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution,
                                                     cardinality)
    vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality,
                                            typeOfVP)
    cdsDataFileName = utils.getCDSDataFileName(options)
    cdsQueryFileName = utils.getCDSQueryFileName(options)
    #    cdsDataFileName     = 'cds_data/data_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP)
    #    cdsQueryFileName    = 'cds_query/query_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP)

    datas = utils.getDataInFile(dataFileName)
    querys = utils.readDataFromFile(queryFileName)
    vps = utils.readDataFromFile(vpFileName)
    print len(datas), len(querys), len(vps)

    cdsDatas = []
    for i in xrange(len(datas)):
        t = []
        for j in xrange(len(vps)):
            t.append(utils.hammingDistance(datas[i], vps[j]))
        cdsDatas.append(t)
    utils.writeDataToFile(cdsDataFileName, cdsDatas)

    cdsQuerys = []
    for i in xrange(len(querys)):
        t = []
        for j in xrange(len(vps)):
            t.append(utils.hammingDistance(querys[i], vps[j]))
        cdsQuerys.append(t)
    utils.writeDataToFile(cdsQueryFileName, cdsQuerys)
    print cdsDataFileName, cdsQueryFileName
def generateVantagePointsWithPattern(options):
    numberOfData = options['numberOfData']
    dim          = options['numberOfDimension']
    numberOfVP   = options['numberOfVP']
    cardinality  = options['numberOfAlphabet']
    typeOfVP     = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))
    majorPattern = [ [] for i in xrange(numberOfVP) ]
    minorPattern = [ [] for i in xrange(numberOfVP) ]
    for j in xrange(dim):
        d = {}
        for i in xrange(numberOfData):
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else :
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        for k in xrange(numberOfVP):
            majorPattern[k].append(d[k][0])
            minorPattern[k].append(d[-1-k][0])
    utils.writeDataToFile('vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,'major'),majorPattern)
    utils.writeDataToFile('vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,'minor'),minorPattern)
Exemple #11
0
def convertNDDSToCDS(options):
    size            = options['numberOfData']
    dim             = options['numberOfDimension']
    distribution    = options['distribution']
    cardinality     = options['numberOfAlphabet']
    numberOfVP      = options['numberOfVP']
    typeOfVP        = options['typeOfVP']

    dataFileName    = 'data/data_%d_%d_%s_%d.txt'%(size,dim,distribution,cardinality)
    queryFileName   = 'query/query_%d_%d_%s_%d.txt'%(size,dim,distribution,cardinality)
    vpFileName      = 'vp/vp_%d_%d_%d_%s.txt'%(dim,numberOfVP,cardinality,typeOfVP)
    cdsDataFileName = utils.getCDSDataFileName(options)
    cdsQueryFileName= utils.getCDSQueryFileName(options)
#    cdsDataFileName     = 'cds_data/data_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP)
#    cdsQueryFileName    = 'cds_query/query_%d_%d_%s_%d_%s.txt'%(size,numberOfVP,distribution,cardinality,typeOfVP)

    datas   = utils.getDataInFile(dataFileName)
    querys  = utils.readDataFromFile(queryFileName)
    vps     = utils.readDataFromFile(vpFileName)
    print len(datas),len(querys),len(vps)

    cdsDatas = []
    for i in xrange(len(datas)):
        t = []
        for j in xrange(len(vps)):
            t.append(utils.hammingDistance(datas[i],vps[j]))
        cdsDatas.append(t)
    utils.writeDataToFile(cdsDataFileName,cdsDatas)

    cdsQuerys = []
    for i in xrange(len(querys)):
        t = []
        for j in xrange(len(vps)):
            t.append(utils.hammingDistance(querys[i],vps[j]))
        cdsQuerys.append(t)
    utils.writeDataToFile(cdsQueryFileName,cdsQuerys)
    print cdsDataFileName, cdsQueryFileName
Exemple #12
0
if __name__ == '__main__':
    utils.createDirectory('ndt_data')
    utils.createDirectory('ndt_query')
    dictionary = makeDictionaryKeyIsAlphabet()

    dataFileNames = glob.glob('data/*.txt')
    for dataFileName in dataFileNames:
        print dataFileName
        onlyFileName = dataFileName.split('.')[0].split('/')[1]
        size        = onlyFileName.split('_')[1]
        dim         = onlyFileName.split('_')[2]
        vptype      = onlyFileName.split('_')[3]
        cardinality = onlyFileName.split('_')[4]

        queryFileName = 'query/query_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality)
        datas = utils.getDataInFile(dataFileName)
        querys = utils.readDataFromFile(queryFileName)

        ndtDataFileName = 'ndt_data/data_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality)
        ndtQueryFileName = 'ndt_query/query_%s_%s_%s_%s.txt'%(size,dim,vptype,cardinality)

        if os.path.exists(ndtDataFileName):
            print '%s is exists'%ndtDataFileName
        else :
            with open(ndtDataFileName,'w') as fp:
                for i in xrange(len(datas)):
                    for j in xrange(len(datas[i])):
                        fp.write(str(dictionary[datas[i][j]])+' ')
                    fp.write('\n')
        if os.path.exists(ndtQueryFileName):
            print '%s is exsts'%ndtQueryFileName
def convertNDDSToCDS(options):
    size = options['numberOfData']
    dim = options['numberOfDimension']
    distribution = options['distribution']
    cardinality = options['numberOfAlphabet']
    numberOfVP = options['numberOfVP']
    typeOfVP = options['typeOfVP']

    dataFileName = 'data/data_%d_%d_%s_%d.txt' % (size, dim, distribution,
                                                  cardinality)
    queryFileName = 'query/query_%d_%d_%s_%d.txt' % (size, dim, distribution,
                                                     cardinality)
    vpFileName = 'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality,
                                            typeOfVP)
    #    cdsDataFileName = utils.getCDSDataFileName(options)
    #    cdsQueryFileName= utils.getCDSQueryFileName(options)
    cdsDataFileName = 'cds_data/data_%d_%d_%d_%s_%d_%sgeh.txt' % (
        size, dim, numberOfVP, distribution, cardinality, typeOfVP)
    cdsQueryFileName = 'cds_query/query_%d_%d_%d_%s_%d_%sgeh.txt' % (
        size, dim, numberOfVP, distribution, cardinality, typeOfVP)

    datas = utils.getDataInFile(dataFileName)
    querys = utils.readDataFromFile(queryFileName)
    vps = utils.readDataFromFile(vpFileName)
    print len(datas), len(querys), len(vps)

    d = [{} for i in xrange(dim)]
    for i in xrange(len(datas)):
        for j in xrange(dim):
            if datas[i][j] in d[j]:
                d[j][datas[i][j]] += 1
            else:
                d[j][datas[i][j]] = 1

    for i in xrange(dim):
        for key in d[i]:
            d[i][key] = 1.0 - float(d[i][key]) / float(len(datas))

    def geh(a, b):
        ret = 0.0
        for i in xrange(len(a)):
            if a[i] != b[i]:
                ret += 1.0
            else:
                ret += d[i][a[i]] / float(dim)
        return ret / float(dim)

    cdsDatas = []
    for i in xrange(len(datas)):
        t = []
        for j in xrange(len(vps)):
            t.append(geh(datas[i], vps[j]))
        cdsDatas.append(t)
    utils.writeDataToFile(cdsDataFileName, cdsDatas)

    cdsQuerys = []
    for i in xrange(len(querys)):
        t = []
        for j in xrange(len(vps)):
            t.append(geh(querys[i], vps[j]))
        cdsQuerys.append(t)
    utils.writeDataToFile(cdsQueryFileName, cdsQuerys)
    print cdsDataFileName, cdsQueryFileName
Exemple #14
0
def generateVantagePointsWithHybridAlgorithm(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))

    majorPattern = [[] for i in xrange(numberOfVP + 1)]
    for j in xrange(dim):
        d = {}
        for i in xrange(numberOfData):
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else:
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        for k in xrange(1):
            majorPattern[k].append(d[k][0])

    vps = []
    vps.append(majorPattern[0])
    d = [0 for i in xrange(dim + 1)]
    one_pass = False
    while len(vps) < numberOfVP:
        print len(vps)
        ans, ansDataIndex = -1, -1
        if one_pass:
            for i in xrange(len(datas)):
                ok = False
                for j in xrange(len(vps)):
                    dist = utils.hammingDistance(datas[i], vps[j])
                    if d[dist] > 1:
                        ok = True
                    if datas[i] == vps[j]:
                        ok = True
                if ok:
                    continue
                for j in xrange(len(vps)):
                    dist = utils.hammingDistance(datas[i], vps[j])
                    d[dist] += 1
                vps.append(datas[i])
            one_pass = False
        else:
            change = False
            for i in xrange(dim + 1):
                if d[i] == 0:
                    change = True
                    ans, ans_vp = -1, ''
                    fx = 987654321
                    for j in xrange(len(vps)):
                        cur_vp = generateVpWithDist(dim, cardinality, vps[j],
                                                    i)
                        vps.append(cur_vp)
                        cur, dists = calculateMany(vps)
                        if cur > ans:
                            ans, ans_vp = cur, cur_vp
                            fx = max(dists)
                        elif cur == ans:
                            if max(dists) < fx:
                                ans, ans_vp = cur, cur_vp
                                fx = max(dists)
                        vps = vps[:-1]
                    for j in xrange(len(vps)):
                        dist = utils.hammingDistance(vps[j], ans_vp)
                        d[dist] += 1
                    vps.append(ans_vp)
                    break
            if not change:
                vps.append(datas[random.randrange(0, numberOfData)])

    utils.writeDataToFile(
        'vp/vp_%d_%d_%d_%s.txt' % (dim, numberOfVP, cardinality, typeOfVP),
        vps)
def generateVantagePointsWithManyAlgorithm(options):
    numberOfData = options['numberOfData']
    dim = options['numberOfDimension']
    numberOfVP = options['numberOfVP']
    cardinality = options['numberOfAlphabet']
    typeOfVP = options['typeOfVP']
    datas = utils.getDataInFile(utils.getDataFileName(options))
    threshold = 2

    majorPattern = []
    for j in xrange(dim):
        d = {}
        for i in xrange(numberOfData):
            if datas[i][j] in d:
                d[datas[i][j]] += 1
            else:
                d[datas[i][j]] = 0
        d = sorted(d.items(), key=lambda x: x[1], reverse=True)
        majorPattern.append(d[0][0])

    vps = []
    vps.append(majorPattern)
    d = [0 for i in xrange(dim + 1)]
    isSelected = [False for i in xrange(len(datas))]
    for i in xrange(len(datas)):
        if datas[i] == majorPattern:
            isSelected[i] = True
    notChangedCount = 0
    while len(vps) < numberOfVP:
        print len(vps)
        changed = False
        for i in xrange(len(datas)):
            if isSelected[i]:
                continue
            is_pass = False
            for j in xrange(len(vps)):
                if datas[i] == vps[j]:
                    is_pass = True
                dist = utils.hammingDistance(datas[i], vps[j])
                if d[dist] > threshold:
                    is_pass = True
            if is_pass:
                continue
            for j in xrange(len(vps)):
                dist = utils.hammingDistance(datas[i], vps[j])
                d[dist] += 1
            vps.append(datas[i])
            isSelected[i] = True
            changed = True
        if not changed:
            print 'not changed so pop worst (%d)' % notChangedCount
            worstIdx = getWorstVP(vps)
            for j in xrange(len(vps)):
                if j == worstIdx: continue
                dist = utils.hammingDistance(vps[j], vps[worstIdx])
                d[dist] -= 1
            nextVPS = popDataAtIndex(vps, worstIdx)
            vps = nextVPS
            notChangedCount += 1
            if notChangedCount > numberOfVP / 2:
                notChangedCount = 0
            for k in xrange(notChangedCount - 1):
                worstIdx = getWorstVP(vps)
                for j in xrange(len(vps)):
                    if j == worstIdx: continue
                    dist = utils.hammingDistance(vps[j], vps[worstIdx])
                    d[dist] -= 1
                nextVPS = popDataAtIndex(vps, worstIdx)
                vps = nextVPS

        print len(vps)
        print d