Esempio n. 1
0
def aff_cluster(Sfn,
                conv_iter=15,
                max_iter=2000,
                damping=0.95,
                mpi=None,
                verbose=False,
                debug=False,
                *args,
                **kwargs):

    comm, NPROCS, rank = mpi

    NPROCS_LOCAL = int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])

    #Init storage for matrices
    #Get file name
    #Open matrix file in parallel mode
    SSf = h5py.File(Sfn, 'r+', driver='mpio', comm=comm)
    SSf.atomic = True
    #Open table with data for clusterization
    SS = SSf['cluster']
    SSs = SS.id.get_space()

    params = {
        'N': 0,
        'l': 0,
        'll': 0,
        'TMfn': '',
        'disk': False,
        'preference': 0.0
    }

    P = Bunch(params)

    ft = np.float32

    if rank == 0:

        N, N1 = SS.shape

        if N != N1:
            raise ValueError("S must be a square array \
                (shape=%s)" % repr((N, N1)))
        else:
            P.N = N

        try:
            preference = SS.attrs['preference']
        except:
            raise ValueError('Unable to get preference from cluster matrix')

        if max_iter < 0:
            raise ValueError('max_iter must be > 0')

        if not 0 < conv_iter < max_iter:
            raise ValueError('conv_iter must lie in \
                interval between 0 and max_iter')

        if damping < 0.5 or damping >= 1:
            raise ValueError('damping must lie in interval between 0.5 and 1')

        print '#' * 10, 'Main params', '#' * 10
        print 'preference: %.3f' % preference
        print 'damping: %.3f' % damping
        print 'conv_iter: %d' % conv_iter
        print 'max_iter: %d' % max_iter
        print '#' * 31

        P.TMbfn = str(uuid.uuid1())
        P.TMfn = P.TMbfn + '.hdf5'

        # Magic 4 to fit MPI.Gather
        r = N % (NPROCS * 4)
        N -= r
        l = N // NPROCS
        if r > 0:
            print 'Truncating matrix to %sx%s to fit on %d procs' \
                % (N, N, NPROCS)
        P.N = N

        # Fit to memory
        MEM = psutil.virtual_memory().available / NPROCS_LOCAL
        # MEM = 500 * 10 ** 6
        ts = np.dtype(ft).itemsize * N  # Python give bits
        ts *= 8 * 1.1  # Allocate memory for e, tE, and ...
        # MEM -= ts  # ----
        tl = int(MEM // ts)  # Allocate memory for tS, tA, tR....

        def adjust_cache(tl, l):
            while float(l) % float(tl) > 0:
                tl -= 1
            return tl

        if tl < l:
            P.disk = True
            try:
                cache = 0
                #                cache = int(sys.argv[1])
                #                print sys.argv[1]
                assert cache < l
            except:
                cache = tl
                #print 'Wrong cache settings, set cache to %d' % tl
            tl = adjust_cache(tl, l)
            P.l = l
            P.ll = tl
        else:
            P.l = l
            P.ll = l

        if verbose:
            print "Available memory per process: %.2fG" % (MEM / 10.0**9)
            print "Memory per row: %.2fM" % (ts / 10.0**6)
            print "Estimated memory per process: %.2fG" \
                % (ts * P.ll / 10.0 ** 9)
            print 'Cache size is %d of %d' % (P.ll, P.l)

    P = comm.bcast(P)

    N = P.N
    l = P.l
    ll = P.ll

    ms = h5s.create_simple((ll, N))
    ms_l = h5s.create_simple((N, ))

    tb, te = task(N, NPROCS, rank)

    tS = np.ndarray((ll, N), dtype=ft)
    tSl = np.ndarray((N, ), dtype=ft)

    disk = P.disk

    if disk is True:
        TMLfd = tempfile.mkdtemp()
        TMLfn = osp(TMLfd, P.TMbfn + '_' + str(rank) + '.hdf5')
        TMLf = h5py.File(TMLfn, 'w')
        TMLf.atomic = True

        S = TMLf.create_dataset('S', (l, N), dtype=ft)
        Ss = S.id.get_space()

    #Copy input data and
    #place preference on diagonal
    z = -np.finfo(ft).max

    for i in range(tb, te, ll):
        SSs.select_hyperslab((i, 0), (ll, N))
        SS.id.read(ms, SSs, tS)

        if disk is True:
            Ss.select_hyperslab((i - tb, 0), (ll, N))
            S.id.write(ms, Ss, tS)

    if disk is True:
        R = TMLf.create_dataset('R', (l, N), dtype=ft)
        Rs = R.id.get_space()

    tRold = np.zeros((ll, N), dtype=ft)
    tR = np.zeros((ll, N), dtype=ft)
    tdR = np.zeros((l, ), dtype=ft)

    #Shared storage
    TMf = h5py.File(P.TMfn, 'w', driver='mpio', comm=comm)
    TMf.atomic = True

    Rp = TMf.create_dataset('Rp', (N, N), dtype=ft)
    Rps = Rp.id.get_space()

    tRp = np.ndarray((ll, N), dtype=ft)
    tRpa = np.ndarray((N, ll), dtype=ft)

    A = TMf.create_dataset('A', (N, N), dtype=ft)
    As = A.id.get_space()

    tAS = np.ndarray((ll, N), dtype=ft)
    tAold = np.ndarray((N, ll), dtype=ft)
    tA = np.ndarray((N, ll), dtype=ft)
    tdA = np.ndarray((l, ), dtype=ft)

    e = np.ndarray((N, conv_iter), dtype=np.int8)
    tE = np.ndarray((N, ), dtype=np.int8)
    ttE = np.ndarray((l, ), dtype=np.int8)

    converged = False
    cK = 0
    K = 0
    ind = np.arange(ll)

    for it in range(max_iter):
        if rank == 0:
            if verbose is True:
                print '=' * 10 + 'It %d' % (it) + '=' * 10
                tit = time.time()
        # Compute responsibilities
        for i in range(tb, te, ll):
            if disk is True:
                il = i - tb
                Ss.select_hyperslab((il, 0), (ll, N))
                S.id.read(ms, Ss, tS)
                #tS = S[i, :]
                Rs.select_hyperslab((il, 0), (ll, N))
                R.id.read(ms, Rs, tRold)
            else:
                tRold = tR.copy()

            As.select_hyperslab((i, 0), (ll, N))
            A.id.read(ms, As, tAS)
            #Tas = a[I, :]
            tAS += tS
            #tRold = R[i, :]

            tI = bn.nanargmax(tAS, axis=1)
            tY = tAS[ind, tI]
            tAS[ind, tI[ind]] = z
            tY2 = bn.nanmax(tAS, axis=1)

            tR = tS - tY[:, np.newaxis]
            tR[ind, tI[ind]] = tS[ind, tI[ind]] - tY2[ind]
            tR = (1 - damping) * tR + damping * tRold

            tRp = np.maximum(tR, 0)

            for il in range(ll):
                tRp[il, i + il] = tR[il, i + il]
                tdR[i - tb + il] = tR[il, i + il]

            if disk is True:
                R.id.write(ms, Rs, tR)
                #R[i, :] = tR

            Rps.select_hyperslab((i, 0), (ll, N))
            Rp.id.write(ms, Rps, tRp)

            #Rp[i, :] = tRp
        if rank == 0:
            if verbose is True:
                teit1 = time.time()
                print 'R T %s' % (teit1 - tit)

        comm.Barrier()

        # Compute availabilities
        for j in range(tb, te, ll):

            As.select_hyperslab((0, j), (N, ll))

            if disk is True:
                A.id.read(ms, As, tAold)
            else:
                tAold = tA.copy()

            Rps.select_hyperslab((0, j), (N, ll))
            Rp.id.read(ms, Rps, tRpa)
            #tRp = Rp[:, j]

            tA = bn.nansum(tRpa, axis=0)[np.newaxis, :] - tRpa
            for jl in range(ll):
                tdA[j - tb + jl] = tA[j + jl, jl]

            tA = np.minimum(tA, 0)

            for jl in range(ll):
                tA[j + jl, jl] = tdA[j - tb + jl]

            tA *= (1 - damping)
            tA += damping * tAold

            for jl in range(ll):
                tdA[j - tb + jl] = tA[j + jl, jl]

            A.id.write(ms, As, tA)

        if rank == 0:
            if verbose is True:
                teit2 = time.time()
                print 'A T %s' % (teit2 - teit1)

        ttE = np.array(((tdA + tdR) > 0), dtype=np.int8)

        if NPROCS > 1:
            comm.Gather([ttE, MPI.INT], [tE, MPI.INT])
            comm.Bcast([tE, MPI.INT])
        else:
            tE = ttE
        e[:, it % conv_iter] = tE
        pK = K
        K = bn.nansum(tE)

        if rank == 0:
            if verbose is True:
                teit = time.time()
                cc = ''
                if K == pK:
                    if cK == 0:
                        cK += 1
                    elif cK > 1:
                        cc = ' Conv %d of %d' % (cK, conv_iter)
                else:
                    cK = 0

                print 'Total K %d T %s%s' % (K, teit - tit, cc)

        if it >= conv_iter:

            if rank == 0:
                se = bn.nansum(e, axis=1)
                converged = (bn.nansum((se == conv_iter) + (se == 0)) == N)

                if (converged == np.bool_(True)) and (K > 0):
                    if verbose is True:
                        print("Converged after %d iterations." % (it))
                    converged = True
                else:
                    converged = False

            converged = comm.bcast(converged, root=0)

        if converged is True:
            break

    if not converged and verbose and rank == 0:
        print("Failed to converge after %d iterations." % (max_iter))

    if K > 0:

        I = np.nonzero(e[:, 0])[0]
        C = np.zeros((N, ), dtype=np.int)
        tC = np.zeros((l, ), dtype=np.int)

        for i in range(l):
            if disk is True:
                Ss.select_hyperslab((i, 0), (1, N))
                S.id.read(ms_l, Ss, tSl)
            else:
                tSl = tS[i]

            tC[i] = bn.nanargmax(tSl[I])

        comm.Gather([tC, MPI.INT], [C, MPI.INT])

        if rank == 0:
            C[I] = np.arange(K)

        comm.Bcast([C, MPI.INT])

        for k in range(K):
            ii = np.where(C == k)[0]
            tN = ii.shape[0]

            tI = np.zeros((tN, ), dtype=np.float32)
            ttI = np.zeros((tN, ), dtype=np.float32)
            tttI = np.zeros((tN, ), dtype=np.float32)
            ms_k = h5s.create_simple((tN, ))

            j = rank
            while j < tN:
                ind = [(ii[i], ii[j]) for i in range(tN)]
                SSs.select_elements(ind)
                SS.id.read(ms_k, SSs, tttI)

                ttI[j] = bn.nansum(tttI)
                j += NPROCS

            comm.Reduce([ttI, MPI.FLOAT], [tI, MPI.FLOAT])

            if rank == 0:
                I[k] = ii[bn.nanargmax(tI)]

        I.sort()
        comm.Bcast([I, MPI.INT])

        for i in range(l):
            if disk is True:
                Ss.select_hyperslab((i, 0), (1, N))
                S.id.read(ms_l, Ss, tSl)
            else:
                tSl = tS[i]

            tC[i] = bn.nanargmax(tSl[I])

        comm.Gather([tC, MPI.INT], [C, MPI.INT])

        if rank == 0:
            C[I] = np.arange(K)

    else:
        if rank == 0:
            I = np.zeros(())
            C = np.zeros(())

    #Cleanup
    SSf.close()
    TMf.close()

    if disk is True:
        TMLf.close()
        shutil.rmtree(TMLfd)

    comm.Barrier()

    if rank == 0:

        os.remove(P.TMfn)

        if verbose:
            print 'APN: %d' % K

        if I.size and C.size:

            Sf = h5py.File(Sfn, 'r+', driver='sec2')

            if 'aff_labels' in Sf.keys():
                del Sf['aff_labels']

            LM = Sf.require_dataset('aff_labels', shape=C.shape, dtype=np.int)
            LM[:] = C[:]

            if 'aff_centers' in Sf.keys():
                del Sf['aff_centers']

            CM = Sf.require_dataset('aff_centers', shape=I.shape, dtype=np.int)
            CM[:] = I[:]
            Sf.close()
Esempio n. 2
0
from os.path import join as osp
import time

# Installed packages.
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

########################################################################
# FILES

# Define mnist files. These were decompressed with 'gzip -d <file>'
MNIST_DIR = 'mnist'
TRAIN_IMAGES_FILE = osp(MNIST_DIR, 'train-images-idx3-ubyte')
TRAIN_LABELS_FILE = osp(MNIST_DIR, 'train-labels-idx1-ubyte')
TEST_IMAGES_FILE = osp(MNIST_DIR, 't10k-images-idx3-ubyte')
TEST_LABELS_FILE = osp(MNIST_DIR, 't10k-labels-idx1-ubyte')

TRAIN_FILE_CHAMBERLAND = osp(MNIST_DIR, 'mnist_train.csv')
TEST_FILE_CHAMBERLAND = osp(MNIST_DIR, 'mnist_test.csv')

# Output directory.
OUT_DIR = osp('..', '..', 'Challenges', '2Submissions', 'team1')

# Define output files.
LR_PRED_FILE = osp(OUT_DIR, '2challenge_logreg.csv')
LR_COEFF_FILE = osp(OUT_DIR, '2challenge_logreg_vectors.csv')
KNN_PRED_FILE = osp(OUT_DIR, '2challenge_knn.csv')
########################################################################
Esempio n. 3
0
#! /usr/bin/env python
import os
from flask import Flask, redirect, render_template, request, session
import yaml
import uuid
import zipfile
import urllib
#from urlparse import  urlparse as urlparse
from mendeley import Mendeley
from mendeley.session import MendeleySession
from syncrm import cli
from argparse import Namespace
from os.path import join as osp
from os.path import dirname as dirname
with open(osp(dirname(__file__), 'config.yml'), 'r') as f:
    config = yaml.load(f)

REDIRECT_URI = 'http://*****:*****@app.route('/')
def home():
    if 'token' in session:
        return redirect('/listDocuments')
ms = h5s.create_simple((ll, N))
ms_l = h5s.create_simple((N,))
ms_e = h5s.create_simple((1,))


tb, te = task(rank, l)

tS = np.ndarray((ll, N), dtype=ft)
tSl = np.ndarray((N,), dtype=ft)
tdS = np.ndarray((1,), dtype=ft)

disk = P.disk

if disk is True:
    TMLfd = tempfile.mkdtemp()
    TMLfn = osp(TMLfd, P.TMfn + '_' + str(rank) + '.hdf5')
    TMLf = h5py.File(TMLfn, 'w')

    S = TMLf.create_dataset('S', (l, N), dtype=ft)
    Ss = S.id.get_space()

#Copy input data and
#place preference on diagonal
preference = P.preference
random_state = np.random.RandomState(0)
x = np.finfo(ft).eps
y = np.finfo(ft).tiny * 100
z = - np.finfo(ft).max

for i in xrange(tb, te, ll):
    SSs.select_hyperslab((i, 0), (ll, N))