Example #1
0
class TransitionalModel():
    def __init__(self, trainpath, validpaths, max_trans=0, vnodes=False, hypotest=None, saveplots=False):
        assert isinstance(trainpath, str)
        assert len(trainpath)
        assert isinstance(validpaths, list)
        for path in validpaths:
            assert isinstance(path, str)
            assert len(path)
        assert isinstance(max_trans, int)
        assert max_trans >= 0
        assert isinstance(vnodes, bool)
        assert isinstance(saveplots, bool)

        self._modname = 'base'
        if vnodes:
            self._modname = 'vnode'
        if hypotest:
            self._modname += '+hypo'
        
        self._savepathbase = None
        if saveplots:
            self._savepathbase = "./%s-plots-%s-maxtrans%d/" % (trainpath, self._modname, max_trans)
            os.mkdir(self._savepathbase)

        # load datasets
        self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, \
                                             vnodes=vnodes)

        if hypotest:
            #statespace, transitions = self._trainset.admissible_statespace_transitions()
            #print "orig:\n", statespace, transitions
            statespace, transitions = self._trainset.hypotest_admissible_statespace_transitions(hypotest)
            #print "hypotest:\n", statespace, transitions
            self._trainset = ObservationSequence(loadpath=trainpath, max_trans=max_trans, vnodes=vnodes, \
                                             admissible_statespace=statespace, \
                                             admissible_transitions=transitions)

        statespace, transitions = self._trainset.admissible_statespace_transitions()
        #self._trainset.dump()
        #print "trainset:\n", statespace, transitions

        if not self._trainset.outseq_len():
            print "warning, TransitionModel.__init__, empty trainset at path %s" % trainpath


        _FILTER_TRANS = 1
        self._validsets = []
        for validpath in validpaths:
            if _FILTER_TRANS:
                # filter trans
                dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes, \
                                              admissible_statespace=statespace, \
                                              admissible_transitions=transitions)
            else:
                dataset = ObservationSequence(loadpath=validpath, max_trans=max_trans, vnodes=vnodes)

            if not dataset.outseq_len():
                print "TransitionModel, skipping empty validation dataset at path %s" % validpath
                continue
            self._validsets.append(dataset)

        # define the initial set of states and the set of output symbols
        self._init_states = sorted(self._trainset.states())
        self._N = len(self._init_states)
        self._symbols = sorted(self._trainset.symbols())
        self._M = len(self._symbols)

        self._symbols_ext = copy.copy(self._symbols)        
        if not _FILTER_TRANS:

              symbols = set()
              for dataset in self._validsets:
                  symbols.update(dataset.symbols())
                  
              print "symbols", symbols
              for sym in symbols:
                  if sym not in self._symbols_ext:
                      print "sym", sym
                      print "TransitionalModel: appending sym=\"%s\" in valset but not in trainset" % sym
                      self._symbols_ext.append(sym)
        self._M_ext = len(self._symbols_ext)
        
        print "self._symbols_ext", self._symbols_ext
        camsymbols = {}
        for sym in self._symbols_ext:
            print "sym = ", sym
            for _id in sym.split('-'):
                print "  _id = ", _id

                __id = int(_id.strip('abcdef'))
                try:
                    camsymbols[__id].add(sym)
                except:
                    print "new cam id", _id
                    camsymbols[__id] = set([sym])
        for camid in camsymbols.keys():
            symbols = camsymbols[camid]
            camsymbols[camid] = sorted(symbols)

        symbol2cam = {}
        for sym in self._symbols_ext:
            for _id in sym.split('-'):
                __id = int(_id.strip('abcdef'))
                try:
                    symbol2cam[sym].append(__id)
                except:
                    symbol2cam[sym] = [__id]
      
        symbol2cam_enc = {}
        for sym,camids in symbol2cam.iteritems():
            symenc = []
            symbol2cam_enc[self._symbol_enc(sym)] = camids

        print "camsymbols=",camsymbols
        print "symbol2cam=",symbol2cam        
        print "symbol2cam_enc=",symbol2cam_enc
        print "symbols_ext", self._symbols_ext
                
        # define the initial A, B matrices
        A, B = self._build_initial_model()

        # compute the node to be split
        self._compute_node_splits()
       
        # encode datasets (bijective mapping symobols <-> integers)
        self._trainset_enc = []
        for observ in self._trainset.outseq():
            self._trainset_enc.append(self._symbol_enc(observ))

        self._validsets_enc = []
        for dataset in self._validsets:
            encoded = []
            for observ in dataset.outseq():
                encoded.append(self._symbol_enc(observ))
            self._validsets_enc.append(encoded)

        # print some info
        self.print_info()

        # create the HMM model with node splitting
        self._hmm = HiddenMarkovModel(A, B, copy.copy(self._init_states),	\
              			      self._trainset_enc, validsets=self._validsets_enc, symnames=copy.copy(self._symbols_ext), symbol2cam=symbol2cam_enc)

        self._transition_graph_threshold = 0.01
        self._hmm.plot_transition_graph(0., savepathbase=self._savepathbase, savetag='coverage-model')

    def optimize(self, nu=0.01):
        # plot coverage
        self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
                			savepathbase=self._savepathbase, savetag='coverage-model')

        # run BW before the first node-split
        while True:
            done = self._hmm.baumwelch(nu)
            self._hmm.plot_perf(savepathbase=self._savepathbase)
            self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
                				savepathbase=self._savepathbase)
            if done:
                break

        while True:

            keys = sorted(self._pending_splits.keys())
            print "\n%d node-splits left" % len(keys)
            if not len(keys):
                break

            key = keys[0]
            splits = sorted(self._pending_splits[key], key=lambda split_data: split_data[0])
            #print "splits for node ", key, " (", len(splits), "):", splits
            assert len(splits)

            split = splits.pop()

            if not len(splits):
                self._pending_splits.pop(key)
            else:
                self._pending_splits[key] = splits

            # per ogni nodo corrente
            #   guarda alla backward star
            #   guarda alla forward star 
            #   costruisci matrice rettangolare transizioni
            #   identifica la riga piu' ortogonale a tutte le altre
            #
            #     se lo trovi splitta il nodo ribalanciando le probabilita'
           #(inner, bsym, bstar_syms, fstar_syms, row)
            #print split
            _inner = split[0]
            _bsym = split[1]
            _bstar_sym = split[2]
            _fstar_sym = split[3]
            rates = split[4]
            assert numpy.isclose(rates.sum(), 1)

            #print "   splitting node %s : inner=%.2f bsym=%s _bstar=%s _fstar=%s, rates=%s" % (key, _inner, _bsym, _bstar_sym, _fstar_sym, rates)

            state = self._state_enc(key)
            bstate = self._state_enc(_bsym)
            bstar = []
            for i in _bstar_sym:
                bstar.append(self._state_enc(i))
            fstar = []
            for i in _fstar_sym:
                fstar.append(self._state_enc(i))

            self._hmm.split_state(state, bstate, bstar, fstar, rates)
            self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
            				    savepathbase=self._savepathbase, savetag='node-splitting')

            while True:
                done = self._hmm.baumwelch(nu)
                self._hmm.plot_perf(savepathbase=self._savepathbase)
                self._hmm.plot_transition_graph(self._transition_graph_threshold,	\
                				savepathbase=self._savepathbase)
                if done:
                    break

    def _symbol_enc(self, sym):
        #print sym
        #print self._symbols
        assert sym in self._symbols_ext
        return self._symbols_ext.index(sym)
        
    def _state_enc(self, state):
        assert state in self._init_states
        return self._init_states.index(state)

    def name(self):
        return self._modname

    def nr_training_transitions(self):
        return max(0, self._trainset.outseq_len() -1)

    def nr_validation_transitions(self, setid):
        assert isinstance(setid, int)
        assert setid >= 0
        assert setid < self._validsets.outseq_len()
        return max(0, self._validsets[setid].outseq_len() -1)

    def _build_initial_model(self):
        dataset = self._trainset
        A = numpy.zeros((self._N, self._N))

        for src in self._init_states:
            nr_from = dataset.count_transitions_from(src)
            #print "nr_from: %s = %d" % (src, nr_from)
            assert nr_from >= 0
            if nr_from == 0:
                continue

            src_idx = self._state_enc(src)

            for dest in self._init_states:
                nr_to = dataset.count_transitions_from_to(src, dest)
                #print "nr_from_to: %s->%s = %d" % (src, dest, nr_from)
                assert nr_to >= 0
                if nr_to == 0:
                    continue

                dest_idx = self._state_enc(dest)
                A[src_idx,dest_idx] = nr_to/nr_from

        for i in xrange(self._N):
            # potremmo non avere transizioni uscenti da un certo
            # nodo ed in quel caso le righe di A non sommano tutte all'unita'
            # usa una distribuzione uniforme
            if not numpy.isclose(A[i,:].sum(), 1.):
                A[i,:] = numpy.ones(self._N)/self._N
       
        B = numpy.zeros((self._M,self._N))
        for state in self._init_states:
            if state in self._symbols:
                B[self._symbol_enc(state), self._state_enc(state)] = 1.
            else:
                # this must be one of the vnull states
                assert '~0~' in state
                B[self._symbol_enc('0'), self._state_enc(state)] = 1.

        return A, B

    def _compute_node_splits(self):
        triplets = self._trainset.state_triplets_dict()
        fstar = self._trainset.state_fstar_dict()
        bstar = self._trainset.state_bstar_dict()

        #print "fstar: ", fstar
        #print "bstar: ", bstar

        splits = {}     # computed splits keyed by the id of the splitted node

        # compute splits for each state
        states = tuple(set(self._trainset.states()))
        for state in states:
            if (state not in fstar.keys()) or (state not in bstar.keys()):
                # this can happen for symbols in the first and last triplet
                continue

            fstar_states = sorted(fstar[state])
            bstar_states = sorted(bstar[state])
            nr_fstar_states = len(fstar_states)
            nr_bstar_states = len(bstar_states)
            if nr_fstar_states < 2 or nr_bstar_states < 2:
                # the 'transition rate' table must have be at least 2x2
                continue
            
            # build the table (using a matrix)
            mat = numpy.zeros((nr_bstar_states, nr_fstar_states))
            for i in xrange(nr_bstar_states):
                for j in xrange(nr_fstar_states):
                    bstate = bstar_states[i]
                    fstate = fstar_states[j]
                    try:
                        mat[i,j] = triplets[(bstate,state,fstate)]
                    except:
                        mat[i,j] = 0.
                
                # we have just filled the i-th row, normalize it to unit norm
                mat[i,:] /= numpy.linalg.norm(mat[i,:])
             
            # Evaluate the inner products
            matt = mat.transpose()
            for i in xrange(nr_bstar_states-1):
                bstate = bstar_states[i]
                row = mat[i,:]

                # We do all the inner products at once and then select just the smallest one
                # ! inners that have been evaluted already in past iterations are
                # ! skipped by setting them to 1.
                inners = row.dot(matt)
                inners[0:i] = 1.
                idx = numpy.argmin(inners)
                inner = inners[idx]
                if inner > 0.2:
                    #print "splitting: discarding split with to high inner for node %s : from=%s inner=%.2f" % (state, bstate, inner)
                    continue

                # add a new entry to the list of splits for the node with id 'sym'
                entry = (inner, bstate, bstar_states, fstar_states, row/row.sum())
                try:
                    #print "splitting: new split for node %s : %s" % (sym, entry)
                    splits[state].append(entry)
                except:
                    splits[state] = [entry]

        nr_splits = 0                        
        print "\nNode splitting:"
        for state, _splits in splits.iteritems():
            #print "  splits for node ", key
            nr_splits += len(_splits)
            print "  Nr. of splits for node %s: %d" % (state,len(_splits))
            #for entry in item:
                #print key, entry

        print "  Tot. nr. of splits %d" % nr_splits
        self._pending_splits = splits

    def print_info(self):
        print "transitional model dump: fill me"
        print "Symbols: "
        for i in xrange(len(self._symbols)):
            print "  ", i, self._symbols[i]
        print "States: "
        for i in xrange(len(self._init_states)):
            print "  ", i, self._init_states[i]