Ejemplo n.º 1
0
def get_data(seed=123456, nDocTotal=32, T=1000, **kwargs):
    ''' Generate several data sequences, returned as a bnpy data-object

    Args
    -------
    seed : integer seed for random number generator,
          used for actually *generating* the data
    seqLens : total number of observations in each sequence

    Returns
    -------
    Data : bnpy GroupXData object, with nObsTotal observations
    '''
    fullX, fullZ, doc_range = get_X(seed, nDocTotal, T)
    X = np.vstack(fullX)
    Z = np.asarray(fullZ)

    nUsedStates = len(np.unique(Z))
    if nUsedStates < K:
        print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA'

    Data = GroupXData(X=X, doc_range=doc_range, TrueZ=Z)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Ejemplo n.º 2
0
def get_data(meetingNum=1, **kwargs):
    ''' Load data for specified single sequence.

    Args
    ----
    meetingNum : int
        Identifies which sequence out of the 21 possible to use.
        Must be valid number in range [1,2,3, ... 21].

    Returns
    -------
    Data : GroupXData
        holding only the data for a single sequence.
    '''
    if meetingNum <= 0 or meetingNum > len(fileNames):
        raise ValueError('Bad value for meetingNum: %s' % (meetingNum))

    fName = fileNames[meetingNum - 1].replace(suffix, '')
    matfilepath = os.path.join(datasetdir, 'rawData', 'speakerDiarizationData',
                               fName)

    if not os.path.isfile(matfilepath):
        raise ValueError('CANNOT FIND SPEAKDIAR DATASET MAT FILE:\n' +
                         matfilepath)

    Data = GroupXData.read_from_mat(matfilepath)
    Data.summary = \
        'Pre-processed audio data from NIST file %s (meeting %d / 21)' \
        % (fName.replace(suffix, ''), meetingNum)
    Data.name = 'SpeakerDiar' + str(meetingNum)

    Data.fileNames = [fName]
    return Data
Ejemplo n.º 3
0
def get_data(**kwargs):
    ''' Returns data from audio tracks
    '''

    if os.path.exists(DATAFILE_MAT):
        Data = GroupXData.LoadFromFile(DATAFILE_MAT)
    else:
        obs = []
        doc_range = [0]
        count = 0
        with h5py.File('../tracks.h5', 'r') as tracks:
            for track, grp in ProgressBar(tracks.items()):
                if 'gfccs' not in grp:
                    continue
                data = grp['gfccs']
                count += data.shape[0]
                doc_range.append(count)
                obs.append(data.value.astype(np.float64))
        X = np.vstack(obs)
        Data = GroupXData(X=X, doc_range=doc_range)
        Data.save_to_mat(DATAFILE_MAT)
    Data.name = 'AudioCorpus'
    Data.summary = 'Audio Corpus. obs=10.5M docs=559'

    return Data
Ejemplo n.º 4
0
def get_data(**kwargs):
    Data = GroupXData.read_mat(matfilepath)
    Data.summary = get_data_info()
    Data.name = get_short_name()
    # Verify that true state space is indexed starting at 0, not 1
    # Violating this can cause bugs in the alignment code
    assert Data.TrueParams['Z'].min() == 0
    assert Data.TrueParams['Z'].max() == 11
    return Data
Ejemplo n.º 5
0
def get_data(seed=8675309, nDocTotal=52, T=800, **kwargs):
    '''
      Args
      -------
      seed : integer seed for random number generator,
              used for actually *generating* the data
      nObsTotal : total number of observations for the dataset.

      Returns
      -------
        Data : bnpy XData object, with nObsTotal observations
    '''
    X, Xprev, TrueZ, doc_range = genToyData(
        seed=seed, nDocTotal=nDocTotal, T=T)
    Data = GroupXData(X=X, TrueZ=TrueZ, Xprev=Xprev, doc_range=doc_range)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Ejemplo n.º 6
0
def get_data(nDocTotal=200,
             nObsPerDoc=300,
             nLetterPerDoc=3,
             seed=0,
             dstart=0,
             **kwargs):
    ''' Generate data as GroupXData object

	Guarantees that each letter is used at least once every 26 docs.

	'''
    nLetters = 26
    PRNG = np.random.RandomState(seed)
    # Letters decay in probability from A to Z
    LetterProbs = np.ones(nLetters)
    for i in range(1, nLetters):
        LetterProbs[i] = 0.95 * LetterProbs[i - 1]
    LetterProbs /= LetterProbs.sum()

    X = np.zeros((nDocTotal * nObsPerDoc, 64))
    TrueZ = np.zeros(nDocTotal * nObsPerDoc)
    doc_range = np.zeros(nDocTotal + 1, dtype=np.int32)
    for d in xrange(nDocTotal):
        start_d = d * nObsPerDoc
        doc_range[d] = start_d
        doc_range[d + 1] = start_d + nObsPerDoc

        # Select subset of letters to appear in current document
        mustIncludeLetter = (dstart + d) % 26
        chosenLetters = PRNG.choice(nLetters,
                                    size=nLetterPerDoc,
                                    p=LetterProbs,
                                    replace=False)
        loc = np.flatnonzero(chosenLetters == mustIncludeLetter)
        if loc.size > 0:
            chosenLetters[loc[0]] = mustIncludeLetter
        else:
            chosenLetters[-1] = mustIncludeLetter
        lProbs_d = LetterProbs[chosenLetters] / LetterProbs[chosenLetters].sum(
        )
        nObsPerChoice = PRNG.multinomial(nObsPerDoc, lProbs_d)
        assert nObsPerChoice.sum() == nObsPerDoc
        start = start_d
        for i in range(nLetterPerDoc):
            TrueZ[start:(start + nObsPerChoice[i])] = chosenLetters[i]
            Lcovmat = letter2covmat(chr(CHRSTART + chosenLetters[i]))
            X[start:(start + nObsPerChoice[i])] = PRNG.multivariate_normal(
                np.zeros(64), Lcovmat, size=nObsPerChoice[i])
            start += nObsPerChoice[i]
    for i in range(nLetters):
        print chr(CHRSTART + i), np.sum(TrueZ == i)
    return GroupXData(X=X, TrueZ=TrueZ, doc_range=doc_range)
Ejemplo n.º 7
0
def get_data(seed=DEFAULT_SEED, T=DEFAULT_LEN, **kwargs):
    ''' Generate toy data sequences, returned as a bnpy data-object

      Args
      -------
      seed : integer seed for random number generator,
              used for actually *generating* the data
      T : int number of observations in each sequence

      Returns
      -------
      Data : bnpy GroupXData object, with nObsTotal observations
    '''
    X, Xprev, Z, doc_range = get_X(seed, T)

    nUsedStates = len(np.unique(Z))
    if nUsedStates < K:
        print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA'

    Data = GroupXData(X=X, Xprev=Xprev, doc_range=doc_range, TrueZ=Z)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Ejemplo n.º 8
0
    def loadDataForBatch(self, batchID):
        ''' Load the data assigned to a particular batch

        Returns
        -------
        Dchunk : bnpy.data.DataObj subclass
        '''
        dpath = self.datafileList[batchID]
        if dpath.endswith('.ldac'):
            return BagOfWordsData.LoadFromFile_ldac(dpath, **self.DataInfo)
        elif self.dataset_type == 'GroupXData':
            return GroupXData.LoadFromFile(dpath, **self.DataInfo)
        else:
            return XData.read_file(dpath, **self.DataInfo)
Ejemplo n.º 9
0
def get_data(seed=86758, seqLens=((3000, 3000, 3000, 3000, 500)), **kwargs):
    ''' Generate several data sequences, returned as a bnpy data-object

    Args
    -------
    seed : integer seed for random number generator,
          used for actually *generating* the data
    nObsTotal : total number of observations for the dataset.

    Returns
    -------
    Data : bnpy GroupXData object, with nObsTotal observations
    '''
    fullX, fullZ, seqIndicies = get_X(seed, seqLens)
    X = np.vstack(fullX)
    Z = np.asarray(fullZ)
    doc_range = np.asarray(seqIndicies)

    Data = GroupXData(X=X, doc_range=doc_range,
                      TrueZ=Z)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Ejemplo n.º 10
0
def loadDataForSlice(filepath='', dataset_type='', **kwargs):
    """ Return data object loaded from specific file.

    Keyword args
    ------------
    workerID
    nWorkers
    """
    if filepath.endswith('.ldac'):
        return BagOfWordsData.LoadFromFile_ldac(filepath, **kwargs)
    else:
        if dataset_type == 'GroupXData':
            return GroupXData.LoadFromFile(filepath, **kwargs)
        else:
            return XData.LoadFromFile(filepath, **kwargs)
def MakeGroupData(seed, nDoc, nObsPerDoc):
    ''' Make a GroupXData object
    '''
    PRNG = np.random.RandomState(seed)
    Pi = PRNG.dirichlet(gamma * np.ones(K), size=nDoc)
    XList = list()
    ZList = list()
    for d in range(nDoc):
        Npercomp = PRNG.multinomial(nObsPerDoc, Pi[d])
        for k in range(K):
            if Npercomp[k] < 1:
                continue
            Xcur_k = _sample_data_from_comp(k, Npercomp[k], PRNG)
            XList.append(Xcur_k)
            ZList.append(k * np.ones(Npercomp[k]))

    doc_range = np.arange(0, nDoc * nObsPerDoc + 1, nObsPerDoc)
    X = np.vstack(XList)
    TrueZ = np.hstack(ZList)
    return GroupXData(X, doc_range, TrueZ=TrueZ)
# print("right")
# x_eoc, x_prev_eoc, z_eoc, doc_range_eoc = read_data(path_right, z_value=0)
# print("straight")
# x_straight, x_prev_straight , z_straight , doc_range_straight = read_data(path_straight,doc_range=doc_range_eoc[-1],z_value=10)
# print("left")
# x_left, x_prev_left, z_left , doc_range_left = read_data(path_left,doc_range=doc_range_straight[-1],z_value=55)

# x = np.vstack((x_eoc,x_straight,x_left))
# x_prev = np.vstack((x_prev_eoc,x_prev_straight, x_prev_left))
# z = np.hstack((z_eoc, z_straight, z_left))
# doc_range = np.hstack((doc_range_eoc[:-1], doc_range_straight[:-1], doc_range_left))

print("total trajectories: ", doc_range.shape[0] - 1)

dataset = GroupXData(X=x, doc_range=doc_range, Xprev=x_prev)  #, TrueZ=z

# output_path_starter = '/media/ng/Vetnari/nakul_old_thinkpad/corl_all_data/all_cleaned_bags/new_outputs/'
output_path_starter = '/home/ng/workspace/corl_2019_all_code/output/'

###############################################################################
#
# Setup: Initialization hyperparameters
# -------------------------------------

init_kwargs = dict(
    K=20,
    initname='randexamples',
)

alg_kwargs = dict(
Ejemplo n.º 13
0
def generateDataset(**kwargs):
    for key in Defaults:
        if key not in kwargs:
            kwargs[key] = Defaults[key]
    phi = makePhi(**kwargs)
    transPi = makePi(**kwargs)
    PRNG = np.random.RandomState(kwargs['seed'])

    nSeq = kwargs['nDocTotal']
    T_in = kwargs['T']

    if isinstance(T_in, str):
        Tvals = [int(T) for T in T_in.split(',')]
    else:
        Tvals = [T_in]

    if len(Tvals) == 1:
        seqLens = Tvals[0] * np.ones(nSeq, dtype=np.int32)
    elif len(Tvals) < nSeq:
        seqLens = np.tile(Tvals, nSeq)[:nSeq]
    elif len(Tvals) >= nSeq:
        seqLens = np.asarray(Tvals, dtype=np.int32)[:nSeq]

    doc_range = np.hstack([0, np.cumsum(seqLens)])
    N = doc_range[-1]
    allX = np.zeros((N, D))
    allZ = np.zeros(N, dtype=np.int32)

    startStates = [bgStateID, fgStateID]
    states0toKm1 = np.arange(K)
    # Each iteration generates one time-series/sequence
    # with starting state deterministically rotating among all states
    for i in range(nSeq):
        start = doc_range[i]
        stop = doc_range[i + 1]

        T = stop - start
        Z = np.zeros(T, dtype=np.int32)
        X = np.zeros((T, D))
        nConsec = 0

        Z[0] = startStates[i % len(startStates)]
        X[0] = PRNG.rand(D) < phi[Z[0]]
        for t in range(1, T):
            if nConsec > kwargs['maxTConsec']:
                # Force transition if we've gone on too long
                transPi_t = transPi[Z[t - 1]].copy()
                transPi_t[Z[t - 1]] = 0
                transPi_t /= transPi_t.sum()
            else:
                transPi_t = transPi[Z[t - 1]]
            Z[t] = PRNG.choice(states0toKm1, p=transPi_t)
            X[t] = PRNG.rand(D) < phi[Z[t]]
            if Z[t] == Z[t - 1]:
                nConsec += 1
            else:
                nConsec = 0
        allZ[start:stop] = Z
        allX[start:stop] = X

    TrueParams = dict()
    TrueParams['beta'] = np.mean(transPi, axis=0)
    TrueParams['phi'] = phi
    TrueParams['Z'] = allZ
    TrueParams['K'] = K
    return GroupXData(allX, doc_range=doc_range, TrueParams=TrueParams)
Ejemplo n.º 14
0
 def train_image_specific_topics(self,
                                 y,
                                 sigma,
                                 Niter=50,
                                 Kfresh=100,
                                 pixelMask=None):
     print('Training %d image-specific clusters...' % Kfresh)
     D, patchSize, GP = self.D, int(np.sqrt(self.D)), self.GP
     # gather fully observable patches
     if pixelMask is None:  # gray-scale image denoising
         v = im2col(y, patchSize)
     else:  # color image inpainting
         C = 3
         patchMask = np.logical_not(
             np.any(im2col(pixelMask, patchSize), axis=0))
         v = np.hstack(
             tuple([
                 im2col(y[:, :, c], patchSize)[:, patchMask]
                 for c in xrange(C)
             ]))
     v -= np.mean(v, axis=0)
     v = v.T
     testData = GroupXData(X=v, doc_range=[0, len(v)], nDocTotal=1)
     testData.name = 'test_image_patches'
     # set up hyper-parameters and run Bregman k-means
     cached_B_name = 'models/HDP/B.mat'
     xBar = loadmat(cached_B_name)['Cov']
     xBar2 = loadmat(cached_B_name)['Cov2']
     tmp0 = (np.diag(xBar) + sigma**2)**2
     tmp1 = np.diag(xBar2) + 6 * np.diag(xBar) * sigma**2 + 3 * sigma**4
     nu = D + 3 + 2 * np.sum(tmp0) / np.sum(tmp1 - tmp0)
     B = (nu - D - 1) * (xBar + sigma**2 * np.eye(D))
     obsModel = ZeroMeanGaussObsModel(D=D,
                                      min_covar=1e-8,
                                      inferType='memoVB',
                                      B=B,
                                      nu=nu)
     Z, Mu, Lscores = runKMeans_BregmanDiv(testData.X,
                                           Kfresh,
                                           obsModel,
                                           Niter=Niter,
                                           assert_monotonic=False)
     Korig = self.K
     Kall = np.max(Z) + Korig + 1
     Kfresh = Kall - Korig
     Z += Korig
     # load SuffStats of training images
     trainSS = loadSuffStatBag('models/HDP/SS.dump')
     trainSS.insertEmptyComps(Kfresh)
     # construct SuffStats of the test image
     DocTopicCount = np.bincount(Z, minlength=int(Kall)).reshape((1, Kall))
     DocTopicCount = np.array(DocTopicCount, dtype=np.float64)
     resp = np.zeros((len(Z), Kall))
     resp[np.arange(len(Z)), Z] = 1.0
     testLP = dict(resp=resp, DocTopicCount=DocTopicCount)
     alphaPi0 = np.hstack(
         (GP.alphaPi0, GP.alphaPi0Rem / (Kfresh + 1) * np.ones(Kfresh)))
     alphaPi0Rem = GP.alphaPi0Rem / (Kfresh + 1)
     testLP = updateLPGivenDocTopicCount(testLP, DocTopicCount, alphaPi0,
                                         alphaPi0Rem)
     testSS = self.patchModel.get_global_suff_stats(
         testData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1)
     xxT = np.zeros((Kall, D, D))
     for k in xrange(Korig, Kall):
         idx = Z == k
         tmp = np.einsum('nd,ne->de', v[idx], v[idx])
         tmp -= testSS.N[k] * sigma**2 * np.eye(D)
         val, vec = np.linalg.eig(tmp)
         val[val < EPS] = EPS
         xxT[k] = np.dot(vec, np.dot(np.diag(val), vec.T))
     testSS.setField('xxT', xxT, dims=('K', 'D', 'D'))
     testSS.setUIDs(trainSS.uids)
     # combine training and test SS; update model parameters
     combinedSS = trainSS + testSS
     self.patchModel.update_global_params(combinedSS)
     self.calcGlobalParams()
    with open(pickle_path,"wb") as f:
        pickle.dump(dict_obj_to_save, f)
else:
    with open(pickle_path, "r") as f:
        dict_obj_to_save = pickle.load(f)
        list_of_empty_arrays = dict_obj_to_save['0']
        list_of_action_indices = dict_obj_to_save['1']
        file_names_list = dict_obj_to_save['2']
        list_of_full_data = dict_obj_to_save['3']
        x = dict_obj_to_save['4']
        x_prev = dict_obj_to_save['5']
        z = dict_obj_to_save['6']
        doc_range = dict_obj_to_save['7']

# for movo these params!
dataset = GroupXData(X=x,doc_range=doc_range, Xprev=x_prev) #, TrueZ=z




list_of_old_skill_indices = []
list_of_new_skill_indices = []
list_of_skills = []
list_of_next_skills = []

filter_length = 50
state_window = 5

for trajectory_of_interest in range(doc_range.shape[0]-1):
    # trajectory_of_interest = 13
    print("-----------------"+str(trajectory_of_interest)+"----------------")
# print("right")
# x_eoc, x_prev_eoc, z_eoc, doc_range_eoc = read_data(path_right, z_value=0)
# print("straight")
# x_straight, x_prev_straight , z_straight , doc_range_straight = read_data(path_straight,doc_range=doc_range_eoc[-1],z_value=10)
# print("left")
# x_left, x_prev_left, z_left , doc_range_left = read_data(path_left,doc_range=doc_range_straight[-1],z_value=55)

# x = np.vstack((x_eoc,x_straight,x_left))
# x_prev = np.vstack((x_prev_eoc,x_prev_straight, x_prev_left))
# z = np.hstack((z_eoc, z_straight, z_left))
# doc_range = np.hstack((doc_range_eoc[:-1], doc_range_straight[:-1], doc_range_left))

print("total trajectories: ", doc_range.shape)

dataset = GroupXData(X=x[:, -5:-2],
                     doc_range=doc_range,
                     Xprev=x_prev[:, -5:-2])  #, TrueZ=z

output_path_starter = '/media/ng/7ccf8f98-7ab8-498b-b405-54df784c3191/ng/workspace/bayesian_changepoint_detection/outputs/'

###############################################################################
#
# Setup: Initialization hyperparameters
# -------------------------------------

init_kwargs = dict(
    K=20,
    initname='randexamples',
)

alg_kwargs = dict(
Ejemplo n.º 17
0
def get_data(**kwargs):
    Data = GroupXData.read_from_mat(matfilepath)
    Data.summary = get_data_info()
    Data.name = get_short_name()
    return Data