Beispiel #1
0
def trainer(z, split=3500, pre_train=True):
    """
    Trainer function for a LBL model
    """
    # Unpack some stuff
    ngrams = z['ngrams']
    labels = z['labels']
    instances = z['instances']
    word_dict = z['word_dict']
    index_dict = z['index_dict']
    context = z['context']
    vocabsize = len(z['word_dict'])

    # Load word embeddings
    if pre_train:
        embed_map = lm_tools.load_embeddings()
    else:
        embed_map = None

    # Initialize the network
    net = lbl.LBL(name='lbl',
                  loc='models/lbl.pkl',
                  seed=1234,
                  criteria='validation_pp',
                  k=5,
                  V=vocabsize,
                  K=50,
                  context=context,
                  batchsize=20,
                  maxepoch=100,
                  eta_t=0.2,
                  gamma_r=1e-4,
                  gamma_c=1e-5,
                  f=0.998,
                  p_i=0.5,
                  p_f=0.9,
                  T=20.0,
                  verbose=1)

    # Break up the data for training and validation
    inds = np.arange(len(ngrams))  # np.arange(5) = [0,1,2,3,4]
    prng = RandomState(net.seed)
    prng.shuffle(inds)

    # get the size of validation set
    ngramsV = [ngrams[i] for i in inds[-split:]]
    flat_ngramsV = [item for sublist in ngramsV for item in sublist]
    instance_split = len(flat_ngramsV)

    inds = np.arange(len(instances))
    prng = RandomState(net.seed)
    prng.shuffle(inds)

    X = instances[inds[:-instance_split]]
    V = instances[inds[-instance_split:]]
    Y = labels[inds[:-instance_split]]
    VY = labels[inds[-instance_split:]]

    # Train the network
    net.train(X, Y, V, VY, index_dict, word_dict, embed_map)
Beispiel #2
0
def make_ratings(n_users, n_items, min_rating_per_user, max_rating_per_user,
                 rating_choices, seed=None, shuffle=True):
    """Randomly generate a (user_id, item_id, rating) array

    Return
    ------
        ndarray with shape (n_samples, 3)

    """
    if not (isinstance(rating_choices, list) or
            isinstance(rating_choices, tuple)):
        raise ValueError("'rating_choices' must be a list or tuple")
    if min_rating_per_user < 0 or min_rating_per_user >= n_items:
        raise ValueError("invalid 'min_rating_per_user' invalid")
    if (min_rating_per_user > max_rating_per_user) or \
       (max_rating_per_user >= n_items):
        raise ValueError("invalid 'max_rating_per_user' invalid")

    rs = RandomState(seed=seed)
    user_arrs = []
    for user_id in xrange(n_users):
        item_count = rs.randint(min_rating_per_user, max_rating_per_user)
        item_ids = rs.choice(n_items, item_count, replace=False)
        ratings = rs.choice(rating_choices, item_count)
        arr = np.stack(
            [np.repeat(user_id, item_count), item_ids, ratings], axis=1)
        user_arrs.append(arr)

    ratings = np.array(np.vstack(user_arrs))
    ratings[:, 2] = ratings[:, 2].astype('float')
    if shuffle:
        rs.shuffle(ratings)
    return ratings
Beispiel #3
0
def split(dataset, test_size=0.5, random_state=None):
    if random_state is None:
        random_state = np.random.randint(0, 999999)
    nb = dataset.X.shape[0]
    nb_test = int(nb * test_size)
    nb_train = nb - nb_test
    rng = RandomState(random_state)
    indices = np.arange(0, nb)
    rng.shuffle(indices)
    indices_train = indices[0:nb_train]
    indices_test = indices[nb_train:]

    X = dataset.X[indices_train]
    if hasattr(dataset, 'y') and dataset.y is not None:
        y = dataset.y[indices_train]
    else:
        y = None
    dataset_train = Manual(X, y)
    if hasattr(dataset, "img_dim"):
        dataset_train.img_dim = dataset.img_dim
    if hasattr(dataset, "output_dim"):
        dataset_train.output_dim = dataset.output_dim

    X = dataset.X[indices_test]
    if hasattr(dataset, 'y') and dataset.y is not None:
        y = dataset.y[indices_test]
    else:
        y = None
    dataset_test = Manual(X, y)
    if hasattr(dataset, "img_dim"):
        dataset_test.img_dim = dataset.img_dim
    if hasattr(dataset, "output_dim"):
        dataset_test.output_dim = dataset.output_dim
    return dataset_train, dataset_test
Beispiel #4
0
    def train(self, X, XY, V, VY, count_dict, word_dict, embed_map):
        """
        Trains the LBL
        """
        self.start = self.seed
        self.init_params(embed_map, count_dict, XY)
        inds = np.arange(len(X))
        numbatches = len(inds) / self.batchsize
        curr = 1e20
        counter = 0
        target=None
        num = 15000

        # Main loop
        stop.display_phase(1)
        for epoch in range(self.maxepoch):
            self.epoch = epoch
            tic = time.time()
            prng = RandomState(self.seed + epoch + 1)
            prng.shuffle(inds)
            for minibatch in range(numbatches):

                batchX = X[inds[minibatch::numbatches]]
                batchY = XY[inds[minibatch::numbatches]]
            
                (words, acts, preds) = self.forward(batchX)
                self.backward(batchY, preds, acts, words, batchX)
                self.update_params(batchX)

            self.update_hyperparams()
            toc = time.time()

            # Results and stopping criteria
            obj = self.compute_obj(X[:num], XY[:num])
            obj_val = self.compute_obj(V[:num], VY[:num])

            if self.verbose > 0:
                stop.display_results(epoch, toc-tic, obj, obj_val)
            (curr, counter) = stop.update_result(curr, obj_val, counter)
            if counter == 0:
                stop.save_model(self, self.loc)
                stopping_target = obj

            if stop.criteria_complete(self, epoch, curr, obj, counter, 
                self.k, obj_val, target):
                if self.criteria == 'maxepoch':
                    break
                elif self.criteria == 'validation_pp':
                    self = stop.load_model(self.loc)
                    counter = 0
                    X = np.r_[X, V]
                    XY = vstack([XY, VY]).tocsr()
                    self.criteria = 'll_train_heldout'
                    target = stopping_target   #obj
                    stop.display_phase(2)
                    inds = range(X.shape[0])
                    prng.shuffle(inds)
                    numbatches = len(inds) / self.batchsize
                elif self.criteria == 'll_train_heldout':
                    break
def shuffle_data(X, L, seed=1234):
    """
    Shuffle the data
    """
    prng = RandomState(seed)
    inds = np.arange(len(X))
    prng.shuffle(inds)
    X = [X[i] for i in inds]
    L = L[inds]
    return (X, L)    
def txt2im(net, z, txt, k=5, search=100, seed=1234):
    """
    Given text query txt, retrieve the top-k images from z['IM']
    For speed, only searches over a random subset of 'search' images
    """
    inds = np.arange(len(z['IM']))
    prng = RandomState(seed)
    prng.shuffle(inds)  
    ims = lm_tools.txt2im(net, txt, z['IM'][inds[:search]], z['word_dict'], k=k)
    return inds[ims]
Beispiel #7
0
def train_test_split(X, y, test_size, random_state):
        indices=numpy.arange(len(X))
        prng = RandomState(random_state)
        prng.shuffle(indices)
        e_ind=int(round(len(X)*test_size))
        training_idx = indices[e_ind:len(X)]
        test_idx = indices[0:e_ind]
        X_training, X_test = X[training_idx,:], X[test_idx,:]
        y_training, y_test = y[training_idx,:], y[test_idx,:]
        rVal=[X_training,X_test,y_training,y_test]
        return rVal
Beispiel #8
0
def shuffle(seed=None, *args):
	"""
	Shuffles the given lists in parallel.
	"""
	indices = range(len(args[0]))
	prng = RandomState(seed)	
	prng.shuffle(indices)
	shuffled = []
	for i,lst in enumerate(args):
		shuffled.append([lst[k] for k in indices])
	return shuffled
Beispiel #9
0
    def _iter_fast(self, ds, batch_size, start=None, end=None,
            shuffle=True, seed=None):
        # craete random seed
        prng1 = None
        prng2 = _dummy_shuffle
        if shuffle:
            if seed is None:
                seed = get_random_magic_seed()
            prng1 = RandomState(seed)
            prng2 = RandomState(seed)

        batches = create_batch(ds.shape[0], batch_size, start, end, prng1)
        prng2.shuffle(batches)
        for i, j in batches:
            data = ds[i:j]
            yield self._normalizer(data[prng2.permutation(data.shape[0])])
Beispiel #10
0
def main():
    from numpy.random import RandomState
    rng = RandomState(1337)

    # get samples
    os.chdir(data)
    samples=glob("*.zip")

    # shuffle or sort
    # samples.sort()
    rng.shuffle(samples)

    # remove previously unzipped files
    for i in set(glob("*")).difference(samples): shutil.rmtree(i) 

    print len(samples), "samples found"

    #start preprocessing
    gather_stats(samples)
Beispiel #11
0
def lhcSample(bounds, N, seed=None):
    """
    Perform latin hypercube sampling.
    
    @param bounds:  sequence of [min, max] bounds for the space
    @param N:       number of samples
    
    @return: list of samples points (represented as arrays)
    """
    rs = RandomState(seed)
    samp = []
    for bmin, bmax in bounds:
        if bmin == bmax:
            dsamp = array([bmin] * N)
        else:
            dsamp = (bmax - bmin) * rs.rand(N) / N + arange(bmin, bmax, (bmax - bmin) / N)
        rs.shuffle(dsamp)
        samp.append(dsamp)

    return list(vstack(samp).T)
Beispiel #12
0
def load_pairs(dataset,im_type,step_size=[]):
    offset=0
    max_difference=0.2
    random_state=42
    dsRawData=load_data(dataset,offset,max_difference)
    dir_list=dsRawData[0]
    data_y=dsRawData[1]

    X_Pairs=[]
    for s in step_size:
        tmp_X_train,tmp_y_train,tmp_overlaps_train=prepare_data(s,dir_list,data_y)

        if(len(X_Pairs)==0):
            X_Pairs=tmp_X_train
        else:
            X_Pairs=numpy.concatenate((X_Pairs,tmp_X_train))

    X_Pairs=numpy.asarray(X_Pairs)
    prng = RandomState(random_state)
    prng.shuffle(X_Pairs)

    return X_Pairs
Beispiel #13
0
def permute_rows(m, prng=None):
    """
    Permute the rows of a matrix in-place

    Parameters
    ----------
    m : array-like
        A 2-d array
    prng : RandomState instance or None, optional (default=None)
        If RandomState instance, prng is the pseudorandom number generator;
        If None, the pseudorandom number generator is the RandomState
        instance used by `np.random`.

    Returns
    -------
    None
        Original matrix is permute in-place, nothing returned
    """
    if prng is None:
        prng = RandomState()

    for row in m:
        prng.shuffle(row)
Beispiel #14
0
    def test_end2end_known_test_data(self):
        if rs.app.config['RUN_TESTS']:
            # training/test data and output files
            #label_file = '../data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'
            label_file = rs.app.config['LABEL_FILE']
            #self.report_path = '../data/input/SDS_PV2_combined/reports'
            #self.report_path = rs.app.config['TEXT_REPORT_DIR']
            label_data = pd.read_csv(label_file)
            key_start = int(rs.app.config['REGION_COL_START'])
            key_stop = int(rs.app.config['REGION_COL_STOP'])+1
            region_keys = label_data.columns[key_start:key_stop]
            # set the numpy random seed so results are reproducible
            randstate = RandomState(987654321)

            # partition the data
            pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
            test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
            randstate.shuffle(test_mask)
            test_labels = label_data.iloc[test_mask]
            #report_path = '../data/input/SDS_PV2_combined/reports'
            report_path = rs.app.config['TEXT_REPORT_DIR']
            test_reports = [self.load_report('{0}/{1}.txt'.format(report_path, pid)) for pid in test_labels['pid']]
            min_acc = float(rs.app.config['MIN_ACCURACY'])

            # send reports individually as multiple requests
            accuracy = [0,0,0,0]
            region_labels = ['inner','middle', 'outer', 'mastoid']
            for idx, tpl in enumerate(zip(test_labels['pid'],test_reports)):
                data = {tpl[0]:tpl[1]}
                request_body = json.dumps(data)
                rv = self.app.post('/classify',data=request_body, content_type='application/json')
                rdata = json.loads(rv.data)
                for jdx,label in enumerate(region_labels):
                    act = test_labels[label].iloc[idx]
                    pred = rdata[tpl[0]][jdx]
                    if act == pred:
                        accuracy[jdx] += 1

            accuracy = [v/float(len(test_labels)) for v in accuracy]
            for v in accuracy:
                self.assertGreater(v, min_acc, 'Failed accuracy on individual post test {0}'.format(accuracy))

            #send reports in one batch requiest
            accuracy = [0,0,0,0]
            region_labels = ['inner','middle', 'outer', 'mastoid']
            data = {}
            for idx, tpl in enumerate(zip(test_labels['pid'],test_reports)):
                data[tpl[0]]=tpl[1]
            request_body = json.dumps(data)
            rv = self.app.post('/classify',data=request_body, content_type='application/json')
            rdata = json.loads(rv.data)
            for idx,pid in enumerate(test_labels['pid']):
                for jdx,label in enumerate(region_labels):
                    act = test_labels[label].iloc[idx]
                    pred = rdata[pid][jdx]
                    if act == pred:
                        accuracy[jdx] += 1

            accuracy = [v/float(len(test_labels)) for v in accuracy]
            for v in accuracy:
                self.assertGreater(v, min_acc, 'Failed accuracy on batch post test {0}'.format(accuracy))
Beispiel #15
0
    os.mkdir(save_path + "/" + model_name)
    with open(save_path + '/' + model_name + "/arguments.txt", "w") as f:
        f.write(str(args))

    prng = RandomState(random_state)

    lexicon = get_lexicon()

    classes = {j: i for i, j in enumerate(lexicon)}
    inverse_classes = {v: k for k, v in classes.items()}
    print(" [INFO] %s" % classes)

    if mjsynth:
        train = open(os.path.join(path, training_fname), "r").readlines()
        train = parse_mjsynth(path, train)
        prng.shuffle(train)

        val = np.array(open(os.path.join(path, val_fname), "r").readlines())
        val = parse_mjsynth(path, val)

    else:
        train = [
            os.path.join(dp, f) for dp, dn, filenames in os.walk(path)
            for f in filenames if re.search('png|jpeg|jpg', f)
        ]
        prng.shuffle(train)

        length = len(train)
        train, val = train[:int(length *
                                train_portion)], train[int(length *
                                                           train_portion):]
def tabu_search(core, z, neighbordict,numP,w,floor, floor_variable,lockSoln, lockflag,lockVar, maxfailures=50,maxiterations=15):
        
    ##Pseudo constants
    pid = mp.current_process()._identity[0]
    tabu_list = deque(maxlen=sharedupdate[2][core])#What is this core's tabu list length? 
    maxiterations *= cores #Test synchronize cores to exit at the same time.
        
    maxfailures += int(maxfailures*uniform(-1.1, 1.2))#James et. al 2007
    
    def _tabu_check(tabu_list, neighbor, region, old_membership):
        if tabu_list:#If we have a deque with contents
            for tabu_region in(tabu_list):
                if neighbor == tabu_region[0]:
                    #print neighbor, tabu_region[0]
                    if region == tabu_region[1] and old_membership == tabu_region[2]:
                        return False    
    

    def _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln):
        #Initialize a local swap space to store n best diversified soln - these do not need to be better 
        div_soln_space = np.empty(sharedSoln.shape)
        div_soln_space[:] = np.inf
        div_variance = np.array([sharedVar[:,core_soln_column]] * sharedVar.shape[1]).T
        workingSoln = np.copy(sharedSoln[0:,core_soln_column])            
        workingVar = np.copy(sharedVar[:,core_soln_column])

        #Iterate through the regions and check all moves, store the 4 best.
        for region in np.unique(workingSoln[1:]):
            members = np.where(workingSoln == region)[0]
            neighbors = []
            for member in members:
                candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based
                candidates = [candidate for candidate in candidates if candidate not in members]
                candidates = [candidate for candidate in candidates if candidate not in neighbors]
                neighbors.extend(candidates)
            candidates = []
            
            #Iterate through the neighbors
            for neighbor in neighbors:
                neighborSoln = np.copy(workingSoln[1:]) #Pull a copy of the local working version
                old_membership = neighborSoln[neighbor]#Track where we started to check_floor
                
                neighborSoln[neighbor] = region #Move the neighbor into the new region in the copy
                
                #Check the floor
                floor_check = True 
                if not np.sum(floor_variable[neighborSoln == old_membership]) >= floor:
                    continue
                    if not np.sum(floor_variable[neighborSoln == region]) >= floor:
                        continue                

                #Check contiguity of the swap
                block = np.where(workingSoln[1:] == neighbor)[0].tolist()
                if not check_contiguity(neighbordict, block, neighbor):
                    continue                
                
                #Compute the local variance
                varcopy = np.copy(workingVar)
                varcopy[old_membership] = np.var(z[neighborSoln == old_membership])
                varcopy[region] = np.var(z[neighborSoln == region])
                varcopy[0] = np.sum(varcopy[1:])           
                
                if np.any(div_soln_space[0] == np.inf):
                    column = np.where(np.isinf(div_soln_space[0]) == True)[0][0]
                    div_soln_space[1:,column] = neighborSoln[:]
                    div_soln_space[0:,column][0] = varcopy[0]
                    div_variance[:,column] = varcopy
                else:
                    column = np.argmax(div_soln_space[0])
                    div_soln_space[1:,column] = neighborSoln[:]
                    div_soln_space[0:,column][0] = varcopy[0]
                    div_variance[:,column] = varcopy
        #Write one of the neighbor perturbations to the shared memory space to work on.
        valid = np.where(div_soln_space[0] != np.inf)[0]
        #print sharedVar[:,core_soln_column]
        try:
            selection = randint(0,len(valid)-1)
            with lockSoln:
                sharedSoln[:,core_soln_column] = div_soln_space[:,selection]
            with lockVar:
                sharedVar[:,core_soln_column] = div_variance[:,selection]
                #print sharedVar[:,core_soln_column]
        except:
            pass
            #print div_soln_space[0]
            print "Attempt to diversify failed."
 
    ##This shows that we are operating asynchronously.   
    #if core ==2:
        #time.sleep(5)
    
    while sum(sharedupdate[1]) < maxiterations:
        core_soln_column = (core + sharedupdate[1][core])%len(sharedupdate[1])
        
        if sharedupdate[0][core_soln_column] == False:
            _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln) #Li, et. al (in press - P-Compact_Regions)

        #print "ProcessID %i is processing soln column %i in iteration %i."%(pid, core_soln_column,sharedupdate[1][core]) #Uncomment to see that cores move around the search space    
        failures = 0 #The total local failure counter
        local_best_variance = sharedSoln[:,core_soln_column][0]
        workingSoln = np.copy(sharedSoln[:,core_soln_column])    
        workingVar = np.copy(sharedVar[:,core_soln_column])
        
        while failures <= maxfailures:
            #Select a random starting point in the search space.
            nr = np.unique(workingSoln[1:]) #This is 0 based, ie. region 0 - region 31
            regionIDs = nr
            changed_regions = np.ones(len(nr))
            randstate = RandomState(pid) #To 'unsync' the cores we need to instantiate a random class with a unique seed.
            randstate.shuffle(regionIDs) #shuffle the regions so we start with a random region
            changed_regions[:] = 0
            swap_flag = False #Flag to stop looping prior to max iterations if we are not improving.
            
            #Iterate through the regions, checking potential swaps
            for region in regionIDs:
                members = np.where(workingSoln == region)[0] #get the members of the region
                #Get the neighbors to the members.  Grab only those that could change.
                neighbors = []
                for member in members:
                    candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based
                    candidates = [candidate for candidate in candidates if candidate not in members]
                    candidates = [candidate for candidate in candidates if candidate not in neighbors]
                    neighbors.extend(candidates)
                candidates = []
                                
                #Iterate through the neighbors
                for neighbor in neighbors:
                    neighborSoln = np.copy(workingSoln[1:]) #Pull a copy of the local working version
                    old_membership = neighborSoln[neighbor]#Track where we started to check_floor
                    #For whatever reason candidates block is adding other units in the region, ie. we test moving from region 1 to region 1...
                    '''ToDO: Check the candidates code above, something is wrong with it...testing more than necessary'''
                    if old_membership == region:
                        continue
                    
                    #Check the tabu list
                    tabu_move_check = _tabu_check(tabu_list, neighbor, region, old_membership)
                    if tabu_move_check is not None:
                        continue
                    
                    neighborSoln[neighbor] = region #Move the neighbor into the new region in the copy

                    #Check the floor
                    floor_check = True 
                    if not np.sum(floor_variable[neighborSoln == old_membership]) >= floor:
                        continue
                        if not np.sum(floor_variable[neighborSoln == region]) >= floor:
                            continue

                    #Compute the local variance
                    varcopy = np.copy(workingVar)
                    varcopy[old_membership] = np.var(z[neighborSoln== old_membership])
                    varcopy[region] = np.var(z[neighborSoln == region])
                    varcopy[0] = np.sum(varcopy[1:])
                    
                    #Check the locally compute varaince against current working best.
                    if varcopy[0] >= workingSoln[0]:
                        continue
                    
                    #Check contiguity of the swap
                    block = np.where(workingSoln[1:] == neighbor)[0].tolist()
                    if not check_contiguity(neighbordict, block, neighbor):
                        #print "Failed contiguity check"
                        continue
                    
                    #After this we have passed all tests, a check of all possible swaps has yielded a better one.
                    swap_flag = True
                    workingSoln[0] = varcopy[0]
                    workingSoln[1:] = neighborSoln[:]
                    workingVar = varcopy
                    tabu_list.appendleft((neighbor,old_membership,region))
                    
                if swap_flag == False:
                    failures += 1
        
        with lockSoln and lockflag:
            sharedupdate[0][core_soln_column] = 0
            sharedSoln[:,core_soln_column]
            if workingSoln[0] < sharedSoln[:,core_soln_column][0]:
                sharedSoln[:,core_soln_column] = workingSoln[:]
                sharedupdate[0][core_soln_column] = 1 #Set the update flag to true
                if not workingSoln[0] < sharedSoln[0].any():
                    results = set_half_to_best(len(sharedSoln[0]))
                    with lockVar:
                        sharedVar[:,core_soln_column] = workingVar
                        for result in results:
                            sharedVar[:,result] = workingVar
        #Increment the core iteration counter    
        sharedupdate[1][core] += 1
def mantel_pval(X,
                Y,
                kX="Euclidian",
                kY="Delta",
                sigmaX=None,
                sigmaY=None,
                N_samp=500,
                random_seed=None,
                return_boots=False):
    """ Calculates the Mantel test 
    A faster method for calculating the p-values
    X: Array of locations
    Y: Array of observations 
    
    kX: method for calculating the X matrix
    kY: method for calculating the Y matrix
    sigmaX: param for the Gaussian kernel
    sigmaY: param for the Gaussian kernel
    N_samp: Number of samples for bootstrap 
    random_seed: for calculating p values
    return_boots: return bootstrap vaules?
    """
    prng = RandomState(random_seed)

    # Calculate two distance matrices
    if not sigmaX and kX == "Gaussian":
        sigmaX = HSIC.getSigmaGaussian(X, X, 200)
    if not sigmaY and kY == "Gaussian":
        sigmaY = HSIC.getSigmaGaussian(Y, Y, 200)

    # Calculate two distance matrices
    A = matrix_mantel(X, kX, sigmaX)
    B = matrix_mantel(Y, kY, sigmaY)

    # Just pearson correlation
    A_minus_mean = A - A.mean()
    B_mean = B.mean()
    B_minus_mean = B - B_mean
    lright = math.sqrt((A_minus_mean**2).sum())
    lleft = math.sqrt(((B_minus_mean)**2).sum())
    top = A_minus_mean.dot(B_minus_mean)
    mantel_val = top / (lright * lleft)

    # Calculating the p-value
    pval = 1.0
    Yrand = np.copy(Y)
    boots = []
    for i in xrange(N_samp):
        prng.shuffle(Yrand)
        B_tmp = matrix_mantel(Yrand, kY, sigmaY)

        if return_boots:
            boots.append(A_minus_mean.dot(B_tmp - B_mean))
        if A_minus_mean.dot(B_tmp - B_mean
                            ) >= top:  #can use B_mean instead of B_tmp.mean()
            pval += 1

    pval /= N_samp + 1

    if return_boots:
        return mantel_val, pval, boots
    else:
        return mantel_val, pval
Beispiel #18
0
    def _iter_slow(self, batch_size=128, start=None, end=None,
                   shuffle=True, seed=None, mode=0):
        # ====== Set random seed ====== #
        all_ds = self._data[:]
        prng1 = None
        prng2 = _dummy_shuffle
        if shuffle:
            if seed is None:
                seed = get_random_magic_seed()
            prng1 = RandomState(seed)
            prng2 = RandomState(seed)

        all_size = [i.shape[0] for i in all_ds]
        n_dataset = len(all_ds)

        # ====== Calculate batch_size ====== #
        if mode == 1: # equal
            s = sum(all_size)
            all_batch_size = [int(round(batch_size * i / s)) for i in all_size]
            for i in xrange(len(all_batch_size)):
                if all_batch_size[i] == 0: all_batch_size[i] += 1
            if sum(all_batch_size) > batch_size: # 0.5% -> round up, too much
                for i in xrange(len(all_batch_size)):
                    if all_batch_size[i] > 1:
                        all_batch_size[i] -= 1
                        break
            all_upsample = [None] * len(all_size)
        elif mode == 2 or mode == 3: # upsampling and downsampling
            maxsize = int(max(all_size)) if mode == 2 else int(min(all_size))
            all_batch_size = [int(batch_size / n_dataset) for i in xrange(n_dataset)]
            for i in xrange(batch_size - sum(all_batch_size)): # not enough
                all_batch_size[i] += 1
            all_upsample = [maxsize for i in xrange(n_dataset)]
        else: # sequential
            all_batch_size = [batch_size]
            all_upsample = [None]
            all_size = [sum(all_size)]
        # ====== Create all block and batches ====== #
        # [ ((idx1, batch1), (idx2, batch2), ...), # batch 1
        #   ((idx1, batch1), (idx2, batch2), ...), # batch 2
        #   ... ]
        all_block_batch = []
        # contain [block_batches1, block_batches2, ...]
        tmp_block_batch = []
        for n, batchsize, upsample in zip(all_size, all_batch_size, all_upsample):
            tmp_block_batch.append(
                create_batch(n, batchsize, start, end, prng1, upsample))
        # ====== Distribute block and batches ====== #
        if mode == 1 or mode == 2 or mode == 3:
            for i in zip_longest(*tmp_block_batch):
                all_block_batch.append([(k, v) for k, v in enumerate(i) if v is not None])
        else:
            all_size = [i.shape[0] for i in all_ds]
            all_idx = []
            for i, j in enumerate(all_size):
                all_idx += [(i, k) for k in xrange(j)] # (ds_idx, index)
            all_idx = [all_idx[i[0]:i[1]] for i in tmp_block_batch[0]]
            # complex algorithm to connecting the batch with different dataset
            for i in all_idx:
                tmp = []
                idx = i[0][0] # i[0][0]: ds_index
                start = i[0][1] # i[0][1]: index
                end = start
                for j in i[1:]: # detect change in index
                    if idx != j[0]:
                        tmp.append((idx, (start, end + 1)))
                        idx = j[0]
                        start = j[1]
                    end = j[1]
                tmp.append((idx, (start, end + 1)))
                all_block_batch.append(tmp)
        prng2.shuffle(all_block_batch)
        # print if you want debug
        # for _ in all_block_batch:
        #     for i, j in _:
        #         print('ds:', i, '  batch:', j)
        #     print('===== End =====')
        # ====== return iteration ====== #
        for _ in all_block_batch: # each _ is a block
            batches = np.concatenate(
                [all_ds[i][j[0]:j[1]] for i, j in _], axis=0)
            batches = batches[prng2.permutation(batches.shape[0])]
            yield self._normalizer(batches)
def raw_noise_2d(positions, pType):
    seq = numpy.arange(0, 256, dtype=int)
    prng = RandomState(pType)
    prng.shuffle(seq)
    perm = numpy.zeros(512, dtype=int)
    perm[0:256] = seq
    perm[256::] = seq

    grad3 = numpy.array([[1, 1, 0], [-1, 1, 0], [1, -1, 0], [-1, -1, 0],
                         [1, 0, 1], [-1, 0, 1], [1, 0, -1], [-1, 0, -1],
                         [0, 1, 1], [0, -1, 1], [0, 1, -1], [0, -1, -1]])

    nValues = positions.shape[0]

    """2D Raw Simplex noise."""
    # Noise contributions from the three corners
    corners = numpy.zeros((3, 2, nValues))

    # Skew the input space to determine which simplex cell we're in
    F2 = 0.5 * (math.sqrt(3.0) - 1.0)

    # Hairy skew factor for 2D
    s = numpy.sum(positions, axis=1) * F2
    ij = (positions.T + s).astype(int)
    G2 = (3.0 - math.sqrt(3.0)) / 6.0
    t0 = numpy.sum(ij, axis=0) * G2
    # Unskew the cell origin back to (x,y) space
    XY = ij - t0
    # The x,y distances from the cell origin
    corners[0, :, :] = positions.T - XY

    i1 = corners[0, 0, :] > corners[0, 1, :]
    j1 = ~i1
    # A step of (1,0) in (i,j) means a step of (1-c,-c) in (x,y), and
    # a step of (0,1) in (i,j) means a step of (-c,1-c) in (x,y), where
    corners[1, 0, :] = corners[0, 0, :] - i1 + G2  # Offsets for middle corner in (x,y) unskewed coords
    corners[1, 1, :] = corners[0, 1, :] - j1 + G2
    corners[2, :, :] = corners[0, :, :] - 1.0 + 2.0 * G2  # Offsets for last corner in (x,y) unskewed coords
    # Work out the hashed gradient indices of the three simplex corners
    ij &= 255

    gi = numpy.zeros((3, nValues), dtype=int)
    gi[0, :] = perm[ij[0, :] + perm[ij[1, :]]] % 12
    gi[1, :] = perm[ij[0, :] + i1 + perm[ij[1, :] + j1]] % 12
    gi[2, :] = perm[ij[0, :] + 1 + perm[ij[1, :] + 1]] % 12

    n = numpy.zeros((3, nValues), dtype=float)
    # Calculate the contribution from the three corners    
    temp = corners * corners
    t = .5 - temp[:, 0, :] - temp[:, 1, :]
    m = t >= 0
    t *= t
    t *= t
    grad = grad3[gi]
    # print t
    n[0, m[0, :]] = t[0, m[0, :]] * dot2d(grad[0, m[0, :], :], corners[0, :, m[0, :]])
    n[1, m[1, :]] = t[1, m[1, :]] * dot2d(grad[1, m[1, :], :], corners[1, :, m[1, :]])
    n[2, m[2, :]] = t[2, m[2, :]] * dot2d(grad[2, m[2, :], :], corners[2, :, m[2, :]])

    # Add contributions from each corner to get the final noise value.
    # The result is scaled to return values in the interval [-1,1].
    result = (70.0 * numpy.sum(n, axis=0))
    return (1 + result) * .5
Beispiel #20
0
     nblists = info.get_supported_nblist()
     access = info.get_supported_access()
 print "Supported elements:", elements
 print "Supported neighborlist methods:", nblists
 print "Supported access methods:", access
 if len(access) > 1:
     access = ['loca', 'iter']
 else:
     access = [None]
 if len(elements) == 1:
     main = elements[0]
     other = None
     state = reference_states[atomic_numbers[main]]
 else:
     elements = list(elements)
     rnd.shuffle(elements)
     for i in range(len(elements)):
         main = elements[i]
         other = elements[i-1]
         state = reference_states[atomic_numbers[main]]
         if state['symmetry'] in known_states:
             break
 if state['symmetry'] not in known_states:
     print "Cannot simulate %s, reference state '%s' not supported" % (main, state['symmetry'])
     print "SKIPPING MODEL!"
     continue
 
 init_atoms = bulk(main, orthorhombic=True).repeat((7,7,7))
 r = init_atoms.get_positions()
 r += rnd.normal(0.0, 0.1, r.shape)
 init_atoms.set_positions(r)
Beispiel #21
0
def train(dim_word=256, # word vector dimensionality
          ctx_dim=512, # context vector dimensionality
          dim=256, # the number of LSTM_Old units
          margin=0.2, # margin for pairwise ranking loss. Should be (0,1] if use_norm is on
          use_norm=True, # whether to L2norm vectors prior to loss
          use_ctx_mean=False, # whether to initialze decoder to annotation means
          use_last=False, #Only use last hidden state for ranking
          n_layers_att=1,
          n_layers_init=1, # This isn't useful if use_ctx_mean=False
          patience=10,
          max_epochs=5000,
          dispFreq=1,
          decay_c=0.,
          alpha_c=1.,
          lrate=0.01,
          selector=True,
          n_words=23461,
          maxlen=100, # maximum length of the description
          optimizer='adam',
          batch_size = 64,
          valid_batch_size = 128,
          saveto='/ais/gobi3/u/rkiros/flickr8k/rank_models/lstm_toy.npz',
          validFreq=200,
          total_queries=5000, # total number of queries
          n_queries=50, # number of queries to validate on, resampled each time
          saveFreq=200, # save the parameters after every saveFreq updates
          sampleFreq=200, # generate some samples after every sampleFreq updates
          dataset='flickr8k',
          dictionary=None, # word dictionary
          use_dropout=False,
          use_dropout_lstm=False,
          reload_=False):

    # Model options
    print alpha_c
    model_options = locals().copy()
    model_options = validate_options(model_options)

    # reload options
    if reload_ and os.path.exists(saveto):
        print "Reloading options"
        with open('%s.pkl'%saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    load_data, prepare_data = get_dataset(dataset)
    train, valid, test, worddict = load_data()

    # Invert the dictionary, add special tokens
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print "Reloading model"
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
          inps, alphas, \
          alphas_contrast, cost, \
          opts_out = \
          build_model(tparams, model_options)

    print 'Building ranker'
    trng_r, use_noise_r, \
            inps_r, alphas_r, \
            scores, opts_out_r = \
            build_ranker(tparams, model_options)

    # before any regularizer
    print 'Building functions'
    f_log_probs = theano.function(inps, -cost, profile=False)
    f_ranker = theano.function(inps_r, scores, profile=False)

    print 'Regularization'
    cost = cost.mean()
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean()
        alpha_reg_contrast = alpha_c * ((1.-alphas_contrast.sum(0))**2).sum(0).mean()
        cost += alpha_reg
        cost += alpha_reg_contrast

    # gradient computation
    print 'Computing gradients'
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print 'Optimization'

    train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen)

    if valid:
        kf_valid = KFold(len(valid[1]), n_folds=len(valid[1])/valid_batch_size, shuffle=False)
    if test:
        kf_test = KFold(len(test[1]), n_folds=len(test[1])/valid_batch_size, shuffle=False)

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = numpy.load(saveto)['history_errs'].tolist()
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        print 'Epoch ', eidx

        for caps in train_iter:
            n_samples += len(caps)
            uidx += 1
            use_noise.set_value(1.)

            pd_start = time.time()
            x, mask, ctx = prepare_data(caps,
                                        train[1],
                                        worddict,
                                        maxlen=maxlen,
                                        n_words=n_words)
            pd_duration = time.time() - pd_start

            # Get some contrastive images
            prng = RandomState(eidx + n_samples)
            inds = numpy.arange(len(train[1]))
            prng.shuffle(inds)
            contrast_ctx = numpy.zeros((len(caps), train[1][0].shape[1])).astype('float32')
            for cidx in range(len(caps)):
                contrast_ctx[cidx,:] = numpy.array(train[1][inds[cidx]].todense())
            contrast_ctx = contrast_ctx.reshape([contrast_ctx.shape[0], 14*14, 512])

            if x == None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            ud_start = time.time()
            cost = f_grad_shared(x, mask, ctx, contrast_ctx)
            f_update(lrate)
            ud_duration = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p != None:
                    params = copy.copy(best_p)
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
                print 'Done'

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                train_err = 0
                valid_err = 0
                test_err = 0

                if valid:

                    queries = numpy.arange(total_queries)
                    prng.shuffle(queries)
                    (r1, r5, r10, r25, r50, r100, medr) = recallK(f_ranker, model_options, worddict, prepare_data, valid, kf_valid, queries[:n_queries], verbose=False)
                    print "Recall@(1,5,10,25,50,100): %.1f, %.1f, %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, r25, r50, r100, medr)

                    #TODO: Not sure if this is the best choice, maybe explore alternatives
                    valid_err = medr
                    history_errs.append([valid_err, 1e20])

                # Use the median rank to decide when to stop
                if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if eidx > patience and valid_err >= numpy.array(history_errs)[:,0].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

        print 'Seen %d samples'%n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    train_err = 0
    valid_err = 0
    test_err = 0

    queries = numpy.arange(total_queries)
    prng.shuffle(queries)
    (r1, r5, r10, r25, r50, r100, medr) = recallK(f_ranker, model_options, worddict, prepare_data, valid, kf_valid, queries[:n_queries], verbose=False)
    print "Recall@(1,5,10,25,50,100): %.1f, %.1f, %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, r25, r50, r100, medr)
    valid_err = medr

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
                valid_err=valid_err, test_err=test_err, history_errs=history_errs,
                **params)
Beispiel #22
0
def trainer(train, dev, # training and development tuples
            dim=1000, # embedding dimensionality
            dim_im=4096, # image dimensionality
            dim_s=4800, # sentence dimensionality
            margin=0.2, # margin for pairwise ranking
            ncon=50, # number of contrastive terms
            max_epochs=15,
            lrate=0.01, # not needed with Adam
            dispFreq=10,
            optimizer='adam',
            batch_size = 100,
            valid_batch_size = 100,
            saveto='/ais/gobi3/u/rkiros/ssg/models/cocorank1000_combine.npz',
            validFreq=500,
            saveFreq=500,
            reload_=False):

    # Model options
    model_options = {}
    model_options['dim'] = dim
    model_options['dim_im'] = dim_im
    model_options['dim_s'] = dim_s
    model_options['margin'] = margin
    model_options['ncon'] = ncon
    model_options['max_epochs'] = max_epochs
    model_options['lrate'] = lrate
    model_options['dispFreq'] = dispFreq
    model_options['optimizer'] = optimizer
    model_options['batch_size'] = batch_size
    model_options['valid_batch_size'] = valid_batch_size
    model_options['saveto'] = saveto
    model_options['validFreq'] = validFreq
    model_options['saveFreq'] = saveFreq
    model_options['reload_'] = reload_

    model_options = validate_options(model_options)
    print model_options

    # reload options
    if reload_ and os.path.exists(saveto):
        print "Reloading options"
        with open('%s.pkl'%saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print "Reloading model"
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    inps, cost = build_model(tparams, model_options)

    print 'Building encoder'
    inps_e, lim, ls = build_encoder(tparams, model_options)

    print 'Building functions'
    f_cost = theano.function(inps, -cost, profile=False)
    f_emb = theano.function(inps_e, [lim, ls], profile=False)

    # gradient computation
    print 'Computing gradients'
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)

    print 'Optimization'

    uidx = 0
    estop = False
    start = 1234
    seed = 1234
    inds = numpy.arange(len(train[0]))
    numbatches = len(inds) / batch_size
    curr = 0
    counter = 0
    target=None
    history_errs = []

    # Main loop
    for eidx in range(max_epochs):
        tic = time.time()
        prng = RandomState(seed - eidx - 1)
        prng.shuffle(inds)

        for minibatch in range(numbatches):

            uidx += 1
            conprng_im = RandomState(seed + uidx + 1)
            conprng_s = RandomState(2*seed + uidx + 1)

            im = train[1][inds[minibatch::numbatches]]
            s = train[2][inds[minibatch::numbatches]]

            cinds_im = conprng_im.random_integers(low=0, high=len(train[0])-1, size=ncon * len(im))
            cinds_s = conprng_s.random_integers(low=0, high=len(train[0])-1, size=ncon * len(s))
            cim = train[1][cinds_im]
            cs = train[2][cinds_s]

            ud_start = time.time()
            cost = f_grad_shared(im, s, cim, cs)
            f_update(lrate)
            ud_duration = time.time() - ud_start

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud_duration

            if numpy.mod(uidx, validFreq) == 0:

                print 'Computing ranks...'
                lim, ls = f_emb(dev[1], dev[2])
                (r1, r5, r10, medr) = i2t(lim, ls)
                print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)
                (r1i, r5i, r10i, medri) = t2i(lim, ls)
                print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)

                currscore = r1 + r5 + r10 + r1i + r5i + r10i
                if currscore > curr:
                    curr = currscore

                    # Save model
                    print 'Saving...',
                    params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
                    print 'Done'
Beispiel #23
0
    def train(self, X, indX, XY, V, indV, VY, IM, count_dict, word_dict, embed_map):
        """
        Trains the LBL
        """
        self.start = self.seed
        self.init_params(embed_map, count_dict, XY)
        inds = np.arange(len(X))
        numbatches = len(inds) / self.batchsize
        curr = 1e20
        counter = 0
        target=None
        num = 15000

        x = T.matrix('x', dtype='int32')
        y = T.matrix('y')
        im = T.matrix('im')
        lr = T.scalar('lr')
        mom = T.scalar('mom')
        (words, acts, IF, preds) = self.forward(x, im)
        obj_T = self.compute_obj(x, im, y)
        compute_obj_T = theano.function([x, im, y], obj_T)
        train_batch = theano.function([x, im, y, lr, mom], obj_T, 
                                      updates=self.update_params(obj_T, x, lr, mom), 
                                      on_unused_input='warn')

        log_file = open("train_valid_err.txt", 'w')

        # Main loop
        stop.display_phase(1)
        for epoch in range(self.maxepoch):
            self.epoch = epoch
            tic = time.time()
            prng = RandomState(self.seed + epoch + 1)
            prng.shuffle(inds)
            obj = 0.0
            for minibatch in range(numbatches):
                batchX = X[inds[minibatch::numbatches]].astype(np.int32)
                batchY = XY[inds[minibatch::numbatches]].toarray().astype(theano.config.floatX)
                batchindX = indX[inds[minibatch::numbatches]].astype(np.int32).flatten()
                batchIm = IM[batchindX].astype(theano.config.floatX)
                
                obj += train_batch(batchX, batchIm, batchY, self.eta_t, self.p_t)

            self.update_hyperparams()

            toc = time.time()
            # Results and stopping criteria
            obj_val = compute_obj_T(V[:num].astype(np.int32), 
                                  IM[indV[:num].astype(int).flatten()].astype(theano.config.floatX), 
                                  VY[:num].toarray().astype(theano.config.floatX))

            log_file.write('{} {}\n'.format(obj, obj_val))

            if self.verbose > 0:
                stop.display_results(epoch, toc-tic, obj, obj_val)
            (curr, counter) = stop.update_result(curr, obj_val, counter)
            if counter == 0:
                stop.save_model_theano(self, self.loc)
                stopping_target = obj

            if stop.criteria_complete(self, epoch, curr, obj, counter, 
                self.k, obj_val, target):
                if self.criteria == 'maxepoch':
                    break
                elif self.criteria == 'validation_pp':
                    stop.load_model_theano(self, self.loc)

                    counter = 0
                    X = np.r_[X, V]
                    XY = vstack([XY, VY]).tocsr()
                    indX = np.r_[indX, indV]
                    self.criteria = 'll_train_heldout'
                    target = stopping_target   #obj
                    stop.display_phase(2)
                    inds = range(X.shape[0])
                    prng.shuffle(inds)
                    numbatches = len(inds) / self.batchsize
                elif self.criteria == 'll_train_heldout':
                    break

        log_file.close()
class CntWindowTrialIterator(object):
    """Cut out windows for several predictions from a continous dataset
     with a trial marker y signal.

    Parameters
    ----------

    Returns
    -------

    """
    def __init__(self, batch_size, input_time_length, n_sample_preds,
            check_preds_smaller_trial_len=True):
        self.batch_size = batch_size
        self.input_time_length = input_time_length
        self.n_sample_preds = n_sample_preds
        self.check_preds_smaller_trial_len = check_preds_smaller_trial_len
        self.rng = RandomState(328774)
        
    def reset_rng(self):
        self.rng = RandomState(328774)

    def get_batches(self, dataset, shuffle):
        i_trial_starts, i_trial_ends = compute_trial_start_end_samples(
            dataset.y, check_trial_lengths_equal=False,
            input_time_length=self.input_time_length)
        if self.check_preds_smaller_trial_len:
            self.check_trial_bounds(i_trial_starts, i_trial_ends)
        start_end_blocks_per_trial = self.compute_start_end_block_inds(
            i_trial_starts, i_trial_ends)

        topo = dataset.get_topological_view()
        y = dataset.y

        return self.yield_block_batches(topo, y, start_end_blocks_per_trial, shuffle=shuffle)

    def check_trial_bounds(self, i_trial_starts, i_trial_ends):
        for start, end in zip(i_trial_starts, i_trial_ends):
            assert end - start + 1 >= self.n_sample_preds, (
                "Trial should be longer or equal than number of sample preds, "
                "Trial length: {:d}, sample preds {:d}...".
                format(end - start + 1, self.n_sample_preds))

    def compute_start_end_block_inds(self, i_trial_starts, i_trial_ends):
        # create start stop indices for all batches still 2d trial -> start stop
        start_end_blocks_per_trial = []
        for i_trial in xrange(len(i_trial_starts)):
            trial_start = i_trial_starts[i_trial]
            trial_end = i_trial_ends[i_trial]
            start_end_blocks = get_start_end_blocks_for_trial(trial_start,
                trial_end, self.input_time_length, self.n_sample_preds)
        
            if self.check_preds_smaller_trial_len:
                # check that block is correct, all predicted samples should be the trial samples
                all_predicted_samples = [range(start_end[1] - self.n_sample_preds + 1, 
                    start_end[1]+1) for start_end in start_end_blocks]
                # this check takes about 50 ms in performance test
                # whereas loop itself takes only 5 ms.. deactivate it if not necessary
                assert np.array_equal(range(i_trial_starts[i_trial], i_trial_ends[i_trial] + 1), 
                               np.unique(np.concatenate(all_predicted_samples)))

            start_end_blocks_per_trial.append(start_end_blocks)
        return start_end_blocks_per_trial

    def yield_block_batches(self, topo, y, start_end_blocks_per_trial, shuffle):
        start_end_blocks_flat = np.concatenate(start_end_blocks_per_trial)
        if shuffle:
            self.rng.shuffle(start_end_blocks_flat)

        for i_block in xrange(0, len(start_end_blocks_flat), self.batch_size):
            i_block_stop = min(i_block + self.batch_size, len(start_end_blocks_flat))
            start_end_blocks = start_end_blocks_flat[i_block:i_block_stop]
            batch = create_batch(topo,y, start_end_blocks, self.n_sample_preds)
            yield batch
Beispiel #25
0
def tabu_search(core, z, neighbordict,numP,w,floor_variable,lockSoln, lockflag, maxfailures=15,maxiterations=10):
    ##Pseudo constants
    pid = mp.current_process()._identity[0]
    tabu_list = deque(maxlen=sharedupdate[2][core])#What is this core's tabu list length? 
    maxiterations *= cores #Test synchronize cores to exit at the same time.
        
    maxfailures += int(maxfailures*uniform(-1.1, 1.2))#James et. al 2007
    
    def _tabu_check(tabu_list, neighbor, region, old_membership):
        if tabu_list:#If we have a deque with contents
            for tabu_region in(tabu_list):
                if neighbor == tabu_region[0]:
                    #print neighbor, tabu_region[0]
                    if region == tabu_region[1] and old_membership == tabu_region[2]:
                        return False    
    

    def _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln):
        '''
        The goal of this function is to diversify a soln that is not improving.
        What about the possability of diversifying a good answer away from 'the soln?'
        I do no think that that should be an issue - we make a randomized greedy swap.  Is one enough?
        
        Rationale: This is a randomized Greedy swap (GRASP), where we store the best n permutations and then randomly select the one we will use.  Originally in Li et. al (in press).  We need to test different values of n to see what the impact is.
        '''
        #print "Diversifying: ", sharedSoln[0]

        #Initialize a local swap space to store n best diversified soln - these do not need to be better 
        div_soln_space = np.ndarray(sharedSoln.shape)
        div_soln_space[:] = float("inf")
        workingcopy = np.copy(sharedSoln[0:,core_soln_column])            

        #Iterate through the regions and check all moves, store the 4 best.
        for region in np.unique(workingcopy[1:]):
            members = np.where(workingcopy == region)[0]
            neighbors = []
            for member in members:
                candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based
                candidates = [candidate for candidate in candidates if candidate not in members]
                candidates = [candidate for candidate in candidates if candidate not in neighbors]
                neighbors.extend(candidates)
            candidates = []
            
            #Iterate through the neighbors
            for neighbor in neighbors:
                neighborcopy = np.copy(workingcopy[:]) #Pull a copy of the local working version
                old_membership = neighborcopy[neighbor]#Track where we started to check_floor
                
                neighborcopy[neighbor] = region #Move the neighbor into the new region in the copy
                
                #Here we start to check the swap and see if it is better
                swap_var = objective_function_vec(neighborcopy[1:],z)#Variance of the new swap
                if not swap_var < div_soln_space[0].any():
                    block = np.where(workingcopy[1:] == neighbor)[0]#A list of the members in a region.
                    block=block.tolist() #For current contiguity check
                    if check_contiguity(neighbordict, block, neighbor):#Check contiguity
                        if check_floor(np.where(neighborcopy[1:,]==region)[0], floor_variable, w) and check_floor(np.where(neighborcopy[1:,]==old_membership)[0],floor_variable,w):
                            
                            neighborcopy[0] = swap_var
                            if not np.isinf(div_soln_space[0].any()):
                                div_soln_space[:,np.argmax(div_soln_space[0])] = neighborcopy[:]
                            else:
                                div_soln_space[:,np.argmin(div_soln_space[0])] = neighborcopy[:]
                        else:
                            del neighborcopy
                            #print "Swap failed due to floor_check."
                    else:
                        del neighborcopy
                        #print "Swap failed due to contiguity."
        #It is possible that the perturbation will not generate enough soln to fill the space, 
        # so we need to ignore those columns with variance = infinity.
        
        #Write one of the neighbor perturbations to the shared memory space to work on.
        valid = np.where(div_soln_space[0] != np.inf)[0]
        try:
            selection = randint(0,len(valid)-1)
            with lockSoln:
                sharedSoln[:,core_soln_column] = div_soln_space[:,selection]
                print "Diversified to:", sharedSoln[0]
        except:
            print div_soln_space[0]
            print "Attempt to diversify failed."
            

        
    ##This shows that we are operating asynchronously.   
    #if core ==2:
        #time.sleep(5)
    
    while sum(sharedupdate[1]) < maxiterations:
        core_soln_column = (core + sharedupdate[1][core])%len(sharedupdate[1]) #This iterates the cores around the search space.
        
        #Check for diversification here and diversify if necessary...
        if sharedupdate[0][core_soln_column] == False:
            _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln) #Li, et. al (in press - P-Compact_Regions)

        #print "ProcessID %i is processing soln column %i in iteration %i."%(pid, core_soln_column,sharedupdate[1][core]) #Uncomment to see that cores move around the search space    
        failures = 0 #The total iteration counter
        #What are the current best solutions local to this core?
        local_best_variance = sharedSoln[:,core_soln_column][0]
        workingSoln = np.copy(sharedSoln[:,core_soln_column])    
        
        while failures <= maxfailures:#How many total iterations can the core make
      
            #Select a random starting point in the search space.
            nr = np.unique(workingSoln[1:]) #This is 0 based, ie. region 0 - region 31
            regionIDs = nr
            changed_regions = np.ones(len(nr))
            randstate = RandomState(pid) #To 'unsync' the cores we need to instantiate a random class with a unique seed.
            randstate.shuffle(regionIDs) #shuffle the regions so we start with a random region
            changed_regions[:] = 0
            swap_flag = False #Flag to stop looping prior to max iterations if we are not improving.
            
            #Iterate through the regions, checking potential swaps
            for region in regionIDs:
                members = np.where(workingSoln == region)[0] #get the members of the region
                #print region, members
                #Get the neighbors to the members.  Grab only those that could change.
                neighbors = []
                for member in members:
                    candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based
                    candidates = [candidate for candidate in candidates if candidate not in members]
                    candidates = [candidate for candidate in candidates if candidate not in neighbors]
                    neighbors.extend(candidates)
                candidates = []
                
                #Iterate through the neighbors
                for neighbor in neighbors:
                    neighborSoln = np.copy(workingSoln[:]) #Pull a copy of the local working version
                    old_membership = neighborSoln[neighbor]#Track where we started to check_floor
                    
                    tabu_move_check =_tabu_check(tabu_list, neighbor, region, old_membership)
                    if tabu_move_check is not None:
                        break
                    
                    neighborSoln[neighbor] = region #Move the neighbor into the new region in the copy
                    
                    #Here we start to check the swap and see if it is better
                    swap_var = objective_function_vec(neighborSoln[1:],z)#Variance of the new swap
                    if swap_var <= local_best_variance:
                        block = np.where(workingSoln[1:] == neighbor)[0]#A list of the members in a region.
                        block=block.tolist() #For current contiguity check
                        if check_contiguity(neighbordict, block, neighbor):#Check contiguity
                            if check_floor(np.where(neighborSoln[1:,]==region)[0], floor_variable, w) and check_floor(np.where(neighborSoln[1:,]==old_membership)[0],floor_variable,w):#What about the floor of the region loosing the member in the original code?
                                #print "Swap made on core %i.  Objective function improved from %f to %f." %(pid, swap_var, local_best_variance)
                                local_best_variance = swap_var#Set the new local best to the swap. We have made a swap that betters the objective function.
                                neighborSoln[0] = swap_var
                                workingSoln[:] = neighborSoln[:]
                                swap_flag = True #We made a swap
                                tabu_list.appendleft((neighbor,old_membership,region))#tuple(polygon_id, oldgroup,newgroup)
                            else:
                                del neighborSoln
                                #print "Swap failed due to floor_check."
                        else:
                            del neighborSoln
                            #print "Swap failed due to contiguity."
                    
            if swap_flag == False:
                #print "Failed to make any swap, incrementing the fail counter."
                failures += 1
            
        #print workingSoln, len(np.unique(workingSoln[1:]))    
        
        with lockflag:
            sharedupdate[0][core_soln_column] = 0 #Set the update flag to false
            #print "Locking update flag to set to false"
        
        with lockSoln:#The lock is released at the end of the with statement
            sharedSoln[:,core_soln_column]#Lock the column of the shared soln we are using.
            if workingSoln[0] < sharedSoln[:,core_soln_column][0]:
                sharedSoln[:,core_soln_column]
                sharedSoln[:,core_soln_column] = workingSoln
                #print "Better soln loaded into sharedSoln: %f." %(workingSoln[0])
                sharedupdate[0][core_soln_column] = 1 #Set the update flag to true
                if not workingSoln[0] < sharedSoln[0].any():
                    set_half_to_best(len(sharedSoln[0]))
                    #print "Setting half the soln to new global best. ", sharedSoln[0]
        #Increment the core iteration counter    
        sharedupdate[1][core] += 1
Beispiel #26
0
def trainer(z, split=3500, pre_train=True):
    """
    Trainer function for a MLBLF model
    """
    # Unpack some stuff
    ngrams = z['ngrams']
    labels = z['labels']
    instances = z['instances']
    word_dict = z['word_dict']
    index_dict = z['index_dict']
    context = z['context']
    vocabsize = len(z['word_dict'])
    im = z['IM']
    index = z['index']

    # Load word embeddings
    if pre_train:
        embed_map = lm_tools.load_embeddings()
    else:
        embed_map = None

    # Initialize the network
    net = mlblf.MLBLF(name='mlblf',
                      loc='models/mlblf.pkl',         # where to store the model file
                      seed=1234,                              # used to initializing the model parameters
                      criteria='validation_pp',        # the stopping criteria
                      k=5,                                           # the window size used for validation
                      V=vocabsize,                           # the size of the vocabulary
                      K=50,                                        # the dim of the word representations
                      D=im.shape[1],                       # the dim of the images features
                      h=256,                                      # dim of an intermediate layer on the image channel
                      factors=50,                              # number of factors
                      context=context,
                      batchsize=20,
                      maxepoch=100,
                      eta_t=0.02,
                      gamma_r=1e-4,
                      gamma_c=1e-5,
                      f=0.998,
                      p_i=0.5,
                      p_f=0.9,
                      T=20.0,
                      verbose=1)

    # Break up the data for training and validation
    inds = np.arange(len(ngrams))
    prng = RandomState(net.seed)
    prng.shuffle(inds)

    ngramsV = [ngrams[i] for i in inds[-split:]]
    flat_ngramsV = [item for sublist in ngramsV for item in sublist]
    instance_split = len(flat_ngramsV)

    inds = np.arange(len(instances))
    prng = RandomState(net.seed)
    prng.shuffle(inds)

    X = instances[inds[:-instance_split]]
    V = instances[inds[-instance_split:]]
    Y = labels[inds[:-instance_split]]
    VY = labels[inds[-instance_split:]]
    indX = index[inds[:-instance_split]]
    indV = index[inds[-instance_split:]]

    # Train the network
    net.train(X, indX, Y, V, indV, VY, im, index_dict, word_dict, embed_map)
Beispiel #27
0
def trainer(train,
            valid,
            test,
            n_chars=33,
            img_w=128,
            max_len=27,
            feature_maps=100,
            filter_hs=[2, 3, 4],
            max_epochs=20,
            gamma=10,
            ncon=100,
            lrate=0.0002,
            batch_size=100,
            dispFreq=10,
            validFreq=10,
            saveto='example.npz'):
    """ train, valid, test : datasets
        n_chars : vocabulary size
        img_w : character embedding dimension.
        max_len : the maximum length of a sentence
        feature_maps : the number of feature maps we used
        filter_hs: the filter window sizes we used
        max_epochs : The maximum number of epoch to run
        gamma: hyper-parameter using in ranking
        ncon: the number of negative samples we used for each postive sample
        lrate : learning rate
        batch_size : batch size during training
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation rank score after this number of update.
        saveto: where to save the result.
    """
    global ctr
    img_h = max_len + 2 * (filter_hs[-1] - 1)

    model_options = {}
    model_options['n_chars'] = n_chars
    model_options['img_w'] = img_w
    model_options['img_h'] = img_h
    model_options['feature_maps'] = feature_maps
    model_options['filter_hs'] = filter_hs
    model_options['max_epochs'] = max_epochs
    model_options['gamma'] = gamma
    model_options['ncon'] = ncon
    model_options['lrate'] = lrate
    model_options['batch_size'] = batch_size
    model_options['dispFreq'] = dispFreq
    model_options['validFreq'] = validFreq
    model_options['saveto'] = saveto

    logger.info('Model options {}'.format(model_options))

    logger.info('Building model...')

    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))

    model_options['filter_shapes'] = filter_shapes
    model_options['pool_sizes'] = pool_sizes

    params = init_params(model_options)
    tparams = init_tparams(params)

    use_noise, inps, cost = build_model(tparams, model_options)

    logger.info('Building encoder...')
    inps_e, feat_x, feat_y = build_encoder(tparams, model_options)

    logger.info('Building functions...')
    f_emb = theano.function(inps_e, [feat_x, feat_y], name='f_emb')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = Adam(tparams, cost, inps, lr)

    logger.info('Training model...')

    uidx = 0
    seed = 1234
    curr = 0
    history_errs = []

    valid_x = prepare_data(valid[0], max_len, n_chars, filter_hs[-1])
    valid_y = prepare_data(valid[1], max_len, n_chars, filter_hs[-1])

    test_x = prepare_data(test[0], max_len, n_chars, filter_hs[-1])
    test_y = prepare_data(test[1], max_len, n_chars, filter_hs[-1])

    zero_vec_tensor = tensor.vector()
    zero_vec = np.zeros(img_w).astype(theano.config.floatX)
    set_zero = theano.function([zero_vec_tensor],
                               updates=[(tparams['Wemb'],
                                         tensor.set_subtensor(
                                             tparams['Wemb'][n_chars - 1, :],
                                             zero_vec_tensor))])

    # Main loop
    for eidx in range(max_epochs):
        print("epoch {} ".format(eidx))

        prng = RandomState(seed - eidx - 1)

        trainA = train[0]
        trainB = train[1]

        num_samples = len(trainA)

        inds = np.arange(num_samples)
        prng.shuffle(inds)
        numbatches = len(inds) / batch_size
        for minibatch in range(numbatches):
            print("minibatch : ", minibatch)

            use_noise.set_value(0.)
            uidx += 1
            conprng = RandomState(seed + uidx + 1)

            x = [trainA[seq] for seq in inds[minibatch::numbatches]]
            y = [trainB[seq] for seq in inds[minibatch::numbatches]]

            cinds = conprng.random_integers(low=0,
                                            high=num_samples - 1,
                                            size=ncon * len(x))
            cy = [trainB[seq] for seq in cinds]

            x = prepare_data(x, max_len, n_chars, filter_hs[-1])
            y = prepare_data(y, max_len, n_chars, filter_hs[-1])
            cy = prepare_data(cy, max_len, n_chars, filter_hs[-1])

            feats_x, feats_y = f_emb(x, y)
            (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)

            cost = f_grad_shared(x, y, cy)
            print("cost {},r {}".format(cost, r1))
            f_update(lrate)

            xdata.append(ctr)
            ctr = ctr + 1
            ydata.append(cost)
            y2data.append(r1)

            lines.set_xdata(xdata)
            lines.set_ydata(ydata)
            lines2.set_xdata(xdata)
            lines2.set_ydata(y2data)
            # Need both of these in order to rescale
            ax[0].relim()
            ax[0].autoscale_view()
            ax[1].relim()
            ax[1].autoscale_view()
            # We need to draw *and* flush
            figure.canvas.draw()
            figure.canvas.flush_events()

            # the special token does not need to update.
            set_zero(zero_vec)

            if np.mod(uidx, dispFreq) == 0:
                logger.info('Epoch {} Update {} Cost {}'.format(
                    eidx, uidx, cost))

            if np.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                logger.info('Computing ranks...')

                # valid_y,slocs = shuffle_valid(valid_y)

                feats_x, feats_y = f_emb(valid_x, valid_y)
                # (r1, r3, r10, medr, meanr, h_meanr) = rank_valid(feats_x, feats_y,slocs)
                (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)

                x3data.append(ctr)
                y3data.append(r1)

                lines3.set_xdata(x3data)
                lines3.set_ydata(y3data)

                ax[2].relim()
                ax[2].autoscale_view()
                # We need to draw *and* flush
                figure.canvas.draw()
                figure.canvas.flush_events()

                history_errs.append([r1, r3, r10, medr, meanr, h_meanr])

                logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format(
                    r1, r3, r10, medr, meanr, h_meanr))
                print('Valid Rank:{}, {}, {}, {},{},{}'.format(
                    r1, r3, r10, medr, meanr, h_meanr))

                currscore = r1 + r3 + r10
                if currscore > curr:
                    curr = currscore
                    logger.info('Saving...')
                    params = unzip(tparams)
                    np.savez(saveto, history_errs=history_errs, **params)
                    logger.info('Done...')

    use_noise.set_value(0.)
    zipp(params, tparams)
    logger.info('Final results...')

    feats_x, feats_y = f_emb(valid_x, valid_y)
    (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)
    logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format(
        r1, r3, r10, medr, meanr, h_meanr))

    feats_x, feats_y = f_emb(test_x, test_y)
    (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)
    logger.info('Test Rank:{}, {}, {}, {},{},{}'.format(
        r1, r3, r10, medr, meanr, h_meanr))

    # np.savez("./cnn_feats.npz", feats_x=feats_x, feats_y=feats_y)

    return (r1, r3, r10, medr, meanr, h_meanr)
Beispiel #28
0
def main():
    print args
    print

    accuracies = defaultdict(lambda: [])

    ora_accu = defaultdict(lambda: [])
    ora_cm = defaultdict(lambda: [])
    lbl_dit = defaultdict(lambda: [])
    oracle_accuracies =[]

    aucs = defaultdict(lambda: [])

    x_axis = defaultdict(lambda: [])

    vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())

    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = None
    if args.train == "20news":
        categories = [['alt.atheism', 'talk.religion.misc'],
                      ['comp.graphics', 'comp.windows.x'],
                      ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                      ['rec.sport.baseball', 'sci.crypt']]
        categories=categories[2]
    elif args.train == "webkb":
        categories = ['student','faculty']
    elif args.train == "arxiv":
        categories = [['cs.AI','cs.LG'],
                      ['physics.comp-ph','physics.data-an']]
        categories=categories[0]

    min_size = 10

    args.fixk = None

    data, vct = load_from_file(args.train, [categories], args.fixk, min_size, vct, raw=True)
    print data.train.target_names
    print "Vectorizer:", vct
    print("Data %s" % args.train)
    print("Data size %s" % len(data.train.data))

    parameters = parse_parameters_mat(args.cost_model)

    print "Cost Parameters %s" % parameters

    cost_model = set_cost_model(args.cost_function, parameters=parameters)
    print "\nCost Model: %s" % cost_model.__class__.__name__

    ### SENTENCE TRANSFORMATION
    if args.train == "twitter":
        sent_detector = TwitterSentenceTokenizer()
    else:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    ## delete <br> to "." to recognize as end of sentence
    data.train.data = clean_html(data.train.data)
    data.test.data = clean_html(data.test.data)

    print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0]))
    ## Get the features of the sentence dataset

    ## create splits of data: pool, test, oracle, sentences
    expert_data = Bunch()
    if not args.fulloracle:
        train_test_data = Bunch()

        expert_data.sentence, train_test_data.pool = split_data(data.train)
        expert_data.oracle, train_test_data.test = split_data(data.test)

        data.train.data = train_test_data.pool.train.data
        data.train.target = train_test_data.pool.train.target

        data.test.data = train_test_data.test.train.data
        data.test.target = train_test_data.test.train.target

    ## convert document to matrix
    data.train.bow = vct.fit_transform(data.train.data)
    data.test.bow = vct.transform(data.test.data)

    #### EXPERT CLASSIFIER: ORACLE
    print("Training Oracle expert")
    exp_clf = set_classifier(args.classifier, parameter=args.expert_penalty)

    if not args.fulloracle:
        print "Training expert documents:%s" % len(expert_data.oracle.train.data)
        labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit)

        expert_data.oracle.train.data = sent_train
        expert_data.oracle.train.target = np.array(labels)
        expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data)

        exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target)
    else:
        # expert_data.data = np.concatenate((data.train.data, data.test.data))
        # expert_data.target = np.concatenate((data.train.target, data.test.target))
        expert_data.data =data.train.data
        expert_data.target = data.train.target
        expert_data.target_names = data.train.target_names
        labels, sent_train = split_data_sentences(expert_data, sent_detector, vct, limit=args.limit)
        expert_data.bow = vct.transform(sent_train)
        expert_data.target = labels
        expert_data.data = sent_train
        exp_clf.fit(expert_data.bow, expert_data.target)


    if "neutral" in args.expert:
        expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "true" in args.expert:
        expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function)
    elif "pred" in args.expert:
        expert = baseexpert.PredictingExpert(exp_clf,  #threshold=args.neutral_threshold,
                                             cost_function=cost_model.cost_function)
    elif "human" in args.expert:
        expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ")
    else:
        raise Exception("We need an expert!")
    print "Training expert documents:%s" % len(sent_train)
    print "\nExpert: %s " % expert

    #### EXPERT CLASSIFIER: SENTENCES
    print("Training sentence expert")

    sent_clf = None
    if args.cheating:
        labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit)

        expert_data.sentence.train.data = sent_train
        expert_data.sentence.train.target = np.array(labels)
        expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data)
        sent_clf = set_classifier(args.classifier, parameter=args.expert_penalty)
        sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target)

    #### STUDENT CLASSIFIER
    clf = set_classifier(args.classifier, parameter=args.expert_penalty)


    print "\nStudent Classifier: %s" % clf
    print "\nSentence Classifier: %s" % sent_clf
    print "\nExpert Oracle Classifier: %s" % exp_clf
    print "Penalty:", exp_clf.C
    print "Oracle "
    #### ACTIVE LEARNING SETTINGS
    step_size = args.step_size
    bootstrap_size = args.bootstrap
    evaluation_points = 200

    print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size,
                                                                                          evaluation_points, args.fixk,
                                                                                          min_size))
    print ("Anytime active learning experiment - use objective function to pick data")
    t0 = time.time()
    tac = []
    tau = []
    ### experiment starts
    for t in range(args.trials):

        trial_accu = []

        trial_aucs = []

        print "*" * 60
        print "Trial: %s" % t

        student = get_student(clf, cost_model, sent_clf, sent_detector, vct)
        student.human_mode = args.expert == 'human'
        print "\nStudent: %s " % student

        train_indices = []
        neutral_data = []  # save the xik vectors
        train_x = []
        train_y = []
        neu_x = []  # data to train the classifier
        neu_y = np.array([])

        pool = Bunch()
        pool.data = data.train.bow.tocsr()  # full words, for training
        pool.text = data.train.data
        pool.target = data.train.target
        pool.predicted = []
        pool.remaining = range(pool.data.shape[0]) # indices of the pool
        rand = RandomState(t * 1234)
        rand.shuffle(pool.remaining)
        pool.offset = 0


        bootstrapped = False
        current_cost = 0
        iteration = 0
        query_index = None
        query_size = None
        oracle_answers = 0
        calibrated=args.calibrate
        while 0 < student.budget and len(pool.remaining) > pool.offset and iteration <= args.maxiter:
            util = []

            if not bootstrapped:
                query_index = pool.remaining[:bootstrap_size]
                bootstrapped = True
                query = pool.data[query_index]

                print
            else:
                # if not calibrated:
                #     chosen = student.pick_next(pool=pool, step_size=step_size)
                # else:
                #     chosen = student.pick_next_cal(pool=pool, step_size=step_size)
                chosen = student.pick_next(pool=pool, step_size=step_size)

                query_index = [x for x, y in chosen]  # document id of chosen instances
                query = [y for x, y in chosen]  # sentence of the document

                query_size = [1] * len(query_index)

            ground_truth = pool.target[query_index]

            if iteration == 0:  ## bootstrap uses ground truth
                labels = ground_truth
                spent = [0] * len(ground_truth)  ## bootstrap cost is ignored
            else:
                # print "ask labels"
                if isinstance(expert, baseexpert.HumanExpert):
                    labels = expert.label_instances(query, ground_truth)
                    # raise Exception("Oops, this is not ready, yet.")
                else:
                    labels = expert.label_instances(query, ground_truth)
                spent = expert.estimate_instances(query_size)

            ### accumulate the cost of the query
            query_cost = np.array(spent).sum()
            current_cost += query_cost

            useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None])

            neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \
                if iteration != 0 else np.array([])

            ## add data recent acquired to train
            if useful_answers.shape[0] != 0:
                train_indices.extend(useful_answers[:, 0])

                # add labels to training
                train_x = pool.data[train_indices]  # # train with all the words

                # update labels with the expert labels
                train_y.extend(useful_answers[:, 1])

            neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct)  # update sentence student classifier data

            if neu_y.shape[0] != neu_x.shape[0]:
                raise Exception("Training data corrupted!")
            if train_x.shape[0] != len(train_y):
                raise Exception("Training data corrupted!")

            # remove labels from pool
            pool.offset = len(train_indices)

            # retrain the model
            current_model = student.train_all(train_x, train_y, neu_x, neu_y)

            # evaluate and save results
            y_probas = current_model.predict_proba(data.test.bow)

            auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1])

            pred_y = current_model.classes_[np.argmax(y_probas, axis=1)]

            correct_labels = np.sum(np.array(ground_truth) == np.array(labels).reshape(len(labels)))

            accu = metrics.accuracy_score(data.test.target, pred_y)
            if not student.human_mode:
                print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format(
                    len(train_indices),
                    accu,
                    auc, query_cost,
                    current_cost,
                    ground_truth,
                    len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.sum(train_y), correct_labels))

            ## the results should be based on the cost of the labeling
            if iteration > 0:  # bootstrap iteration

                student.budget -= query_cost  ## Bootstrap doesn't count
                # oracle accuracy (from queries)
                oracle_answers += correct_labels
                x_axis_range = current_cost
                x_axis[x_axis_range].append(current_cost)
                ## save results
                accuracies[x_axis_range].append(accu)
                aucs[x_axis_range].append(auc)
                # ora_accu[x_axis_range].append(1. * correct_labels/len(ground_truth))
                ora_accu[x_axis_range].append(1. * correct_labels)
                ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y)))
                lbl_dit[x_axis_range].append(np.sum(train_y))
                # partial trial results
                trial_accu.append([x_axis_range, accu])
                trial_aucs.append([x_axis_range, auc])
                # oracle_accuracies[x_axis_range].append(oracle_answers)
            iteration += 1
            # end of budget loop

        tac.append(trial_accu)
        tau.append(trial_aucs)
        oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size))
        print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers,
                 iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size))
        #end trial loop
    if args.cost_function not in "uniform":
        accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size)
        aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size)
    print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean()
    print("Elapsed time %.3f" % (time.time() - t0))
    cheating = "CHEATING" if args.cheating else "NOCHEAT"
    print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student)
    oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
Beispiel #29
0
class PMF(ModelBase):
    """Probabilistic Matrix Factorization
    """

    def __init__(self, n_user, n_item, n_feature, batch_size=1e5, epsilon=50.0,
                 momentum=0.8, seed=None, reg=1e-2, converge=1e-5,
                 max_rating=None, min_rating=None):

        super(PMF, self).__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.n_feature = n_feature

        self.random_state = RandomState(seed)

        # batch size
        self.batch_size = batch_size

        # learning rate
        self.epsilon = float(epsilon)
        self.momentum = float(momentum)
        # regularization parameter
        self.reg = reg
        self.converge = converge
        self.max_rating = float(max_rating) \
            if max_rating is not None else max_rating
        self.min_rating = float(min_rating) \
            if min_rating is not None else min_rating

        # data state
        self.mean_rating_ = None
        # user/item features
        self.user_features_ = 0.1 * self.random_state.rand(n_user, n_feature)
        self.item_features_ = 0.1 * self.random_state.rand(n_item, n_feature)

    def fit(self, ratings, n_iters=50):

        check_ratings(ratings, self.n_user, self.n_item,
                      self.max_rating, self.min_rating)

        self.mean_rating_ = np.mean(ratings[:, 2])
        last_rmse = None
        batch_num = int(np.ceil(float(ratings.shape[0] / self.batch_size)))
        logger.debug("batch count = %d", batch_num + 1)

        # momentum
        u_feature_mom = np.zeros((self.n_user, self.n_feature))
        i_feature_mom = np.zeros((self.n_item, self.n_feature))
        # gradient
        u_feature_grads = np.zeros((self.n_user, self.n_feature))
        i_feature_grads = np.zeros((self.n_item, self.n_feature))
        for iteration in xrange(n_iters):
            logger.debug("iteration %d...", iteration)

            self.random_state.shuffle(ratings)

            for batch in xrange(batch_num):
                start_idx = int(batch * self.batch_size)
                end_idx = int((batch + 1) * self.batch_size)
                data = ratings[start_idx:end_idx]

                # compute gradient
                u_features = self.user_features_.take(
                    data.take(0, axis=1), axis=0)
                i_features = self.item_features_.take(
                    data.take(1, axis=1), axis=0)
                preds = np.sum(u_features * i_features, 1)
                errs = preds - (data.take(2, axis=1) - self.mean_rating_)
                err_mat = np.tile(2 * errs, (self.n_feature, 1)).T
                u_grads = i_features * err_mat + self.reg * u_features
                i_grads = u_features * err_mat + self.reg * i_features

                u_feature_grads.fill(0.0)
                i_feature_grads.fill(0.0)
                for i in xrange(data.shape[0]):
                    row = data.take(i, axis=0)
                    u_feature_grads[row[0], :] += u_grads.take(i, axis=0)
                    i_feature_grads[row[1], :] += i_grads.take(i, axis=0)

                # update momentum
                u_feature_mom = (self.momentum * u_feature_mom) + \
                    ((self.epsilon / data.shape[0]) * u_feature_grads)
                i_feature_mom = (self.momentum * i_feature_mom) + \
                    ((self.epsilon / data.shape[0]) * i_feature_grads)

                # update latent variables
                self.user_features_ -= u_feature_mom
                self.item_features_ -= i_feature_mom

            # compute RMSE
            train_preds = self.predict(ratings[:, :2])
            train_rmse = RMSE(train_preds, ratings[:, 2])
            logger.info("iter: %d, train RMSE: %.6f", iteration, train_rmse)

            # stop when converge
            if last_rmse and abs(train_rmse - last_rmse) < self.converge:
                logger.info('converges at iteration %d. stop.', iteration)
                break
            else:
                last_rmse = train_rmse
        return self

    def predict(self, data):

        if not self.mean_rating_:
            raise NotFittedError()

        u_features = self.user_features_.take(data.take(0, axis=1), axis=0)
        i_features = self.item_features_.take(data.take(1, axis=1), axis=0)
        preds = np.sum(u_features * i_features, 1) + self.mean_rating_

        if self.max_rating:
            preds[preds > self.max_rating] = self.max_rating

        if self.min_rating:
            preds[preds < self.min_rating] = self.min_rating
        return preds
Beispiel #30
0
    #load test set data - same set used for ML tests
    seed = 987654321
    # set the numpy random seed so results are reproducible
    rs = RandomState(987654321)

    # set common path variables
    label_file = './data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt'

    # read data
    label_data = pd.read_csv(label_file)

    # partition the data
    pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2])
    train_mask = np.concatenate((pos_cases[0], neg_cases[0]))
    test_mask = np.concatenate((pos_cases[1], neg_cases[1]))
    rs.shuffle(train_mask)
    rs.shuffle(test_mask)
    train_labels = label_data.iloc[train_mask]
    test_labels = label_data.iloc[test_mask]
    # read in the text reports
    train_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in train_labels['pid']]
    test_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in test_labels['pid']]

    #import keywords
    keywords = {}
    with open(keyword_file, 'r') as f:
        key = ""
        for line in f.readlines():
            if line.startswith("#"):
                key = line[1:].strip('\n')
            else:
    else:
        raise Exception('verify failed: %s' % file_path)
    return file_path

# load or download MovieLens 1M dataset
rating_file = ml_1m_download(ML_1M_FOLDER, file_size=ML_1M_ZIP_SIZE)
ratings = load_movielens_1m_ratings(rating_file)
n_user = max(ratings[:, 0])
n_item = max(ratings[:, 1])

# shift user_id & movie_id by 1. let user_id & movie_id start from 0
ratings[:, (0, 1)] -= 1

# split data to training & testing
train_pct = 0.9
rand_state.shuffle(ratings)
train_size = int(train_pct * ratings.shape[0])
train = ratings[:train_size]
validation = ratings[train_size:]

# models settings
n_feature = 10
eval_iters = 10
print("n_user: %d, n_item: %d, n_feature: %d, training size: %d, validation size: %d" % (
    n_user, n_item, n_feature, train.shape[0], validation.shape[0]))
als = ALS(n_user=n_user, n_item=n_item, n_feature=n_feature,
          reg=5e-2, max_rating=5., min_rating=1., seed=0)

als.fit(train, n_iters=eval_iters)
train_preds = als.predict(train[:, :2])
train_rmse = RMSE(train_preds, train[:, 2])