def trainer(z, split=3500, pre_train=True): """ Trainer function for a LBL model """ # Unpack some stuff ngrams = z['ngrams'] labels = z['labels'] instances = z['instances'] word_dict = z['word_dict'] index_dict = z['index_dict'] context = z['context'] vocabsize = len(z['word_dict']) # Load word embeddings if pre_train: embed_map = lm_tools.load_embeddings() else: embed_map = None # Initialize the network net = lbl.LBL(name='lbl', loc='models/lbl.pkl', seed=1234, criteria='validation_pp', k=5, V=vocabsize, K=50, context=context, batchsize=20, maxepoch=100, eta_t=0.2, gamma_r=1e-4, gamma_c=1e-5, f=0.998, p_i=0.5, p_f=0.9, T=20.0, verbose=1) # Break up the data for training and validation inds = np.arange(len(ngrams)) # np.arange(5) = [0,1,2,3,4] prng = RandomState(net.seed) prng.shuffle(inds) # get the size of validation set ngramsV = [ngrams[i] for i in inds[-split:]] flat_ngramsV = [item for sublist in ngramsV for item in sublist] instance_split = len(flat_ngramsV) inds = np.arange(len(instances)) prng = RandomState(net.seed) prng.shuffle(inds) X = instances[inds[:-instance_split]] V = instances[inds[-instance_split:]] Y = labels[inds[:-instance_split]] VY = labels[inds[-instance_split:]] # Train the network net.train(X, Y, V, VY, index_dict, word_dict, embed_map)
def make_ratings(n_users, n_items, min_rating_per_user, max_rating_per_user, rating_choices, seed=None, shuffle=True): """Randomly generate a (user_id, item_id, rating) array Return ------ ndarray with shape (n_samples, 3) """ if not (isinstance(rating_choices, list) or isinstance(rating_choices, tuple)): raise ValueError("'rating_choices' must be a list or tuple") if min_rating_per_user < 0 or min_rating_per_user >= n_items: raise ValueError("invalid 'min_rating_per_user' invalid") if (min_rating_per_user > max_rating_per_user) or \ (max_rating_per_user >= n_items): raise ValueError("invalid 'max_rating_per_user' invalid") rs = RandomState(seed=seed) user_arrs = [] for user_id in xrange(n_users): item_count = rs.randint(min_rating_per_user, max_rating_per_user) item_ids = rs.choice(n_items, item_count, replace=False) ratings = rs.choice(rating_choices, item_count) arr = np.stack( [np.repeat(user_id, item_count), item_ids, ratings], axis=1) user_arrs.append(arr) ratings = np.array(np.vstack(user_arrs)) ratings[:, 2] = ratings[:, 2].astype('float') if shuffle: rs.shuffle(ratings) return ratings
def split(dataset, test_size=0.5, random_state=None): if random_state is None: random_state = np.random.randint(0, 999999) nb = dataset.X.shape[0] nb_test = int(nb * test_size) nb_train = nb - nb_test rng = RandomState(random_state) indices = np.arange(0, nb) rng.shuffle(indices) indices_train = indices[0:nb_train] indices_test = indices[nb_train:] X = dataset.X[indices_train] if hasattr(dataset, 'y') and dataset.y is not None: y = dataset.y[indices_train] else: y = None dataset_train = Manual(X, y) if hasattr(dataset, "img_dim"): dataset_train.img_dim = dataset.img_dim if hasattr(dataset, "output_dim"): dataset_train.output_dim = dataset.output_dim X = dataset.X[indices_test] if hasattr(dataset, 'y') and dataset.y is not None: y = dataset.y[indices_test] else: y = None dataset_test = Manual(X, y) if hasattr(dataset, "img_dim"): dataset_test.img_dim = dataset.img_dim if hasattr(dataset, "output_dim"): dataset_test.output_dim = dataset.output_dim return dataset_train, dataset_test
def train(self, X, XY, V, VY, count_dict, word_dict, embed_map): """ Trains the LBL """ self.start = self.seed self.init_params(embed_map, count_dict, XY) inds = np.arange(len(X)) numbatches = len(inds) / self.batchsize curr = 1e20 counter = 0 target=None num = 15000 # Main loop stop.display_phase(1) for epoch in range(self.maxepoch): self.epoch = epoch tic = time.time() prng = RandomState(self.seed + epoch + 1) prng.shuffle(inds) for minibatch in range(numbatches): batchX = X[inds[minibatch::numbatches]] batchY = XY[inds[minibatch::numbatches]] (words, acts, preds) = self.forward(batchX) self.backward(batchY, preds, acts, words, batchX) self.update_params(batchX) self.update_hyperparams() toc = time.time() # Results and stopping criteria obj = self.compute_obj(X[:num], XY[:num]) obj_val = self.compute_obj(V[:num], VY[:num]) if self.verbose > 0: stop.display_results(epoch, toc-tic, obj, obj_val) (curr, counter) = stop.update_result(curr, obj_val, counter) if counter == 0: stop.save_model(self, self.loc) stopping_target = obj if stop.criteria_complete(self, epoch, curr, obj, counter, self.k, obj_val, target): if self.criteria == 'maxepoch': break elif self.criteria == 'validation_pp': self = stop.load_model(self.loc) counter = 0 X = np.r_[X, V] XY = vstack([XY, VY]).tocsr() self.criteria = 'll_train_heldout' target = stopping_target #obj stop.display_phase(2) inds = range(X.shape[0]) prng.shuffle(inds) numbatches = len(inds) / self.batchsize elif self.criteria == 'll_train_heldout': break
def shuffle_data(X, L, seed=1234): """ Shuffle the data """ prng = RandomState(seed) inds = np.arange(len(X)) prng.shuffle(inds) X = [X[i] for i in inds] L = L[inds] return (X, L)
def txt2im(net, z, txt, k=5, search=100, seed=1234): """ Given text query txt, retrieve the top-k images from z['IM'] For speed, only searches over a random subset of 'search' images """ inds = np.arange(len(z['IM'])) prng = RandomState(seed) prng.shuffle(inds) ims = lm_tools.txt2im(net, txt, z['IM'][inds[:search]], z['word_dict'], k=k) return inds[ims]
def train_test_split(X, y, test_size, random_state): indices=numpy.arange(len(X)) prng = RandomState(random_state) prng.shuffle(indices) e_ind=int(round(len(X)*test_size)) training_idx = indices[e_ind:len(X)] test_idx = indices[0:e_ind] X_training, X_test = X[training_idx,:], X[test_idx,:] y_training, y_test = y[training_idx,:], y[test_idx,:] rVal=[X_training,X_test,y_training,y_test] return rVal
def shuffle(seed=None, *args): """ Shuffles the given lists in parallel. """ indices = range(len(args[0])) prng = RandomState(seed) prng.shuffle(indices) shuffled = [] for i,lst in enumerate(args): shuffled.append([lst[k] for k in indices]) return shuffled
def _iter_fast(self, ds, batch_size, start=None, end=None, shuffle=True, seed=None): # craete random seed prng1 = None prng2 = _dummy_shuffle if shuffle: if seed is None: seed = get_random_magic_seed() prng1 = RandomState(seed) prng2 = RandomState(seed) batches = create_batch(ds.shape[0], batch_size, start, end, prng1) prng2.shuffle(batches) for i, j in batches: data = ds[i:j] yield self._normalizer(data[prng2.permutation(data.shape[0])])
def main(): from numpy.random import RandomState rng = RandomState(1337) # get samples os.chdir(data) samples=glob("*.zip") # shuffle or sort # samples.sort() rng.shuffle(samples) # remove previously unzipped files for i in set(glob("*")).difference(samples): shutil.rmtree(i) print len(samples), "samples found" #start preprocessing gather_stats(samples)
def lhcSample(bounds, N, seed=None): """ Perform latin hypercube sampling. @param bounds: sequence of [min, max] bounds for the space @param N: number of samples @return: list of samples points (represented as arrays) """ rs = RandomState(seed) samp = [] for bmin, bmax in bounds: if bmin == bmax: dsamp = array([bmin] * N) else: dsamp = (bmax - bmin) * rs.rand(N) / N + arange(bmin, bmax, (bmax - bmin) / N) rs.shuffle(dsamp) samp.append(dsamp) return list(vstack(samp).T)
def load_pairs(dataset,im_type,step_size=[]): offset=0 max_difference=0.2 random_state=42 dsRawData=load_data(dataset,offset,max_difference) dir_list=dsRawData[0] data_y=dsRawData[1] X_Pairs=[] for s in step_size: tmp_X_train,tmp_y_train,tmp_overlaps_train=prepare_data(s,dir_list,data_y) if(len(X_Pairs)==0): X_Pairs=tmp_X_train else: X_Pairs=numpy.concatenate((X_Pairs,tmp_X_train)) X_Pairs=numpy.asarray(X_Pairs) prng = RandomState(random_state) prng.shuffle(X_Pairs) return X_Pairs
def permute_rows(m, prng=None): """ Permute the rows of a matrix in-place Parameters ---------- m : array-like A 2-d array prng : RandomState instance or None, optional (default=None) If RandomState instance, prng is the pseudorandom number generator; If None, the pseudorandom number generator is the RandomState instance used by `np.random`. Returns ------- None Original matrix is permute in-place, nothing returned """ if prng is None: prng = RandomState() for row in m: prng.shuffle(row)
def test_end2end_known_test_data(self): if rs.app.config['RUN_TESTS']: # training/test data and output files #label_file = '../data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt' label_file = rs.app.config['LABEL_FILE'] #self.report_path = '../data/input/SDS_PV2_combined/reports' #self.report_path = rs.app.config['TEXT_REPORT_DIR'] label_data = pd.read_csv(label_file) key_start = int(rs.app.config['REGION_COL_START']) key_stop = int(rs.app.config['REGION_COL_STOP'])+1 region_keys = label_data.columns[key_start:key_stop] # set the numpy random seed so results are reproducible randstate = RandomState(987654321) # partition the data pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2]) test_mask = np.concatenate((pos_cases[1], neg_cases[1])) randstate.shuffle(test_mask) test_labels = label_data.iloc[test_mask] #report_path = '../data/input/SDS_PV2_combined/reports' report_path = rs.app.config['TEXT_REPORT_DIR'] test_reports = [self.load_report('{0}/{1}.txt'.format(report_path, pid)) for pid in test_labels['pid']] min_acc = float(rs.app.config['MIN_ACCURACY']) # send reports individually as multiple requests accuracy = [0,0,0,0] region_labels = ['inner','middle', 'outer', 'mastoid'] for idx, tpl in enumerate(zip(test_labels['pid'],test_reports)): data = {tpl[0]:tpl[1]} request_body = json.dumps(data) rv = self.app.post('/classify',data=request_body, content_type='application/json') rdata = json.loads(rv.data) for jdx,label in enumerate(region_labels): act = test_labels[label].iloc[idx] pred = rdata[tpl[0]][jdx] if act == pred: accuracy[jdx] += 1 accuracy = [v/float(len(test_labels)) for v in accuracy] for v in accuracy: self.assertGreater(v, min_acc, 'Failed accuracy on individual post test {0}'.format(accuracy)) #send reports in one batch requiest accuracy = [0,0,0,0] region_labels = ['inner','middle', 'outer', 'mastoid'] data = {} for idx, tpl in enumerate(zip(test_labels['pid'],test_reports)): data[tpl[0]]=tpl[1] request_body = json.dumps(data) rv = self.app.post('/classify',data=request_body, content_type='application/json') rdata = json.loads(rv.data) for idx,pid in enumerate(test_labels['pid']): for jdx,label in enumerate(region_labels): act = test_labels[label].iloc[idx] pred = rdata[pid][jdx] if act == pred: accuracy[jdx] += 1 accuracy = [v/float(len(test_labels)) for v in accuracy] for v in accuracy: self.assertGreater(v, min_acc, 'Failed accuracy on batch post test {0}'.format(accuracy))
os.mkdir(save_path + "/" + model_name) with open(save_path + '/' + model_name + "/arguments.txt", "w") as f: f.write(str(args)) prng = RandomState(random_state) lexicon = get_lexicon() classes = {j: i for i, j in enumerate(lexicon)} inverse_classes = {v: k for k, v in classes.items()} print(" [INFO] %s" % classes) if mjsynth: train = open(os.path.join(path, training_fname), "r").readlines() train = parse_mjsynth(path, train) prng.shuffle(train) val = np.array(open(os.path.join(path, val_fname), "r").readlines()) val = parse_mjsynth(path, val) else: train = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if re.search('png|jpeg|jpg', f) ] prng.shuffle(train) length = len(train) train, val = train[:int(length * train_portion)], train[int(length * train_portion):]
def tabu_search(core, z, neighbordict,numP,w,floor, floor_variable,lockSoln, lockflag,lockVar, maxfailures=50,maxiterations=15): ##Pseudo constants pid = mp.current_process()._identity[0] tabu_list = deque(maxlen=sharedupdate[2][core])#What is this core's tabu list length? maxiterations *= cores #Test synchronize cores to exit at the same time. maxfailures += int(maxfailures*uniform(-1.1, 1.2))#James et. al 2007 def _tabu_check(tabu_list, neighbor, region, old_membership): if tabu_list:#If we have a deque with contents for tabu_region in(tabu_list): if neighbor == tabu_region[0]: #print neighbor, tabu_region[0] if region == tabu_region[1] and old_membership == tabu_region[2]: return False def _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln): #Initialize a local swap space to store n best diversified soln - these do not need to be better div_soln_space = np.empty(sharedSoln.shape) div_soln_space[:] = np.inf div_variance = np.array([sharedVar[:,core_soln_column]] * sharedVar.shape[1]).T workingSoln = np.copy(sharedSoln[0:,core_soln_column]) workingVar = np.copy(sharedVar[:,core_soln_column]) #Iterate through the regions and check all moves, store the 4 best. for region in np.unique(workingSoln[1:]): members = np.where(workingSoln == region)[0] neighbors = [] for member in members: candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based candidates = [candidate for candidate in candidates if candidate not in members] candidates = [candidate for candidate in candidates if candidate not in neighbors] neighbors.extend(candidates) candidates = [] #Iterate through the neighbors for neighbor in neighbors: neighborSoln = np.copy(workingSoln[1:]) #Pull a copy of the local working version old_membership = neighborSoln[neighbor]#Track where we started to check_floor neighborSoln[neighbor] = region #Move the neighbor into the new region in the copy #Check the floor floor_check = True if not np.sum(floor_variable[neighborSoln == old_membership]) >= floor: continue if not np.sum(floor_variable[neighborSoln == region]) >= floor: continue #Check contiguity of the swap block = np.where(workingSoln[1:] == neighbor)[0].tolist() if not check_contiguity(neighbordict, block, neighbor): continue #Compute the local variance varcopy = np.copy(workingVar) varcopy[old_membership] = np.var(z[neighborSoln == old_membership]) varcopy[region] = np.var(z[neighborSoln == region]) varcopy[0] = np.sum(varcopy[1:]) if np.any(div_soln_space[0] == np.inf): column = np.where(np.isinf(div_soln_space[0]) == True)[0][0] div_soln_space[1:,column] = neighborSoln[:] div_soln_space[0:,column][0] = varcopy[0] div_variance[:,column] = varcopy else: column = np.argmax(div_soln_space[0]) div_soln_space[1:,column] = neighborSoln[:] div_soln_space[0:,column][0] = varcopy[0] div_variance[:,column] = varcopy #Write one of the neighbor perturbations to the shared memory space to work on. valid = np.where(div_soln_space[0] != np.inf)[0] #print sharedVar[:,core_soln_column] try: selection = randint(0,len(valid)-1) with lockSoln: sharedSoln[:,core_soln_column] = div_soln_space[:,selection] with lockVar: sharedVar[:,core_soln_column] = div_variance[:,selection] #print sharedVar[:,core_soln_column] except: pass #print div_soln_space[0] print "Attempt to diversify failed." ##This shows that we are operating asynchronously. #if core ==2: #time.sleep(5) while sum(sharedupdate[1]) < maxiterations: core_soln_column = (core + sharedupdate[1][core])%len(sharedupdate[1]) if sharedupdate[0][core_soln_column] == False: _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln) #Li, et. al (in press - P-Compact_Regions) #print "ProcessID %i is processing soln column %i in iteration %i."%(pid, core_soln_column,sharedupdate[1][core]) #Uncomment to see that cores move around the search space failures = 0 #The total local failure counter local_best_variance = sharedSoln[:,core_soln_column][0] workingSoln = np.copy(sharedSoln[:,core_soln_column]) workingVar = np.copy(sharedVar[:,core_soln_column]) while failures <= maxfailures: #Select a random starting point in the search space. nr = np.unique(workingSoln[1:]) #This is 0 based, ie. region 0 - region 31 regionIDs = nr changed_regions = np.ones(len(nr)) randstate = RandomState(pid) #To 'unsync' the cores we need to instantiate a random class with a unique seed. randstate.shuffle(regionIDs) #shuffle the regions so we start with a random region changed_regions[:] = 0 swap_flag = False #Flag to stop looping prior to max iterations if we are not improving. #Iterate through the regions, checking potential swaps for region in regionIDs: members = np.where(workingSoln == region)[0] #get the members of the region #Get the neighbors to the members. Grab only those that could change. neighbors = [] for member in members: candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based candidates = [candidate for candidate in candidates if candidate not in members] candidates = [candidate for candidate in candidates if candidate not in neighbors] neighbors.extend(candidates) candidates = [] #Iterate through the neighbors for neighbor in neighbors: neighborSoln = np.copy(workingSoln[1:]) #Pull a copy of the local working version old_membership = neighborSoln[neighbor]#Track where we started to check_floor #For whatever reason candidates block is adding other units in the region, ie. we test moving from region 1 to region 1... '''ToDO: Check the candidates code above, something is wrong with it...testing more than necessary''' if old_membership == region: continue #Check the tabu list tabu_move_check = _tabu_check(tabu_list, neighbor, region, old_membership) if tabu_move_check is not None: continue neighborSoln[neighbor] = region #Move the neighbor into the new region in the copy #Check the floor floor_check = True if not np.sum(floor_variable[neighborSoln == old_membership]) >= floor: continue if not np.sum(floor_variable[neighborSoln == region]) >= floor: continue #Compute the local variance varcopy = np.copy(workingVar) varcopy[old_membership] = np.var(z[neighborSoln== old_membership]) varcopy[region] = np.var(z[neighborSoln == region]) varcopy[0] = np.sum(varcopy[1:]) #Check the locally compute varaince against current working best. if varcopy[0] >= workingSoln[0]: continue #Check contiguity of the swap block = np.where(workingSoln[1:] == neighbor)[0].tolist() if not check_contiguity(neighbordict, block, neighbor): #print "Failed contiguity check" continue #After this we have passed all tests, a check of all possible swaps has yielded a better one. swap_flag = True workingSoln[0] = varcopy[0] workingSoln[1:] = neighborSoln[:] workingVar = varcopy tabu_list.appendleft((neighbor,old_membership,region)) if swap_flag == False: failures += 1 with lockSoln and lockflag: sharedupdate[0][core_soln_column] = 0 sharedSoln[:,core_soln_column] if workingSoln[0] < sharedSoln[:,core_soln_column][0]: sharedSoln[:,core_soln_column] = workingSoln[:] sharedupdate[0][core_soln_column] = 1 #Set the update flag to true if not workingSoln[0] < sharedSoln[0].any(): results = set_half_to_best(len(sharedSoln[0])) with lockVar: sharedVar[:,core_soln_column] = workingVar for result in results: sharedVar[:,result] = workingVar #Increment the core iteration counter sharedupdate[1][core] += 1
def mantel_pval(X, Y, kX="Euclidian", kY="Delta", sigmaX=None, sigmaY=None, N_samp=500, random_seed=None, return_boots=False): """ Calculates the Mantel test A faster method for calculating the p-values X: Array of locations Y: Array of observations kX: method for calculating the X matrix kY: method for calculating the Y matrix sigmaX: param for the Gaussian kernel sigmaY: param for the Gaussian kernel N_samp: Number of samples for bootstrap random_seed: for calculating p values return_boots: return bootstrap vaules? """ prng = RandomState(random_seed) # Calculate two distance matrices if not sigmaX and kX == "Gaussian": sigmaX = HSIC.getSigmaGaussian(X, X, 200) if not sigmaY and kY == "Gaussian": sigmaY = HSIC.getSigmaGaussian(Y, Y, 200) # Calculate two distance matrices A = matrix_mantel(X, kX, sigmaX) B = matrix_mantel(Y, kY, sigmaY) # Just pearson correlation A_minus_mean = A - A.mean() B_mean = B.mean() B_minus_mean = B - B_mean lright = math.sqrt((A_minus_mean**2).sum()) lleft = math.sqrt(((B_minus_mean)**2).sum()) top = A_minus_mean.dot(B_minus_mean) mantel_val = top / (lright * lleft) # Calculating the p-value pval = 1.0 Yrand = np.copy(Y) boots = [] for i in xrange(N_samp): prng.shuffle(Yrand) B_tmp = matrix_mantel(Yrand, kY, sigmaY) if return_boots: boots.append(A_minus_mean.dot(B_tmp - B_mean)) if A_minus_mean.dot(B_tmp - B_mean ) >= top: #can use B_mean instead of B_tmp.mean() pval += 1 pval /= N_samp + 1 if return_boots: return mantel_val, pval, boots else: return mantel_val, pval
def _iter_slow(self, batch_size=128, start=None, end=None, shuffle=True, seed=None, mode=0): # ====== Set random seed ====== # all_ds = self._data[:] prng1 = None prng2 = _dummy_shuffle if shuffle: if seed is None: seed = get_random_magic_seed() prng1 = RandomState(seed) prng2 = RandomState(seed) all_size = [i.shape[0] for i in all_ds] n_dataset = len(all_ds) # ====== Calculate batch_size ====== # if mode == 1: # equal s = sum(all_size) all_batch_size = [int(round(batch_size * i / s)) for i in all_size] for i in xrange(len(all_batch_size)): if all_batch_size[i] == 0: all_batch_size[i] += 1 if sum(all_batch_size) > batch_size: # 0.5% -> round up, too much for i in xrange(len(all_batch_size)): if all_batch_size[i] > 1: all_batch_size[i] -= 1 break all_upsample = [None] * len(all_size) elif mode == 2 or mode == 3: # upsampling and downsampling maxsize = int(max(all_size)) if mode == 2 else int(min(all_size)) all_batch_size = [int(batch_size / n_dataset) for i in xrange(n_dataset)] for i in xrange(batch_size - sum(all_batch_size)): # not enough all_batch_size[i] += 1 all_upsample = [maxsize for i in xrange(n_dataset)] else: # sequential all_batch_size = [batch_size] all_upsample = [None] all_size = [sum(all_size)] # ====== Create all block and batches ====== # # [ ((idx1, batch1), (idx2, batch2), ...), # batch 1 # ((idx1, batch1), (idx2, batch2), ...), # batch 2 # ... ] all_block_batch = [] # contain [block_batches1, block_batches2, ...] tmp_block_batch = [] for n, batchsize, upsample in zip(all_size, all_batch_size, all_upsample): tmp_block_batch.append( create_batch(n, batchsize, start, end, prng1, upsample)) # ====== Distribute block and batches ====== # if mode == 1 or mode == 2 or mode == 3: for i in zip_longest(*tmp_block_batch): all_block_batch.append([(k, v) for k, v in enumerate(i) if v is not None]) else: all_size = [i.shape[0] for i in all_ds] all_idx = [] for i, j in enumerate(all_size): all_idx += [(i, k) for k in xrange(j)] # (ds_idx, index) all_idx = [all_idx[i[0]:i[1]] for i in tmp_block_batch[0]] # complex algorithm to connecting the batch with different dataset for i in all_idx: tmp = [] idx = i[0][0] # i[0][0]: ds_index start = i[0][1] # i[0][1]: index end = start for j in i[1:]: # detect change in index if idx != j[0]: tmp.append((idx, (start, end + 1))) idx = j[0] start = j[1] end = j[1] tmp.append((idx, (start, end + 1))) all_block_batch.append(tmp) prng2.shuffle(all_block_batch) # print if you want debug # for _ in all_block_batch: # for i, j in _: # print('ds:', i, ' batch:', j) # print('===== End =====') # ====== return iteration ====== # for _ in all_block_batch: # each _ is a block batches = np.concatenate( [all_ds[i][j[0]:j[1]] for i, j in _], axis=0) batches = batches[prng2.permutation(batches.shape[0])] yield self._normalizer(batches)
def raw_noise_2d(positions, pType): seq = numpy.arange(0, 256, dtype=int) prng = RandomState(pType) prng.shuffle(seq) perm = numpy.zeros(512, dtype=int) perm[0:256] = seq perm[256::] = seq grad3 = numpy.array([[1, 1, 0], [-1, 1, 0], [1, -1, 0], [-1, -1, 0], [1, 0, 1], [-1, 0, 1], [1, 0, -1], [-1, 0, -1], [0, 1, 1], [0, -1, 1], [0, 1, -1], [0, -1, -1]]) nValues = positions.shape[0] """2D Raw Simplex noise.""" # Noise contributions from the three corners corners = numpy.zeros((3, 2, nValues)) # Skew the input space to determine which simplex cell we're in F2 = 0.5 * (math.sqrt(3.0) - 1.0) # Hairy skew factor for 2D s = numpy.sum(positions, axis=1) * F2 ij = (positions.T + s).astype(int) G2 = (3.0 - math.sqrt(3.0)) / 6.0 t0 = numpy.sum(ij, axis=0) * G2 # Unskew the cell origin back to (x,y) space XY = ij - t0 # The x,y distances from the cell origin corners[0, :, :] = positions.T - XY i1 = corners[0, 0, :] > corners[0, 1, :] j1 = ~i1 # A step of (1,0) in (i,j) means a step of (1-c,-c) in (x,y), and # a step of (0,1) in (i,j) means a step of (-c,1-c) in (x,y), where corners[1, 0, :] = corners[0, 0, :] - i1 + G2 # Offsets for middle corner in (x,y) unskewed coords corners[1, 1, :] = corners[0, 1, :] - j1 + G2 corners[2, :, :] = corners[0, :, :] - 1.0 + 2.0 * G2 # Offsets for last corner in (x,y) unskewed coords # Work out the hashed gradient indices of the three simplex corners ij &= 255 gi = numpy.zeros((3, nValues), dtype=int) gi[0, :] = perm[ij[0, :] + perm[ij[1, :]]] % 12 gi[1, :] = perm[ij[0, :] + i1 + perm[ij[1, :] + j1]] % 12 gi[2, :] = perm[ij[0, :] + 1 + perm[ij[1, :] + 1]] % 12 n = numpy.zeros((3, nValues), dtype=float) # Calculate the contribution from the three corners temp = corners * corners t = .5 - temp[:, 0, :] - temp[:, 1, :] m = t >= 0 t *= t t *= t grad = grad3[gi] # print t n[0, m[0, :]] = t[0, m[0, :]] * dot2d(grad[0, m[0, :], :], corners[0, :, m[0, :]]) n[1, m[1, :]] = t[1, m[1, :]] * dot2d(grad[1, m[1, :], :], corners[1, :, m[1, :]]) n[2, m[2, :]] = t[2, m[2, :]] * dot2d(grad[2, m[2, :], :], corners[2, :, m[2, :]]) # Add contributions from each corner to get the final noise value. # The result is scaled to return values in the interval [-1,1]. result = (70.0 * numpy.sum(n, axis=0)) return (1 + result) * .5
nblists = info.get_supported_nblist() access = info.get_supported_access() print "Supported elements:", elements print "Supported neighborlist methods:", nblists print "Supported access methods:", access if len(access) > 1: access = ['loca', 'iter'] else: access = [None] if len(elements) == 1: main = elements[0] other = None state = reference_states[atomic_numbers[main]] else: elements = list(elements) rnd.shuffle(elements) for i in range(len(elements)): main = elements[i] other = elements[i-1] state = reference_states[atomic_numbers[main]] if state['symmetry'] in known_states: break if state['symmetry'] not in known_states: print "Cannot simulate %s, reference state '%s' not supported" % (main, state['symmetry']) print "SKIPPING MODEL!" continue init_atoms = bulk(main, orthorhombic=True).repeat((7,7,7)) r = init_atoms.get_positions() r += rnd.normal(0.0, 0.1, r.shape) init_atoms.set_positions(r)
def train(dim_word=256, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=256, # the number of LSTM_Old units margin=0.2, # margin for pairwise ranking loss. Should be (0,1] if use_norm is on use_norm=True, # whether to L2norm vectors prior to loss use_ctx_mean=False, # whether to initialze decoder to annotation means use_last=False, #Only use last hidden state for ranking n_layers_att=1, n_layers_init=1, # This isn't useful if use_ctx_mean=False patience=10, max_epochs=5000, dispFreq=1, decay_c=0., alpha_c=1., lrate=0.01, selector=True, n_words=23461, maxlen=100, # maximum length of the description optimizer='adam', batch_size = 64, valid_batch_size = 128, saveto='/ais/gobi3/u/rkiros/flickr8k/rank_models/lstm_toy.npz', validFreq=200, total_queries=5000, # total number of queries n_queries=50, # number of queries to validate on, resampled each time saveFreq=200, # save the parameters after every saveFreq updates sampleFreq=200, # generate some samples after every sampleFreq updates dataset='flickr8k', dictionary=None, # word dictionary use_dropout=False, use_dropout_lstm=False, reload_=False): # Model options print alpha_c model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data() # Invert the dictionary, add special tokens word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ inps, alphas, \ alphas_contrast, cost, \ opts_out = \ build_model(tparams, model_options) print 'Building ranker' trng_r, use_noise_r, \ inps_r, alphas_r, \ scores, opts_out_r = \ build_ranker(tparams, model_options) # before any regularizer print 'Building functions' f_log_probs = theano.function(inps, -cost, profile=False) f_ranker = theano.function(inps_r, scores, profile=False) print 'Regularization' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() alpha_reg_contrast = alpha_c * ((1.-alphas_contrast.sum(0))**2).sum(0).mean() cost += alpha_reg cost += alpha_reg_contrast # gradient computation print 'Computing gradients' grads = tensor.grad(cost, wrt=itemlist(tparams)) lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[1]), n_folds=len(valid[1])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[1]), n_folds=len(test[1])/valid_batch_size, shuffle=False) history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start # Get some contrastive images prng = RandomState(eidx + n_samples) inds = numpy.arange(len(train[1])) prng.shuffle(inds) contrast_ctx = numpy.zeros((len(caps), train[1][0].shape[1])).astype('float32') for cidx in range(len(caps)): contrast_ctx[cidx,:] = numpy.array(train[1][inds[cidx]].todense()) contrast_ctx = contrast_ctx.reshape([contrast_ctx.shape[0], 14*14, 512]) if x == None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() cost = f_grad_shared(x, mask, ctx, contrast_ctx) f_update(lrate) ud_duration = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p != None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: queries = numpy.arange(total_queries) prng.shuffle(queries) (r1, r5, r10, r25, r50, r100, medr) = recallK(f_ranker, model_options, worddict, prepare_data, valid, kf_valid, queries[:n_queries], verbose=False) print "Recall@(1,5,10,25,50,100): %.1f, %.1f, %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, r25, r50, r100, medr) #TODO: Not sure if this is the best choice, maybe explore alternatives valid_err = medr history_errs.append([valid_err, 1e20]) # Use the median rank to decide when to stop if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) bad_counter = 0 if eidx > patience and valid_err >= numpy.array(history_errs)[:,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Seen %d samples'%n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 queries = numpy.arange(total_queries) prng.shuffle(queries) (r1, r5, r10, r25, r50, r100, medr) = recallK(f_ranker, model_options, worddict, prepare_data, valid, kf_valid, queries[:n_queries], verbose=False) print "Recall@(1,5,10,25,50,100): %.1f, %.1f, %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, r25, r50, r100, medr) valid_err = medr params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params)
def trainer(train, dev, # training and development tuples dim=1000, # embedding dimensionality dim_im=4096, # image dimensionality dim_s=4800, # sentence dimensionality margin=0.2, # margin for pairwise ranking ncon=50, # number of contrastive terms max_epochs=15, lrate=0.01, # not needed with Adam dispFreq=10, optimizer='adam', batch_size = 100, valid_batch_size = 100, saveto='/ais/gobi3/u/rkiros/ssg/models/cocorank1000_combine.npz', validFreq=500, saveFreq=500, reload_=False): # Model options model_options = {} model_options['dim'] = dim model_options['dim_im'] = dim_im model_options['dim_s'] = dim_s model_options['margin'] = margin model_options['ncon'] = ncon model_options['max_epochs'] = max_epochs model_options['lrate'] = lrate model_options['dispFreq'] = dispFreq model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['valid_batch_size'] = valid_batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['saveFreq'] = saveFreq model_options['reload_'] = reload_ model_options = validate_options(model_options) print model_options # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print 'Building encoder' inps_e, lim, ls = build_encoder(tparams, model_options) print 'Building functions' f_cost = theano.function(inps, -cost, profile=False) f_emb = theano.function(inps_e, [lim, ls], profile=False) # gradient computation print 'Computing gradients' grads = tensor.grad(cost, wrt=itemlist(tparams)) lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' uidx = 0 estop = False start = 1234 seed = 1234 inds = numpy.arange(len(train[0])) numbatches = len(inds) / batch_size curr = 0 counter = 0 target=None history_errs = [] # Main loop for eidx in range(max_epochs): tic = time.time() prng = RandomState(seed - eidx - 1) prng.shuffle(inds) for minibatch in range(numbatches): uidx += 1 conprng_im = RandomState(seed + uidx + 1) conprng_s = RandomState(2*seed + uidx + 1) im = train[1][inds[minibatch::numbatches]] s = train[2][inds[minibatch::numbatches]] cinds_im = conprng_im.random_integers(low=0, high=len(train[0])-1, size=ncon * len(im)) cinds_s = conprng_s.random_integers(low=0, high=len(train[0])-1, size=ncon * len(s)) cim = train[1][cinds_im] cs = train[2][cinds_s] ud_start = time.time() cost = f_grad_shared(im, s, cim, cs) f_update(lrate) ud_duration = time.time() - ud_start if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud_duration if numpy.mod(uidx, validFreq) == 0: print 'Computing ranks...' lim, ls = f_emb(dev[1], dev[2]) (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving...', params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done'
def train(self, X, indX, XY, V, indV, VY, IM, count_dict, word_dict, embed_map): """ Trains the LBL """ self.start = self.seed self.init_params(embed_map, count_dict, XY) inds = np.arange(len(X)) numbatches = len(inds) / self.batchsize curr = 1e20 counter = 0 target=None num = 15000 x = T.matrix('x', dtype='int32') y = T.matrix('y') im = T.matrix('im') lr = T.scalar('lr') mom = T.scalar('mom') (words, acts, IF, preds) = self.forward(x, im) obj_T = self.compute_obj(x, im, y) compute_obj_T = theano.function([x, im, y], obj_T) train_batch = theano.function([x, im, y, lr, mom], obj_T, updates=self.update_params(obj_T, x, lr, mom), on_unused_input='warn') log_file = open("train_valid_err.txt", 'w') # Main loop stop.display_phase(1) for epoch in range(self.maxepoch): self.epoch = epoch tic = time.time() prng = RandomState(self.seed + epoch + 1) prng.shuffle(inds) obj = 0.0 for minibatch in range(numbatches): batchX = X[inds[minibatch::numbatches]].astype(np.int32) batchY = XY[inds[minibatch::numbatches]].toarray().astype(theano.config.floatX) batchindX = indX[inds[minibatch::numbatches]].astype(np.int32).flatten() batchIm = IM[batchindX].astype(theano.config.floatX) obj += train_batch(batchX, batchIm, batchY, self.eta_t, self.p_t) self.update_hyperparams() toc = time.time() # Results and stopping criteria obj_val = compute_obj_T(V[:num].astype(np.int32), IM[indV[:num].astype(int).flatten()].astype(theano.config.floatX), VY[:num].toarray().astype(theano.config.floatX)) log_file.write('{} {}\n'.format(obj, obj_val)) if self.verbose > 0: stop.display_results(epoch, toc-tic, obj, obj_val) (curr, counter) = stop.update_result(curr, obj_val, counter) if counter == 0: stop.save_model_theano(self, self.loc) stopping_target = obj if stop.criteria_complete(self, epoch, curr, obj, counter, self.k, obj_val, target): if self.criteria == 'maxepoch': break elif self.criteria == 'validation_pp': stop.load_model_theano(self, self.loc) counter = 0 X = np.r_[X, V] XY = vstack([XY, VY]).tocsr() indX = np.r_[indX, indV] self.criteria = 'll_train_heldout' target = stopping_target #obj stop.display_phase(2) inds = range(X.shape[0]) prng.shuffle(inds) numbatches = len(inds) / self.batchsize elif self.criteria == 'll_train_heldout': break log_file.close()
class CntWindowTrialIterator(object): """Cut out windows for several predictions from a continous dataset with a trial marker y signal. Parameters ---------- Returns ------- """ def __init__(self, batch_size, input_time_length, n_sample_preds, check_preds_smaller_trial_len=True): self.batch_size = batch_size self.input_time_length = input_time_length self.n_sample_preds = n_sample_preds self.check_preds_smaller_trial_len = check_preds_smaller_trial_len self.rng = RandomState(328774) def reset_rng(self): self.rng = RandomState(328774) def get_batches(self, dataset, shuffle): i_trial_starts, i_trial_ends = compute_trial_start_end_samples( dataset.y, check_trial_lengths_equal=False, input_time_length=self.input_time_length) if self.check_preds_smaller_trial_len: self.check_trial_bounds(i_trial_starts, i_trial_ends) start_end_blocks_per_trial = self.compute_start_end_block_inds( i_trial_starts, i_trial_ends) topo = dataset.get_topological_view() y = dataset.y return self.yield_block_batches(topo, y, start_end_blocks_per_trial, shuffle=shuffle) def check_trial_bounds(self, i_trial_starts, i_trial_ends): for start, end in zip(i_trial_starts, i_trial_ends): assert end - start + 1 >= self.n_sample_preds, ( "Trial should be longer or equal than number of sample preds, " "Trial length: {:d}, sample preds {:d}...". format(end - start + 1, self.n_sample_preds)) def compute_start_end_block_inds(self, i_trial_starts, i_trial_ends): # create start stop indices for all batches still 2d trial -> start stop start_end_blocks_per_trial = [] for i_trial in xrange(len(i_trial_starts)): trial_start = i_trial_starts[i_trial] trial_end = i_trial_ends[i_trial] start_end_blocks = get_start_end_blocks_for_trial(trial_start, trial_end, self.input_time_length, self.n_sample_preds) if self.check_preds_smaller_trial_len: # check that block is correct, all predicted samples should be the trial samples all_predicted_samples = [range(start_end[1] - self.n_sample_preds + 1, start_end[1]+1) for start_end in start_end_blocks] # this check takes about 50 ms in performance test # whereas loop itself takes only 5 ms.. deactivate it if not necessary assert np.array_equal(range(i_trial_starts[i_trial], i_trial_ends[i_trial] + 1), np.unique(np.concatenate(all_predicted_samples))) start_end_blocks_per_trial.append(start_end_blocks) return start_end_blocks_per_trial def yield_block_batches(self, topo, y, start_end_blocks_per_trial, shuffle): start_end_blocks_flat = np.concatenate(start_end_blocks_per_trial) if shuffle: self.rng.shuffle(start_end_blocks_flat) for i_block in xrange(0, len(start_end_blocks_flat), self.batch_size): i_block_stop = min(i_block + self.batch_size, len(start_end_blocks_flat)) start_end_blocks = start_end_blocks_flat[i_block:i_block_stop] batch = create_batch(topo,y, start_end_blocks, self.n_sample_preds) yield batch
def tabu_search(core, z, neighbordict,numP,w,floor_variable,lockSoln, lockflag, maxfailures=15,maxiterations=10): ##Pseudo constants pid = mp.current_process()._identity[0] tabu_list = deque(maxlen=sharedupdate[2][core])#What is this core's tabu list length? maxiterations *= cores #Test synchronize cores to exit at the same time. maxfailures += int(maxfailures*uniform(-1.1, 1.2))#James et. al 2007 def _tabu_check(tabu_list, neighbor, region, old_membership): if tabu_list:#If we have a deque with contents for tabu_region in(tabu_list): if neighbor == tabu_region[0]: #print neighbor, tabu_region[0] if region == tabu_region[1] and old_membership == tabu_region[2]: return False def _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln): ''' The goal of this function is to diversify a soln that is not improving. What about the possability of diversifying a good answer away from 'the soln?' I do no think that that should be an issue - we make a randomized greedy swap. Is one enough? Rationale: This is a randomized Greedy swap (GRASP), where we store the best n permutations and then randomly select the one we will use. Originally in Li et. al (in press). We need to test different values of n to see what the impact is. ''' #print "Diversifying: ", sharedSoln[0] #Initialize a local swap space to store n best diversified soln - these do not need to be better div_soln_space = np.ndarray(sharedSoln.shape) div_soln_space[:] = float("inf") workingcopy = np.copy(sharedSoln[0:,core_soln_column]) #Iterate through the regions and check all moves, store the 4 best. for region in np.unique(workingcopy[1:]): members = np.where(workingcopy == region)[0] neighbors = [] for member in members: candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based candidates = [candidate for candidate in candidates if candidate not in members] candidates = [candidate for candidate in candidates if candidate not in neighbors] neighbors.extend(candidates) candidates = [] #Iterate through the neighbors for neighbor in neighbors: neighborcopy = np.copy(workingcopy[:]) #Pull a copy of the local working version old_membership = neighborcopy[neighbor]#Track where we started to check_floor neighborcopy[neighbor] = region #Move the neighbor into the new region in the copy #Here we start to check the swap and see if it is better swap_var = objective_function_vec(neighborcopy[1:],z)#Variance of the new swap if not swap_var < div_soln_space[0].any(): block = np.where(workingcopy[1:] == neighbor)[0]#A list of the members in a region. block=block.tolist() #For current contiguity check if check_contiguity(neighbordict, block, neighbor):#Check contiguity if check_floor(np.where(neighborcopy[1:,]==region)[0], floor_variable, w) and check_floor(np.where(neighborcopy[1:,]==old_membership)[0],floor_variable,w): neighborcopy[0] = swap_var if not np.isinf(div_soln_space[0].any()): div_soln_space[:,np.argmax(div_soln_space[0])] = neighborcopy[:] else: div_soln_space[:,np.argmin(div_soln_space[0])] = neighborcopy[:] else: del neighborcopy #print "Swap failed due to floor_check." else: del neighborcopy #print "Swap failed due to contiguity." #It is possible that the perturbation will not generate enough soln to fill the space, # so we need to ignore those columns with variance = infinity. #Write one of the neighbor perturbations to the shared memory space to work on. valid = np.where(div_soln_space[0] != np.inf)[0] try: selection = randint(0,len(valid)-1) with lockSoln: sharedSoln[:,core_soln_column] = div_soln_space[:,selection] print "Diversified to:", sharedSoln[0] except: print div_soln_space[0] print "Attempt to diversify failed." ##This shows that we are operating asynchronously. #if core ==2: #time.sleep(5) while sum(sharedupdate[1]) < maxiterations: core_soln_column = (core + sharedupdate[1][core])%len(sharedupdate[1]) #This iterates the cores around the search space. #Check for diversification here and diversify if necessary... if sharedupdate[0][core_soln_column] == False: _diversify_soln(core_soln_column, neighbordict,z, floor_variable,lockSoln) #Li, et. al (in press - P-Compact_Regions) #print "ProcessID %i is processing soln column %i in iteration %i."%(pid, core_soln_column,sharedupdate[1][core]) #Uncomment to see that cores move around the search space failures = 0 #The total iteration counter #What are the current best solutions local to this core? local_best_variance = sharedSoln[:,core_soln_column][0] workingSoln = np.copy(sharedSoln[:,core_soln_column]) while failures <= maxfailures:#How many total iterations can the core make #Select a random starting point in the search space. nr = np.unique(workingSoln[1:]) #This is 0 based, ie. region 0 - region 31 regionIDs = nr changed_regions = np.ones(len(nr)) randstate = RandomState(pid) #To 'unsync' the cores we need to instantiate a random class with a unique seed. randstate.shuffle(regionIDs) #shuffle the regions so we start with a random region changed_regions[:] = 0 swap_flag = False #Flag to stop looping prior to max iterations if we are not improving. #Iterate through the regions, checking potential swaps for region in regionIDs: members = np.where(workingSoln == region)[0] #get the members of the region #print region, members #Get the neighbors to the members. Grab only those that could change. neighbors = [] for member in members: candidates = neighbordict[member-1]#neighbordict is 0 based, member is 1 based candidates = [candidate for candidate in candidates if candidate not in members] candidates = [candidate for candidate in candidates if candidate not in neighbors] neighbors.extend(candidates) candidates = [] #Iterate through the neighbors for neighbor in neighbors: neighborSoln = np.copy(workingSoln[:]) #Pull a copy of the local working version old_membership = neighborSoln[neighbor]#Track where we started to check_floor tabu_move_check =_tabu_check(tabu_list, neighbor, region, old_membership) if tabu_move_check is not None: break neighborSoln[neighbor] = region #Move the neighbor into the new region in the copy #Here we start to check the swap and see if it is better swap_var = objective_function_vec(neighborSoln[1:],z)#Variance of the new swap if swap_var <= local_best_variance: block = np.where(workingSoln[1:] == neighbor)[0]#A list of the members in a region. block=block.tolist() #For current contiguity check if check_contiguity(neighbordict, block, neighbor):#Check contiguity if check_floor(np.where(neighborSoln[1:,]==region)[0], floor_variable, w) and check_floor(np.where(neighborSoln[1:,]==old_membership)[0],floor_variable,w):#What about the floor of the region loosing the member in the original code? #print "Swap made on core %i. Objective function improved from %f to %f." %(pid, swap_var, local_best_variance) local_best_variance = swap_var#Set the new local best to the swap. We have made a swap that betters the objective function. neighborSoln[0] = swap_var workingSoln[:] = neighborSoln[:] swap_flag = True #We made a swap tabu_list.appendleft((neighbor,old_membership,region))#tuple(polygon_id, oldgroup,newgroup) else: del neighborSoln #print "Swap failed due to floor_check." else: del neighborSoln #print "Swap failed due to contiguity." if swap_flag == False: #print "Failed to make any swap, incrementing the fail counter." failures += 1 #print workingSoln, len(np.unique(workingSoln[1:])) with lockflag: sharedupdate[0][core_soln_column] = 0 #Set the update flag to false #print "Locking update flag to set to false" with lockSoln:#The lock is released at the end of the with statement sharedSoln[:,core_soln_column]#Lock the column of the shared soln we are using. if workingSoln[0] < sharedSoln[:,core_soln_column][0]: sharedSoln[:,core_soln_column] sharedSoln[:,core_soln_column] = workingSoln #print "Better soln loaded into sharedSoln: %f." %(workingSoln[0]) sharedupdate[0][core_soln_column] = 1 #Set the update flag to true if not workingSoln[0] < sharedSoln[0].any(): set_half_to_best(len(sharedSoln[0])) #print "Setting half the soln to new global best. ", sharedSoln[0] #Increment the core iteration counter sharedupdate[1][core] += 1
def trainer(z, split=3500, pre_train=True): """ Trainer function for a MLBLF model """ # Unpack some stuff ngrams = z['ngrams'] labels = z['labels'] instances = z['instances'] word_dict = z['word_dict'] index_dict = z['index_dict'] context = z['context'] vocabsize = len(z['word_dict']) im = z['IM'] index = z['index'] # Load word embeddings if pre_train: embed_map = lm_tools.load_embeddings() else: embed_map = None # Initialize the network net = mlblf.MLBLF(name='mlblf', loc='models/mlblf.pkl', # where to store the model file seed=1234, # used to initializing the model parameters criteria='validation_pp', # the stopping criteria k=5, # the window size used for validation V=vocabsize, # the size of the vocabulary K=50, # the dim of the word representations D=im.shape[1], # the dim of the images features h=256, # dim of an intermediate layer on the image channel factors=50, # number of factors context=context, batchsize=20, maxepoch=100, eta_t=0.02, gamma_r=1e-4, gamma_c=1e-5, f=0.998, p_i=0.5, p_f=0.9, T=20.0, verbose=1) # Break up the data for training and validation inds = np.arange(len(ngrams)) prng = RandomState(net.seed) prng.shuffle(inds) ngramsV = [ngrams[i] for i in inds[-split:]] flat_ngramsV = [item for sublist in ngramsV for item in sublist] instance_split = len(flat_ngramsV) inds = np.arange(len(instances)) prng = RandomState(net.seed) prng.shuffle(inds) X = instances[inds[:-instance_split]] V = instances[inds[-instance_split:]] Y = labels[inds[:-instance_split]] VY = labels[inds[-instance_split:]] indX = index[inds[:-instance_split]] indV = index[inds[-instance_split:]] # Train the network net.train(X, indX, Y, V, indV, VY, im, index_dict, word_dict, embed_map)
def trainer(train, valid, test, n_chars=33, img_w=128, max_len=27, feature_maps=100, filter_hs=[2, 3, 4], max_epochs=20, gamma=10, ncon=100, lrate=0.0002, batch_size=100, dispFreq=10, validFreq=10, saveto='example.npz'): """ train, valid, test : datasets n_chars : vocabulary size img_w : character embedding dimension. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used max_epochs : The maximum number of epoch to run gamma: hyper-parameter using in ranking ncon: the number of negative samples we used for each postive sample lrate : learning rate batch_size : batch size during training dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation rank score after this number of update. saveto: where to save the result. """ global ctr img_h = max_len + 2 * (filter_hs[-1] - 1) model_options = {} model_options['n_chars'] = n_chars model_options['img_w'] = img_w model_options['img_h'] = img_h model_options['feature_maps'] = feature_maps model_options['filter_hs'] = filter_hs model_options['max_epochs'] = max_epochs model_options['gamma'] = gamma model_options['ncon'] = ncon model_options['lrate'] = lrate model_options['batch_size'] = batch_size model_options['dispFreq'] = dispFreq model_options['validFreq'] = validFreq model_options['saveto'] = saveto logger.info('Model options {}'.format(model_options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) model_options['filter_shapes'] = filter_shapes model_options['pool_sizes'] = pool_sizes params = init_params(model_options) tparams = init_tparams(params) use_noise, inps, cost = build_model(tparams, model_options) logger.info('Building encoder...') inps_e, feat_x, feat_y = build_encoder(tparams, model_options) logger.info('Building functions...') f_emb = theano.function(inps_e, [feat_x, feat_y], name='f_emb') lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, inps, lr) logger.info('Training model...') uidx = 0 seed = 1234 curr = 0 history_errs = [] valid_x = prepare_data(valid[0], max_len, n_chars, filter_hs[-1]) valid_y = prepare_data(valid[1], max_len, n_chars, filter_hs[-1]) test_x = prepare_data(test[0], max_len, n_chars, filter_hs[-1]) test_y = prepare_data(test[1], max_len, n_chars, filter_hs[-1]) zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor( tparams['Wemb'][n_chars - 1, :], zero_vec_tensor))]) # Main loop for eidx in range(max_epochs): print("epoch {} ".format(eidx)) prng = RandomState(seed - eidx - 1) trainA = train[0] trainB = train[1] num_samples = len(trainA) inds = np.arange(num_samples) prng.shuffle(inds) numbatches = len(inds) / batch_size for minibatch in range(numbatches): print("minibatch : ", minibatch) use_noise.set_value(0.) uidx += 1 conprng = RandomState(seed + uidx + 1) x = [trainA[seq] for seq in inds[minibatch::numbatches]] y = [trainB[seq] for seq in inds[minibatch::numbatches]] cinds = conprng.random_integers(low=0, high=num_samples - 1, size=ncon * len(x)) cy = [trainB[seq] for seq in cinds] x = prepare_data(x, max_len, n_chars, filter_hs[-1]) y = prepare_data(y, max_len, n_chars, filter_hs[-1]) cy = prepare_data(cy, max_len, n_chars, filter_hs[-1]) feats_x, feats_y = f_emb(x, y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) cost = f_grad_shared(x, y, cy) print("cost {},r {}".format(cost, r1)) f_update(lrate) xdata.append(ctr) ctr = ctr + 1 ydata.append(cost) y2data.append(r1) lines.set_xdata(xdata) lines.set_ydata(ydata) lines2.set_xdata(xdata) lines2.set_ydata(y2data) # Need both of these in order to rescale ax[0].relim() ax[0].autoscale_view() ax[1].relim() ax[1].autoscale_view() # We need to draw *and* flush figure.canvas.draw() figure.canvas.flush_events() # the special token does not need to update. set_zero(zero_vec) if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format( eidx, uidx, cost)) if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) logger.info('Computing ranks...') # valid_y,slocs = shuffle_valid(valid_y) feats_x, feats_y = f_emb(valid_x, valid_y) # (r1, r3, r10, medr, meanr, h_meanr) = rank_valid(feats_x, feats_y,slocs) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) x3data.append(ctr) y3data.append(r1) lines3.set_xdata(x3data) lines3.set_ydata(y3data) ax[2].relim() ax[2].autoscale_view() # We need to draw *and* flush figure.canvas.draw() figure.canvas.flush_events() history_errs.append([r1, r3, r10, medr, meanr, h_meanr]) logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) print('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) currscore = r1 + r3 + r10 if currscore > curr: curr = currscore logger.info('Saving...') params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done...') use_noise.set_value(0.) zipp(params, tparams) logger.info('Final results...') feats_x, feats_y = f_emb(valid_x, valid_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) feats_x, feats_y = f_emb(test_x, test_y) (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y) logger.info('Test Rank:{}, {}, {}, {},{},{}'.format( r1, r3, r10, medr, meanr, h_meanr)) # np.savez("./cnn_feats.npz", feats_x=feats_x, feats_y=feats_y) return (r1, r3, r10, medr, meanr, h_meanr)
def main(): print args print accuracies = defaultdict(lambda: []) ora_accu = defaultdict(lambda: []) ora_cm = defaultdict(lambda: []) lbl_dit = defaultdict(lambda: []) oracle_accuracies =[] aucs = defaultdict(lambda: []) x_axis = defaultdict(lambda: []) vct = TfidfVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = None if args.train == "20news": categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] categories=categories[2] elif args.train == "webkb": categories = ['student','faculty'] elif args.train == "arxiv": categories = [['cs.AI','cs.LG'], ['physics.comp-ph','physics.data-an']] categories=categories[0] min_size = 10 args.fixk = None data, vct = load_from_file(args.train, [categories], args.fixk, min_size, vct, raw=True) print data.train.target_names print "Vectorizer:", vct print("Data %s" % args.train) print("Data size %s" % len(data.train.data)) parameters = parse_parameters_mat(args.cost_model) print "Cost Parameters %s" % parameters cost_model = set_cost_model(args.cost_function, parameters=parameters) print "\nCost Model: %s" % cost_model.__class__.__name__ ### SENTENCE TRANSFORMATION if args.train == "twitter": sent_detector = TwitterSentenceTokenizer() else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') ## delete <br> to "." to recognize as end of sentence data.train.data = clean_html(data.train.data) data.test.data = clean_html(data.test.data) print("Train:{}, Test:{}, {}".format(len(data.train.data), len(data.test.data), data.test.target.shape[0])) ## Get the features of the sentence dataset ## create splits of data: pool, test, oracle, sentences expert_data = Bunch() if not args.fulloracle: train_test_data = Bunch() expert_data.sentence, train_test_data.pool = split_data(data.train) expert_data.oracle, train_test_data.test = split_data(data.test) data.train.data = train_test_data.pool.train.data data.train.target = train_test_data.pool.train.target data.test.data = train_test_data.test.train.data data.test.target = train_test_data.test.train.target ## convert document to matrix data.train.bow = vct.fit_transform(data.train.data) data.test.bow = vct.transform(data.test.data) #### EXPERT CLASSIFIER: ORACLE print("Training Oracle expert") exp_clf = set_classifier(args.classifier, parameter=args.expert_penalty) if not args.fulloracle: print "Training expert documents:%s" % len(expert_data.oracle.train.data) labels, sent_train = split_data_sentences(expert_data.oracle.train, sent_detector, vct, limit=args.limit) expert_data.oracle.train.data = sent_train expert_data.oracle.train.target = np.array(labels) expert_data.oracle.train.bow = vct.transform(expert_data.oracle.train.data) exp_clf.fit(expert_data.oracle.train.bow, expert_data.oracle.train.target) else: # expert_data.data = np.concatenate((data.train.data, data.test.data)) # expert_data.target = np.concatenate((data.train.target, data.test.target)) expert_data.data =data.train.data expert_data.target = data.train.target expert_data.target_names = data.train.target_names labels, sent_train = split_data_sentences(expert_data, sent_detector, vct, limit=args.limit) expert_data.bow = vct.transform(sent_train) expert_data.target = labels expert_data.data = sent_train exp_clf.fit(expert_data.bow, expert_data.target) if "neutral" in args.expert: expert = baseexpert.NeutralityExpert(exp_clf, threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "true" in args.expert: expert = baseexpert.TrueOracleExpert(cost_function=cost_model.cost_function) elif "pred" in args.expert: expert = baseexpert.PredictingExpert(exp_clf, #threshold=args.neutral_threshold, cost_function=cost_model.cost_function) elif "human" in args.expert: expert = baseexpert.HumanExpert(", ".join(["{}={}".format(a,b) for a,b in enumerate(data.train.target_names)])+"? > ") else: raise Exception("We need an expert!") print "Training expert documents:%s" % len(sent_train) print "\nExpert: %s " % expert #### EXPERT CLASSIFIER: SENTENCES print("Training sentence expert") sent_clf = None if args.cheating: labels, sent_train = split_data_sentences(expert_data.sentence.train, sent_detector, vct, limit=args.limit) expert_data.sentence.train.data = sent_train expert_data.sentence.train.target = np.array(labels) expert_data.sentence.train.bow = vct.transform(expert_data.sentence.train.data) sent_clf = set_classifier(args.classifier, parameter=args.expert_penalty) sent_clf.fit(expert_data.sentence.train.bow, expert_data.sentence.train.target) #### STUDENT CLASSIFIER clf = set_classifier(args.classifier, parameter=args.expert_penalty) print "\nStudent Classifier: %s" % clf print "\nSentence Classifier: %s" % sent_clf print "\nExpert Oracle Classifier: %s" % exp_clf print "Penalty:", exp_clf.C print "Oracle " #### ACTIVE LEARNING SETTINGS step_size = args.step_size bootstrap_size = args.bootstrap evaluation_points = 200 print("\nExperiment: step={0}, BT={1}, plot points={2}, fixk:{3}, minsize:{4}".format(step_size, bootstrap_size, evaluation_points, args.fixk, min_size)) print ("Anytime active learning experiment - use objective function to pick data") t0 = time.time() tac = [] tau = [] ### experiment starts for t in range(args.trials): trial_accu = [] trial_aucs = [] print "*" * 60 print "Trial: %s" % t student = get_student(clf, cost_model, sent_clf, sent_detector, vct) student.human_mode = args.expert == 'human' print "\nStudent: %s " % student train_indices = [] neutral_data = [] # save the xik vectors train_x = [] train_y = [] neu_x = [] # data to train the classifier neu_y = np.array([]) pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.text = data.train.data pool.target = data.train.target pool.predicted = [] pool.remaining = range(pool.data.shape[0]) # indices of the pool rand = RandomState(t * 1234) rand.shuffle(pool.remaining) pool.offset = 0 bootstrapped = False current_cost = 0 iteration = 0 query_index = None query_size = None oracle_answers = 0 calibrated=args.calibrate while 0 < student.budget and len(pool.remaining) > pool.offset and iteration <= args.maxiter: util = [] if not bootstrapped: query_index = pool.remaining[:bootstrap_size] bootstrapped = True query = pool.data[query_index] print else: # if not calibrated: # chosen = student.pick_next(pool=pool, step_size=step_size) # else: # chosen = student.pick_next_cal(pool=pool, step_size=step_size) chosen = student.pick_next(pool=pool, step_size=step_size) query_index = [x for x, y in chosen] # document id of chosen instances query = [y for x, y in chosen] # sentence of the document query_size = [1] * len(query_index) ground_truth = pool.target[query_index] if iteration == 0: ## bootstrap uses ground truth labels = ground_truth spent = [0] * len(ground_truth) ## bootstrap cost is ignored else: # print "ask labels" if isinstance(expert, baseexpert.HumanExpert): labels = expert.label_instances(query, ground_truth) # raise Exception("Oops, this is not ready, yet.") else: labels = expert.label_instances(query, ground_truth) spent = expert.estimate_instances(query_size) ### accumulate the cost of the query query_cost = np.array(spent).sum() current_cost += query_cost useful_answers = np.array([[x, y] for x, y in zip(query_index, labels) if y is not None]) neutral_answers = np.array([[x, z] for x, y, z in zip(query_index, labels, query_size) if y is None]) \ if iteration != 0 else np.array([]) ## add data recent acquired to train if useful_answers.shape[0] != 0: train_indices.extend(useful_answers[:, 0]) # add labels to training train_x = pool.data[train_indices] # # train with all the words # update labels with the expert labels train_y.extend(useful_answers[:, 1]) neu_x, neu_y, neutral_data = update_sentence(neutral_data, neu_x, neu_y, labels, query_index, pool, vct) # update sentence student classifier data if neu_y.shape[0] != neu_x.shape[0]: raise Exception("Training data corrupted!") if train_x.shape[0] != len(train_y): raise Exception("Training data corrupted!") # remove labels from pool pool.offset = len(train_indices) # retrain the model current_model = student.train_all(train_x, train_y, neu_x, neu_y) # evaluate and save results y_probas = current_model.predict_proba(data.test.bow) auc = metrics.roc_auc_score(data.test.target, y_probas[:, 1]) pred_y = current_model.classes_[np.argmax(y_probas, axis=1)] correct_labels = np.sum(np.array(ground_truth) == np.array(labels).reshape(len(labels))) accu = metrics.accuracy_score(data.test.target, pred_y) if not student.human_mode: print ("TS:{0}\tAccu:{1:.3f}\tAUC:{2:.3f}\tCost:{3:.2f}\tCumm:{4:.2f}\tGT:{5}\tneu:{6}\t{7}\tND:{8}\tTD:{9}\t ora_accu:{10}".format( len(train_indices), accu, auc, query_cost, current_cost, ground_truth, len(neutral_answers), neu_y.shape[0], neu_y.sum(), np.sum(train_y), correct_labels)) ## the results should be based on the cost of the labeling if iteration > 0: # bootstrap iteration student.budget -= query_cost ## Bootstrap doesn't count # oracle accuracy (from queries) oracle_answers += correct_labels x_axis_range = current_cost x_axis[x_axis_range].append(current_cost) ## save results accuracies[x_axis_range].append(accu) aucs[x_axis_range].append(auc) # ora_accu[x_axis_range].append(1. * correct_labels/len(ground_truth)) ora_accu[x_axis_range].append(1. * correct_labels) ora_cm[x_axis_range].append(metrics.confusion_matrix(ground_truth, labels, labels=np.unique(train_y))) lbl_dit[x_axis_range].append(np.sum(train_y)) # partial trial results trial_accu.append([x_axis_range, accu]) trial_aucs.append([x_axis_range, auc]) # oracle_accuracies[x_axis_range].append(oracle_answers) iteration += 1 # end of budget loop tac.append(trial_accu) tau.append(trial_aucs) oracle_accuracies.append(1.*oracle_answers / (len(train_indices)-bootstrap_size)) print "Trial: {}, Oracle right answers: {}, Iteration: {}, Labels:{}, ACCU-OR:{}".format(t, oracle_answers, iteration, len(train_indices)-bootstrap_size,1.*oracle_answers / (len(train_indices)-bootstrap_size)) #end trial loop if args.cost_function not in "uniform": accuracies = extrapolate_trials(tac, cost_25=parameters[1][1], step_size=args.step_size) aucs = extrapolate_trials(tau, cost_25=parameters[1][1], step_size=args.step_size) print "\nAverage oracle accuracy: ", np.array(oracle_accuracies).mean() print("Elapsed time %.3f" % (time.time() - t0)) cheating = "CHEATING" if args.cheating else "NOCHEAT" print_extrapolated_results(accuracies, aucs, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student) oracle_accuracy(ora_accu, file_name=args.train+"-"+cheating+"-"+args.prefix+"-"+args.classifier+"-"+args.student, cm=ora_cm, num_trials=args.trials)
class PMF(ModelBase): """Probabilistic Matrix Factorization """ def __init__(self, n_user, n_item, n_feature, batch_size=1e5, epsilon=50.0, momentum=0.8, seed=None, reg=1e-2, converge=1e-5, max_rating=None, min_rating=None): super(PMF, self).__init__() self.n_user = n_user self.n_item = n_item self.n_feature = n_feature self.random_state = RandomState(seed) # batch size self.batch_size = batch_size # learning rate self.epsilon = float(epsilon) self.momentum = float(momentum) # regularization parameter self.reg = reg self.converge = converge self.max_rating = float(max_rating) \ if max_rating is not None else max_rating self.min_rating = float(min_rating) \ if min_rating is not None else min_rating # data state self.mean_rating_ = None # user/item features self.user_features_ = 0.1 * self.random_state.rand(n_user, n_feature) self.item_features_ = 0.1 * self.random_state.rand(n_item, n_feature) def fit(self, ratings, n_iters=50): check_ratings(ratings, self.n_user, self.n_item, self.max_rating, self.min_rating) self.mean_rating_ = np.mean(ratings[:, 2]) last_rmse = None batch_num = int(np.ceil(float(ratings.shape[0] / self.batch_size))) logger.debug("batch count = %d", batch_num + 1) # momentum u_feature_mom = np.zeros((self.n_user, self.n_feature)) i_feature_mom = np.zeros((self.n_item, self.n_feature)) # gradient u_feature_grads = np.zeros((self.n_user, self.n_feature)) i_feature_grads = np.zeros((self.n_item, self.n_feature)) for iteration in xrange(n_iters): logger.debug("iteration %d...", iteration) self.random_state.shuffle(ratings) for batch in xrange(batch_num): start_idx = int(batch * self.batch_size) end_idx = int((batch + 1) * self.batch_size) data = ratings[start_idx:end_idx] # compute gradient u_features = self.user_features_.take( data.take(0, axis=1), axis=0) i_features = self.item_features_.take( data.take(1, axis=1), axis=0) preds = np.sum(u_features * i_features, 1) errs = preds - (data.take(2, axis=1) - self.mean_rating_) err_mat = np.tile(2 * errs, (self.n_feature, 1)).T u_grads = i_features * err_mat + self.reg * u_features i_grads = u_features * err_mat + self.reg * i_features u_feature_grads.fill(0.0) i_feature_grads.fill(0.0) for i in xrange(data.shape[0]): row = data.take(i, axis=0) u_feature_grads[row[0], :] += u_grads.take(i, axis=0) i_feature_grads[row[1], :] += i_grads.take(i, axis=0) # update momentum u_feature_mom = (self.momentum * u_feature_mom) + \ ((self.epsilon / data.shape[0]) * u_feature_grads) i_feature_mom = (self.momentum * i_feature_mom) + \ ((self.epsilon / data.shape[0]) * i_feature_grads) # update latent variables self.user_features_ -= u_feature_mom self.item_features_ -= i_feature_mom # compute RMSE train_preds = self.predict(ratings[:, :2]) train_rmse = RMSE(train_preds, ratings[:, 2]) logger.info("iter: %d, train RMSE: %.6f", iteration, train_rmse) # stop when converge if last_rmse and abs(train_rmse - last_rmse) < self.converge: logger.info('converges at iteration %d. stop.', iteration) break else: last_rmse = train_rmse return self def predict(self, data): if not self.mean_rating_: raise NotFittedError() u_features = self.user_features_.take(data.take(0, axis=1), axis=0) i_features = self.item_features_.take(data.take(1, axis=1), axis=0) preds = np.sum(u_features * i_features, 1) + self.mean_rating_ if self.max_rating: preds[preds > self.max_rating] = self.max_rating if self.min_rating: preds[preds < self.min_rating] = self.min_rating return preds
#load test set data - same set used for ML tests seed = 987654321 # set the numpy random seed so results are reproducible rs = RandomState(987654321) # set common path variables label_file = './data/input/SDS_PV2_combined/SDS_PV2_class_labels.txt' # read data label_data = pd.read_csv(label_file) # partition the data pos_cases, neg_cases = wrangle.partion(label_data['doc_norm']==1, label_data, ratios=[0.8,0.2]) train_mask = np.concatenate((pos_cases[0], neg_cases[0])) test_mask = np.concatenate((pos_cases[1], neg_cases[1])) rs.shuffle(train_mask) rs.shuffle(test_mask) train_labels = label_data.iloc[train_mask] test_labels = label_data.iloc[test_mask] # read in the text reports train_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in train_labels['pid']] test_reports = [load_report('{0}/{1}_fi.txt'.format(report_path, pid)) for pid in test_labels['pid']] #import keywords keywords = {} with open(keyword_file, 'r') as f: key = "" for line in f.readlines(): if line.startswith("#"): key = line[1:].strip('\n') else:
else: raise Exception('verify failed: %s' % file_path) return file_path # load or download MovieLens 1M dataset rating_file = ml_1m_download(ML_1M_FOLDER, file_size=ML_1M_ZIP_SIZE) ratings = load_movielens_1m_ratings(rating_file) n_user = max(ratings[:, 0]) n_item = max(ratings[:, 1]) # shift user_id & movie_id by 1. let user_id & movie_id start from 0 ratings[:, (0, 1)] -= 1 # split data to training & testing train_pct = 0.9 rand_state.shuffle(ratings) train_size = int(train_pct * ratings.shape[0]) train = ratings[:train_size] validation = ratings[train_size:] # models settings n_feature = 10 eval_iters = 10 print("n_user: %d, n_item: %d, n_feature: %d, training size: %d, validation size: %d" % ( n_user, n_item, n_feature, train.shape[0], validation.shape[0])) als = ALS(n_user=n_user, n_item=n_item, n_feature=n_feature, reg=5e-2, max_rating=5., min_rating=1., seed=0) als.fit(train, n_iters=eval_iters) train_preds = als.predict(train[:, :2]) train_rmse = RMSE(train_preds, train[:, 2])