Example #1
0
		sorted_values = list(bst.values(reverse=True))
		sorted_users  = [x[0] for x in sorted_values]	
		print("[spliting %d users into #files: %d]" % (len(sorted_users),args.n_splits))
		out_files = []	
		out_path, ext = os.path.splitext(args.input) 	
		for i in range(args.n_splits):		
			fname = "%s%d%s" % (out_path,i+1,ext)		
			print("   > %s" % fname)
			f = open(fname,"w")
			out_files.append(f)
		tf.seek(0)
		out_log =  [[] for x in range(args.n_splits)]
		# set_trace()
		print("[processing users]")
		partition_size = math.floor(len(sorted_users)*1.0/args.n_splits)
		for x in stPickle.s_load(tf):
			user, train, _, _,_ = x		
			user_rank = sorted_users.index(user)		
			fnumber   = int(math.floor(user_rank*1.0/partition_size)) 		
			if fnumber < 0: fnumber = 0		
			if fnumber > args.n_splits-1: fnumber = args.n_splits-1	
			print("   > user: %s | #train: %d | rank: %d | fnum: %d" % (user, len(train), user_rank, fnumber))
			out_file = out_files[fnumber]
			stPickle.s_dump_elt(x, out_file)		
			out_log[fnumber].append(len(train))
		print("[avg #docs: ]")
		for i in range(len(out_log)): print("   >file %d: %.3f " % (i,np.mean(out_log[i])))			
		print("[removing original training file: %s]" % args.input)
		os.remove(args.input)
	print("error: n_splits should be at least 2")	
Example #2
0
			elif u_idx != prev_user:						
				#after accumulating all documents for current user, shuffle and write them to disk			
				assert len(prev_user_data) == len(prev_neg_samples)
				#shuffle the data			
				shuf_idx = np.arange(len(prev_user_data))
				rng.shuffle(shuf_idx)
				prev_user_data = [prev_user_data[i] for i in shuf_idx]
				prev_neg_samples = [prev_neg_samples[i] for i in shuf_idx]
				# set_trace()					
				split = int(len(prev_user_data)*.9)
				train = prev_user_data[:split]
				test  = prev_user_data[split:]	
				neg_samples = prev_neg_samples[:split]
				#each training instance consists of:
				#[user_name, train docs, test docs, negative samples] 			
				stPickle.s_dump_elt([prev_user, train, test, neg_samples ], f_train)				
				prev_user_data = []				
				prev_neg_samples = []							
			elif j == n_docs-1:			
				#can't forget the very last message
				prev_user_data.append(msg_idx)				
				prev_neg_samples.append(negative_samples)
				#shuffle the data			
				shuf_idx = np.arange(len(prev_user_data))
				rng.shuffle(shuf_idx)
				prev_user_data   = [prev_user_data[i] for i in shuf_idx]
				prev_neg_samples = [prev_neg_samples[i] for i in shuf_idx]		
				#split
				split = int(len(prev_user_data)*.9)				
				train = prev_user_data[:split]
				test  = prev_user_data[split:]				
    new_training_data = open(tmp_data_path,"w")    

    # for instance in training_data:        
    #     neg_samples = []
    #     for msg in instance[TRAIN_IDX]:            
    #         neg_samples += [[ multinomial_samples(unigram_distribution, msg, args.neg_samples) \
    #                  for _ in xrange(len(msg)) ]]
    #     instance[NEG_SAMPLES_IDX] = neg_samples                  
    #     stPickle.s_dump_elt(instance,new_training_data)   

    for instance in training_data:        
        neg_samples = []
        for msg in instance[TRAIN_IDX]:            
            neg_samples += [sampler.sample((len(msg),args.neg_samples))]
        instance[NEG_SAMPLES_IDX] = neg_samples                  
        stPickle.s_dump_elt(instance,new_training_data)   

    #replace the training data file with the new augmented one    
    os.remove(args.input)
    os.rename(tmp_data_path, args.input)
    tend = time.time() - t0
    print "\n[runtime: %d minutes (%.2f secs)]" % ((tend/60),tend)    



    # neg_samples = []    
    
    #     set_trace()
        
    #     neg_samples.append(neg_samp)        
    # instance[NEG_SAMPLES_IDX] = neg_samples