sorted_values = list(bst.values(reverse=True)) sorted_users = [x[0] for x in sorted_values] print("[spliting %d users into #files: %d]" % (len(sorted_users),args.n_splits)) out_files = [] out_path, ext = os.path.splitext(args.input) for i in range(args.n_splits): fname = "%s%d%s" % (out_path,i+1,ext) print(" > %s" % fname) f = open(fname,"w") out_files.append(f) tf.seek(0) out_log = [[] for x in range(args.n_splits)] # set_trace() print("[processing users]") partition_size = math.floor(len(sorted_users)*1.0/args.n_splits) for x in stPickle.s_load(tf): user, train, _, _,_ = x user_rank = sorted_users.index(user) fnumber = int(math.floor(user_rank*1.0/partition_size)) if fnumber < 0: fnumber = 0 if fnumber > args.n_splits-1: fnumber = args.n_splits-1 print(" > user: %s | #train: %d | rank: %d | fnum: %d" % (user, len(train), user_rank, fnumber)) out_file = out_files[fnumber] stPickle.s_dump_elt(x, out_file) out_log[fnumber].append(len(train)) print("[avg #docs: ]") for i in range(len(out_log)): print(" >file %d: %.3f " % (i,np.mean(out_log[i]))) print("[removing original training file: %s]" % args.input) os.remove(args.input) print("error: n_splits should be at least 2")
elif u_idx != prev_user: #after accumulating all documents for current user, shuffle and write them to disk assert len(prev_user_data) == len(prev_neg_samples) #shuffle the data shuf_idx = np.arange(len(prev_user_data)) rng.shuffle(shuf_idx) prev_user_data = [prev_user_data[i] for i in shuf_idx] prev_neg_samples = [prev_neg_samples[i] for i in shuf_idx] # set_trace() split = int(len(prev_user_data)*.9) train = prev_user_data[:split] test = prev_user_data[split:] neg_samples = prev_neg_samples[:split] #each training instance consists of: #[user_name, train docs, test docs, negative samples] stPickle.s_dump_elt([prev_user, train, test, neg_samples ], f_train) prev_user_data = [] prev_neg_samples = [] elif j == n_docs-1: #can't forget the very last message prev_user_data.append(msg_idx) prev_neg_samples.append(negative_samples) #shuffle the data shuf_idx = np.arange(len(prev_user_data)) rng.shuffle(shuf_idx) prev_user_data = [prev_user_data[i] for i in shuf_idx] prev_neg_samples = [prev_neg_samples[i] for i in shuf_idx] #split split = int(len(prev_user_data)*.9) train = prev_user_data[:split] test = prev_user_data[split:]
new_training_data = open(tmp_data_path,"w") # for instance in training_data: # neg_samples = [] # for msg in instance[TRAIN_IDX]: # neg_samples += [[ multinomial_samples(unigram_distribution, msg, args.neg_samples) \ # for _ in xrange(len(msg)) ]] # instance[NEG_SAMPLES_IDX] = neg_samples # stPickle.s_dump_elt(instance,new_training_data) for instance in training_data: neg_samples = [] for msg in instance[TRAIN_IDX]: neg_samples += [sampler.sample((len(msg),args.neg_samples))] instance[NEG_SAMPLES_IDX] = neg_samples stPickle.s_dump_elt(instance,new_training_data) #replace the training data file with the new augmented one os.remove(args.input) os.rename(tmp_data_path, args.input) tend = time.time() - t0 print "\n[runtime: %d minutes (%.2f secs)]" % ((tend/60),tend) # neg_samples = [] # set_trace() # neg_samples.append(neg_samp) # instance[NEG_SAMPLES_IDX] = neg_samples