def load_train_users_heldout_data(train_users, src_vocabs, trg_vocabs, user_data_ratio=0.5): src_users = load_users(SATED_TRAIN_USER) train_src_texts = load_texts(SATED_TRAIN_ENG) train_trg_texts = load_texts(SATED_TRAIN_FR) user_src_texts = defaultdict(list) user_trg_texts = defaultdict(list) for u, s, t in zip(src_users, train_src_texts, train_trg_texts): if u in train_users: user_src_texts[u].append(s) user_trg_texts[u].append(t) assert 0. < user_data_ratio < 1. # held out some fraction of data for testing for u in user_src_texts: l = len(user_src_texts[u]) l = int(l * user_data_ratio) user_src_texts[u] = user_src_texts[u][l:] user_trg_texts[u] = user_trg_texts[u][l:] for u in train_users: process_texts(user_src_texts[u], src_vocabs) process_texts(user_trg_texts[u], trg_vocabs) return user_src_texts, user_trg_texts
def load_shadow_user_data(train_users, num_users=100, num_words=10000, seed=12345): src_users = load_users(SATED_TRAIN_USER) train_src_texts = load_texts(SATED_TRAIN_ENG) train_trg_texts = load_texts(SATED_TRAIN_FR) user_counter = Counter(src_users) all_users = [tup[0] for tup in user_counter.most_common()] np.random.seed(seed) np.random.shuffle(all_users) np.random.seed(None) attacker_users = all_users[num_users * 2:num_users * 4] test_users = np.setdiff1d(attacker_users, train_users) print(len(train_users), len(test_users)) user_src_texts = defaultdict(list) user_trg_texts = defaultdict(list) test_user_src_texts = defaultdict(list) test_user_trg_texts = defaultdict(list) for u, s, t in zip(src_users, train_src_texts, train_trg_texts): if u in train_users: user_src_texts[u].append(s) user_trg_texts[u].append(t) if u in test_users: test_user_src_texts[u].append(s) test_user_trg_texts[u].append(t) src_words = [] trg_words = [] for u in train_users: src_words += list(chain(*user_src_texts[u])) trg_words += list(chain(*user_trg_texts[u])) src_vocabs = process_vocabs(src_words, num_words) trg_vocabs = process_vocabs(trg_words, num_words) for u in train_users: process_texts(user_src_texts[u], src_vocabs) process_texts(user_trg_texts[u], trg_vocabs) for u in test_users: process_texts(test_user_src_texts[u], src_vocabs) process_texts(test_user_trg_texts[u], trg_vocabs) src_words = [] trg_words = [] for u in train_users: src_words += list(chain(*user_src_texts[u])) trg_words += list(chain(*user_trg_texts[u])) src_vocabs = process_vocabs(src_words, None) trg_vocabs = process_vocabs(trg_words, None) return user_src_texts, user_trg_texts, test_user_src_texts, test_user_trg_texts, src_vocabs, trg_vocabs
def load_ubuntu_by_user(num_users=200, num_words=5000, num_data_per_user=200, test_size=5000): src_texts, trg_texts = load_extracted_ubuntu(num_users * num_data_per_user * 2 + test_size) test_src_texts = src_texts[-test_size:] test_trg_texts = trg_texts[-test_size:] src_texts = src_texts[:-test_size] trg_texts = trg_texts[:-test_size] all_users = np.arange(num_users * 2) np.random.seed(None) train_users = np.random.choice(all_users, size=num_users, replace=False) user_src_texts = defaultdict(list) user_trg_texts = defaultdict(list) for u in train_users: user_src_texts[u] = src_texts[u * num_data_per_user:(u + 1) * num_data_per_user] user_trg_texts[u] = trg_texts[u * num_data_per_user:(u + 1) * num_data_per_user] src_words = [] trg_words = [] for u in train_users: src_words += list(chain(*user_src_texts[u])) trg_words += list(chain(*user_trg_texts[u])) src_vocabs = process_vocabs(src_words, num_words) trg_vocabs = process_vocabs(trg_words, num_words) for u in train_users: process_texts(user_src_texts[u], src_vocabs) process_texts(user_trg_texts[u], trg_vocabs) process_texts(test_src_texts, src_vocabs) process_texts(test_trg_texts, trg_vocabs) src_words = [] trg_words = [] for u in train_users: src_words += list(chain(*user_src_texts[u])) trg_words += list(chain(*user_trg_texts[u])) src_vocabs = process_vocabs(src_words, None) trg_vocabs = process_vocabs(trg_words, None) return user_src_texts, user_trg_texts, test_src_texts, test_trg_texts, test_src_texts, test_trg_texts,\ src_vocabs, trg_vocabs # if __name__ == '__main__': # # load_cornell_movie_by_user(num_users=200, sample_user=False, user_data_ratio=0.5) # load_ubuntu_by_users()
def load_cornell_movie_by_user(num_users=100, num_words=5000, test_on_user=False, sample_user=False, min_count=20, user_data_ratio=0.): train_data, dev_data, test_data = load_extracted_cornell_movie( dev_size=5000, test_size=5000) train_src_texts, train_trg_texts, src_users, _ = train_data dev_src_texts, dev_trg_texts = dev_data test_src_texts, test_trg_texts = test_data user_counter = Counter(src_users) all_users = np.asarray( [tup[0] for tup in user_counter.most_common() if tup[1] >= min_count]) print 'Loaded {} users'.format(len(all_users)) np.random.seed(12345) np.random.shuffle(all_users) np.random.seed(None) train_users = set(all_users[:num_users]) test_users = all_users[num_users:num_users * 2] if sample_user: attacker_users = all_users[num_users * 2:num_users * 4] np.random.seed(None) train_users = np.random.choice(attacker_users, size=num_users, replace=False) print train_users[:10] user_src_texts = defaultdict(list) user_trg_texts = defaultdict(list) test_user_src_texts = defaultdict(list) test_user_trg_texts = defaultdict(list) for u, s, t in zip(src_users, train_src_texts, train_trg_texts): if u in train_users: user_src_texts[u].append(s) user_trg_texts[u].append(t) if test_on_user and u in test_users: test_user_src_texts[u].append(s) test_user_trg_texts[u].append(t) if 0. < user_data_ratio < 1.: # held out some fraction of data for testing for u in user_src_texts: l = len(user_src_texts[u]) # print l l = int(l * user_data_ratio) user_src_texts[u] = user_src_texts[u][:l] user_trg_texts[u] = user_trg_texts[u][:l] src_words = [] trg_words = [] for u in train_users: src_words += list(chain(*user_src_texts[u])) trg_words += list(chain(*user_trg_texts[u])) src_vocabs = process_vocabs(src_words, num_words) trg_vocabs = process_vocabs(trg_words, num_words) for u in train_users: process_texts(user_src_texts[u], src_vocabs) process_texts(user_trg_texts[u], trg_vocabs) if test_on_user: for u in test_users: process_texts(test_user_src_texts[u], src_vocabs) process_texts(test_user_trg_texts[u], trg_vocabs) process_texts(dev_src_texts, src_vocabs) process_texts(dev_trg_texts, trg_vocabs) process_texts(test_src_texts, src_vocabs) process_texts(test_trg_texts, trg_vocabs) src_words = [] trg_words = [] for u in train_users: src_words += list(chain(*user_src_texts[u])) trg_words += list(chain(*user_trg_texts[u])) src_vocabs = process_vocabs(src_words, None) trg_vocabs = process_vocabs(trg_words, None) if test_on_user: return user_src_texts, user_trg_texts, test_user_src_texts, test_user_trg_texts, src_vocabs, trg_vocabs else: return user_src_texts, user_trg_texts, dev_src_texts, dev_trg_texts, test_src_texts, test_trg_texts, \ src_vocabs, trg_vocabs
def load_cornell_movie(num_words=10000): train_data, dev_data, test_data = load_extracted_cornell_movie() train_src_texts, train_trg_texts, _, _ = train_data dev_src_texts, dev_trg_texts = dev_data test_src_texts, test_trg_texts = test_data src_words = list(chain(*train_src_texts)) trg_words = list(chain(*train_trg_texts)) src_vocabs = process_vocabs(src_words, num_words) trg_vocabs = process_vocabs(trg_words, num_words) process_texts(train_src_texts, src_vocabs) process_texts(train_trg_texts, trg_vocabs) process_texts(dev_src_texts, src_vocabs) process_texts(dev_trg_texts, trg_vocabs) process_texts(test_src_texts, src_vocabs) process_texts(test_trg_texts, trg_vocabs) src_words = list(chain(*train_src_texts)) trg_words = list(chain(*train_trg_texts)) src_vocabs = process_vocabs(src_words, None) trg_vocabs = process_vocabs(trg_words, None) return train_src_texts, train_trg_texts, dev_src_texts, dev_trg_texts, test_src_texts, test_trg_texts, \ src_vocabs, trg_vocabs