Beispiel #1
0
def load_train_users_heldout_data(train_users,
                                  src_vocabs,
                                  trg_vocabs,
                                  user_data_ratio=0.5):
    src_users = load_users(SATED_TRAIN_USER)
    train_src_texts = load_texts(SATED_TRAIN_ENG)
    train_trg_texts = load_texts(SATED_TRAIN_FR)

    user_src_texts = defaultdict(list)
    user_trg_texts = defaultdict(list)

    for u, s, t in zip(src_users, train_src_texts, train_trg_texts):
        if u in train_users:
            user_src_texts[u].append(s)
            user_trg_texts[u].append(t)

    assert 0. < user_data_ratio < 1.
    # held out some fraction of data for testing
    for u in user_src_texts:
        l = len(user_src_texts[u])
        l = int(l * user_data_ratio)
        user_src_texts[u] = user_src_texts[u][l:]
        user_trg_texts[u] = user_trg_texts[u][l:]

    for u in train_users:
        process_texts(user_src_texts[u], src_vocabs)
        process_texts(user_trg_texts[u], trg_vocabs)

    return user_src_texts, user_trg_texts
Beispiel #2
0
def load_shadow_user_data(train_users,
                          num_users=100,
                          num_words=10000,
                          seed=12345):
    src_users = load_users(SATED_TRAIN_USER)
    train_src_texts = load_texts(SATED_TRAIN_ENG)
    train_trg_texts = load_texts(SATED_TRAIN_FR)

    user_counter = Counter(src_users)
    all_users = [tup[0] for tup in user_counter.most_common()]
    np.random.seed(seed)
    np.random.shuffle(all_users)
    np.random.seed(None)

    attacker_users = all_users[num_users * 2:num_users * 4]
    test_users = np.setdiff1d(attacker_users, train_users)
    print(len(train_users), len(test_users))

    user_src_texts = defaultdict(list)
    user_trg_texts = defaultdict(list)

    test_user_src_texts = defaultdict(list)
    test_user_trg_texts = defaultdict(list)

    for u, s, t in zip(src_users, train_src_texts, train_trg_texts):
        if u in train_users:
            user_src_texts[u].append(s)
            user_trg_texts[u].append(t)
        if u in test_users:
            test_user_src_texts[u].append(s)
            test_user_trg_texts[u].append(t)

    src_words = []
    trg_words = []
    for u in train_users:
        src_words += list(chain(*user_src_texts[u]))
        trg_words += list(chain(*user_trg_texts[u]))

    src_vocabs = process_vocabs(src_words, num_words)
    trg_vocabs = process_vocabs(trg_words, num_words)

    for u in train_users:
        process_texts(user_src_texts[u], src_vocabs)
        process_texts(user_trg_texts[u], trg_vocabs)

    for u in test_users:
        process_texts(test_user_src_texts[u], src_vocabs)
        process_texts(test_user_trg_texts[u], trg_vocabs)

    src_words = []
    trg_words = []

    for u in train_users:
        src_words += list(chain(*user_src_texts[u]))
        trg_words += list(chain(*user_trg_texts[u]))

    src_vocabs = process_vocabs(src_words, None)
    trg_vocabs = process_vocabs(trg_words, None)

    return user_src_texts, user_trg_texts, test_user_src_texts, test_user_trg_texts, src_vocabs, trg_vocabs
def load_ubuntu_by_user(num_users=200,
                        num_words=5000,
                        num_data_per_user=200,
                        test_size=5000):
    src_texts, trg_texts = load_extracted_ubuntu(num_users *
                                                 num_data_per_user * 2 +
                                                 test_size)
    test_src_texts = src_texts[-test_size:]
    test_trg_texts = trg_texts[-test_size:]
    src_texts = src_texts[:-test_size]
    trg_texts = trg_texts[:-test_size]

    all_users = np.arange(num_users * 2)
    np.random.seed(None)
    train_users = np.random.choice(all_users, size=num_users, replace=False)

    user_src_texts = defaultdict(list)
    user_trg_texts = defaultdict(list)

    for u in train_users:
        user_src_texts[u] = src_texts[u * num_data_per_user:(u + 1) *
                                      num_data_per_user]
        user_trg_texts[u] = trg_texts[u * num_data_per_user:(u + 1) *
                                      num_data_per_user]

    src_words = []
    trg_words = []
    for u in train_users:
        src_words += list(chain(*user_src_texts[u]))
        trg_words += list(chain(*user_trg_texts[u]))

    src_vocabs = process_vocabs(src_words, num_words)
    trg_vocabs = process_vocabs(trg_words, num_words)

    for u in train_users:
        process_texts(user_src_texts[u], src_vocabs)
        process_texts(user_trg_texts[u], trg_vocabs)

    process_texts(test_src_texts, src_vocabs)
    process_texts(test_trg_texts, trg_vocabs)

    src_words = []
    trg_words = []
    for u in train_users:
        src_words += list(chain(*user_src_texts[u]))
        trg_words += list(chain(*user_trg_texts[u]))

    src_vocabs = process_vocabs(src_words, None)
    trg_vocabs = process_vocabs(trg_words, None)

    return user_src_texts, user_trg_texts, test_src_texts, test_trg_texts, test_src_texts, test_trg_texts,\
           src_vocabs, trg_vocabs


# if __name__ == '__main__':
#     # load_cornell_movie_by_user(num_users=200, sample_user=False, user_data_ratio=0.5)
#     load_ubuntu_by_users()
def load_cornell_movie_by_user(num_users=100,
                               num_words=5000,
                               test_on_user=False,
                               sample_user=False,
                               min_count=20,
                               user_data_ratio=0.):
    train_data, dev_data, test_data = load_extracted_cornell_movie(
        dev_size=5000, test_size=5000)
    train_src_texts, train_trg_texts, src_users, _ = train_data
    dev_src_texts, dev_trg_texts = dev_data
    test_src_texts, test_trg_texts = test_data

    user_counter = Counter(src_users)
    all_users = np.asarray(
        [tup[0] for tup in user_counter.most_common() if tup[1] >= min_count])
    print 'Loaded {} users'.format(len(all_users))

    np.random.seed(12345)
    np.random.shuffle(all_users)
    np.random.seed(None)

    train_users = set(all_users[:num_users])
    test_users = all_users[num_users:num_users * 2]

    if sample_user:
        attacker_users = all_users[num_users * 2:num_users * 4]
        np.random.seed(None)
        train_users = np.random.choice(attacker_users,
                                       size=num_users,
                                       replace=False)
        print train_users[:10]

    user_src_texts = defaultdict(list)
    user_trg_texts = defaultdict(list)

    test_user_src_texts = defaultdict(list)
    test_user_trg_texts = defaultdict(list)

    for u, s, t in zip(src_users, train_src_texts, train_trg_texts):
        if u in train_users:
            user_src_texts[u].append(s)
            user_trg_texts[u].append(t)
        if test_on_user and u in test_users:
            test_user_src_texts[u].append(s)
            test_user_trg_texts[u].append(t)

    if 0. < user_data_ratio < 1.:
        # held out some fraction of data for testing
        for u in user_src_texts:
            l = len(user_src_texts[u])
            # print l
            l = int(l * user_data_ratio)
            user_src_texts[u] = user_src_texts[u][:l]
            user_trg_texts[u] = user_trg_texts[u][:l]

    src_words = []
    trg_words = []
    for u in train_users:
        src_words += list(chain(*user_src_texts[u]))
        trg_words += list(chain(*user_trg_texts[u]))

    src_vocabs = process_vocabs(src_words, num_words)
    trg_vocabs = process_vocabs(trg_words, num_words)

    for u in train_users:
        process_texts(user_src_texts[u], src_vocabs)
        process_texts(user_trg_texts[u], trg_vocabs)

    if test_on_user:
        for u in test_users:
            process_texts(test_user_src_texts[u], src_vocabs)
            process_texts(test_user_trg_texts[u], trg_vocabs)

    process_texts(dev_src_texts, src_vocabs)
    process_texts(dev_trg_texts, trg_vocabs)

    process_texts(test_src_texts, src_vocabs)
    process_texts(test_trg_texts, trg_vocabs)

    src_words = []
    trg_words = []

    for u in train_users:
        src_words += list(chain(*user_src_texts[u]))
        trg_words += list(chain(*user_trg_texts[u]))

    src_vocabs = process_vocabs(src_words, None)
    trg_vocabs = process_vocabs(trg_words, None)

    if test_on_user:
        return user_src_texts, user_trg_texts, test_user_src_texts, test_user_trg_texts, src_vocabs, trg_vocabs
    else:
        return user_src_texts, user_trg_texts, dev_src_texts, dev_trg_texts, test_src_texts, test_trg_texts, \
               src_vocabs, trg_vocabs
def load_cornell_movie(num_words=10000):
    train_data, dev_data, test_data = load_extracted_cornell_movie()
    train_src_texts, train_trg_texts, _, _ = train_data
    dev_src_texts, dev_trg_texts = dev_data
    test_src_texts, test_trg_texts = test_data

    src_words = list(chain(*train_src_texts))
    trg_words = list(chain(*train_trg_texts))

    src_vocabs = process_vocabs(src_words, num_words)
    trg_vocabs = process_vocabs(trg_words, num_words)

    process_texts(train_src_texts, src_vocabs)
    process_texts(train_trg_texts, trg_vocabs)

    process_texts(dev_src_texts, src_vocabs)
    process_texts(dev_trg_texts, trg_vocabs)

    process_texts(test_src_texts, src_vocabs)
    process_texts(test_trg_texts, trg_vocabs)

    src_words = list(chain(*train_src_texts))
    trg_words = list(chain(*train_trg_texts))

    src_vocabs = process_vocabs(src_words, None)
    trg_vocabs = process_vocabs(trg_words, None)
    return train_src_texts, train_trg_texts, dev_src_texts, dev_trg_texts, test_src_texts, test_trg_texts, \
           src_vocabs, trg_vocabs