Ejemplo n.º 1
0
def read_indices(dataset):
    if dataset == "youtube":
        indices_filepath = get_package_path(
        ) + "/data_folder/uniform_data/youtube/data_splits.txt"
    elif dataset == "reddit":
        indices_filepath = get_package_path(
        ) + "/data_folder/uniform_data/reddit/data_splits.txt"
    else:
        raise ValueError

    with open(indices_filepath, "r") as fp:
        file_row = next(fp)

        clean_row = file_row.strip().split("\t")

        train_size = int(clean_row[0])
        val_size = int(clean_row[1])
        test_size = int(clean_row[2])

        indices = np.empty(train_size + val_size + test_size, dtype=np.int32)

        i = 0
        for file_row in fp:
            clean_row = file_row.strip()
            indices[i] = int(clean_row)

            i += 1

        train = indices[:train_size]
        val = indices[train_size:train_size + val_size]
        test = indices[train_size + val_size:]

    return train, val, test
Ejemplo n.º 2
0
def form_item_to_popularity(platform):
    folder = get_package_path() + "/data_folder/anonymized_data/" + platform
    output_file_path = get_package_path(
    ) + "/data_folder/anonymized_data/" + platform + "/item_to_popularity" + ".txt"

    ####################################################################################################################
    # Extraction functions.
    ####################################################################################################################
    extraction_functions = dict()
    extraction_functions[
        "comment_generator"] = anonymized_extract.comment_generator
    extraction_functions[
        "extract_comment_name"] = anonymized_extract.extract_comment_name
    extraction_functions[
        "extract_parent_comment_name"] = anonymized_extract.extract_parent_comment_name
    extraction_functions[
        "extract_lifetime"] = anonymized_extract.extract_lifetime
    extraction_functions[
        "extract_user_name"] = anonymized_extract.extract_user_name
    extraction_functions[
        "calculate_targets"] = anonymized_extract.calculate_targets
    extraction_functions["anonymous_coward_name"] = "0"

    ####################################################################################################################
    # Iterate over all videos.
    ####################################################################################################################
    input_file_path = folder + "/anonymized_data" + ".txt"

    anonymize_user = dict()

    counter = 0

    fp = open(output_file_path, "w")

    document_gen = anonymized_extract.document_generator(input_file_path)
    for document in document_gen:
        if counter % 50 == 0:
            print(input_file_path, counter)

        targets = anonymized_extract.calculate_targets(document)

        fp.write(
            repr(targets["comments"]) + "\t" + repr(targets["users"]) + "\t" +
            repr(targets["score_wilson"]) + "\t" +
            repr(targets["controversiality_wilson"]) + "\n")
        counter += 1
Ejemplo n.º 3
0
def read_weights(dataset):
    if dataset == "youtube":
        base_model_filepath = get_package_path(
        ) + "/data_folder/models/youtube_model.pkl"
    elif dataset == "reddit":
        base_model_filepath = get_package_path(
        ) + "/data_folder/models/reddit_model.pkl"
    else:
        raise ValueError("Invalid dataset.")

    file_path_list = list()
    for i in range(3):
        file_path_list.append(base_model_filepath + "." + repr(i))

    fin = open(file_path_list[0], "rb")
    params = cPickle.load(fin)
    fin.close()

    user_embeddings = params[0]

    return user_embeddings
Ejemplo n.º 4
0
__author__ = "Georgios Rizos ([email protected])"

from thread2vec.preprocessing.anonymize_datasets import anonymize_reddit_dataset, anonymize_youtube_dataset
from thread2vec.preprocessing.anonymize_datasets import form_item_to_user, form_item_to_popularity
from thread2vec.representation.utility import get_data
from thread2vec.preprocessing.handcrafted import calculate_reddit_features, calculate_youtube_features
from thread2vec.common import get_package_path

if "__main__" == __name__:
    # Anonymize raw data
    anonymize_reddit_dataset(
        get_package_path() + "/data_folder/raw_data/reddit",
        get_package_path() + "/data_folder/anonymized_data/reddit")
    anonymize_youtube_dataset(
        get_package_path() + "/data_folder/raw_data/youtube",
        get_package_path() + "/data_folder/anonymized_data/youtube")

    # Form item to responding users arrays for different time scales.
    for scale in ["post", "min", "hour", "day", "week", "inf"]:
        form_item_to_user("youtube", scale)
        form_item_to_user("reddit", scale)

    # Extract label values from raw data.
    form_item_to_popularity("youtube")
    form_item_to_popularity("reddit")

    # Calculate engineered features.
    calculate_reddit_features()
    calculate_youtube_features()

    # Store data splits.
Ejemplo n.º 5
0
def calculate_youtube_features():
    input_file_path = get_package_path(
    ) + "/data_folder/anonymized_data/youtube/anonymized_data.txt"

    ####################################################################################################################
    # Iterate over all videos.
    ####################################################################################################################
    graph_generator = form_graphs([
        input_file_path,
    ],
                                  item_id_set=set(range(411288)))

    features_generator = extract_features(graph_generator, "youtube")

    youtube_feature_name_list = sorted(
        get_handcrafted_feature_names("YouTube"))
    number_of_youtube_features = len(youtube_feature_name_list)

    number_of_items = 411288  # TODO: Make this readable.

    features_post = np.empty((number_of_items, number_of_youtube_features),
                             dtype=np.float32)
    features_minute = np.empty((number_of_items, number_of_youtube_features),
                               dtype=np.float32)
    features_hour = np.empty((number_of_items, number_of_youtube_features),
                             dtype=np.float32)
    features_day = np.empty((number_of_items, number_of_youtube_features),
                            dtype=np.float32)
    features_week = np.empty((number_of_items, number_of_youtube_features),
                             dtype=np.float32)
    features_inf = np.empty((number_of_items, number_of_youtube_features),
                            dtype=np.float32)

    features_dict = dict()
    features_dict[0] = features_post
    features_dict[1] = features_minute
    features_dict[2] = features_hour
    features_dict[3] = features_day
    features_dict[4] = features_week
    features_dict[5] = features_inf

    counter = 0
    for features in features_generator:
        for s, snapshot in enumerate(features["snapshots"]):
            snapshot_features = snapshot["features"]

            for f, feature_name in enumerate(youtube_feature_name_list):
                features_dict[s][counter, f] = np.float32(
                    snapshot_features[feature_name])

        if s < 5:
            for s_extra in range(s + 1, 6):
                for f, feature_name in enumerate(youtube_feature_name_list):
                    features_dict[s_extra][counter, f] = np.float32(
                        snapshot_features[feature_name])

        counter += 1

    np.save(
        get_package_path() +
        "/data_folder/anonymized_data/youtube/features_post", features_post)
    np.save(
        get_package_path() +
        "/data_folder/anonymized_data/youtube/features_minute",
        features_minute)
    np.save(
        get_package_path() +
        "/data_folder/anonymized_data/youtube/features_hour", features_hour)
    np.save(
        get_package_path() +
        "/data_folder/anonymized_data/youtube/features_day", features_day)
    np.save(
        get_package_path() +
        "/data_folder/anonymized_data/youtube/features_week", features_week)
    np.save(
        get_package_path() +
        "/data_folder/anonymized_data/youtube/features_inf", features_inf)
Ejemplo n.º 6
0
def get_data(dataset, scale):
    if dataset == "youtube":
        item_to_userset_filepath = get_package_path(
        ) + "/data_folder/anonymized_data/youtube/item_to_userset_" + scale + ".txt"
        anonymize_user_filepath = get_package_path(
        ) + "/data_folder/anonymized_data/youtube/anonymize_user_" + scale + ".txt"
        popularity_filepath = get_package_path(
        ) + "/data_folder/anonymized_data/youtube/item_to_popularity.txt"
        anonymous_coward_name = repr(0)
        top_users = 200001
        total_number_of_items = 516995
    elif dataset == "reddit":
        item_to_userset_filepath = get_package_path(
        ) + "/data_folder/anonymized_data/reddit/item_to_userset_" + scale + ".txt"
        anonymize_user_filepath = get_package_path(
        ) + "/data_folder/anonymized_data/reddit/anonymize_user_" + scale + ".txt"
        popularity_filepath = get_package_path(
        ) + "/data_folder/anonymized_data/reddit/item_to_popularity.txt"
        anonymous_coward_name = repr(0)
        top_users = 20000
        total_number_of_items = 35844
    else:
        raise ValueError("Invalid dataset.")

    # Read popularity values.
    bad_popularity_items = list()
    popularity_matrix = np.empty((total_number_of_items, 4), dtype=np.float32)
    with open(popularity_filepath, "r") as fp:
        file_row = next(fp)

        item_counter = 0
        for file_row in fp:
            clean_row = file_row.strip().split("\t")

            if clean_row[0] == "None":
                popularity_matrix[item_counter, 0] = np.nan
                popularity_matrix[item_counter, 1] = np.nan
                popularity_matrix[item_counter, 2] = np.nan
                popularity_matrix[item_counter, 3] = np.nan
                bad_popularity_items.append(item_counter)
            else:
                popularity_matrix[item_counter, 0] = float(clean_row[0])
                popularity_matrix[item_counter, 1] = float(clean_row[1])
                popularity_matrix[item_counter, 2] = float(clean_row[2])
                popularity_matrix[item_counter, 3] = float(clean_row[3])

            item_counter += 1
    bad_popularity_items = np.array(bad_popularity_items, dtype=np.int32)

    # Read user anonymizer.
    anonymize_user = dict()
    with open(anonymize_user_filepath, "r") as fp:
        for file_row in fp:
            clean_row = file_row.strip().split("\t")

            anonymize_user[clean_row[0]] = int(clean_row[1])
    total_number_of_users = len(anonymize_user)

    true_anonymize_user = copy.copy(anonymize_user)

    user_list = list()
    for i in range(total_number_of_users):
        user_list.append(None)

    for k, v in anonymize_user.items():
        user_list[v] = k

    anonymous_coward_within_discussion = anonymize_user[anonymous_coward_name]

    # Read item to userset.
    item_to_user_row = list()
    item_to_user_col = list()

    item_to_user_matrix = spsp.coo_matrix(
        (np.array(list(), dtype=np.int32),
         (np.array(list(), dtype=np.int32), np.array(list(), dtype=np.int32))),
        shape=(total_number_of_items, total_number_of_users))

    item_to_user_matrix = spsp.csc_matrix(item_to_user_matrix)

    with open(item_to_userset_filepath, "r") as fp:
        counter = 0
        for file_row in fp:
            clean_row = file_row.strip().split("\t")

            for user in clean_row[1:]:
                item_to_user_row.append(int(clean_row[0]))
                item_to_user_col.append(int(user))

            counter += 1
            if counter % 10000 == 0:
                item_to_user_row = np.array(item_to_user_row, dtype=np.int32)
                item_to_user_col = np.array(item_to_user_col, dtype=np.int32)
                item_to_user_data = np.ones_like(item_to_user_row,
                                                 dtype=np.int32)

                item_to_user_matrix_to_add = spsp.coo_matrix(
                    (item_to_user_data, (item_to_user_row, item_to_user_col)),
                    shape=(total_number_of_items, total_number_of_users))

                item_to_user_matrix_to_add = spsp.csc_matrix(
                    item_to_user_matrix_to_add)
                item_to_user_matrix = item_to_user_matrix + item_to_user_matrix_to_add

                item_to_user_row = list()
                item_to_user_col = list()

    item_to_user_row = np.array(item_to_user_row, dtype=np.int32)
    item_to_user_col = np.array(item_to_user_col, dtype=np.int32)
    item_to_user_data = np.ones_like(item_to_user_row, dtype=np.int32)

    item_to_user_matrix_to_add = spsp.coo_matrix(
        (item_to_user_data, (item_to_user_row, item_to_user_col)),
        shape=(total_number_of_items, total_number_of_users))

    item_to_user_matrix_to_add = spsp.csc_matrix(item_to_user_matrix_to_add)
    item_to_user_matrix = item_to_user_matrix + item_to_user_matrix_to_add

    if top_users is not None:
        user_to_item_distribution = item_to_user_matrix.sum(axis=0)

        user_indices_sorted = np.empty(top_users, dtype=np.int32)
        user_indices_sorted_to_add = np.argsort(user_to_item_distribution)[
            0, -top_users:]
        user_indices_sorted[:] = user_indices_sorted_to_add

        user_indices_sorted = user_indices_sorted[
            user_indices_sorted != anonymous_coward_within_discussion]

        user_indices_sorted_set = set(list(user_indices_sorted))

        filtered_item_to_user_matrix = item_to_user_matrix[:,
                                                           user_indices_sorted]

        new_user_list = list()
        new_anonymize_user = dict()
        counter = 0
        for user in user_list:
            if anonymize_user[user] in user_indices_sorted_set:
                new_user_list.append(user)
                new_anonymize_user[user] = counter
                counter += 1
        user_list = new_user_list
        anonymize_user = new_anonymize_user

    else:
        top_users = total_number_of_users
        user_to_item_distribution = np.empty(top_users, dtype=np.int32)
        user_to_item_distribution[:] = item_to_user_matrix.sum(axis=0)[0, :]

        user_indices_sorted = np.arange(user_to_item_distribution.size,
                                        dtype=np.int32)
        user_indices_sorted = user_indices_sorted[
            user_to_item_distribution > 1]

        user_indices_sorted = user_indices_sorted[
            user_indices_sorted != anonymous_coward_within_discussion]

        user_indices_sorted_set = set(list(user_indices_sorted))

        filtered_item_to_user_matrix = item_to_user_matrix[:,
                                                           user_indices_sorted]

        new_user_list = list()
        new_anonymize_user = dict()
        counter = 0
        for user in user_list:
            if anonymize_user[user] in user_indices_sorted_set:
                new_user_list.append(user)
                new_anonymize_user[user] = counter
                counter += 1
        user_list = new_user_list
        anonymize_user = new_anonymize_user

    # item_to_user_distribution = filtered_item_to_user_matrix.sum(axis=1)
    # item_to_user_distribution = item_to_user_distribution[item_to_user_distribution > 1]

    item_to_user_distribution = np.empty(total_number_of_items, dtype=np.int32)
    item_to_user_distribution[:] = filtered_item_to_user_matrix.sum(
        axis=1)[:, 0].transpose()

    item_indices_sorted = np.arange(total_number_of_items, dtype=np.int32)
    item_indices_sorted = item_indices_sorted[item_to_user_distribution > 0]

    item_indices_sorted = np.setdiff1d(item_indices_sorted,
                                       bad_popularity_items)

    filtered_item_to_user_matrix = spsp.csr_matrix(
        filtered_item_to_user_matrix)
    filtered_item_to_user_matrix = filtered_item_to_user_matrix[
        item_indices_sorted, :]

    popularity_matrix = popularity_matrix[item_indices_sorted, :]

    user_to_item_distribution = np.empty(len(anonymize_user), dtype=np.int32)
    user_to_item_distribution[:] = filtered_item_to_user_matrix.sum(
        axis=0)[0, :]

    user_indices_sorted = np.arange(user_to_item_distribution.size,
                                    dtype=np.int32)
    user_indices_sorted = user_indices_sorted[user_to_item_distribution > 0]

    user_indices_sorted = user_indices_sorted[
        user_indices_sorted != anonymous_coward_within_discussion]

    user_indices_sorted_set = set(list(user_indices_sorted))

    filtered_item_to_user_matrix = filtered_item_to_user_matrix[:,
                                                                user_indices_sorted]

    new_user_list = list()
    new_anonymize_user = dict()
    counter = 0
    for user in user_list:
        if anonymize_user[user] in user_indices_sorted_set:
            new_user_list.append(user)
            new_anonymize_user[user] = counter
            counter += 1
    user_list = new_user_list
    anonymize_user = new_anonymize_user

    true_user_id_to_user_id = dict()
    for user in user_list:
        k = true_anonymize_user[user]
        v = anonymize_user[user]
        true_user_id_to_user_id[k] = v

    index_1 = int(np.ceil(filtered_item_to_user_matrix.shape[0] * 0.5))
    index_2 = int(np.ceil(filtered_item_to_user_matrix.shape[0] * 0.75))

    index_permutation = np.random.permutation(
        np.arange(filtered_item_to_user_matrix.shape[0], dtype=np.int32))

    train = index_permutation[:index_1]
    val = index_permutation[index_1:index_2]
    test = index_permutation[index_2:]

    data_splits = (train, val, test)

    data = dict()
    data["filtered_item_to_user_matrix"] = filtered_item_to_user_matrix
    data["popularity_matrix"] = popularity_matrix
    data["item_indices_sorted"] = item_indices_sorted
    data["anonymize_user"] = anonymize_user
    data["true_user_id_to_user_id"] = true_user_id_to_user_id
    data["user_list"] = user_list
    data["number_of_items"] = filtered_item_to_user_matrix.shape[0]
    data["number_of_users"] = filtered_item_to_user_matrix.shape[1]
    data["data_splits"] = data_splits

    return data
Ejemplo n.º 7
0
def handcrafted_features_versus_aggregation_comparison(dataset, vlad_clusters):
    method_names = list()
    results_list = list()

    train, val, test = read_indices(dataset)

    train = np.append(train, val)

    data = get_data(dataset, "week")

    y = data["popularity_matrix"]
    y_train = y[train, 2]
    y_test = y[test, 2]

    handcrafted_parameters = ["hour", "day", "week", "final"]

    X = make_features_vlad(
        dataset,
        number_of_vlad_clusters=vlad_clusters,
        filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
        user_id_set=set(list(data["true_user_id_to_user_id"].values())),
        do_power_norm=True,
        do_l2_norm=True)

    for star in handcrafted_parameters:
        handcrafted_features = np.load(get_package_path() +
                                       "/data_folder/uniform_data/" + dataset +
                                       "/features_" + star + ".npy")

        method_names.append(star + "")
        X_train = handcrafted_features[train, :]
        X_test = handcrafted_features[test, :]

        model = LinearRegression().fit(X_train, y_train)

        y_pred = model.predict(X_test)

        loss = np.mean(np.power(y_pred - y_test, 2))

        print(loss)
        results_list.append(loss)

        method_names.append(star + "vlad" + repr(vlad_clusters))
        X = np.hstack([X, handcrafted_features])

        X_train = X[train, :]
        X_test = X[test, :]

        model = LinearRegression().fit(X_train, y_train)

        y_pred = model.predict(X_test)

        loss = np.mean(np.power(y_pred - y_test, 2))

        print(loss)
        results_list.append(loss)

    with open(
            get_package_path() + "/data_folder/uniform_data/" + dataset +
            "/handcrafted_benchmark.txt", "w") as fp:
        for name, loss in zip(method_names, results_list):
            fp.write(name + "\t" + repr(loss) + "\n")

    method_names = np.array(method_names)
    results_list = np.array(results_list)

    indices_sorted = np.argsort(results_list)
    print(method_names[indices_sorted])
    print(results_list[indices_sorted])
Ejemplo n.º 8
0
def mean_versus_vlad_aggregation(dataset):
    method_names = list()
    vlad_parameters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30]
    results_list = list()

    train, val, test = read_indices(dataset)

    train = np.append(train, val)

    data = get_data(dataset, "week")

    y = data["popularity_matrix"]
    y_train = y[train, 1]
    y_test = y[test, 1]

    ####################################################################################################################
    # Mean
    ####################################################################################################################
    method_name = "mean"

    method_names.append(method_name + "")
    X = make_features_mean(
        dataset,
        filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
        user_id_set=set(list(data["true_user_id_to_user_id"].values())),
        do_power_norm=False,
        do_l2_norm=False)

    X_train = X[train, :]
    X_test = X[test, :]

    model = LinearRegression().fit(X_train, y_train)

    y_pred = model.predict(X_test)

    loss = np.mean(np.power(y_pred - y_test, 2))

    print(loss)
    results_list.append(loss)

    method_names.append(method_name + "_pnorm")
    X = make_features_mean(
        dataset,
        filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
        user_id_set=set(list(data["true_user_id_to_user_id"].values())),
        do_power_norm=False,
        do_l2_norm=True)

    X_train = X[train, :]
    X_test = X[test, :]

    model = LinearRegression().fit(X_train, y_train)

    y_pred = model.predict(X_test)

    loss = np.mean(np.power(y_pred - y_test, 2))

    print(loss)
    results_list.append(loss)

    method_names.append(method_name + "_l2norm")
    X = make_features_mean(
        dataset,
        filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
        user_id_set=set(list(data["true_user_id_to_user_id"].values())),
        do_power_norm=True,
        do_l2_norm=False)

    X_train = X[train, :]
    X_test = X[test, :]

    model = LinearRegression().fit(X_train, y_train)

    y_pred = model.predict(X_test)

    loss = np.mean(np.power(y_pred - y_test, 2))

    print(loss)
    results_list.append(loss)

    method_names.append(method_name + "_allnorm")
    X = make_features_mean(
        dataset,
        filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
        user_id_set=set(list(data["true_user_id_to_user_id"].values())),
        do_power_norm=True,
        do_l2_norm=True)

    X_train = X[train, :]
    X_test = X[test, :]

    model = LinearRegression().fit(X_train, y_train)

    y_pred = model.predict(X_test)

    loss = np.mean(np.power(y_pred - y_test, 2))

    print(loss)
    results_list.append(loss)

    ####################################################################################################################
    # VLAD
    ####################################################################################################################
    method_name = "vlad"

    for vlad_clusters in vlad_parameters:

        method_names.append(method_name + repr(vlad_clusters) + "")
        X = make_features_vlad(
            dataset,
            number_of_vlad_clusters=vlad_clusters,
            filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
            user_id_set=set(list(data["true_user_id_to_user_id"].values())),
            do_power_norm=False,
            do_l2_norm=False)

        X_train = X[train, :]
        X_test = X[test, :]

        model = LinearRegression().fit(X_train, y_train)

        y_pred = model.predict(X_test)

        loss = np.mean(np.power(y_pred - y_test, 2))

        print(loss)
        results_list.append(loss)

        method_names.append(method_name + repr(vlad_clusters) + "_pnorm")
        X = make_features_vlad(
            dataset,
            number_of_vlad_clusters=vlad_clusters,
            filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
            user_id_set=set(list(data["true_user_id_to_user_id"].values())),
            do_power_norm=True,
            do_l2_norm=False)

        X_train = X[train, :]
        X_test = X[test, :]

        model = LinearRegression().fit(X_train, y_train)

        y_pred = model.predict(X_test)

        loss = np.mean(np.power(y_pred - y_test, 2))

        print(loss)
        results_list.append(loss)

        method_names.append(method_name + repr(vlad_clusters) + "_l2norm")
        X = make_features_vlad(
            dataset,
            number_of_vlad_clusters=vlad_clusters,
            filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
            user_id_set=set(list(data["true_user_id_to_user_id"].values())),
            do_power_norm=False,
            do_l2_norm=True)

        X_train = X[train, :]
        X_test = X[test, :]

        model = LinearRegression().fit(X_train, y_train)

        y_pred = model.predict(X_test)

        loss = np.mean(np.power(y_pred - y_test, 2))

        print(loss)
        results_list.append(loss)

        method_names.append(method_name + repr(vlad_clusters) + "_allnorm")
        X = make_features_vlad(
            dataset,
            number_of_vlad_clusters=vlad_clusters,
            filtered_item_to_user_matrix=data["filtered_item_to_user_matrix"],
            user_id_set=set(list(data["true_user_id_to_user_id"].values())),
            do_power_norm=True,
            do_l2_norm=True)

        X_train = X[train, :]
        X_test = X[test, :]

        model = LinearRegression().fit(X_train, y_train)

        y_pred = model.predict(X_test)

        loss = np.mean(np.power(y_pred - y_test, 2))

        print(loss)
        results_list.append(loss)

    with open(
            get_package_path() + "/data_folder/uniform_data/" + dataset +
            "/aggregation_benchmark_user.txt", "w") as fp:
        for name, loss in zip(method_names, results_list):
            fp.write(name + "\t" + repr(loss) + "\n")

    method_names = np.array(method_names)
    results_list = np.array(results_list)

    indices_sorted = np.argsort(results_list)
    print(method_names[indices_sorted])
    print(results_list[indices_sorted])
Ejemplo n.º 9
0
__author__ = "Georgios Rizos ([email protected])"

from thread2vec.common import get_package_path
from thread2vec.representation.neural_embedding import Thread2Vec
from thread2vec.representation.utility import get_data

if __name__ == "__main__":
    data_folder = get_package_path() + "/data_folder"

    ####################################################################################################################
    # Run Reddit experiment.
    ####################################################################################################################
    batch_size = 64
    negative_samples = .4
    embedding_size = 64
    window_size = 500
    learning_rate = 1e-3
    dropout = 0.2

    dataset = "reddit"

    data = get_data(dataset, "week")
    print("Read data.")

    async_batch_size = 1000

    shuffle = True

    user_user_iterations_number = None

    number_of_vlad_clusters = 50
Ejemplo n.º 10
0
def form_item_to_user(platform, time_scale):
    folder = get_package_path() + "/data_folder/anonymized_data/" + platform
    output_file_path = get_package_path(
    ) + "/data_folder/anonymized_data/" + platform + "/item_to_userset_" + time_scale + ".txt"
    anonymize_user_file_path = get_package_path(
    ) + "/data_folder/anonymized_data/" + platform + "/anonymize_user_" + time_scale + ".txt"

    time_scale_in_seconds = dict()
    time_scale_in_seconds["post"] = 0.0
    time_scale_in_seconds["min"] = 60.0
    time_scale_in_seconds["hour"] = 3600.0
    time_scale_in_seconds["day"] = 86400.0
    time_scale_in_seconds["week"] = 604810.0
    time_scale_in_seconds["inf"] = sys.maxsize

    ####################################################################################################################
    # Extraction functions.
    ####################################################################################################################
    extraction_functions = dict()
    extraction_functions[
        "comment_generator"] = anonymized_extract.comment_generator
    extraction_functions[
        "extract_comment_name"] = anonymized_extract.extract_comment_name
    extraction_functions[
        "extract_parent_comment_name"] = anonymized_extract.extract_parent_comment_name
    extraction_functions[
        "extract_lifetime"] = anonymized_extract.extract_lifetime
    extraction_functions[
        "extract_user_name"] = anonymized_extract.extract_user_name
    extraction_functions[
        "calculate_targets"] = anonymized_extract.calculate_targets
    extraction_functions["anonymous_coward_name"] = "0"

    ####################################################################################################################
    # Iterate over all videos.
    ####################################################################################################################
    input_file_path = folder + "/anonymized_data" + ".txt"

    anonymize_user = dict()

    counter = 0

    fp = open(output_file_path, "w")

    document_gen = anonymized_extract.document_generator(input_file_path)
    for document in document_gen:
        if counter % 50 == 0:
            print(input_file_path, counter)

        user_set = list()

        ################################################################################################################
        # Within discussion anonymization.
        ################################################################################################################
        comment_gen = extraction_functions["comment_generator"](document)
        comment_list = [comment for comment in comment_gen]

        initial_post = comment_list[0]
        initial_timestamp = extraction_functions["extract_lifetime"](
            initial_post)

        for comment in comment_list:
            lifetime = extraction_functions["extract_lifetime"](
                comment) - initial_timestamp
            if lifetime > time_scale_in_seconds[time_scale]:
                continue

            user_name = extraction_functions["extract_user_name"](comment)
            user_id = anonymize_user.get(user_name, len(anonymize_user))
            anonymize_user[user_name] = user_id

            user_set.append(user_id)

        user_set = set(user_set)
        user_set = [repr(u) for u in user_set]
        fp.write(repr(counter) + "\t" + "\t".join(sorted(user_set)) + "\n")
        counter += 1

    fp.close()
    with open(anonymize_user_file_path, "w") as fp:
        for k, v in anonymize_user.items():
            fp.write(k + "\t" + repr(v) + "\n")