def load_dataset_k(dataset_k_path,
                   feature_osn_name_list,
                   branching_feature_names_list_dict,
                   usergraph_feature_names_list_dict,
                   temporal_feature_names_list_dict):
    dataset_k = dict()
    X_k_min_dict = dict()
    X_t_next_dict = dict()

    index = dict()

    h5_store = h5_open(dataset_k_path)

    for osn_name in feature_osn_name_list:
        dataset_k[osn_name] = dict()

        df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)

        dataset_k[osn_name]["X_branching"] = df.values
        dataset_k[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_k[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

        data_frame = h5load_from(h5_store, "/data/" + osn_name + "/utility_arrays")
        X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values
        X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values

    h5_close(h5_store)

    return dataset_k, X_k_min_dict, X_t_next_dict, index
def load_dataset_full(dataset_full_path, target_osn_name,
                      feature_osn_name_list, target_name_list,
                      branching_feature_names_list_dict,
                      usergraph_feature_names_list_dict,
                      temporal_feature_names_list_dict):
    dataset_full = dict()
    dataset_full[target_osn_name] = dict()

    index = dict()

    h5_store = h5_open(dataset_full_path)

    for osn_name in feature_osn_name_list:
        df = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)
        dataset_full[osn_name]["X_branching"] = df.values
        dataset_full[osn_name]["X_usergraph"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_full[osn_name]["X_temporal"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

    data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw")
    dataset_full[target_osn_name]["y_raw"] = dict()
    for target_name in target_name_list:
        dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[
            target_name].values

    h5_close(h5_store)

    return dataset_full, index
def load_dataset_full(dataset_full_path,
                      target_osn_name,
                      feature_osn_name_list,
                      target_name_list,
                      branching_feature_names_list_dict,
                      usergraph_feature_names_list_dict,
                      temporal_feature_names_list_dict):
    dataset_full = dict()
    dataset_full[target_osn_name] = dict()

    index = dict()

    h5_store = h5_open(dataset_full_path)

    for osn_name in feature_osn_name_list:
        df = h5load_from(h5_store, "/data/" + osn_name + "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)
        dataset_full[osn_name]["X_branching"] = df.values
        dataset_full[osn_name]["X_usergraph"] = h5load_from(h5_store, "/data/" + osn_name + "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_full[osn_name]["X_temporal"] = h5load_from(h5_store, "/data/" + osn_name + "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

    data_frame = h5load_from(h5_store, "/data/" + target_osn_name + "/y_raw")
    dataset_full[target_osn_name]["y_raw"] = dict()
    for target_name in target_name_list:
        dataset_full[target_osn_name]["y_raw"][target_name] = data_frame[target_name].values

    h5_close(h5_store)

    return dataset_full, index
def fill_X_handcrafted_full_and_y_raw(
        dataset_full, h5_store_files, h5_keys, offset, osn_name, target_list,
        branching_feature_names_list_dict, usergraph_feature_names_list_dict,
        temporal_feature_names_list_dict, number_of_branching_features_dict,
        number_of_usergraph_features_dict, number_of_temporal_features_dict):
    for d, h5_key in enumerate(h5_keys):
        handcrafted_features_data_frame = h5load_from(h5_store_files[1],
                                                      h5_key)

        kth_row = get_kth_row(handcrafted_features_data_frame, -1,
                              branching_feature_names_list_dict[osn_name])
        dataset_full[osn_name]["X_branching"][
            offset + d, :number_of_branching_features_dict[osn_name]] = kth_row

        kth_row = get_kth_row(handcrafted_features_data_frame, -1,
                              usergraph_feature_names_list_dict[osn_name])
        dataset_full[osn_name]["X_usergraph"][
            offset + d, :number_of_usergraph_features_dict[osn_name]] = kth_row

        kth_row = get_kth_row(handcrafted_features_data_frame, -1,
                              temporal_feature_names_list_dict[osn_name])
        dataset_full[osn_name]["X_temporal"][
            offset + d, :number_of_temporal_features_dict[osn_name]] = kth_row

        for target_name in target_list:
            dataset_full[osn_name]["y_raw"][target_name][
                offset + d] = get_target_value(handcrafted_features_data_frame,
                                               target_name)
def fill_X_handcrafted_full_and_y_raw(dataset_full,
                                      h5_store_files,
                                      h5_keys,
                                      offset,
                                      osn_name,
                                      target_list,
                                      branching_feature_names_list_dict,
                                      usergraph_feature_names_list_dict,
                                      temporal_feature_names_list_dict,
                                      number_of_branching_features_dict,
                                      number_of_usergraph_features_dict,
                                      number_of_temporal_features_dict):
    for d, h5_key in enumerate(h5_keys):
        handcrafted_features_data_frame = h5load_from(h5_store_files[1], h5_key)

        kth_row = get_kth_row(handcrafted_features_data_frame,
                              -1,
                              branching_feature_names_list_dict[osn_name])
        dataset_full[osn_name]["X_branching"][offset + d, :number_of_branching_features_dict[osn_name]] = kth_row

        kth_row = get_kth_row(handcrafted_features_data_frame,
                              -1,
                              usergraph_feature_names_list_dict[osn_name])
        dataset_full[osn_name]["X_usergraph"][offset + d, :number_of_usergraph_features_dict[osn_name]] = kth_row

        kth_row = get_kth_row(handcrafted_features_data_frame,
                              -1,
                              temporal_feature_names_list_dict[osn_name])
        dataset_full[osn_name]["X_temporal"][offset + d, :number_of_temporal_features_dict[osn_name]] = kth_row

        for target_name in target_list:
            dataset_full[osn_name]["y_raw"][target_name][offset + d] = get_target_value(handcrafted_features_data_frame,
                                                                                        target_name)
def get_all_comment_lifetimes(h5_stores_and_keys, osn_focus):
    all_comment_timestamps_list = list()
    extend_comment_timestamp = all_comment_timestamps_list.extend
    for h5_store_files, h5_keys in h5_stores_and_keys:
        for h5_key in h5_keys[osn_focus]:
            timestamps_data_frame = h5load_from(h5_store_files[0], h5_key)

            timestamps_col = timestamps_data_frame["timestamp"]

            extend_comment_timestamp(timestamps_col.iloc[1:] - timestamps_col.iloc[0])

    return all_comment_timestamps_list
def get_all_comment_lifetimes(h5_stores_and_keys, osn_focus):
    all_comment_timestamps_list = list()
    extend_comment_timestamp = all_comment_timestamps_list.extend
    for h5_store_files, h5_keys in h5_stores_and_keys:
        for h5_key in h5_keys[osn_focus]:
            timestamps_data_frame = h5load_from(h5_store_files[0], h5_key)

            timestamps_col = timestamps_data_frame["timestamp"]

            extend_comment_timestamp(timestamps_col.iloc[1:] -
                                     timestamps_col.iloc[0])

    return all_comment_timestamps_list
def load_dataset_k(dataset_k_path, feature_osn_name_list,
                   branching_feature_names_list_dict,
                   usergraph_feature_names_list_dict,
                   temporal_feature_names_list_dict):
    dataset_k = dict()
    X_k_min_dict = dict()
    X_t_next_dict = dict()

    index = dict()

    h5_store = h5_open(dataset_k_path)

    for osn_name in feature_osn_name_list:
        dataset_k[osn_name] = dict()

        df = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_branching")[branching_feature_names_list_dict[osn_name]]
        index[osn_name] = list(df.index)

        dataset_k[osn_name]["X_branching"] = df.values
        dataset_k[osn_name]["X_usergraph"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_usergraph")[usergraph_feature_names_list_dict[osn_name]].values
        dataset_k[osn_name]["X_temporal"] = h5load_from(
            h5_store, "/data/" + osn_name +
            "/X_temporal")[temporal_feature_names_list_dict[osn_name]].values

        data_frame = h5load_from(h5_store,
                                 "/data/" + osn_name + "/utility_arrays")
        X_k_min_dict[osn_name] = data_frame["X_k_min_array"].values
        X_t_next_dict[osn_name] = data_frame["X_t_next_array"].values

    h5_close(h5_store)

    return dataset_k, X_k_min_dict, X_t_next_dict, index
def get_all_post_lifetimes(h5_stores_and_keys, osn_focus):
    all_post_lifetimes_list = list()
    append_post_lifetime = all_post_lifetimes_list.append
    for h5_store_files, h5_keys in h5_stores_and_keys:
        for h5_key in h5_keys[osn_focus]:
            timestamps_data_frame = h5load_from(h5_store_files[0], h5_key)

            timestamps_col = timestamps_data_frame["timestamp"]

            if timestamps_col.size == 1:
                index = 0
            else:
                index = int(np.ceil(0.99 * (timestamps_col.size - 1)))

            append_post_lifetime(timestamps_col.iloc[index] - timestamps_col.iloc[0])

    return all_post_lifetimes_list
def get_all_post_lifetimes(h5_stores_and_keys, osn_focus):
    all_post_lifetimes_list = list()
    append_post_lifetime = all_post_lifetimes_list.append
    for h5_store_files, h5_keys in h5_stores_and_keys:
        for h5_key in h5_keys[osn_focus]:
            timestamps_data_frame = h5load_from(h5_store_files[0], h5_key)

            timestamps_col = timestamps_data_frame["timestamp"]

            if timestamps_col.size == 1:
                index = 0
            else:
                index = int(np.ceil(0.99 * (timestamps_col.size - 1)))

            append_post_lifetime(timestamps_col.iloc[index] -
                                 timestamps_col.iloc[0])

    return all_post_lifetimes_list
def calculate_k_based_on_lifetime(dataset_k, h5_store_files, h5_keys, offset,
                                  k, X_k_min_dict, X_t_next_dict, osn_name):
    number_of_keys = len(h5_keys["post"])

    for d in range(number_of_keys):
        timestamps_data_frame = h5load_from(h5_store_files[0],
                                            h5_keys["post"][d])

        if np.isnan(X_t_next_dict[osn_name][offset + d]):
            continue

        observed_comments,\
        next_lifetime = get_k_based_on_lifetime(timestamps_data_frame,
                                                k,
                                                min_k=X_k_min_dict[osn_name][offset + d],
                                                max_k=-1)

        X_k_min_dict[osn_name][offset + d] = observed_comments
        X_t_next_dict[osn_name][offset + d] = next_lifetime
def fill_X_handcrafted_k_actual(dataset_k, h5_store_files, h5_keys, offset, k,
                                X_k_min_dict, X_t_next_dict,
                                branching_feature_names_list,
                                usergraph_feature_names_list,
                                temporal_feature_names_list, osn_name):
    for d, h5_key in enumerate(h5_keys):
        if X_k_min_dict[osn_name][offset + d] == -1:
            dataset_k[osn_name]["X_branching"][offset + d, :] = np.nan
            dataset_k[osn_name]["X_usergraph"][offset + d, :] = np.nan
            dataset_k[osn_name]["X_temporal"][offset + d, :] = np.nan
            continue

        handcrafted_features_data_frame = h5load_from(h5_store_files[1],
                                                      h5_key)

        # min_index = 0
        # max_index = len(branching_feature_names_list)

        kth_row = get_kth_row(handcrafted_features_data_frame,
                              X_k_min_dict[osn_name][offset + d],
                              branching_feature_names_list)

        dataset_k[osn_name]["X_branching"][offset + d, :] = kth_row

        # min_index = len(branching_feature_names_list)
        # max_index = len(branching_feature_names_list) + len(usergraph_feature_names_list)

        kth_row = get_kth_row(handcrafted_features_data_frame,
                              X_k_min_dict[osn_name][offset + d],
                              usergraph_feature_names_list)
        dataset_k[osn_name]["X_usergraph"][offset + d, :] = kth_row

        # min_index = len(branching_feature_names_list) + len(usergraph_feature_names_list)
        # max_index = len(branching_feature_names_list) + len(usergraph_feature_names_list) + len(temporal_feature_names_list)

        kth_row = get_kth_row(handcrafted_features_data_frame,
                              X_k_min_dict[osn_name][offset + d],
                              temporal_feature_names_list)
        dataset_k[osn_name]["X_temporal"][offset + d, :] = kth_row
def calculate_k_based_on_lifetime(dataset_k,
                                  h5_store_files,
                                  h5_keys,
                                  offset,
                                  k,
                                  X_k_min_dict,
                                  X_t_next_dict,
                                  osn_name):
    number_of_keys = len(h5_keys["post"])

    for d in range(number_of_keys):
        timestamps_data_frame = h5load_from(h5_store_files[0], h5_keys["post"][d])

        if np.isnan(X_t_next_dict[osn_name][offset + d]):
            continue

        observed_comments,\
        next_lifetime = get_k_based_on_lifetime(timestamps_data_frame,
                                                k,
                                                min_k=X_k_min_dict[osn_name][offset + d],
                                                max_k=-1)

        X_k_min_dict[osn_name][offset + d] = observed_comments
        X_t_next_dict[osn_name][offset + d] = next_lifetime
Esempio n. 14
0
def load_k_evaluation_measures(store_path, number_of_folds=10):

    h5_store = h5_open(store_path + "results.h5")

    kendall_tau_keys = [
        "/data/" + "kendall_tau/fold" + str(fold_index)
        for fold_index in range(number_of_folds)
    ]
    p_value_keys = [
        "/data/" + "p_value/fold" + str(fold_index)
        for fold_index in range(number_of_folds)
    ]
    mse_keys = [
        "/data/" + "mse/fold" + str(fold_index)
        for fold_index in range(number_of_folds)
    ]
    jaccard_keys = [
        "/data/" + "top_k_jaccard/fold" + str(fold_index)
        for fold_index in range(number_of_folds)
    ]
    feature_importances_keys = [
        "/data/" + "feature_importances/fold" + str(fold_index)
        for fold_index in range(number_of_folds)
    ]

    if (len(kendall_tau_keys) != len(p_value_keys)) or\
            (len(kendall_tau_keys) != len(feature_importances_keys)):
        print("Fold number different for evaluation measures load.")
        raise RuntimeError

    number_of_folds = len(feature_importances_keys)
    data_frame = h5load_from(h5_store, feature_importances_keys[0])
    k_list = data_frame.index
    number_of_samples = k_list.size
    feature_names_list = data_frame.columns
    number_of_features = len(feature_names_list)

    kendall_tau_array = np.empty((number_of_samples, number_of_folds),
                                 dtype=np.float64)
    p_value_array = np.empty((number_of_samples, number_of_folds),
                             dtype=np.float64)

    mean_square_error = np.empty((number_of_samples, number_of_folds),
                                 dtype=np.float64)

    top_k_jaccard = np.empty((number_of_samples, number_of_folds),
                             dtype=np.float64)

    feature_importances_array = np.empty(
        (number_of_samples, number_of_folds, number_of_features),
        dtype=np.float64)

    for f in range(number_of_folds):
        kendall_tau_key = kendall_tau_keys[f]
        p_value_key = p_value_keys[f]
        mse_key = mse_keys[f]
        jaccard_key = jaccard_keys[f]
        feature_importances_key = feature_importances_keys[f]

        kendall_tau_data_frame = h5load_from(h5_store, kendall_tau_key)
        p_value_data_frame = h5load_from(h5_store, p_value_key)
        mse_data_frame = h5load_from(h5_store, mse_key)
        jaccard_data_frame = h5load_from(h5_store, jaccard_key)
        feature_importances_data_frame = h5load_from(h5_store,
                                                     feature_importances_key)

        kendall_tau_array[:, f] = np.squeeze(kendall_tau_data_frame.values)
        p_value_array[:, f] = np.squeeze(p_value_data_frame.values)
        mean_square_error[:, f] = np.squeeze(mse_data_frame.values)
        top_k_jaccard[:, f] = np.squeeze(jaccard_data_frame.values)
        try:
            feature_importances_array[:, f, :] = np.squeeze(
                feature_importances_data_frame.values)
        except ValueError:
            feature_importances_array[:,
                                      f, :] = feature_importances_data_frame.values

    k_evaluation_measures = (kendall_tau_array, p_value_array,
                             mean_square_error, top_k_jaccard,
                             feature_importances_array)

    return k_list, k_evaluation_measures, feature_names_list
def load_k_evaluation_measures(store_path,
                               number_of_folds=10):

    h5_store = h5_open(store_path + "results.h5")

    kendall_tau_keys = ["/data/" + "kendall_tau/fold" + str(fold_index) for fold_index in range(number_of_folds)]
    p_value_keys = ["/data/" + "p_value/fold" + str(fold_index) for fold_index in range(number_of_folds)]
    mse_keys = ["/data/" + "mse/fold" + str(fold_index) for fold_index in range(number_of_folds)]
    jaccard_keys = ["/data/" + "top_k_jaccard/fold" + str(fold_index) for fold_index in range(number_of_folds)]
    feature_importances_keys = ["/data/" + "feature_importances/fold" + str(fold_index) for fold_index in range(number_of_folds)]

    if (len(kendall_tau_keys) != len(p_value_keys)) or\
            (len(kendall_tau_keys) != len(feature_importances_keys)):
        print("Fold number different for evaluation measures load.")
        raise RuntimeError

    number_of_folds = len(feature_importances_keys)
    data_frame = h5load_from(h5_store, feature_importances_keys[0])
    k_list = data_frame.index
    number_of_samples = k_list.size
    feature_names_list = data_frame.columns
    number_of_features = len(feature_names_list)

    kendall_tau_array = np.empty((number_of_samples,
                                  number_of_folds),
                                 dtype=np.float64)
    p_value_array = np.empty((number_of_samples,
                              number_of_folds),
                             dtype=np.float64)

    mean_square_error = np.empty((number_of_samples,
                                  number_of_folds), dtype=np.float64)

    top_k_jaccard = np.empty((number_of_samples,
                              number_of_folds), dtype=np.float64)

    feature_importances_array = np.empty((number_of_samples,
                                          number_of_folds,
                                          number_of_features),
                                         dtype=np.float64)

    for f in range(number_of_folds):
        kendall_tau_key = kendall_tau_keys[f]
        p_value_key = p_value_keys[f]
        mse_key = mse_keys[f]
        jaccard_key = jaccard_keys[f]
        feature_importances_key = feature_importances_keys[f]

        kendall_tau_data_frame = h5load_from(h5_store, kendall_tau_key)
        p_value_data_frame = h5load_from(h5_store, p_value_key)
        mse_data_frame = h5load_from(h5_store, mse_key)
        jaccard_data_frame = h5load_from(h5_store, jaccard_key)
        feature_importances_data_frame = h5load_from(h5_store, feature_importances_key)

        kendall_tau_array[:, f] = np.squeeze(kendall_tau_data_frame.values)
        p_value_array[:, f] = np.squeeze(p_value_data_frame.values)
        mean_square_error[:, f] = np.squeeze(mse_data_frame.values)
        top_k_jaccard[:, f] = np.squeeze(jaccard_data_frame.values)
        try:
            feature_importances_array[:, f, :] = np.squeeze(feature_importances_data_frame.values)
        except ValueError:
            feature_importances_array[:, f, :] = feature_importances_data_frame.values

    k_evaluation_measures = (kendall_tau_array,
                             p_value_array,
                             mean_square_error,
                             top_k_jaccard,
                             feature_importances_array)

    return k_list, k_evaluation_measures, feature_names_list