def extracter(customer, period, method=3, cutoff=20):
    """
    Generates a baseline of percentages for each action for each group per user.

    :param customer:
    :type customer: int
    :param period:
    :type period: int
    :param method:
    :type method: int
    :param cutoff: if for every group the percentage of users that uses the action is lower then the cutoff, the action
                    is removed
    :type cutoff: int
    :return:
    :rtype:
    """
    path = sup.PatternPath(customer, period)
    route = path.expertise_file()
    group_df = pd.read_csv(route)
    group_df.columns = ["Username", "Group"]
    log_df = reader.parsed_filter(reader.read_parsed(path.log_file()))
    # log_df.drop("Src", 1, inplace=True)
    log_df_mpr = mpr_merger(log_df)
    df_merged = log_df_mpr.merge(group_df, on=["Username"])
    count = df_merged.groupby("Group")["Username"].nunique()
    result, method, cutoff = difference_maker(df_merged, method, cutoff)
    result.index = result.rename(
        index=lambda x: x + "-[" + str(count[x]) + "]").index
    with open(path.destination("baseline_" + method), "w") as out_f:
        out_f.write(result.to_string(index=True))
        out_f.close()
def group_extractor(customer, period, method=3, cutoff=20, algorithm="ACL"):
    """
    generates a file that shows the percentage of users in a group that made an action
    :param customer:
    :type customer: int
    :param period:
    :type period: int
    :param method:
    :type method: int
    :param cutoff: if for every group the percentage of users that uses the action is lower then the cutoff, the action
                    is removed
    :type cutoff: int
    :param algorithm:
    :type algorithm: str
    :return:
    :rtype:
    """
    path = sup.GroupPatternPath(customer, period, algorithm)
    grouping_route = path.grouping_file()
    grouping_df = pd.read_csv(grouping_route)
    expertise_df = pd.read_csv(path.expertise_file())
    expertise_df.columns = ["Username", "Group"]
    log_df = parsed_filter(reader.read_parsed(path.log_file()))
    # log_df.drop("Src", 1, inplace=True)
    log_df_mpr = mpr_merger(log_df)
    df_merged = log_df_mpr.merge(grouping_df, on=["Username"])
    count = df_merged.groupby("Group")["Username"].nunique()
    result, method, cutoff = difference_maker(df_merged, method, cutoff)
    result.index = result.rename(
        index=lambda x: str(x) + "-[" + str(count[x]) + "]").index

    with open(path.destination(path.get_filename() + method), "w") as out_f:
        out_f.write(result.to_string(index=True))
        out_f.close()
def interest_checker(customer, period, expertise, resource_type, resource):
    """
    Return a datafame consisting only of request of a certain variable for a specific expertise.
    :param customer:
    :type customer: int
    :param period:
    :type period: int
    :param expertise: for example "Caretaker"
    :type expertise: str
    :param resource_type: for example "SRC"
    :type resource_type: str
    :param resource: for example "Dossier"
    :type resource: str
    :return:
    :rtype: dataframe
    """
    pd.set_option('display.width', 1000)
    path = sup.InterestPatternPath(customer, period)
    print(path.log_file())
    group_df = pd.read_csv(path.expertise_file())
    group_df.columns = ["Username", "Group"]
    print(path.log_file())
    log_df = reader.read_parsed(path.log_file())
    log_df_mpr = mpr_merger(log_df)
    selected_expertise = group_df.loc[group_df["Group"] == expertise]
    merged_df = pd.merge(log_df_mpr,
                         selected_expertise,
                         how="inner",
                         on=["Username"])
    selected_resource = merged_df.loc[merged_df[resource_type] == resource]
    return selected_resource
def interest_finder(customer, period, method=4, cutoff=0.5):
    """
    checks for actions that only a small percent of users make in a baseline
    :param customer:
    :type customer:int
    :param period:
    :type period: int
    :param method:
    :type method: int
    :param cutoff:
    :type cutoff: int
    :return:
    :rtype:
    """
    pd.set_option('display.width', 1000)
    path = sup.PatternPath(customer, period)
    route = path.expertise_file()
    group_df = pd.read_csv(route)
    group_df.columns = ["Username", "Group"]
    log_df = reader.read_parsed(path.log_file())
    log_df_mpr = mpr_merger(log_df)
    print("done with merging")
    log_df_mpr_filtered = parsed_filter(log_df_mpr.copy())
    df_merged = log_df_mpr_filtered.merge(group_df, on=["Username"])
    log_df_mpr_filtered = None
    count = df_merged.groupby("Group")["Username"].nunique()
    print("count: ", count)
    result, method, cutoff = difference_maker(df_merged, method, cutoff)
    df_merged = None
    with open(path.destination("baseline_" + method), "w") as out_f:
        out_f.write(result.to_string(index=True))
        out_f.close()
    with open(path.destination("baseline_" + method + "_finds"),
              "w") as out_f2:
        for col in result.columns.values:
            if log_df_mpr["Src"].isin([col]).any():
                c_name = "Src"
            elif log_df_mpr["MPR"].isin([col]).any():
                c_name = "MPR"
            interest = result[((result[col] > 0) & (result[col] <= 0.5))][col]
            for group in interest.index.values:
                selected_expertise = group_df.loc[group_df["Group"] == group]
                selected_merge = pd.merge(log_df_mpr,
                                          selected_expertise,
                                          how="inner",
                                          on=["Username"])
                selected_resource = selected_merge.loc[selected_merge[c_name]
                                                       == col]
                out_f2.write("Group: " + group + "/Resource: " + col +
                             "/Value: " + str(interest[group]) + "\n")
                out_f2.write(
                    selected_resource.to_string(index=False, header=False) +
                    "\n\n")
    out_f2.close()
def figure_extracter(customer, period, method=3, cutoff=20):
    """
    Generates a barchart of requests that are made by users per group.
     x-axis is the expertise
     y-axis is the percentage of users that made the request
     each color represent a type of request
    :param customer:
    :type customer: int
    :param period:
    :type period: int
    :param method:
    :type method: int
    :param cutoff:
    :type cutoff: int
    :return:
    :rtype:
    """
    path = sup.PatternPath(customer, period)
    expertise_df = pd.read_csv(path.expertise_file())
    expertise_df.columns = ["Username", "Group"]
    log_df = parsed_filter(reader.read_parsed(path.log_file()))
    log_df = log_df[log_df.Src != "HubDeployment"]
    log_df_mpr = mpr_merger(log_df)
    df_merged = log_df_mpr.merge(expertise_df, on=["Username"])
    count = df_merged.groupby("Group")["Username"].nunique()
    print(count)
    result, method, cutoff = difference_maker(df_merged, method, cutoff)
    delete_count = count > 20
    result = result[delete_count]
    print(result)
    result.index = result.rename(
        index=lambda x: x + "-[" + str(count[x]) + "]").index
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(1)
    fig.autofmt_xdate()
    result.plot(kind="bar", ax=ax)
    plt.show()
Exemple #6
0
def similarity_finder(path, labels, users, info_df=None):
    """
    finds similarity between users.
    :param path:
    :type path: path
    :param labels:
    :type labels: list
    :param users:
    :type users: list
    :param info_df:
    :type info_df: dataframe
    :return:
    :rtype:
    """
    import Support.reader as reader
    bg_name = "test"
    if info_df is not None:
        bg_name = "bf:" + str(info_df["Branching factor"].values[0]) + "_T:" + str(
            info_df["Threshold"].values[0]) + "G-a:" +\
                  str(info_df["Group-amount"].values[0])
    uni_cl = np.unique(labels, return_counts=True)
    selected_nums = np.where(np.logical_and(uni_cl[1] >= 10, uni_cl[1] <= 50))[0]
    user_data = {}
    group_data = {}
    for i in range(len(np.unique(labels))):
        group_data[i] = []
    for i in range(len(labels)):
        user_data[users.get_value(i, "users")] = labels[i]
        group_data[labels[i]].append(users.get_value(i, "users"))

    df = reader.read_parsed(path.log_file())
    expertise_df = pd.read_csv(path.expertise_file())
    expertise_df.columns = ["Username", "Group"]
    for group in selected_nums:
        selected_num = group
        selected_users = group_data[selected_num]
        print("Amount of selected users:", selected_users)
        selected_expertise_df = expertise_df.loc[expertise_df["Username"].isin(selected_users)]
        count_selected_expertise_users = len(selected_expertise_df.index)
        selected_expertise_df = selected_expertise_df["Group"].value_counts(normalize=True)*100
        gdf = df.loc[df["Username"].isin(selected_users)]
        count = gdf[(gdf['Path'].str.contains("client", case=False) |
                     gdf['Resource'].str.contains("client", case=False))].nunique()
        gdf = gdf.drop("Id", 1)
        gdf = gdf.drop("Date", 1)
        gdf = miner.merge_path_method_resource(gdf)
        column_names = gdf.columns.values.tolist()
        column_names.remove('Username')
        result_df = pd.get_dummies(gdf, columns=column_names, prefix="", prefix_sep="") \
            .groupby(gdf["Username"])
        result_df_sum = result_df.sum()
        result_df_agg = result_df.agg("max").drop("Username", 1)
        if len(result_df_sum.index) != count_selected_expertise_users:
            print("users in group: ", len(result_df_sum.index))
            print("users in expertise_group: ", count_selected_expertise_users)
            print("Missing " + str(len(result_df_sum.index) - count_selected_expertise_users) + " users!")
        with open(path.similarity_destination("txt", bg_name+"_similarities", str(group)), "w") as out_f:
            out_f.write("Group_size="+str(len(result_df_sum.index))+"\n")
            out_f.write("Amount of ClientIds: " + str(count["Id"])+"\n")
            out_f.write("Expertise_division:\n")
            out_f.write(selected_expertise_df.to_string()+"\n")
            out_f.write("Group_summary:\n")
            out_f.write(result_df_agg.sum().apply(lambda x: (int(x)/len(result_df_sum.index))*100).
                        sort_values(ascending=False).to_string())
            out_f.write("\n\nUser_information\n")
            for index, row in result_df_sum.iterrows():
                out_f.write("user: "******"\n")
                out_f.write(row.iloc[row.nonzero()].sort_values(ascending=False).to_string()+"\n\n")