def extracter(customer, period, method=3, cutoff=20): """ Generates a baseline of percentages for each action for each group per user. :param customer: :type customer: int :param period: :type period: int :param method: :type method: int :param cutoff: if for every group the percentage of users that uses the action is lower then the cutoff, the action is removed :type cutoff: int :return: :rtype: """ path = sup.PatternPath(customer, period) route = path.expertise_file() group_df = pd.read_csv(route) group_df.columns = ["Username", "Group"] log_df = reader.parsed_filter(reader.read_parsed(path.log_file())) # log_df.drop("Src", 1, inplace=True) log_df_mpr = mpr_merger(log_df) df_merged = log_df_mpr.merge(group_df, on=["Username"]) count = df_merged.groupby("Group")["Username"].nunique() result, method, cutoff = difference_maker(df_merged, method, cutoff) result.index = result.rename( index=lambda x: x + "-[" + str(count[x]) + "]").index with open(path.destination("baseline_" + method), "w") as out_f: out_f.write(result.to_string(index=True)) out_f.close()
def group_extractor(customer, period, method=3, cutoff=20, algorithm="ACL"): """ generates a file that shows the percentage of users in a group that made an action :param customer: :type customer: int :param period: :type period: int :param method: :type method: int :param cutoff: if for every group the percentage of users that uses the action is lower then the cutoff, the action is removed :type cutoff: int :param algorithm: :type algorithm: str :return: :rtype: """ path = sup.GroupPatternPath(customer, period, algorithm) grouping_route = path.grouping_file() grouping_df = pd.read_csv(grouping_route) expertise_df = pd.read_csv(path.expertise_file()) expertise_df.columns = ["Username", "Group"] log_df = parsed_filter(reader.read_parsed(path.log_file())) # log_df.drop("Src", 1, inplace=True) log_df_mpr = mpr_merger(log_df) df_merged = log_df_mpr.merge(grouping_df, on=["Username"]) count = df_merged.groupby("Group")["Username"].nunique() result, method, cutoff = difference_maker(df_merged, method, cutoff) result.index = result.rename( index=lambda x: str(x) + "-[" + str(count[x]) + "]").index with open(path.destination(path.get_filename() + method), "w") as out_f: out_f.write(result.to_string(index=True)) out_f.close()
def interest_checker(customer, period, expertise, resource_type, resource): """ Return a datafame consisting only of request of a certain variable for a specific expertise. :param customer: :type customer: int :param period: :type period: int :param expertise: for example "Caretaker" :type expertise: str :param resource_type: for example "SRC" :type resource_type: str :param resource: for example "Dossier" :type resource: str :return: :rtype: dataframe """ pd.set_option('display.width', 1000) path = sup.InterestPatternPath(customer, period) print(path.log_file()) group_df = pd.read_csv(path.expertise_file()) group_df.columns = ["Username", "Group"] print(path.log_file()) log_df = reader.read_parsed(path.log_file()) log_df_mpr = mpr_merger(log_df) selected_expertise = group_df.loc[group_df["Group"] == expertise] merged_df = pd.merge(log_df_mpr, selected_expertise, how="inner", on=["Username"]) selected_resource = merged_df.loc[merged_df[resource_type] == resource] return selected_resource
def interest_finder(customer, period, method=4, cutoff=0.5): """ checks for actions that only a small percent of users make in a baseline :param customer: :type customer:int :param period: :type period: int :param method: :type method: int :param cutoff: :type cutoff: int :return: :rtype: """ pd.set_option('display.width', 1000) path = sup.PatternPath(customer, period) route = path.expertise_file() group_df = pd.read_csv(route) group_df.columns = ["Username", "Group"] log_df = reader.read_parsed(path.log_file()) log_df_mpr = mpr_merger(log_df) print("done with merging") log_df_mpr_filtered = parsed_filter(log_df_mpr.copy()) df_merged = log_df_mpr_filtered.merge(group_df, on=["Username"]) log_df_mpr_filtered = None count = df_merged.groupby("Group")["Username"].nunique() print("count: ", count) result, method, cutoff = difference_maker(df_merged, method, cutoff) df_merged = None with open(path.destination("baseline_" + method), "w") as out_f: out_f.write(result.to_string(index=True)) out_f.close() with open(path.destination("baseline_" + method + "_finds"), "w") as out_f2: for col in result.columns.values: if log_df_mpr["Src"].isin([col]).any(): c_name = "Src" elif log_df_mpr["MPR"].isin([col]).any(): c_name = "MPR" interest = result[((result[col] > 0) & (result[col] <= 0.5))][col] for group in interest.index.values: selected_expertise = group_df.loc[group_df["Group"] == group] selected_merge = pd.merge(log_df_mpr, selected_expertise, how="inner", on=["Username"]) selected_resource = selected_merge.loc[selected_merge[c_name] == col] out_f2.write("Group: " + group + "/Resource: " + col + "/Value: " + str(interest[group]) + "\n") out_f2.write( selected_resource.to_string(index=False, header=False) + "\n\n") out_f2.close()
def figure_extracter(customer, period, method=3, cutoff=20): """ Generates a barchart of requests that are made by users per group. x-axis is the expertise y-axis is the percentage of users that made the request each color represent a type of request :param customer: :type customer: int :param period: :type period: int :param method: :type method: int :param cutoff: :type cutoff: int :return: :rtype: """ path = sup.PatternPath(customer, period) expertise_df = pd.read_csv(path.expertise_file()) expertise_df.columns = ["Username", "Group"] log_df = parsed_filter(reader.read_parsed(path.log_file())) log_df = log_df[log_df.Src != "HubDeployment"] log_df_mpr = mpr_merger(log_df) df_merged = log_df_mpr.merge(expertise_df, on=["Username"]) count = df_merged.groupby("Group")["Username"].nunique() print(count) result, method, cutoff = difference_maker(df_merged, method, cutoff) delete_count = count > 20 result = result[delete_count] print(result) result.index = result.rename( index=lambda x: x + "-[" + str(count[x]) + "]").index import matplotlib.pyplot as plt fig, ax = plt.subplots(1) fig.autofmt_xdate() result.plot(kind="bar", ax=ax) plt.show()
def similarity_finder(path, labels, users, info_df=None): """ finds similarity between users. :param path: :type path: path :param labels: :type labels: list :param users: :type users: list :param info_df: :type info_df: dataframe :return: :rtype: """ import Support.reader as reader bg_name = "test" if info_df is not None: bg_name = "bf:" + str(info_df["Branching factor"].values[0]) + "_T:" + str( info_df["Threshold"].values[0]) + "G-a:" +\ str(info_df["Group-amount"].values[0]) uni_cl = np.unique(labels, return_counts=True) selected_nums = np.where(np.logical_and(uni_cl[1] >= 10, uni_cl[1] <= 50))[0] user_data = {} group_data = {} for i in range(len(np.unique(labels))): group_data[i] = [] for i in range(len(labels)): user_data[users.get_value(i, "users")] = labels[i] group_data[labels[i]].append(users.get_value(i, "users")) df = reader.read_parsed(path.log_file()) expertise_df = pd.read_csv(path.expertise_file()) expertise_df.columns = ["Username", "Group"] for group in selected_nums: selected_num = group selected_users = group_data[selected_num] print("Amount of selected users:", selected_users) selected_expertise_df = expertise_df.loc[expertise_df["Username"].isin(selected_users)] count_selected_expertise_users = len(selected_expertise_df.index) selected_expertise_df = selected_expertise_df["Group"].value_counts(normalize=True)*100 gdf = df.loc[df["Username"].isin(selected_users)] count = gdf[(gdf['Path'].str.contains("client", case=False) | gdf['Resource'].str.contains("client", case=False))].nunique() gdf = gdf.drop("Id", 1) gdf = gdf.drop("Date", 1) gdf = miner.merge_path_method_resource(gdf) column_names = gdf.columns.values.tolist() column_names.remove('Username') result_df = pd.get_dummies(gdf, columns=column_names, prefix="", prefix_sep="") \ .groupby(gdf["Username"]) result_df_sum = result_df.sum() result_df_agg = result_df.agg("max").drop("Username", 1) if len(result_df_sum.index) != count_selected_expertise_users: print("users in group: ", len(result_df_sum.index)) print("users in expertise_group: ", count_selected_expertise_users) print("Missing " + str(len(result_df_sum.index) - count_selected_expertise_users) + " users!") with open(path.similarity_destination("txt", bg_name+"_similarities", str(group)), "w") as out_f: out_f.write("Group_size="+str(len(result_df_sum.index))+"\n") out_f.write("Amount of ClientIds: " + str(count["Id"])+"\n") out_f.write("Expertise_division:\n") out_f.write(selected_expertise_df.to_string()+"\n") out_f.write("Group_summary:\n") out_f.write(result_df_agg.sum().apply(lambda x: (int(x)/len(result_df_sum.index))*100). sort_values(ascending=False).to_string()) out_f.write("\n\nUser_information\n") for index, row in result_df_sum.iterrows(): out_f.write("user: "******"\n") out_f.write(row.iloc[row.nonzero()].sort_values(ascending=False).to_string()+"\n\n")