def prepare_rank_feature(transactions, ds): g = session_transition_graph(transactions) page_ranks = nx.pagerank(g) print(len(page_ranks)) ds.loc[:, PAGE_RANK_FEATURE] = pd.Series(np.zeros(len(ds)), index=ds.index) item_idx = ds.columns.get_loc('item_id') + 1 for t in ds.itertuples(): item_row = items[items.item_id == t[item_idx]] if len(item_row) == 1: item_url = item_row.url.iloc[0] if item_url in page_ranks: # print("%s -> %.3f" % (item_url, page_ranks[item_url])) ds[t[0], PAGE_RANK_FEATURE] = page_ranks[item_url]
user_sessions = log.groupby('user_id') user_session_lengths = reduce(lambda l, kv: l + [len(kv[1])], user_sessions.groups.items(), list()) user_session_lengths_distribution = Counter(user_session_lengths) user_df = pd.DataFrame(list(user_session_lengths_distribution.items()), columns=['session_length', 'user_cnt']) """ Plot distribution of counts for session lengths """ # plt.hist(user_df.session_length, list(range(1, 200)), weights=user_df.user_cnt); plt.show() G = session_transition_graph(log) assert(G.number_of_nodes() == 14457) assert(G.number_of_edges() == 27315) """ Plot graph of user sessions parcours """ # pos = nx.spring_layout(G); nx.draw_networkx(G, pos, with_labels=False, node_size=1); plt.show() print("degree_assortativity_coefficient %2.2f" % nx.degree_assortativity_coefficient(G)) print("degree_pearson_correlation_coefficient %2.2f" % nx.degree_pearson_correlation_coefficient(G)) assert(not nx.is_connected(G)) assert(nx.number_connected_components(G) == 171) counter = Counter([c.number_of_edges() for c in nx.connected_component_subgraphs(G)])