def contribution_percentage(result=None): if result is None: result = utils.load_result_pkl(connectivity_file) contributions = {} max_k = max(flatten([result[year]['k_components'].keys() for year in result])) for G in networks_by_year(): year = G.graph['year'] contributions[year] = {} devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) total = float(sum(G.degree(devs, weight='weight').values())) all_devs = float(len(devs)) contributions[year]['total'] = (all_devs, 1, total, 1, total/all_devs) kcomps = result[year]['k_components'] for k in range(2, max_k + 1): if k not in kcomps: contributions[year][k] = (0, 0, 0, 0, 0) else: nodes_at_k = set.union(*[nodes[1] for nodes in kcomps[k]]) devs_at_k = nodes_at_k & devs if not devs_at_k: print("No developers at level {0} in year {1}".format(k, year)) continue n_at_k = float(len(devs_at_k)) contrib_at_k = sum(G.degree(devs_at_k, weight='weight').values()) contributions[year][k] = (len(devs_at_k), (len(devs_at_k) / all_devs) * 100, contrib_at_k, (contrib_at_k / total) * 100, contrib_at_k / n_at_k) return contributions
def get_developers_top_connectivity_by_year(G, year, connectivity=None): if connectivity is None: connectivity = utils.load_result_pkl(connectivity_file) all_devs = set(n for n, d in G.nodes(data=True) if d['bipartite']==1) kcomponents = connectivity[year]['k_components'] max_k = max(kcomponents) nodes = set.union(*[c[1] for c in kcomponents[max_k]]) return set(n for n in nodes if n in all_devs)
def get_layouts(project_name, kind): if project_name == 'python': if kind == 'years': result = utils.load_result_pkl(python_layouts_years_file) elif kind == 'releases': result = utils.load_result_pkl(python_layouts_releases_file) else: raise Exception('Unknown kind {}'.format(kind)) elif project_name == 'debian': if kind == 'years': result = utils.load_result_pkl(debian_layouts_years_file) elif kind == 'releases': result = utils.load_result_pkl(debian_layouts_releases_file) else: raise Exception('Unknown kind {}'.format(kind)) else: raise Exception('Unknown project name {}'.format(project_name)) return result
def get_structural_cohesion_results(project_name, kind): if project_name == 'python': if kind == 'years': result = utils.load_result_pkl(python_connectivity_years_file) elif kind == 'releases': result = utils.load_result_pkl(python_connectivity_releases_file) else: raise Exception('Unknown kind {}'.format(kind)) elif project_name == 'debian': if kind == 'years': result = utils.load_result_pkl(debian_connectivity_years_file) elif kind == 'releases': result = utils.load_result_pkl(debian_connectivity_releases_file) else: raise Exception('Unknown kind {}'.format(kind)) else: raise Exception('Unknown project name {}'.format(project_name)) return result
def get_all_developers_top_connectivity(devs_by_year=None, connectivity=None): if devs_by_year is None: devs_by_year = get_developers_by_years() if connectivity is None: connectivity = utils.load_result_pkl(connectivity_file) all_devs = set.union(*[v for k, v in devs_by_year.items()]) top_devs = set() for year in connectivity: kcomponents = connectivity[year]['k_components'] max_k = max(kcomponents) nodes = set.union(*[c[1] for c in kcomponents[max_k]]) top_devs.update(n for n in nodes if n in all_devs) return top_devs
def write_developer_contrib_df(fname='data/developer_contributions_df.csv'): ids = utils.UniqueIdGenerator() peps = [pep for pep in get_peps() if pep.created is not None] connectivity = utils.load_result_pkl(connectivity_file) centrality = utils.load_result_pkl(centrality_file) networks_gen = networks_by_year() skip = next(networks_gen) networks = list(networks_gen) years = range(1992, 2015) devs_by_year = get_developers_by_years(networks=networks) with open(fname, 'wb') as f: out = csv.writer(f) out.writerow([ 'id', 'year', 'dev', 'has_written_peps', 'has_written_acc_peps', 'is_delegate', 'peps_this_year', 'total_peps', 'accepted_peps_year', 'total_accepted_peps', 'degree', 'contributions_sc', 'contributions_edits', 'contributions_added', 'contributions_deleted', 'collaborators', 'knum', 'aknum', 'top', 'top2', 'tenure', 'betweenness', 'closeness', 'degree_cent', 'file_mean_degree', 'clus_sq', 'clus_dot', 'clus_red', ]) for year, G in zip(years, networks): print("Analyzing {}".format(G.name)) bdfl_delegates = get_delegates_by_year(year, peps=peps) peps_this_year = peps_by_developer_that_year(year, peps=peps) peps_until_year = peps_by_developer_until_year(year, peps=peps) acc_peps_this_year = accepted_peps_by_developer_that_year(year, peps=peps) acc_peps_until_year = accepted_peps_by_developer_until_year(year, peps=peps) top = get_developers_top_connectivity_by_year(G, year, connectivity=connectivity) top2 = get_developers_top_connectivity_by_year_new(G, year, connectivity=connectivity) devs = devs_by_year[year] tenure = compute_tenure_by_year(year, networks=networks) k_num = connectivity[year]['k_num'] bet = normalize(centrality[year]['bet']) clos = normalize(centrality[year]['clos']) deg = normalize(centrality[year]['deg']) clus_sq = nx.square_clustering(G) clus_dot = bp.clustering(G) clus_red = bp.node_redundancy(G) for dev in devs: out.writerow([ ids[dev], year, dev.encode('utf8'), 1 if dev in peps_until_year else 0, # developer has written at least a pep 1 if dev in acc_peps_until_year else 0, # developer has written at least an acc. pep 1 if dev in bdfl_delegates else 0, # developer has been BDFL delegate peps_this_year[dev] if dev in peps_this_year else 0, # peps written this year peps_until_year[dev] if dev in peps_until_year else 0, # peps written until this year acc_peps_this_year[dev] if dev in acc_peps_this_year else 0, # peps acc. this year acc_peps_until_year[dev] if dev in acc_peps_until_year else 0, # total peps acc. len(G[dev]), #G.degree(dev, weight=None), G.degree(dev, weight='weight'), # lines of code added plus deleted G.degree(dev, weight='edits'), # number files edit G.degree(dev, weight='added'), # lines of code added G.degree(dev, weight='deleted'), # lines of code removed second_order_nbrs(G, dev), # second order neighbors k_num[dev][0], # k-component number k_num[dev][1], # Average k-component number 1 if dev in top else 0, # top connectivity level 1 if dev in top2 else 0, # top 2 connectivity level tenure[dev], bet[dev], clos[dev], deg[dev], sum(len(G[n]) for n in G[dev]) / float(len(G[dev])), clus_sq[dev], clus_dot[dev], clus_red[dev], ])
def build_survival_data_frame(fname=survival_file): nan = float('nan') ids = utils.UniqueIdGenerator() connectivity = utils.load_result_pkl(connectivity_file) centrality = utils.load_result_pkl(centrality_file) peps = [pep for pep in get_peps() if pep.created is not None] networks = list(networks_by_year()) devs = get_developers_by_years(networks=networks) skip = networks.pop(0) # skip 1991 G_start = networks.pop(0) # start with 1992 devs_start = set(n for n, d in G_start.nodes(data=True) if d['bipartite']==1) years = range(1993, 2015) with open(fname, 'wb') as f: out = csv.writer(f) out.writerow([ 'id', 'dev', 'period', 'rstart', 'rstop', 'status', 'has_written_peps', 'has_written_acc_peps', 'peps_this_year', 'total_peps', 'accepted_peps_year', 'total_accepted_peps', 'biconnected', 'top', 'tenure', 'colaborators', 'knum', 'aknum', 'clus_sq', 'clus_dot', 'clus_red', 'degree', 'contributions', 'dcentrality', 'betweenness', 'closeness', ]) previous_devs = devs_start previous_year = 1992 previous_G = G_start for i, (year, G) in enumerate(zip(years, networks)): print("processing year {}".format(previous_year)) clus_sq = nx.square_clustering(previous_G) these_devs = devs[year] remaining_devs = get_all_remaining_devs(devs, years[i:]) top_devs = get_developers_top_connectivity( connectivity[previous_year]['k_components'], previous_devs) tenure = compute_tenure_by_year(previous_year) bet = normalize(centrality[previous_year]['bet']) clos = normalize(centrality[previous_year]['bet']) deg = normalize(centrality[previous_year]['deg']) clus_sq = nx.square_clustering(previous_G) clus_dot = bp.clustering(previous_G) clus_red = bp.node_redundancy(previous_G) peps_this_year = peps_by_developer_that_year(previous_year, peps=peps) peps_until_year = peps_by_developer_until_year(previous_year, peps=peps) acc_peps_this_year = accepted_peps_by_developer_that_year(previous_year, peps=peps) acc_peps_until_year = accepted_peps_by_developer_until_year(previous_year, peps=peps) for dev in previous_devs: out.writerow([ ids[dev], # developer numerical ID dev.encode('utf8'), # developer name i + 1, # period i, # start i + 1, # stop 0 if dev in remaining_devs else 1, # status (censored) 1 if dev in peps_until_year else 0, # developer has written at least a pep 1 if dev in acc_peps_until_year else 0, # developer has written at least an acc. pep peps_this_year[dev] if dev in peps_this_year else 0, # peps written this year peps_until_year[dev] if dev in peps_until_year else 0, # peps written until this year acc_peps_this_year[dev] if dev in acc_peps_this_year else 0, # peps acc. this year acc_peps_until_year[dev] if dev in acc_peps_until_year else 0, # total peps acc. 0 if connectivity[previous_year]['k_num'][dev][0] < 2 else 1,#biconnected 0 if dev not in top_devs else 1, # member of the top connectivity level tenure[dev], # tenure in years second_order_nbrs(previous_G, dev), # collaborators connectivity[previous_year]['k_num'].get(dev, (nan,nan))[0], # knum connectivity[previous_year]['k_num'].get(dev, (nan,nan))[1], # aknum clus_sq.get(dev, nan), clus_dot.get(dev, nan), clus_red.get(dev, nan), previous_G.degree(dev), # degree previous_G.degree(dev, weight='weight'), # contributions deg.get(dev, nan), bet.get(dev, nan), clos.get(dev, nan), ]) previous_devs = these_devs previous_year = year previous_G = G