def main(): # Read all researchers cols = ["scopus_id", "label"] df = pd.read_csv(PERSON_FILE, usecols=cols)[cols] df['scopus_id'] = df['scopus_id'].fillna(df['label']) df = df.drop_duplicates(subset="scopus_id").set_index("scopus_id") df.index = df.index.map(lambda x: str(int(x)) if isinstance(x, float) else x) # Skip persons that already have estimates try: collected = pd.read_csv(TARGET_FILE, index_col=0, na_values="", keep_default_na=False) collected = collected[collected.index.isin(df.index.tolist())] collected = collected.dropna(subset=["gender"]) df = df.drop(collected.index, errors='ignore') except FileNotFoundError: collected = pd.DataFrame() # Prepare names df["first"] = df.apply(get_firstname, axis=1) before = df.shape[0] df = df.dropna(subset=["first"]) name_invalid = before - df.shape[0] if name_invalid: print(f">>> Dropping {name_invalid:,} researchers w/o valid name") # Get gender estimates estimates = {} total = df['first'].nunique() print(f">>> Searching for {total} new names...") for name in tqdm(df["first"].unique()): try: resp = genderize.Genderize().get([name]) estimates[name] = resp[0] except Exception as e: # Daily Quota exceeded print("... Quota exceeded, try again tomorrow") break # Write out if estimates: new = pd.DataFrame(estimates).T new["count"] = new["count"].astype(float) df = df.join(new, how="right", on="first") df = df[["count", "gender", "name", "probability"]] collected = pd.concat([collected, df]).sort_index() nans = collected["gender"] == "" collected.loc[nans, ["count", "gender", "probability"]] = nan collected.to_csv(TARGET_FILE, index_label="ID") # Statistics print(">>> Distribution of gender:") print(pd.value_counts(collected["gender"])) n_missing = collected["gender"].isna().sum() + name_invalid write_stats({"N_of_researcher_nogender": n_missing}) share = n_missing / (float(collected.shape[0]) + name_invalid) print(f">>> No estimates for {n_missing:,} out of {collected.shape[0]:,} " f"({share:,.2%}) researchers w/ valid names")
def main(): auth = pd.DataFrame(columns=['index', 'centrality']) com = pd.DataFrame(columns=['index', 'centrality']) global_auth = pd.DataFrame() global_com = pd.DataFrame() print(">>> Now working on:") for file in sorted(glob(NETWORK_FOLDER + "*.gexf")): # Read in n_id = basename(splitext(file)[0]) year = n_id[:4] print("...", n_id) H = nx.read_gexf(file) G = giant(H) # Clustering of random network avg_degree = sum(dict(G.degree()).values())/nx.number_of_nodes(G) exp_clustering = avg_degree/nx.number_of_nodes(G) print(f" expected clustering of random network: {exp_clustering:,}") # Compute centralities new = compute_centralities(H, G) for col in ["eigenvector", "betweenness"]: new[col + "_rank"] = new[col].rank(method="min", ascending=False) # Global measures s = global_analysis(H, G) rho = spearmanr(new["betweenness"], new["eigenvector"], nan_policy='omit') s['rho'] = f"{rho[0]:.2f}{p_to_stars(rho[1])}" # Add to DataFrame new = (new.reset_index() .melt(id_vars=['index'], var_name='centrality', value_name=year)) if n_id.endswith('auth'): auth = auth.merge(new, "outer", on=['index', 'centrality']) global_auth[year] = s elif n_id.endswith('com'): com = com.merge(new, "outer", on=['index', 'centrality']) global_com[year] = s # Statistics ident = "_".join([n_id[5:], year_name(year, -2)]) stats = {f"N_of_nodes_{ident}": nx.number_of_nodes(H), f"N_of_nodes_{ident}_giant": nx.number_of_nodes(G)} write_stats(stats) # WRITE OUT t = [('Overall', k) for k in ['Size', 'Links', 'Avg. clustering', 'Components']] t.extend([('Giant', k) for k in ['Size', 'Density', "Avg. path length", "Diameter", "rho"]]) networks = [('auth', auth, global_auth), ('com', com, global_com)] for label, df1, df2 in networks: # Centralities df1 = df1.sort_values(['index', 'centrality']).set_index('index') fname = f"{TARGET_FOLDER}yearly_centr_{label}.csv" df1.to_csv(fname, index_label="node", encoding="utf8") # Global statistics df2 = df2.T df2['Avg. path length'] = df2['Avg. path length'].astype(float).round(2) df2.columns = pd.MultiIndex.from_tuples(t) fname = f"{OUTPUT_FOLDER}Tables/network_{label}.tex" df2.to_latex(fname, multicolumn_format='c', column_format='lrrrr|rrrrr')
def main(): # Read acknowledgements acks = read_ack_file() drops = ['prev', 'misc', 'order', 'ra', 'ind', 'data', 'editor', 'ref'] acks = acks.drop(drops, axis=1) # Count JEL codes for col in ['jel', 'jel3']: acks[col] = acks[col].apply(clean_jel_codes) acks['num_jel'] = (acks['jel'].str.len().fillna(0) + acks['jel3'].str.len().fillna(0)) acks['num_jel'] = acks['num_jel'].astype("int8") # Merge with Scopus acks.index = acks['title'].apply(standardize) scopus_df = pd.read_csv(SCOPUS_FILE, index_col=0, encoding="utf8") df = (scopus_df.drop(["title", "journal", "year"], axis=1).join(acks, how="inner")) df.index.name = "simple_title" # Count informal collaboration ack_cols = ['coms', 'con', 'sem'] for col in ack_cols: df['num_' + col] = df[col].apply(count) df["num_coms"] = df["num_coms"].replace(0, np.nan) # Mean values of informal collaboration count_cols = ["num_" + c for c in ack_cols] temp = df[count_cols + ["top"]] grouped = temp.groupby("top")[count_cols].agg(["sum", "count"]) print("Average intensive margin of inf. collab. by journal class:") for c in count_cols: print(c) print(grouped[c]["sum"] / grouped[c]["count"]) # Set to 0 for papers with acknowledgement papers_with = df[count_cols].fillna(0).sum(axis=1) > 0 df['with'] = papers_with.astype(int) df.loc[papers_with, count_cols] = df.loc[papers_with, count_cols].fillna(0) # Add metrics for authors and commenters df = df.reset_index().set_index(["simple_title", "year"]) dtypes = {'scopus_id': 'str', 'year': 'uint16'} metrics = pd.read_csv(METRICS_FILE, encoding="utf8", dtype=dtypes) metrics = metrics.drop(columns=['yearly_pubs', 'yearly_wpubs'], axis=1) metrics["cumcites"] = metrics.groupby("scopus_id")["yearly_cites"].cumsum() metrics['year'] = metrics['year'] + 1 # Use previous year's values auth_metrics = aggregate(df, metrics, "auth") coms_metrics = aggregate(df, metrics, "coms") del metrics # Add centralities for authors and commenters files = sorted(glob(CENTR_FOLDER + "*.csv")) centr = pd.concat([read_centrality_file(f) for f in files], axis=0, sort=True) centr = custom_pivot(centr, id_var='node', var_name='year', unstack_by='centrality') centr = centr.rename(columns={"node": "scopus_id"}) centr['year'] = centr['year'].astype( 'uint16') + 1 # Previous year's values for netw in ("com", "auth"): centr[netw + "_giant"] = (~centr[netw + '_eigenvector'].isnull()) * 1 fill_cols = [c for c in centr if "rank" not in c] centr[fill_cols] = centr[fill_cols].fillna(0) auth_centr = aggregate(df, centr, "auth") coms_centr = aggregate(df, centr, "coms") # Combine and fill missings df = df.reset_index(level=1) df = pd.concat([df, auth_metrics, coms_metrics, auth_centr, coms_centr], axis=1, sort=True) for c in df.columns: if "giant" in c: df[c] = df[c].clip(upper=1) fill_cols = list(coms_metrics.columns) + list(coms_centr.columns) df.loc[papers_with, fill_cols] = df.loc[papers_with, fill_cols].fillna(0) # Write out drops = ['title', 'auth', 'coms', 'sem', 'con', 'jel', 'jel3'] df = df.drop(drops, axis=1) df.to_csv(TARGET_FILE, index_label="title") # Analyze JEL codes acks['jel_cat'] = acks.apply(get_jel_categories, axis=1) temp = explode(acks, col="jel_cat") dummies = pd.get_dummies(temp.set_index("title")) dummies = dummies.groupby(dummies.index).sum() print(">>> Distribution of papers to JEL categories:\n", dummies.sum(axis=0)) print(">>> Shares of papers with either G or E:") print( pd.crosstab(dummies["jel_cat_G"], dummies["jel_cat_E"], margins=True, normalize=True)) # Statistics jel_counter = {v: acks[v].notnull().sum() for v in ('jel', 'jel3')} s = { 'N_of_JEL_all': jel_counter['jel'] + jel_counter['jel3'], 'N_of_JEL_added': jel_counter["jel3"] } write_stats(s)
def main(): # Read acknowledgements and networks acks = read_ack_file() G = defaultdict(lambda: nx.Graph()) for fname in glob(NETWORK_FOLDER + '*.gexf'): net_type = splitext(basename(fname))[0][5:] H = nx.read_gexf(fname).to_undirected() G[net_type] = nx.compose(G[net_type], H) # Dictionary mapping authors and their commenters auth_com_map = {n: [] for n in G['auth'].nodes()} for row in acks.itertuples(index=False): for author in row.auth: auth_com_map[author].extend(row.coms) # Reciprocity among coauthors acks['r_auth'] = acks[['auth', 'coms']].apply( lambda s: realized_reciprocity(s, ('auth', 'coms'), G=G['auth']), axis=1) acks['r_auth_p'] = acks['coms'].apply( lambda s: potential_reciprocity(s, G['auth'])) # Reciprocity among commenters acks['r_com'] = acks[['auth', 'coms']].apply( lambda s: realized_reciprocity(s, mapping=auth_com_map), axis=1) acks['r_com_p'] = acks[['auth', 'coms']].apply( lambda s: potential_reciprocity(s, G['auth']), axis=1) # Read affiliation information affs = read_affiliations() # Reciprocity among colleagues acks = acks.set_index("title") auth_aff = group_affiliations("auth", acks, affs, year_correction=1) com_aff = group_affiliations("coms", acks, affs, year_correction=1) both_aff = pd.concat([auth_aff, com_aff], axis=1).reset_index() both_aff = both_aff.dropna(subset=["coms"]) both_aff["auth"] = both_aff["auth"].str.replace("-", "|") for c in ("auth", "coms"): both_aff[c] = both_aff[c].str.strip("|").str.strip("-").str.split("|") both_aff["com_coll"] = both_aff.apply(count_coll_com, axis=1) both_aff = both_aff.dropna(subset=["com_coll"]) print(">>> Distribution of the number of commenters that are colleagues") print(both_aff["com_coll"].value_counts()/both_aff.shape[0]) acks = acks.merge(both_aff.drop(["year", "auth", "coms"], axis=1), "left", on="title") acks["r_coll"] = acks["com_coll"].fillna(0) > 0 acks["r_coll_p"] = acks["com_coll"].notnull() # Statistics stats = {'reci_auth_real': acks['r_auth'].sum(), 'reci_auth_pot': acks['r_auth_p'].sum(), 'reci_com_real': acks['r_com'].sum(), 'reci_com_pot': acks['r_com_p'].sum(), 'reci_coll_real': acks['r_coll'].sum(), 'reci_coll_pot': acks['r_coll_p'].sum(), 'reci_any_real': sum(acks[['r_auth', 'r_com', 'r_coll']].any(axis=1)), 'reci_any_pot': sum(acks[['r_auth_p', 'r_com_p', 'r_coll_p']].any(axis=1))} write_stats(stats) print(">>> Papers with commenting co-authors: " f"{stats['reci_auth_real']:,} (of {stats['reci_auth_pot']:,})") print(">>> Papers with authors commenting on their commenters' " f"work: {stats['reci_com_real']:,} (of {stats['reci_com_pot']:,})") print(">>> Papers with commenters that are colleagues: " f"{stats['reci_coll_real']:,} (of {stats['reci_coll_pot']:,})") print(">>> Papers with either form of reciprocity: " f"{stats['reci_any_real']:,} (of {stats['reci_any_pot']:,})")
def make_barchart(df): """Plot a horizontal stacked bar showing the number of pure authors, commenting authors and pure commenters. """ # Count authors = set() for f in glob(NETWORKS_FOLDER + "*auth.gexf"): authors.update(nx.read_gexf(f).nodes()) commenters = set() for f in glob(NETWORKS_FOLDER + "*com.gexf"): commenters.update(nx.read_gexf(f).nodes()) # Prepare df['scopus_id'] = df['scopus_id'].astype(str) pure_com = (commenters - authors) pure_auth = set(df[df['com_out_degree'].fillna(0) == 0]['scopus_id'].unique()) com_auth = (commenters - pure_auth - pure_com) print(f">>> {len(pure_auth):,} pure authors " f"({sum(x.isdigit() for x in pure_auth):,} w/ Scopus ID); " f"{len(pure_com):,} pure commenters " f"({sum(x.isdigit() for x in pure_com):,} w/ Scopus ID); " f"{len(com_auth):,} mixed types " f"({sum(x.isdigit() for x in com_auth):,} w/ Scopus ID)") out = pd.DataFrame(data=[len(pure_auth), len(com_auth), len(pure_com)], index=['pure_auth', 'com_auth', 'pure_com'], columns=['persons']) # Plot fig, ax = plt.subplots(figsize=(25, 4)) out.T.plot(kind='barh', stacked=True, legend=False, ax=ax, colormap='PiYG', alpha=0.7) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) # Hatched area for commenting authors ax.patches[1].set(hatch="O", facecolor=ax.patches[0].get_facecolor(), edgecolor=ax.patches[2].get_facecolor(), lw=0) # Add labels inside for p in ax.patches: ax.annotate(f"{int(p.get_width()):,}", fontsize=40, xy=(p.get_x() + p.get_width()/3.1, -0.05)) # Add bracket outside (set width manually) auth_cent = (len(authors)/out.sum())/2 - 0.01 bbox = {"boxstyle": 'square', "fc": 'white'} arrowprops = {"arrowstyle": '-[, widthB=8.35, lengthB=1', "lw": 2.0, "color": "black"} ax.annotate('Authors', xy=(auth_cent, 0.90), xytext=(auth_cent, 0.9), xycoords='axes fraction', ha='center', va='bottom', bbox=bbox, fontsize=35, arrowprops=arrowprops) com_cent = (len(commenters)/out.sum())/2 + auth_cent - 0.054 arrowprops.update({"arrowstyle": '-[, widthB=12.73, lengthB=1'}) ax.annotate('Commenters', xy=(com_cent, 0.10), xytext=(com_cent, 0), xycoords='axes fraction', ha='center', va='bottom', bbox=bbox, fontsize=35, arrowprops=arrowprops) # Save sns.despine(ax=None, top=True, right=True, left=True, bottom=True) fname = OUTPUT_FOLDER + "Figures/barh_persons.pdf" fig.savefig(fname, bbox_inches="tight") fname = OUTPUT_FOLDER + "Figures/barh_persons.png" fig.savefig(fname, bbox_inches="tight") plt.clf() # Write stats stats = {'N_of_Authors_pure': len(pure_auth), 'N_of_Commenters_pure': len(pure_com), 'N_of_Authors_commenting': len(com_auth)} write_stats(stats)