def get_clustered_repositories(input_path): filenames = [] repositories = [] for root, dirpath, files in os.walk(input_path): for filename in files: if os.path.splitext(filename)[1] == ".csv" and \ os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): filenames.append(os.path.join(root, filename)) repositories.append(get_repo_name(filename)) return filenames, repositories
def plot_clustered(input_path, filenames, repositories): fig, ax = set_plot_style(1, 1) i = 0 max_x = 0 max_y = 0 repo_scatter = {} cluster_scatter = {} add_cluster_scatter = True for filename in filenames: add_repo_scatter = True c_pubs, ck_pubs, c_tools, ck_tools = get_pub_tool_count(filename) cluster_count = len(ck_pubs.keys()) j = 0 for k in ck_pubs: max_x = max(max_x, ck_pubs[k]) max_y = max(max_y, ck_tools[k]) scatter = ax.scatter(ck_pubs[k], ck_tools[k], marker=get_marker(j), color=get_color(i), alpha=0.5, s=80) if add_repo_scatter: repo_scatter[get_repo_name(filename)] = scatter add_repo_scatter = False if add_cluster_scatter: cluster_scatter[get_cluster_label(cluster_count, k)] = scatter j += 1 add_cluster_scatter = False i += 1 # The default range of plt when `s` is set in the `scatter` # method does not keep all the points in the canvas; so their # values are overridden. ax.set_ylim(bottom=0.5, top=max_y + (max_y * 0.5)) ax.set_xlim(left=0.5, right=max_x + (max_x * 0.5)) ax.set_yscale('log') ax.set_xscale('log') ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d')) ax.xaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d')) ax.set_xlabel("\nPublications Count") ax.set_ylabel("Tools Count\n") # It is required to add legend through `add_artist` for it not be overridden by the second legend. l1 = ax.legend(repo_scatter.values(), repo_scatter.keys(), scatterpoints=1, loc='lower right', ncol=2, title="Repositories") ax.add_artist(l1) l2 = ax.legend(cluster_scatter.values(), cluster_scatter.keys(), scatterpoints=1, loc='upper left', ncol=2, title="Clusters") image_file = os.path.join(input_path, 'plot_pub_tool_clustered.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close()
def run(input_path, plot_density): files = [] for root, dirpath, filenames in os.walk(input_path): for filename in filenames: if os.path.splitext(filename)[1] == ".csv" and \ not os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): files.append(os.path.join(root, filename)) fig, axes = set_plot_style(1, len(files), 3, 16) ylabel = "Count" if plot_density: ylabel = "Probability" col_counter = 0 for file in files: tools = pd.read_csv(file, header=0, sep='\t') _, pre_citations_vectors, post_citations_vectors, _, _, _, delta = get_vectors( tools) pre_citations = [] for citation in pre_citations_vectors: pre_citations.append(np.max(citation)) pre_citations = aggregate(pre_citations, 0, 500) post_citations = [] for citation in post_citations_vectors: post_citations.append(np.max(citation)) post_citations = aggregate(post_citations, 0, 500) plot(axes[col_counter], pre_citations, post_citations, plot_density, get_repo_name(file), ylabel if col_counter == 0 else None) col_counter += 1 handles, labels = axes[-1].get_legend_handles_labels() fig.legend(handles, labels, loc='center', bbox_to_anchor=(0.410, 0.04), ncol=2, framealpha=0.0) image_file = os.path.join(input_path, 'citations_distribution.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close()
def get_pubs_count(input_path): counts = {} repos = [] cluster_count = 0 for root, dirpath, filenames in os.walk(input_path): for filename in filenames: if os.path.splitext(filename)[1] == ".csv" and \ os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): repos.append(get_repo_name(filename)) clusters = get_clusters(os.path.join(root, filename)) cluster_count = len(clusters.groups) for k in clusters.groups: if k not in counts: counts[k] = {} counts[k][filename] = len(clusters.groups[k]) return counts, repos, cluster_count
def plot(input_path, filenames, repositories): fig, ax = set_plot_style(1, 1) i = 0 max_x = 0 max_y = 0 repo_scatter = {} cluster_scatter = {} add_cluster_scatter = True xs = [] ys = [] zs = [] for filename in filenames: repo_color = get_color(i) add_repo_scatter = True c_pubs, _, c_tools, _ = get_pub_tool_count(filename) max_x = max(max_x, c_pubs) max_y = max(max_y, c_tools) tools = pd.read_csv(filename, header=0, sep='\t') pre_citations, post_citations = get_citations_count(tools) xs.append(c_pubs) ys.append(c_tools) # it is multiplied by 2 so to make it a bit bigger on the plot so it can # be seen more easily. z = ((sum(pre_citations) + sum(post_citations)) / c_pubs) * 2 zs.append(z) scatter = ax.scatter(c_pubs, c_tools, color=repo_color, alpha=0.5, s=z) repo_name = get_repo_name(filename) z_str = '{0:.1f}'.format(z / 2.0) ax.annotate(\ f"{repo_name}\n({c_pubs}, {c_tools}, {z_str})", \ xy=(c_pubs, c_tools), \ color=repo_color, textcoords="offset points", \ xytext=OFFSETS[repo_name], \ ha='center', \ #arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.95', color=repo_color) ) repo_scatter[repo_name] = scatter i += 1 print(repo_name) print(f"\tpubs:\t{c_pubs}") print(f"\ttools:\t{c_tools}") print(f"\tcitations:\t{sum(pre_citations) + sum(post_citations)}") #for x,y in zip(xs,ys): # plt.annotate(f"({x}, {y})", # Label # (x,y), # textcoords="offset points", # how to position the text # xytext=(0,10), # distance from text to points (x,y) # ha='center') # horizontal alignment can be left, right or center # The default range of plt when `s` is set in the `scatter` # method does not keep all the points in the canvas; so their # values are overridden. ax.set_ylim(bottom=128, top=max_y + (max_y * 0.5)) ax.set_xlim(left=128, right=max_x + (max_x * 0.5)) ax.set_xscale('log', basex=2) ax.set_yscale('log', basey=2) ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d')) ax.xaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d')) ax.set_xlabel("\nPublications Count") ax.set_ylabel("Tools Count\n") # It is required to add legend through `add_artist` for it not be overridden by the second legend. #ax.legend(repo_scatter.values(), repo_scatter.keys(), scatterpoints=1, loc='upper left', ncol=2) #ax.add_artist(l1) #l2 = ax.legend(cluster_scatter.values(), cluster_scatter.keys(), scatterpoints=1, loc='upper left', ncol=2, title="Clusters") image_file = os.path.join(input_path, 'plot_pub_tool.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close()