def get_pub_tool_count(filename): """ Returns the number of unique tools and publications in each cluster of the given repository filename. """ clusters = get_clusters(filename) pubs = {} tools = {} for k in clusters.groups: if k not in pubs: pubs[k] = {} tools[k] = {} for index, row in clusters.get_group(k).iterrows(): pub_id = row.get(PUBLICATION_ID_COLUMN) if pub_id not in pubs[k]: pubs[k][pub_id] = 0 tool_names = (row.get(TOOLS_COLUMN)).split(TOOLS_SEPARATOR) for name in tool_names: if name not in tools[k]: tools[k][name] = 0 cluster_pubs_count = {} for k in pubs: cluster_pubs_count[k] = len(pubs[k]) cluster_tools_count = {} for k in tools: cluster_tools_count[k] = len(tools[k]) return sum(cluster_pubs_count.values()), cluster_pubs_count, sum( cluster_tools_count.values()), cluster_tools_count
def run(input_path, cluster_count): fig, ax = set_plot_style() plot_row = 0 col_counter = 0 cluster_ststs_filename = os.path.join(input_path, CLUSTERING_STATS_REPORT_FILENAME) if os.path.isfile(cluster_ststs_filename): os.remove(cluster_ststs_filename) # Write column's headers. with open(cluster_ststs_filename, "a") as f: f.write( "Filename\t" \ "Auto-determined Cluster Count\t" \ "Auto-determined Dendrogram Cut Height\t" \ "Auto-determined Cluster Silhouette Score\t" \ "Manually-set Cluster Count\t" \ "Manually-set Dendrogram Cut Height\t" \ "Manually-set Cluster Silhouette Score\n") for root, dirpath, filenames in os.walk(input_path): for filename in filenames: if os.path.splitext(filename)[1] == ".csv" and \ not os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): col_counter += 1 filename_without_extension = os.path.splitext(filename)[0] plot(\ ax[plot_row], filename_without_extension, \ True if col_counter == 4 else False, \ *cluster(root, filename, cluster_count)) plot_row += 1 image_file = os.path.join(input_path, 'dendrogram-and-elbow.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close() # Most of the code below is duplicate, it can be greatly simplified by # methods from other scripts. fNames = [] for root, dirpath, files in os.walk(input_path): for filename in files: if os.path.splitext(filename)[1] == ".csv" and \ os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): fNames.append(os.path.join(root, filename)) avgs_filename = os.path.join(root, "clustered_avg_before_after.txt") if os.path.isfile(avgs_filename): os.remove(avgs_filename) with open(avgs_filename, "a") as f: f.write("Repository\tCluster\tAverage Before\tAverage After\n") for fName in fNames: clusters = get_clusters(fName) for k in clusters.groups: avg_pre, avg_post = get_avg_pre_post(clusters.get_group(k)) f.write( f"{get_repo_name(fName)}\t{k}\t{avg_pre}\t{avg_post}\n")
def get_pubs_count(input_path): counts = {} repos = [] cluster_count = 0 for root, dirpath, filenames in os.walk(input_path): for filename in filenames: if os.path.splitext(filename)[1] == ".csv" and \ os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): repos.append(get_repo_name(filename)) clusters = get_clusters(os.path.join(root, filename)) cluster_count = len(clusters.groups) for k in clusters.groups: if k not in counts: counts[k] = {} counts[k][filename] = len(clusters.groups[k]) return counts, repos, cluster_count
def run(input_path, plot_density): files = [] for root, dirpath, filenames in os.walk(input_path): for filename in filenames: if os.path.splitext(filename)[1] == ".csv" and \ os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX): files.append(filename) x_axis_label = "\n Citation Growth" y_axis_label = ("Probability\n" if plot_density else "Count\n") clusters = get_clusters(os.path.join(root, files[0])) cluster_count = len(clusters.groups) fig, ax = set_plot_style(len(files), cluster_count) row_counter = -1 for filename in files: print(f">>> Processing file: {filename}") row_counter += 1 filename_without_extension = os.path.splitext(filename)[0] repository_name = filename_without_extension.replace(CLUSTERED_FILENAME_POSFIX, "") clusters = get_clusters(os.path.join(root, filename)) col_counter = -1 keys, mappings = get_sorted_clusters(clusters) for i in range(0, len(keys)): print(f"\t- Processing cluster {i}") header = get_cluster_label(cluster_count, i) col_counter += 1 growthes = get_growthes(clusters.get_group(mappings[keys[i]])) plot( ax[row_counter] if cluster_count == 1 else ax[row_counter][col_counter], filename_without_extension, growthes, header=header if row_counter == 0 else None, x_axis_label=x_axis_label if row_counter == len(keys) else None, y_axis_label=f"{repository_name} \n \n {y_axis_label}" if col_counter == 0 else None, plot_density=plot_density) last_ax = ax[row_counter] if cluster_count == 1 else ax[row_counter][col_counter] handles, labels = last_ax.get_legend_handles_labels() image_file = os.path.join(input_path, 'gain_scores_clustered.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close() fig, ax = set_plot_style(1, len(files), fig_height=3, fig_width=16) col_counter = -1 for filename in files: print(f">>> Processing file: {filename}") col_counter += 1 filename_without_extension = os.path.splitext(filename)[0] repository_name = filename_without_extension.replace(CLUSTERED_FILENAME_POSFIX, "") tools = pd.read_csv(os.path.join(root, filename), header=0, sep='\t') growthes = get_growthes(tools) plot( ax[col_counter], filename_without_extension, growthes, header=repository_name, x_axis_label=x_axis_label, y_axis_label=f"\n {y_axis_label}" if col_counter == 0 else None, plot_density=plot_density) handles, labels = ax[col_counter].get_legend_handles_labels() image_file = os.path.join(input_path, 'gain_scores.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close() fig, ax = set_plot_style(1, len(files), fig_height=3, fig_width=16) col_counter = -1 for filename in files: col_counter += 1 filename_without_extension = os.path.splitext(filename)[0] repository_name = filename_without_extension.replace(CLUSTERED_FILENAME_POSFIX, "") tools = pd.read_csv(os.path.join(root, filename), header=0, sep='\t') growthes = get_growthes(tools) plot2( ax[col_counter], growthes, header=repository_name, x_axis_label=x_axis_label, y_axis_label=f"\n {y_axis_label}" if col_counter == 0 else None, plot_density=plot_density) handles, labels = ax[col_counter].get_legend_handles_labels() image_file = os.path.join(input_path, 'gain_scores_sns.png') if os.path.isfile(image_file): os.remove(image_file) plt.savefig(image_file, bbox_inches='tight') plt.close()