Exemple #1
0
def get_pub_tool_count(filename):
    """
    Returns the number of unique tools and publications in each
    cluster of the given repository filename. 
    """
    clusters = get_clusters(filename)

    pubs = {}
    tools = {}

    for k in clusters.groups:
        if k not in pubs:
            pubs[k] = {}
            tools[k] = {}
        for index, row in clusters.get_group(k).iterrows():
            pub_id = row.get(PUBLICATION_ID_COLUMN)
            if pub_id not in pubs[k]:
                pubs[k][pub_id] = 0

            tool_names = (row.get(TOOLS_COLUMN)).split(TOOLS_SEPARATOR)
            for name in tool_names:
                if name not in tools[k]:
                    tools[k][name] = 0

    cluster_pubs_count = {}
    for k in pubs:
        cluster_pubs_count[k] = len(pubs[k])

    cluster_tools_count = {}
    for k in tools:
        cluster_tools_count[k] = len(tools[k])

    return sum(cluster_pubs_count.values()), cluster_pubs_count, sum(
        cluster_tools_count.values()), cluster_tools_count
Exemple #2
0
def run(input_path, cluster_count):
    fig, ax = set_plot_style()
    plot_row = 0
    col_counter = 0

    cluster_ststs_filename = os.path.join(input_path,
                                          CLUSTERING_STATS_REPORT_FILENAME)
    if os.path.isfile(cluster_ststs_filename):
        os.remove(cluster_ststs_filename)
    # Write column's headers.
    with open(cluster_ststs_filename, "a") as f:
        f.write(
            "Filename\t" \
            "Auto-determined Cluster Count\t" \
            "Auto-determined Dendrogram Cut Height\t" \
            "Auto-determined Cluster Silhouette Score\t" \
            "Manually-set Cluster Count\t" \
            "Manually-set Dendrogram Cut Height\t" \
            "Manually-set Cluster Silhouette Score\n")

    for root, dirpath, filenames in os.walk(input_path):
        for filename in filenames:
            if os.path.splitext(filename)[1] == ".csv" and \
               not os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                col_counter += 1
                filename_without_extension = os.path.splitext(filename)[0]
                plot(\
                    ax[plot_row], filename_without_extension, \
                    True if col_counter == 4 else False, \
                    *cluster(root, filename, cluster_count))

                plot_row += 1

    image_file = os.path.join(input_path, 'dendrogram-and-elbow.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()

    # Most of the code below is duplicate, it can be greatly simplified by
    # methods from other scripts.
    fNames = []
    for root, dirpath, files in os.walk(input_path):
        for filename in files:
            if os.path.splitext(filename)[1] == ".csv" and \
               os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                fNames.append(os.path.join(root, filename))

    avgs_filename = os.path.join(root, "clustered_avg_before_after.txt")
    if os.path.isfile(avgs_filename):
        os.remove(avgs_filename)
    with open(avgs_filename, "a") as f:
        f.write("Repository\tCluster\tAverage Before\tAverage After\n")
        for fName in fNames:
            clusters = get_clusters(fName)
            for k in clusters.groups:
                avg_pre, avg_post = get_avg_pre_post(clusters.get_group(k))
                f.write(
                    f"{get_repo_name(fName)}\t{k}\t{avg_pre}\t{avg_post}\n")
def get_pubs_count(input_path):
    counts = {}
    repos = []
    cluster_count = 0
    for root, dirpath, filenames in os.walk(input_path):
        for filename in filenames:
            if os.path.splitext(filename)[1] == ".csv" and \
            os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                repos.append(get_repo_name(filename))
                clusters = get_clusters(os.path.join(root, filename))
                cluster_count = len(clusters.groups)
                for k in clusters.groups:
                    if k not in counts:
                        counts[k] = {}
                    counts[k][filename] = len(clusters.groups[k])

    return counts, repos, cluster_count
Exemple #4
0
def run(input_path, plot_density):
    files = []
    for root, dirpath, filenames in os.walk(input_path):
        for filename in filenames:
            if os.path.splitext(filename)[1] == ".csv" and \
            os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                files.append(filename)

    x_axis_label = "\n Citation Growth"
    y_axis_label = ("Probability\n" if plot_density else "Count\n")

    clusters = get_clusters(os.path.join(root, files[0]))
    cluster_count = len(clusters.groups)

    fig, ax = set_plot_style(len(files), cluster_count)
    row_counter = -1
    for filename in files:
        print(f">>> Processing file: {filename}")
        row_counter += 1
        filename_without_extension = os.path.splitext(filename)[0]
        repository_name = filename_without_extension.replace(CLUSTERED_FILENAME_POSFIX, "")
        clusters = get_clusters(os.path.join(root, filename))

        col_counter = -1
        keys, mappings = get_sorted_clusters(clusters)
        for i in range(0, len(keys)):
            print(f"\t- Processing cluster {i}")
            header = get_cluster_label(cluster_count, i)
            col_counter += 1
            growthes = get_growthes(clusters.get_group(mappings[keys[i]]))
            plot(
                ax[row_counter] if cluster_count == 1 else ax[row_counter][col_counter],
                filename_without_extension,
                growthes,
                header=header if row_counter == 0 else None,
                x_axis_label=x_axis_label if row_counter == len(keys) else None,
                y_axis_label=f"{repository_name} \n \n {y_axis_label}" if col_counter == 0 else None,
                plot_density=plot_density)
    
    last_ax = ax[row_counter] if cluster_count == 1 else ax[row_counter][col_counter]
    handles, labels = last_ax.get_legend_handles_labels()

    image_file = os.path.join(input_path, 'gain_scores_clustered.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()

    fig, ax = set_plot_style(1, len(files), fig_height=3, fig_width=16)
    col_counter = -1
    for filename in files:
        print(f">>> Processing file: {filename}")
        col_counter += 1
        filename_without_extension = os.path.splitext(filename)[0]
        repository_name = filename_without_extension.replace(CLUSTERED_FILENAME_POSFIX, "")

        tools = pd.read_csv(os.path.join(root, filename), header=0, sep='\t')
        growthes = get_growthes(tools)
        plot(
            ax[col_counter],
            filename_without_extension,
            growthes,
            header=repository_name,
            x_axis_label=x_axis_label,
            y_axis_label=f"\n {y_axis_label}" if col_counter == 0 else None,
            plot_density=plot_density)
        
    handles, labels = ax[col_counter].get_legend_handles_labels()

    image_file = os.path.join(input_path, 'gain_scores.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()

    fig, ax = set_plot_style(1, len(files), fig_height=3, fig_width=16)
    col_counter = -1
    for filename in files:
        col_counter += 1
        filename_without_extension = os.path.splitext(filename)[0]
        repository_name = filename_without_extension.replace(CLUSTERED_FILENAME_POSFIX, "")

        tools = pd.read_csv(os.path.join(root, filename), header=0, sep='\t')
        growthes = get_growthes(tools)
        plot2(
            ax[col_counter],
            growthes,
            header=repository_name,
            x_axis_label=x_axis_label,
            y_axis_label=f"\n {y_axis_label}" if col_counter == 0 else None,
            plot_density=plot_density)

    handles, labels = ax[col_counter].get_legend_handles_labels()

    image_file = os.path.join(input_path, 'gain_scores_sns.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()