Exemple #1
0
def get_clustered_repositories(input_path):
    filenames = []
    repositories = []
    for root, dirpath, files in os.walk(input_path):
        for filename in files:
            if os.path.splitext(filename)[1] == ".csv" and \
            os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                filenames.append(os.path.join(root, filename))
                repositories.append(get_repo_name(filename))

    return filenames, repositories
Exemple #2
0
def plot_clustered(input_path, filenames, repositories):
    fig, ax = set_plot_style(1, 1)
    i = 0
    max_x = 0
    max_y = 0
    repo_scatter = {}
    cluster_scatter = {}
    add_cluster_scatter = True
    for filename in filenames:
        add_repo_scatter = True
        c_pubs, ck_pubs, c_tools, ck_tools = get_pub_tool_count(filename)
        cluster_count = len(ck_pubs.keys())
        j = 0
        for k in ck_pubs:
            max_x = max(max_x, ck_pubs[k])
            max_y = max(max_y, ck_tools[k])
            scatter = ax.scatter(ck_pubs[k], ck_tools[k], marker=get_marker(j), color=get_color(i), alpha=0.5, s=80)

            if add_repo_scatter:
                repo_scatter[get_repo_name(filename)] = scatter
                add_repo_scatter = False

            if add_cluster_scatter:
                cluster_scatter[get_cluster_label(cluster_count, k)] = scatter
            j += 1

        add_cluster_scatter = False
        i += 1

    # The default range of plt when `s` is set in the `scatter` 
    # method does not keep all the points in the canvas; so their 
    # values are overridden.
    ax.set_ylim(bottom=0.5, top=max_y + (max_y * 0.5))
    ax.set_xlim(left=0.5, right=max_x + (max_x * 0.5))

    ax.set_yscale('log')
    ax.set_xscale('log')
    ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d'))
    ax.xaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d'))

    ax.set_xlabel("\nPublications Count")
    ax.set_ylabel("Tools Count\n")

    # It is required to add legend through `add_artist` for it not be overridden by the second legend.
    l1 = ax.legend(repo_scatter.values(), repo_scatter.keys(), scatterpoints=1, loc='lower right', ncol=2, title="Repositories")
    ax.add_artist(l1)
    l2 = ax.legend(cluster_scatter.values(), cluster_scatter.keys(), scatterpoints=1, loc='upper left', ncol=2, title="Clusters")

    image_file = os.path.join(input_path, 'plot_pub_tool_clustered.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()
Exemple #3
0
def run(input_path, plot_density):
    files = []
    for root, dirpath, filenames in os.walk(input_path):
        for filename in filenames:
            if os.path.splitext(filename)[1] == ".csv" and \
               not os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                files.append(os.path.join(root, filename))

    fig, axes = set_plot_style(1, len(files), 3, 16)

    ylabel = "Count"
    if plot_density:
        ylabel = "Probability"

    col_counter = 0
    for file in files:
        tools = pd.read_csv(file, header=0, sep='\t')
        _, pre_citations_vectors, post_citations_vectors, _, _, _, delta = get_vectors(
            tools)

        pre_citations = []
        for citation in pre_citations_vectors:
            pre_citations.append(np.max(citation))
        pre_citations = aggregate(pre_citations, 0, 500)

        post_citations = []
        for citation in post_citations_vectors:
            post_citations.append(np.max(citation))
        post_citations = aggregate(post_citations, 0, 500)

        plot(axes[col_counter], pre_citations, post_citations, plot_density,
             get_repo_name(file), ylabel if col_counter == 0 else None)

        col_counter += 1

    handles, labels = axes[-1].get_legend_handles_labels()
    fig.legend(handles,
               labels,
               loc='center',
               bbox_to_anchor=(0.410, 0.04),
               ncol=2,
               framealpha=0.0)

    image_file = os.path.join(input_path, 'citations_distribution.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()
def get_pubs_count(input_path):
    counts = {}
    repos = []
    cluster_count = 0
    for root, dirpath, filenames in os.walk(input_path):
        for filename in filenames:
            if os.path.splitext(filename)[1] == ".csv" and \
            os.path.splitext(filename)[0].endswith(CLUSTERED_FILENAME_POSFIX):
                repos.append(get_repo_name(filename))
                clusters = get_clusters(os.path.join(root, filename))
                cluster_count = len(clusters.groups)
                for k in clusters.groups:
                    if k not in counts:
                        counts[k] = {}
                    counts[k][filename] = len(clusters.groups[k])

    return counts, repos, cluster_count
Exemple #5
0
def plot(input_path, filenames, repositories):
    fig, ax = set_plot_style(1, 1)
    i = 0
    max_x = 0
    max_y = 0
    repo_scatter = {}
    cluster_scatter = {}
    add_cluster_scatter = True
    xs = []
    ys = []
    zs = []
    for filename in filenames:
        repo_color = get_color(i)
        add_repo_scatter = True
        c_pubs, _, c_tools, _ = get_pub_tool_count(filename)
        max_x = max(max_x, c_pubs)
        max_y = max(max_y, c_tools)

        tools = pd.read_csv(filename, header=0, sep='\t')
        pre_citations, post_citations = get_citations_count(tools)

        xs.append(c_pubs)
        ys.append(c_tools)

        # it is multiplied by 2 so to make it a bit bigger on the plot so it can
        # be seen more easily.
        z = ((sum(pre_citations) + sum(post_citations)) / c_pubs) * 2
        zs.append(z)

        scatter = ax.scatter(c_pubs, c_tools, color=repo_color, alpha=0.5, s=z)

        repo_name = get_repo_name(filename)
        z_str = '{0:.1f}'.format(z / 2.0)
        ax.annotate(\
            f"{repo_name}\n({c_pubs}, {c_tools}, {z_str})", \
            xy=(c_pubs, c_tools), \
            color=repo_color,
            textcoords="offset points", \
            xytext=OFFSETS[repo_name], \
            ha='center', \
            #arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.95', color=repo_color)

            )

        repo_scatter[repo_name] = scatter
        i += 1

        print(repo_name)
        print(f"\tpubs:\t{c_pubs}")
        print(f"\ttools:\t{c_tools}")
        print(f"\tcitations:\t{sum(pre_citations) + sum(post_citations)}")

    #for x,y in zip(xs,ys):
    #    plt.annotate(f"({x}, {y})",  # Label
    #                 (x,y),
    #                 textcoords="offset points", # how to position the text
    #                 xytext=(0,10), # distance from text to points (x,y)
    #                 ha='center') # horizontal alignment can be left, right or center

    # The default range of plt when `s` is set in the `scatter`
    # method does not keep all the points in the canvas; so their
    # values are overridden.
    ax.set_ylim(bottom=128, top=max_y + (max_y * 0.5))
    ax.set_xlim(left=128, right=max_x + (max_x * 0.5))

    ax.set_xscale('log', basex=2)
    ax.set_yscale('log', basey=2)
    ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d'))
    ax.xaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter('%d'))

    ax.set_xlabel("\nPublications Count")
    ax.set_ylabel("Tools Count\n")

    # It is required to add legend through `add_artist` for it not be overridden by the second legend.
    #ax.legend(repo_scatter.values(), repo_scatter.keys(), scatterpoints=1, loc='upper left', ncol=2)
    #ax.add_artist(l1)
    #l2 = ax.legend(cluster_scatter.values(), cluster_scatter.keys(), scatterpoints=1, loc='upper left', ncol=2, title="Clusters")

    image_file = os.path.join(input_path, 'plot_pub_tool.png')
    if os.path.isfile(image_file):
        os.remove(image_file)
    plt.savefig(image_file, bbox_inches='tight')
    plt.close()