Ejemplo n.º 1
0
def cluster_diffs(concepts,
                  data,
                  graph_location,
                  file_length_map,
                  occurrence_matrix,
                  file_index_map,
                  times,
                  edges_kept=None,
                  use_file_dist=True,
                  use_call_distance=True,
                  use_data=True,
                  use_namespace=True,
                  use_change_coupling=True):
    """
    :param concepts: The number of concepts we wish to segment
    :param data: The initial diff-regions segmentation, each it's own group
    :param graph_location: The location of the dot file representing the deltaPDG of the file
    :param file_length_map: A map between filename and file line count
    :param occurrence_matrix: The matrix mapping commits to files and vice versa
    :param file_index_map: The map between filenames and occurrence_matrix indices
    :return: The proposed clustering of diff_regions
    """
    deltaPDG = obj_dict_to_networkx(read_graph_from_dot(graph_location))
    if edges_kept is not None:
        deltaPDG = remove_all_except(deltaPDG, edges_kept)
    context = get_context_from_nxgraph(deltaPDG)
    voters = [
        file_distance(file_length_map) if use_file_dist else None,
        call_graph_distance(deltaPDG, context) if use_call_distance else None,
        data_dependency(deltaPDG) if use_data else None,
        namespace_distance(deltaPDG, context) if use_namespace else None,
        change_coupling(occurrence_matrix, file_index_map)
        if use_change_coupling else None,
    ]
    voters = [v for v in voters if v is not None]

    n = len(data)

    t0 = time.process_time()
    for i in range(times):
        affinity, args = generate_empty_affinity(n, voters)
        with ThreadPool(processes=min(os.cpu_count() - 1, 6)) as wp:
            for k, value in wp.imap_unordered(
                    lambda i: (i[1], i[0](data[i[-1][0]], data[i[-1][1]])),
                    args):
                affinity[k] += value

        labels = cluster_from_voter_affinity(affinity, concepts)
    t1 = time.process_time()
    time_ = (t1 - t0) / times

    return labels, time_
Ejemplo n.º 2
0
            def worker(work):
                for data_point_name in tqdm(work, leave=False):
                    concepts = int(
                        os.path.basename(os.path.dirname(data_point_name)))
                    data_point_name = os.path.basename(
                        os.path.dirname(os.path.dirname(data_point_name)))
                    try:
                        file_lens = file_len_map[data_point_name]
                        graph_location = os.path.join('.', 'data',
                                                      'corpora_clean',
                                                      repository_name,
                                                      data_point_name,
                                                      str(concepts),
                                                      'merged.dot')
                        deltaPDG = obj_dict_to_networkx(
                            read_graph_from_dot(graph_location))
                        context = get_context_from_nxgraph(deltaPDG)

                        try:
                            _, truth = list(
                                zip(*[(n, d['community'])
                                      for n, d in deltaPDG.nodes(data=True)
                                      if 'color' in d.keys() and d['color'] !=
                                      'orange' and 'community' in d.keys()]))
                        except ValueError:
                            return

                        labels, time_ = graph_cluster_diffs(
                            deltaPDG, context, concepts, file_lens,
                            occurrence_matrix, file_index_map, times_)
                        truth = np.asarray(truth).astype(int)
                        labels = np.asarray(labels).astype(int)
                        acc, overlap = evaluate(labels,
                                                truth,
                                                q=max(concepts,
                                                      np.max(labels) + 1))
                        with open(
                                os.path.join('out', repository_name,
                                             out_name + '.csv'), 'a') as f:
                            f.write(data_point_name + ',' + str(concepts) +
                                    ',' + str(acc) + ',' + str(overlap) + ',' +
                                    str(time_) + '\n')
                    except FileNotFoundError:
                        pass
                    except KeyError:
                        with open(
                                os.path.join('out', repository_name,
                                             out_name + '.csv'), 'a') as f:
                            f.write(data_point_name + ',' + str(concepts) +
                                    ',' + str(float('nan')) + ',' +
                                    str(float('nan')) + ',' + str(0.0) + '\n')
Ejemplo n.º 3
0
    def __call__(self, filename):
        from sys import platform
        if platform == "linux" or platform == "linux2":
            # linux
            generate_a_pdg = subprocess.Popen([self.location, '.', '.' + filename.replace('/', '\\')],
                                              bufsize=1, cwd=self.repository_location)
            generate_a_pdg.wait()
        elif platform == "win32":
            # Windows...
            generate_a_pdg = subprocess.Popen([self.location, '.', '.' + filename.replace('/', '\\')], bufsize=1,
                                              cwd=self.repository_location)
            generate_a_pdg.wait()

        try:
            shutil.move(os.path.join(self.repository_location, 'pdg.dot'),
                        os.path.join(self.target_location, self.target_filename))
        except FileNotFoundError:
            with open(os.path.join(self.target_location, self.target_filename), 'w') as f:
                f.write('digraph "extractedGraph"{\n}\n')

        try:
            # shutil.move(os.path.join(self.repository_location, 'nameflows.json'),
            # os.path.join(self.target_location, 'nameflows_' + self.target_filename.split('.')[0] + '.json'))
            with open(os.path.join(self.repository_location, 'nameflows.json'), encoding='utf-8-sig') as json_data:
                nameflow_data = json.loads(json_data.read())

            # Normalise the nameflow json
            if nameflow_data is not None:
                for node in nameflow_data['nodes']:
                    file, line = node['Location'].split(' : ')
                    node['Location'] = (file[len(self.repository_location):]
                                        if self.repository_location in file
                                        else file,
                                        line)
                    node['Infile'] = \
                        os.path.normcase(os.path.normpath(filename)) == os.path.normcase(os.path.normpath(file[1:]))

            nameflow_data['relations'] = [[] if v is None else v for v in nameflow_data['relations']]

            # And add nameflow edges
            apdg = obj_dict_to_networkx(read_graph_from_dot(os.path.join(self.target_location, self.target_filename)))
            apdg = add_nameflow_edges(nameflow_data, apdg)
            nx.drawing.nx_pydot.write_dot(apdg, os.path.join(self.target_location, self.target_filename))

        except FileNotFoundError:
            # No file, nothing to add
            pass
Ejemplo n.º 4
0
    def worker(work):
        for graph_location in tqdm(work, leave=False):
            chain = os.path.basename(
                os.path.dirname(os.path.dirname(graph_location)))
            q = int(os.path.basename(os.path.dirname(graph_location)))
            graph = obj_dict_to_networkx(read_graph_from_dot(graph_location))

            t0 = time.process_time()
            for i in range(times):
                DU_chains = extract_DU_chains_from_delta(graph)
                closure = closure_of_DU_on_diff(DU_chains)
            t1 = time.process_time()
            time_ = (t1 - t0) / times

            truth = list()
            label = list()
            for node, data in graph.nodes(data=True):
                if 'color' in data.keys():
                    if 'community' in data.keys():
                        truth.append(int(data['community']))
                    else:
                        truth.append(0)

                    try:
                        label.append(int(closure.nodes[node]['prediction']))
                    except KeyError:
                        label.append(-1)
            nx.drawing.nx_pydot.write_dot(closure,
                                          graph_location[:-4] + '_closure.dot')
            truth = np.asarray(truth)
            label = np.asarray(label)
            acc, overlap = evaluate(truth[label > -1],
                                    label[label > -1],
                                    q=max(q,
                                          np.max(label) +
                                          1) if len(label) > 0 else q)
            cover = len(
                label[label > -1]) / len(label) if len(label) > 0 else .0
            with open('./out/%s/du_results_raw.csv' % repository_name,
                      'a') as f:
                f.write(chain + ',' + str(q) + ',' + str(acc) + ',' +
                        str(overlap) + ',' + str(cover) + ',' + str(time_) +
                        '\n')
Ejemplo n.º 5
0
def worker(all_graph_locations, corpus_name):
    for graph_location in all_graph_locations:
        data_point_name = os.path.basename(
            os.path.dirname(os.path.dirname(graph_location)))
        if os.path.exists(
                os.path.join('.', 'data', 'corpora_clean', corpus_name,
                             data_point_name)):
            print('[Scan and clean] Skipping %s as it exists'
                  '' % data_point_name)
            return
        print('[Scan and clean] Cleaning data-point %s' % data_point_name)

        try:
            graph = obj_dict_to_networkx(read_graph_from_dot(graph_location))
        except (TypeError, ValueError):
            continue

        # Get actual number of communities
        communities = set()
        for node, data in list(graph.nodes(data=True)):
            if 'community' in data.keys():
                communities.add(data['community'])
            if 'color' in data.keys() and 'community' not in data.keys():
                communities.add('0')
                graph.node[node]['community'] = '0'
        communities = sorted(list(communities))

        nr_concepts = str(len(communities))

        if len(communities) > 0:
            # Normalise labels
            for node, data in list(graph.nodes(data=True)):
                if 'community' in data.keys():
                    graph.node[node]['community'] = communities.index(
                        data['community'])

            output_path = os.path.join('.', 'data', 'corpora_clean',
                                       corpus_name, data_point_name,
                                       nr_concepts, 'merged.dot')
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            nx.drawing.nx_pydot.write_dot(graph, output_path)
Ejemplo n.º 6
0
def merge_deltas_for_a_commit(graph_locations):
    # We will use the file attribute to track original files so that diff intersection can be made to work
    original_file = os.path.basename(graph_locations[0])
    # We will take the first graph as a base and add the rest onto it
    graph = obj_dict_to_networkx(read_graph_from_dot(graph_locations[0]))
    contexts = get_context_from_nxgraph(graph)
    output = graph.copy()
    graph_locations = graph_locations[1:]
    for i, graph_location in enumerate(graph_locations):
        next_graph = obj_dict_to_networkx(read_graph_from_dot(graph_location))
        next_contexts = get_context_from_nxgraph(next_graph)
        # First find the contexts that exist in both
        mappable_contexts = list()
        for next_context, current_context in itertools.product(
                set(next_contexts.values()), set(contexts.values())):
            if next_context == current_context and next_context != 'lambda expression':
                mappable_contexts.append(current_context)
                break

        copied_nodes = list()
        mapped_nodes = list()
        # And copy over all of the nodes into the merged representation
        for context in mappable_contexts:
            current_entry, current_exit = find_entry_and_exit(context, graph)
            other_entry, other_exit = find_entry_and_exit(context, next_graph)

            if current_entry is not None and other_entry is not None:
                mapped_nodes.append((str(current_entry), str(other_entry)))
            if current_exit is not None and other_exit is not None:
                mapped_nodes.append((str(current_exit), str(other_exit)))

            other_nodes = [
                n for n in next_graph.nodes(data=True)
                if n[0] not in [other_entry, other_exit]
                and 'cluster' in n[1].keys() and n[1]['cluster'] == context
            ]
            if current_entry is None and other_entry is not None:
                other_nodes.append((other_entry, next_graph.node[other_entry]))
            if current_exit is None and other_exit is not None:
                other_nodes.append((other_exit, next_graph.node[other_exit]))

            if len(other_nodes) > 0:
                if current_entry is not None and 'file' not in graph.node[
                        current_entry].keys():
                    graph.node[current_entry]['file'] = os.path.basename(
                        graph_location[:-len('.dot')])
                if current_exit is not None and 'file' not in graph.node[
                        current_exit]:
                    graph.node[current_exit]['file'] = os.path.basename(
                        graph_location[:-len('.dot')])

            for copy_node, data in other_nodes:
                data['file'] = os.path.basename(graph_location[:-len('.dot')])
                output.add_node('m%d_' % i + copy_node[1:], **data)
                copied_nodes.append(('m%d_' % i + copy_node[1:], copy_node))

        # Now we copy over all of the contexts that did not map/exist in the merged representation
        for other_context in [
                c for c in set(next_contexts.values())
                if c not in mappable_contexts
        ]:
            other_entry, other_exit = find_entry_and_exit(
                other_context, next_graph)
            other_nodes = [
                n for n in next_graph.nodes(data=True)
                if n[0] not in [other_entry, other_exit] and 'cluster' in
                n[1].keys() and n[1]['cluster'] == other_context
            ]
            # For aesthetic reasons make sure to copy entry first and exit last
            if other_entry is not None:
                other_nodes = [(other_entry, next_graph.node[other_entry])
                               ] + other_nodes
            if other_exit is not None:
                other_nodes.append((other_exit, next_graph.node[other_exit]))
            for copy_node, data in other_nodes:
                data['file'] = os.path.basename(graph_location[:-len('.dot')])
                output.add_node('m%d_' % i + copy_node[1:], **data)
                copied_nodes.append(('m%d_' % i + copy_node[1:], copy_node))

        # Finally we copy over all of the nodes w/o a context
        for copy_node, data in [
                n for n in next_graph.nodes(data=True)
                if n[0] not in next_contexts.keys()
        ]:
            data['file'] = os.path.basename(graph_location[:-len('.dot')])
            output.add_node('m%d_' % i + copy_node[1:], **data)
            copied_nodes.append(('m%d_' % i + copy_node[1:], copy_node))

        # We move over the edges making sure we properly map the ends
        reverse_map = {v: u for u, v in copied_nodes + mapped_nodes}
        for copied_node, original_node in copied_nodes:
            for s, t, k in next_graph.edges(nbunch=[original_node], keys=True):
                try:
                    if s in reverse_map.keys() and t in reverse_map.keys():
                        if output.has_node(reverse_map[s]) and output.has_node(
                                reverse_map[t]):
                            output.add_edge(reverse_map[s],
                                            reverse_map[t],
                                            key=k,
                                            **next_graph[s][t][k])
                except KeyError:
                    pass

    # And finally we mark the original file nodes
    for node, _ in [
            n for n in output.nodes(data=True) if 'file' not in n[1].keys()
    ]:
        graph.node[node]['file'] = original_file

    return output
Ejemplo n.º 7
0
            target] if target in name_alias_for_edges.keys() else target
        try:
            if output.has_node(output_source) and output.has_node(
                    output_target):
                output.add_edge(output_source,
                                output_target,
                                key=key,
                                **graph[source][target][key])
        except KeyError:
            # This should never happen!
            pass

    # Add change and community data back in
    for n in output.nodes:
        if 'color' in graph.nodes[n].keys():
            output.nodes[n]['color'] = graph.nodes[n]['color']
        if 'community' in graph.nodes[n].keys():
            output.nodes[n]['community'] = graph.nodes[n]['community']

    return output


if __name__ == '__main__':
    from deltaPDG.Util.pygraph_util import read_graph_from_dot, obj_dict_to_networkx

    graph = obj_dict_to_networkx(
        read_graph_from_dot('./out/gui.cs/Core.cs.dot'))
    compressed = compress_delta(graph)
    nx.drawing.nx_pydot.write_dot(compressed,
                                  './out/gui.cs/compressed_Core.cs.dot')
Ejemplo n.º 8
0
    def worker(work):
        for graph_location in tqdm(work, leave=False):
            chain = os.path.basename(
                os.path.dirname(os.path.dirname(graph_location)))
            q = int(os.path.basename(os.path.dirname(graph_location)))
            graph = obj_dict_to_networkx(read_graph_from_dot(graph_location))
            graph = remove_all_except(graph, edges_kept)

            if len(graph.nodes) == 0:
                continue

            t0 = time.perf_counter()
            for i in range(times):
                seeds, list_of_graphs = deltaPDG_to_list_of_Graphs(
                    graph, khop_k=k_hop)
                wl_subtree = GraphKernel(kernel=[{
                    "name": "weisfeiler_lehman",
                    "n_iter": 10
                }, {
                    "name": "subtree_wl"
                }],
                                         normalize=True)
                if len(list_of_graphs) > 0:
                    similarities = defaultdict(lambda: (0, 0.0))
                    for g1, g2 in itertools.combinations(list_of_graphs, 2):
                        # The graph has to be converted to {Graph, Node_Labels, Edge_Labels}
                        wl_subtree.fit([
                            graph_to_grakel(g1, with_data, with_call,
                                            with_name)
                        ])
                        similarity = wl_subtree.transform([
                            graph_to_grakel(g2, with_data, with_call,
                                            with_name)
                        ])[0][0]
                        similarities[(list_of_graphs.index(g1),
                                      list_of_graphs.index(g2))] = similarity

                    n = len(list_of_graphs)
                    affinity = np.zeros(
                        shape=(scipy.special.comb(n, 2, exact=True), ))
                    args = list(enumerate(itertools.combinations(range(n), 2)))
                    with ThreadPool(processes=min(os.cpu_count() -
                                                  1, 1)) as wp:
                        for k, value in wp.imap_unordered(
                                lambda i: (i[0], similarities[
                                    (i[-1][0], i[-1][1])]), args):
                            affinity[k] += (
                                1 - value
                            )  # affinity is distance! so (1 - sim)

                    cluster = AgglomerativeClustering(n_clusters=None,
                                                      distance_threshold=0.5,
                                                      affinity='precomputed',
                                                      linkage='complete')
                    if len(affinity) < 2:
                        if len(affinity) == 1:
                            labels = np.asarray([
                                0, 0
                            ]) if affinity[0] <= 0.5 else np.asarray([0, 1])
                        else:
                            labels = np.asarray([0])
                    else:
                        labels = cluster.fit_predict(
                            scipy.spatial.distance.squareform(affinity))
                else:
                    labels = None
            t1 = time.perf_counter()
            time_ = (t1 - t0) / times

            truth = list()
            label = list()
            for node, data in graph.nodes(data=True):
                if 'color' in data.keys():
                    if 'community' in data.keys():
                        truth.append(int(data['community']))
                        i = seeds.index(node) if node in seeds else -1

                        if labels is not None and i != -1:
                            data['label'] = '%d: ' % labels[i] + data['label']
                            label.append(labels[i])
                            graph.add_node(node, **data)
                        else:
                            data['label'] = '-1: ' + data['label']
                            label.append(-1)
                            graph.add_node(node, **data)

            nx.drawing.nx_pydot.write_dot(
                graph, graph_location[:-4] + '_output_wl_%d.dot' % k_hop)

            truth = np.asarray(truth)
            label = np.asarray(label)
            acc, overlap = evaluate(truth[label > -1],
                                    label[label > -1],
                                    q=1 if len(label) == 0 else np.max(label) +
                                    1)
            with open(
                    './out/%s/wl_%s_%d_results_%s.csv' %
                (repository_name, edges_kept, k_hop, suffix), 'a') as f:
                f.write(chain + ',' + str(q) + ',' + str(acc) + ',' +
                        str(overlap) + ',' + str(time_) + '\n')