Exemple #1
0
def cluster(clustering_error):
    functions = load_functions(PD_FUNCTIONS_PATH, "pandas")
    functions.update(load_functions(R_FUNCTIONS_PATH, "R"))
    file_name = "clusters"
    folder = os.path.join(BASE_CLUSTER_FOLDER, "%0.02f" % clustering_error)
    cache.mkdir(folder)
    clusters_txt_file = os.path.join(folder, "%s.txt" % file_name)
    clusters_pkl_file = os.path.join(folder, "%s.pkl" % file_name)
    clusters_report_file = os.path.join(folder, "%s.md" % file_name)
    clusterer = RepresentativeClusterer(functions.values(),
                                        distance_function=execution_distance)
    clusters = clusterer.cluster(clusters_txt_file,
                                 skip_singles=True,
                                 clustering_error=clustering_error)
    cache.save_pickle(clusters_pkl_file, clusters)
    n_clusters = len(clusters)
    sizes = [
        len(cluster_funcs) for label, cluster_funcs in clusters.items()
        if label != -1
    ]
    meta_data = "## Cluster sizes\n"
    meta_data += "* Number of clusters: %d\n" % n_clusters
    meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
    meta_data += "* Number of functions not clustered: %d\n\n" % (
        len(functions) - sum(sizes))
    meta_data += "## REPORT\n"
    meta_data += stat.Stat(sizes).report()
    cache.write_file(clusters_report_file, meta_data)
Exemple #2
0
def export_runner_ast(xl_path):
    cache.mkdir(props.EXPORT_HOME)
    writer = pd.ExcelWriter(os.path.join(props.EXPORT_HOME, xl_path),
                            engine='xlsxwriter')
    export_similar_differences(0.9, -9, writer, "HighSim-HighSyn", "d_ast")
    export_similar_differences(0.9, 22, writer, "HighSim-LowSyn", "d_ast")
    export_similar_differences(-0.1, -9, writer, "LowSim-HighSyn", "d_ast")
    export_similar_differences(-0.1, 22, writer, "LowSim-LowSyn", "d_ast")
    writer.save()
    writer.close()
Exemple #3
0
def remove_overlapping_clusters(dataset, language="java_python"):
    # TODO: Think about how to remove syntactic equivalence
    store = mongo_store.FunctionStore(dataset)
    base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset,
                             "clusters", "%s.pkl" % language)
    clusters = cache.load_pickle(base_file)
    non_overlapping_clusters = {}
    for label, functions in clusters.items():
        if label == -1 or len(functions) == 1: continue
        non_overlapping_funcs = []
        metas = {}
        for func in functions:
            meta = store.load_metadata({"name": func.base_name})
            metas[func.base_name] = meta
            if len(non_overlapping_funcs) == 0:
                non_overlapping_funcs.append(func)
                continue
            is_non_overlapping_funcs_updated = False
            for i, existing_func in enumerate(non_overlapping_funcs):
                existing_meta = metas[existing_func.base_name]
                if overlaps(meta, existing_meta):
                    is_non_overlapping_funcs_updated = True
                    if is_more_succinct(meta, existing_meta):
                        non_overlapping_funcs[i] = func
                    break
            if not is_non_overlapping_funcs_updated:
                non_overlapping_funcs.append(func)
        if len(non_overlapping_funcs) > 1:
            non_overlapping_clusters[label] = non_overlapping_funcs
    write_folder = os.path.join(properties.META_RESULTS_FOLDER, dataset,
                                "clusters", "non_overlapping")
    cache.mkdir(write_folder)
    clusters_txt_file = os.path.join(write_folder, "%s.txt" % language)
    clusters_pkl_file = os.path.join(write_folder, "%s.pkl" % language)
    clusters_report_file = os.path.join(write_folder, "%s.md" % language)
    cache.save_pickle(clusters_pkl_file, non_overlapping_clusters)
    clusterer.save_clusters_to_db(dataset, non_overlapping_clusters,
                                  "non_overlapping")
    clusterer.save_clusters_to_txt(non_overlapping_clusters, clusters_txt_file)
    sizes = [
        len(cluster_funcs)
        for label, cluster_funcs in non_overlapping_clusters.items()
        if label != -1
    ]
    meta_data = "## Cluster sizes\n"
    meta_data += "* Number of clusters: %d\n" % len(non_overlapping_clusters)
    meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
    meta_data += "## REPORT\n"
    meta_data += Stat(sizes).report()
    cache.write_file(clusters_report_file, meta_data)
Exemple #4
0
def save_only_target_functions(dataset, mixed_file_base_name, target_language):
    """
  Save only java functions from a mixture of java and python clusters
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :param target_language: Target Language
  :return:
  """
    clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset),
                                        "cluster_testing")
    for folder in sorted(
            cache.list_dir(clusters_base_folder, is_absolute=False)):
        LOGGER.info("Processing '%s' ..." % folder)
        folder_path = os.path.join(clusters_base_folder, folder)
        cache.mkdir(folder_path)
        base_clusters_file = os.path.join(folder_path,
                                          "%s.pkl" % mixed_file_base_name)
        base_clusters = cache.load_pickle(base_clusters_file)
        target_clusters = {}
        for label, functions in base_clusters.items():
            if label == -1 or len(functions) == 1: continue
            contains_target = False
            contains_other = False
            for func in functions:
                if func.source == target_language:
                    contains_target = True
                else:
                    contains_other = True
            if contains_target and not contains_other:
                target_clusters[label] = functions
        LOGGER.info("For folder = %s, # of '%s' clusters = %d" %
                    (folder, target_language, len(target_clusters)))
        file_path = os.path.join(folder_path, "only_%s.txt" % target_language)
        pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language)
        file_contents = []
        for label, functions in target_clusters.items():
            file_contents.append("\n\n****** Cluster %d ******" % label)
            for func in functions:
                file_contents.append(func.body)
        cache.write_file(file_path, "\n".join(file_contents))
        cache.save_pickle(pkl_path, target_clusters)
Exemple #5
0
def save_only_mixed_clusters(dataset, mixed_file_base_name):
    """
  Save only mixed functions
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :return:
  """
    clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset),
                                        "cluster_testing")
    for folder in sorted(
            cache.list_dir(clusters_base_folder, is_absolute=False)):
        LOGGER.info("Processing '%s' ..." % folder)
        folder_path = os.path.join(clusters_base_folder, folder)
        cache.mkdir(folder_path)
        base_clusters_file = os.path.join(folder_path,
                                          "%s.pkl" % mixed_file_base_name)
        base_clusters = cache.load_pickle(base_clusters_file)
        mixed_clusters = {}
        for label, functions in base_clusters.items():
            if label == -1 or len(functions) == 1: continue
            sources = set()
            for func in functions:
                sources.add(func.source)
            if len(sources) > 1:
                mixed_clusters[label] = functions
        LOGGER.info("For folder = %s, # of mixed clusters = %d" %
                    (folder, len(mixed_clusters)))
        file_path = os.path.join(folder_path, "only_mixed.txt")
        pkl_path = os.path.join(folder_path, "only_mixed.pkl")
        file_contents = []
        for label, functions in mixed_clusters.items():
            file_contents.append("\n\n****** Cluster %d ******" % label)
            for func in functions:
                file_contents.append(func.body)
        cache.write_file(file_path, "\n".join(file_contents))
        cache.save_pickle(pkl_path, mixed_clusters)