コード例 #1
0
def random_testing(dataset, language="java_python", n_folds=10):
    LOGGER.info("Random testing with '%d' number of folds" % n_folds)
    folds = get_cross_val(dataset, n_folds, as_dict=True)
    base_folder = lib.get_clusters_folder(dataset)
    LOGGER.info("Loading pickle ...")
    cluster_path = get_cluster_path(dataset, language)
    clusters = cache.load_pickle(cluster_path)
    for index, fold in enumerate(folds):
        file_name = os.path.join(base_folder, "random_testing",
                                 "fold_%d" % index, "distances.pkl")
        cluster_distances = {}
        for label, functions in clusters.items():
            if label == -1:
                continue
            similarity_map = defaultdict(dict)
            for i in range(len(functions) - 1):
                for j in range(i + 1, len(functions)):
                    assert i != j
                    f_i, f_j = functions[i], functions[j]
                    distance = clusterer.execution_distance(
                        fold[f_i.name], fold[f_j.name])
                    similarity_map[f_i.name][f_j.name] = distance
                    similarity_map[f_j.name][f_i.name] = distance
            cluster_distances[label] = similarity_map
        cache.save_pickle(file_name, cluster_distances)
コード例 #2
0
ファイル: similarity.py プロジェクト: Suvodeep90/SLACC
def compute_similarity(dataset, language=None, functions=None, base_folder=None, file_name=None,
                       skip_singles=False, update_clone_meta=False, clustering_error=0.01, cluster_suffix="base"):
  if not functions:
    if language == "java":
      functions = load_functions(dataset, update_clone_meta=update_clone_meta)
    elif language == "python":
      functions = load_py_functions(dataset)
    elif language == "java_python":
      functions = load_functions(dataset, update_clone_meta=update_clone_meta) + load_py_functions(dataset)
    else:
      raise RuntimeError("Invalid language: %s" % language)
    # if dataset not in ["codejam", "introclass"]:
    #   raise RuntimeError("Invalid dataset: %s" % dataset)
  LOGGER.info("Clustering ... ")
  if file_name is None:
    file_name = language or "clusters"
    LOGGER.warning("A @file_name is not provided. Reverting file name to '%s'" % file_name)
  if base_folder is None:
    base_folder = lib.get_clusters_folder(dataset)
  clusters_txt_file = os.path.join(base_folder, "%s.txt" % file_name)
  clusters_pkl_file = os.path.join(base_folder, "%s.pkl" % file_name)
  clusters_report_file = os.path.join(base_folder, "%s.md" % file_name)
  clusters = get_clusterer()(functions).cluster(clusters_txt_file, skip_singles=skip_singles, clustering_error=clustering_error)
  cache.save_pickle(clusters_pkl_file, clusters)
  clusterer.save_clusters_to_db(dataset, clusters, cluster_suffix)
  n_clusters = len(clusters)
  sizes = [len(cluster_funcs) for label, cluster_funcs in clusters.items() if label != -1]
  meta_data = "## Cluster sizes\n"
  meta_data += "* Number of clusters: %d\n" % n_clusters
  meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
  meta_data += "* Number of functions not clustered: %d\n\n" % (len(functions) - sum(sizes))
  meta_data += "## REPORT\n"
  meta_data += Stat(sizes).report()
  cache.write_file(clusters_report_file, meta_data)
コード例 #3
0
ファイル: analyze.py プロジェクト: nischalshrestha/CodeSeer
def save_only_mixed_clusters(dataset, mixed_file_base_name):
  """
  Save only mixed functions
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :return:
  """
  clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing")
  for folder in sorted(cache.list_dir(clusters_base_folder, is_absolute=False)):
    LOGGER.info("Processing '%s' ..." % folder)
    folder_path = os.path.join(clusters_base_folder, folder)
    base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name)
    base_clusters = cache.load_pickle(base_clusters_file)
    mixed_clusters = {}
    for label, functions in base_clusters.items():
      if label == -1 or len(functions) == 1: continue
      sources = set()
      for func in functions:
        sources.add(func.source)
      if len(sources) > 1:
        mixed_clusters[label] = functions
    LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters)))
    file_path = os.path.join(folder_path, "only_mixed.txt")
    pkl_path = os.path.join(folder_path, "only_mixed.pkl")
    file_contents = []
    for label, functions in mixed_clusters.items():
      file_contents.append("\n\n****** Cluster %d ******" % label)
      for func in functions:
        file_contents.append(func.body)
    cache.write_file(file_path, "\n".join(file_contents))
    cache.save_pickle(pkl_path, mixed_clusters)
コード例 #4
0
ファイル: analyze.py プロジェクト: nischalshrestha/CodeSeer
def cluster_testing(dataset, language="java_python"):
  LOGGER.info("Testing different cluster sizes for dataset '%s' and language '%s'" % (dataset, language))
  functions = similarity.load_functions(dataset) + similarity.load_py_functions(dataset)
  errors = [0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
  base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing")
  for clustering_error in errors:
    result_folder = os.path.join(base_folder, "eps_%0.2f" % clustering_error)
    similarity.compute_similarity(dataset, language, functions=functions, base_folder=result_folder,
                                  clustering_error=clustering_error)
コード例 #5
0
def save_only_target_functions(dataset, mixed_file_base_name, target_language):
    """
  Save only java functions from a mixture of java and python clusters
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :param target_language: Target Language
  :return:
  """
    clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset),
                                        "cluster_testing")
    for folder in sorted(
            cache.list_dir(clusters_base_folder, is_absolute=False)):
        LOGGER.info("Processing '%s' ..." % folder)
        folder_path = os.path.join(clusters_base_folder, folder)
        cache.mkdir(folder_path)
        base_clusters_file = os.path.join(folder_path,
                                          "%s.pkl" % mixed_file_base_name)
        base_clusters = cache.load_pickle(base_clusters_file)
        target_clusters = {}
        for label, functions in base_clusters.items():
            if label == -1 or len(functions) == 1: continue
            contains_target = False
            contains_other = False
            for func in functions:
                if func.source == target_language:
                    contains_target = True
                else:
                    contains_other = True
            if contains_target and not contains_other:
                target_clusters[label] = functions
        LOGGER.info("For folder = %s, # of '%s' clusters = %d" %
                    (folder, target_language, len(target_clusters)))
        file_path = os.path.join(folder_path, "only_%s.txt" % target_language)
        pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language)
        file_contents = []
        for label, functions in target_clusters.items():
            file_contents.append("\n\n****** Cluster %d ******" % label)
            for func in functions:
                file_contents.append(func.body)
        cache.write_file(file_path, "\n".join(file_contents))
        cache.save_pickle(pkl_path, target_clusters)
コード例 #6
0
def get_cluster_path(dataset, language):
    return os.path.join(lib.get_clusters_folder(dataset), "%s.pkl" % language)
コード例 #7
0
ファイル: clusterer.py プロジェクト: Suvodeep90/SLACC
def _transfer_clusters(dataset):
    file_name = os.path.join(lib.get_clusters_folder(dataset), "java.pkl")
    clusters = load_clusters_from_pkl(file_name)
    save_clusters_to_db(dataset, clusters, "base")