Esempio n. 1
0
def cluster_testing(dataset, language="java_python"):
  LOGGER.info("Testing different cluster sizes for dataset '%s' and language '%s'" % (dataset, language))
  functions = similarity.load_functions(dataset) + similarity.load_py_functions(dataset)
  errors = [0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]
  base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing")
  for clustering_error in errors:
    result_folder = os.path.join(base_folder, "eps_%0.2f" % clustering_error)
    similarity.compute_similarity(dataset, language, functions=functions, base_folder=result_folder,
                                  clustering_error=clustering_error)
Esempio n. 2
0
def get_cross_val(dataset, n_folds, as_dict=False):
  functions = similarity.load_functions(dataset, is_test=True) + similarity.load_py_functions(dataset, is_test=True)
  all_outputs = [func.outputs for func in functions]
  folds = []
  fold_size = len(all_outputs[0].returns) // n_folds
  for i in range(n_folds):
    fold = {} if as_dict else []
    start, end = i * fold_size, (i + 1) * fold_size
    for func in functions:
      clone = func.deep_clone()
      clone.outputs = all_outputs[i].subset(start, end)
      if as_dict:
        fold[clone.name] = clone
      else:
        fold.append(clone)
    folds.append(fold)
  return folds