Example #1
0
def random_testing(dataset, language="java_python", n_folds=10):
    LOGGER.info("Random testing with '%d' number of folds" % n_folds)
    folds = get_cross_val(dataset, n_folds, as_dict=True)
    base_folder = lib.get_clusters_folder(dataset)
    LOGGER.info("Loading pickle ...")
    cluster_path = get_cluster_path(dataset, language)
    clusters = cache.load_pickle(cluster_path)
    for index, fold in enumerate(folds):
        file_name = os.path.join(base_folder, "random_testing",
                                 "fold_%d" % index, "distances.pkl")
        cluster_distances = {}
        for label, functions in clusters.items():
            if label == -1:
                continue
            similarity_map = defaultdict(dict)
            for i in range(len(functions) - 1):
                for j in range(i + 1, len(functions)):
                    assert i != j
                    f_i, f_j = functions[i], functions[j]
                    distance = clusterer.execution_distance(
                        fold[f_i.name], fold[f_j.name])
                    similarity_map[f_i.name][f_j.name] = distance
                    similarity_map[f_j.name][f_i.name] = distance
            cluster_distances[label] = similarity_map
        cache.save_pickle(file_name, cluster_distances)
Example #2
0
def compute_similarity(dataset, language=None, functions=None, base_folder=None, file_name=None,
                       skip_singles=False, update_clone_meta=False, clustering_error=0.01, cluster_suffix="base"):
  if not functions:
    if language == "java":
      functions = load_functions(dataset, update_clone_meta=update_clone_meta)
    elif language == "python":
      functions = load_py_functions(dataset)
    elif language == "java_python":
      functions = load_functions(dataset, update_clone_meta=update_clone_meta) + load_py_functions(dataset)
    else:
      raise RuntimeError("Invalid language: %s" % language)
    # if dataset not in ["codejam", "introclass"]:
    #   raise RuntimeError("Invalid dataset: %s" % dataset)
  LOGGER.info("Clustering ... ")
  if file_name is None:
    file_name = language or "clusters"
    LOGGER.warning("A @file_name is not provided. Reverting file name to '%s'" % file_name)
  if base_folder is None:
    base_folder = lib.get_clusters_folder(dataset)
  clusters_txt_file = os.path.join(base_folder, "%s.txt" % file_name)
  clusters_pkl_file = os.path.join(base_folder, "%s.pkl" % file_name)
  clusters_report_file = os.path.join(base_folder, "%s.md" % file_name)
  clusters = get_clusterer()(functions).cluster(clusters_txt_file, skip_singles=skip_singles, clustering_error=clustering_error)
  cache.save_pickle(clusters_pkl_file, clusters)
  clusterer.save_clusters_to_db(dataset, clusters, cluster_suffix)
  n_clusters = len(clusters)
  sizes = [len(cluster_funcs) for label, cluster_funcs in clusters.items() if label != -1]
  meta_data = "## Cluster sizes\n"
  meta_data += "* Number of clusters: %d\n" % n_clusters
  meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
  meta_data += "* Number of functions not clustered: %d\n\n" % (len(functions) - sum(sizes))
  meta_data += "## REPORT\n"
  meta_data += Stat(sizes).report()
  cache.write_file(clusters_report_file, meta_data)
Example #3
0
def save_only_mixed_clusters(dataset, mixed_file_base_name):
  """
  Save only mixed functions
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :return:
  """
  clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing")
  for folder in sorted(cache.list_dir(clusters_base_folder, is_absolute=False)):
    LOGGER.info("Processing '%s' ..." % folder)
    folder_path = os.path.join(clusters_base_folder, folder)
    base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name)
    base_clusters = cache.load_pickle(base_clusters_file)
    mixed_clusters = {}
    for label, functions in base_clusters.items():
      if label == -1 or len(functions) == 1: continue
      sources = set()
      for func in functions:
        sources.add(func.source)
      if len(sources) > 1:
        mixed_clusters[label] = functions
    LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters)))
    file_path = os.path.join(folder_path, "only_mixed.txt")
    pkl_path = os.path.join(folder_path, "only_mixed.pkl")
    file_contents = []
    for label, functions in mixed_clusters.items():
      file_contents.append("\n\n****** Cluster %d ******" % label)
      for func in functions:
        file_contents.append(func.body)
    cache.write_file(file_path, "\n".join(file_contents))
    cache.save_pickle(pkl_path, mixed_clusters)
Example #4
0
def fetch_statements(language,
                     force=False,
                     do_save=False,
                     limit=None,
                     as_list=False):
    pkl_file = get_executed_stmts_pkl(language)
    if not force and cache.file_exists(pkl_file):
        LOGGER.info("Retrieving existing '%s' statements!" % language)
        if as_list:
            return cache.load_pickle(pkl_file).values()
        return cache.load_pickle(pkl_file)
    LOGGER.info("Reprocessing '%s' statements!" % language)
    store = mongo_driver.MongoStore(props.DATASET)
    stmts = {}
    mongo_stmts = store.load_stmts(language=language,
                                   is_valid=True,
                                   has_output=True,
                                   limit=limit).items()
    n_stmts = len(mongo_stmts)
    for i, (key, mongo_stmt) in enumerate(mongo_stmts):
        LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts))
        stmt = Statement(mongo_id=mongo_stmt["_id"],
                         snippet=mongo_stmt["snippet"],
                         variables=mongo_stmt["variables"],
                         language=language,
                         outputs=format_outputs(mongo_stmt["outputs"]))
        stmts[stmt.mongo_id] = stmt
    if do_save:
        LOGGER.info("Saving statements .... ")
        cache.save_pickle(pkl_file, stmts)
    if as_list:
        return stmts.values()
    return stmts
Example #5
0
def cluster(clustering_error):
    functions = load_functions(PD_FUNCTIONS_PATH, "pandas")
    functions.update(load_functions(R_FUNCTIONS_PATH, "R"))
    file_name = "clusters"
    folder = os.path.join(BASE_CLUSTER_FOLDER, "%0.02f" % clustering_error)
    cache.mkdir(folder)
    clusters_txt_file = os.path.join(folder, "%s.txt" % file_name)
    clusters_pkl_file = os.path.join(folder, "%s.pkl" % file_name)
    clusters_report_file = os.path.join(folder, "%s.md" % file_name)
    clusterer = RepresentativeClusterer(functions.values(),
                                        distance_function=execution_distance)
    clusters = clusterer.cluster(clusters_txt_file,
                                 skip_singles=True,
                                 clustering_error=clustering_error)
    cache.save_pickle(clusters_pkl_file, clusters)
    n_clusters = len(clusters)
    sizes = [
        len(cluster_funcs) for label, cluster_funcs in clusters.items()
        if label != -1
    ]
    meta_data = "## Cluster sizes\n"
    meta_data += "* Number of clusters: %d\n" % n_clusters
    meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
    meta_data += "* Number of functions not clustered: %d\n\n" % (
        len(functions) - sum(sizes))
    meta_data += "## REPORT\n"
    meta_data += stat.Stat(sizes).report()
    cache.write_file(clusters_report_file, meta_data)
Example #6
0
def remove_overlapping_clusters(dataset, language="java_python"):
    # TODO: Think about how to remove syntactic equivalence
    store = mongo_store.FunctionStore(dataset)
    base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset,
                             "clusters", "%s.pkl" % language)
    clusters = cache.load_pickle(base_file)
    non_overlapping_clusters = {}
    for label, functions in clusters.items():
        if label == -1 or len(functions) == 1: continue
        non_overlapping_funcs = []
        metas = {}
        for func in functions:
            meta = store.load_metadata({"name": func.base_name})
            metas[func.base_name] = meta
            if len(non_overlapping_funcs) == 0:
                non_overlapping_funcs.append(func)
                continue
            is_non_overlapping_funcs_updated = False
            for i, existing_func in enumerate(non_overlapping_funcs):
                existing_meta = metas[existing_func.base_name]
                if overlaps(meta, existing_meta):
                    is_non_overlapping_funcs_updated = True
                    if is_more_succinct(meta, existing_meta):
                        non_overlapping_funcs[i] = func
                    break
            if not is_non_overlapping_funcs_updated:
                non_overlapping_funcs.append(func)
        if len(non_overlapping_funcs) > 1:
            non_overlapping_clusters[label] = non_overlapping_funcs
    write_folder = os.path.join(properties.META_RESULTS_FOLDER, dataset,
                                "clusters", "non_overlapping")
    cache.mkdir(write_folder)
    clusters_txt_file = os.path.join(write_folder, "%s.txt" % language)
    clusters_pkl_file = os.path.join(write_folder, "%s.pkl" % language)
    clusters_report_file = os.path.join(write_folder, "%s.md" % language)
    cache.save_pickle(clusters_pkl_file, non_overlapping_clusters)
    clusterer.save_clusters_to_db(dataset, non_overlapping_clusters,
                                  "non_overlapping")
    clusterer.save_clusters_to_txt(non_overlapping_clusters, clusters_txt_file)
    sizes = [
        len(cluster_funcs)
        for label, cluster_funcs in non_overlapping_clusters.items()
        if label != -1
    ]
    meta_data = "## Cluster sizes\n"
    meta_data += "* Number of clusters: %d\n" % len(non_overlapping_clusters)
    meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
    meta_data += "## REPORT\n"
    meta_data += Stat(sizes).report()
    cache.write_file(clusters_report_file, meta_data)
Example #7
0
def save_only_target_functions(dataset, mixed_file_base_name, target_language):
    """
  Save only java functions from a mixture of java and python clusters
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :param target_language: Target Language
  :return:
  """
    clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset),
                                        "cluster_testing")
    for folder in sorted(
            cache.list_dir(clusters_base_folder, is_absolute=False)):
        LOGGER.info("Processing '%s' ..." % folder)
        folder_path = os.path.join(clusters_base_folder, folder)
        cache.mkdir(folder_path)
        base_clusters_file = os.path.join(folder_path,
                                          "%s.pkl" % mixed_file_base_name)
        base_clusters = cache.load_pickle(base_clusters_file)
        target_clusters = {}
        for label, functions in base_clusters.items():
            if label == -1 or len(functions) == 1: continue
            contains_target = False
            contains_other = False
            for func in functions:
                if func.source == target_language:
                    contains_target = True
                else:
                    contains_other = True
            if contains_target and not contains_other:
                target_clusters[label] = functions
        LOGGER.info("For folder = %s, # of '%s' clusters = %d" %
                    (folder, target_language, len(target_clusters)))
        file_path = os.path.join(folder_path, "only_%s.txt" % target_language)
        pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language)
        file_contents = []
        for label, functions in target_clusters.items():
            file_contents.append("\n\n****** Cluster %d ******" % label)
            for func in functions:
                file_contents.append(func.body)
        cache.write_file(file_path, "\n".join(file_contents))
        cache.save_pickle(pkl_path, target_clusters)
Example #8
0
def save_function(func_data):
    saved_funcs = cache.load_pickle(FUNCTION_STORE)
    if not saved_funcs:
        saved_funcs = {}
    saved_funcs[func_data["name"]] = func_data
    cache.save_pickle(FUNCTION_STORE, saved_funcs)
Example #9
0
def store_args(key, args):
    data_store = cache.load_pickle(STORE_PATH)
    if not data_store:
        data_store = {}
    data_store[key] = args
    cache.save_pickle(STORE_PATH, data_store)