def crawl_link(url, key): LOGGER.info("Fetching URL: %s" % url) parsed_url = urlparse.urlparse(url) user_name = re.sub('[^a-zA-Z]+', '', urlparse.parse_qs(parsed_url.query)['username'][0]) user_name_path = os.path.join(properties.PYTHON_PROJECTS_HOME, DATASET, key, user_name) cache.mk_package(user_name_path) file_handle, _ = urllib.urlretrieve(url) zip_file_object = zipfile.ZipFile(file_handle, 'r') for file_name in zip_file_object.namelist(): f = zip_file_object.open(file_name) file_path = os.path.join(user_name_path, file_name) file_content = f.read() cache.write_file(file_path, file_content) if not cache.is_valid_python_file(file_path): LOGGER.info("Invalid Python File: %s. Deleting it .... " % file_path) cache.delete_file(file_path) if len( cache.list_files(user_name_path, False, False, ignores=["__init__.py"])) == 0: LOGGER.info("Folder '%s' contains no python file. Deleting it") cache.delete_folder(user_name_path)
def connected_components(dataset, base_folder): base_folder_path = os.path.join(properties.META_RESULTS_FOLDER, dataset, base_folder) contents = ["# Epsilons and methods"] for file_path in sorted( cache.list_files(base_folder_path, is_absolute=True)): if not file_path.endswith(".csv"): continue epsilon = cache.get_file_name(file_path).split(".")[0].split( "_", 1)[1].replace("_", ".") print(file_path) graph = networkx.Graph() contents.append("## eps = %s" % epsilon) with open(file_path) as csv_file: csv_reader = csv.reader(csv_file, delimiter=",") next(csv_reader, None) for row in csv_reader: graph.add_edge(row[0], row[1]) n_clusters = networkx.number_connected_components(graph) contents.append("#### \# Functionalities = %d" % n_clusters) contents.append("```") [ contents.append("%d: %s" % (i, ",\n\t".join(component))) for i, component in enumerate( networkx.connected_components(graph)) ] # [contents.append("%d: %s" % (i, ",".join(map(str, sorted(map(int, component)))))) # for i, component in enumerate(networkx.connected_components(graph))] contents.append("```") LOGGER.info("For epsilon = %s, # clusters = %d" % (epsilon, n_clusters)) write_file = os.path.join(base_folder_path, "components.md") cache.write_file(write_file, "\n".join(contents))
def extract_metadata_for_folder(dataset, problem_id=None): sys.path.append(properties.PYTHON_PROJECTS_HOME) function_store = get_function_store(dataset) root_folder = os.path.join(properties.PYTHON_PROJECTS_HOME, dataset) if problem_id: root_folder = os.path.join(root_folder, problem_id) for file_path in cache.list_files(root_folder, check_nest=True, is_absolute=True): file_name = cache.get_file_name(file_path) if not file_name.startswith(a_consts.GENERATED_PREFIX): continue LOGGER.info("Processing '%s' ..." % helper.get_simple_name(file_path)) for func in helper.get_generated_functions(file_path): function_name = func.__name__ valid, func_key = is_executable_function(dataset, func, False) print(function_name, func_key, valid) if valid: meta_data = { "name": function_name, "body": inspect.getsource(func), "inputKey": func_key, "filePath": file_path } function_store.save_py_metadata(meta_data) sys.path.remove(properties.PYTHON_PROJECTS_HOME)
def load_functions(self): functions = [] results_folder = lib.get_dataset_functions_results_folder(self.dataset) for json_file in cache.list_files(results_folder, check_nest=True, is_absolute=True): if not json_file.endswith(".json"): continue functions += self.__load_functions_for_class(json_file) return functions
def execute(dataset, root_folder): for file_path in cache.list_files(root_folder, check_nest=True, is_absolute=True): file_name = cache.get_file_name(file_path) if file_name == "__init__" or file_name.startswith( a_consts.GENERATED_PREFIX): continue get_meta_for_file(dataset, file_path)
def export_methods(dataset): root_folder = os.path.join(properties.PYTHON_PROJECTS_HOME, dataset, problem) for file_path in cache.list_files(root_folder, check_nest=True, is_absolute=True): file_name = cache.get_file_name(file_path) if file_name == "__init__" or file_name.startswith( a_consts.GENERATED_PREFIX): continue generate.generate_for_file(dataset, file_path)
def execute_problem(dataset, problem_id=None): root_folder = os.path.join(properties.PYTHON_PROJECTS_HOME, dataset) if problem_id: root_folder = os.path.join(root_folder, problem_id) for file_path in cache.list_files(root_folder, check_nest=True, is_absolute=True): if not cache.get_file_name(file_path).startswith( a_consts.GENERATED_PREFIX): continue LOGGER.info("Processing '%s'" % helper.get_simple_name(file_path)) execute_file(dataset, file_path)
def get_valid_functions_from_folder(dataset, problem_id=None): total_valid_functions = 0 accessed_keys = set() root_folder = properties.PYTHON_PROJECTS_HOME if problem_id: root_folder = os.path.join(root_folder, problem_id) for file_path in cache.list_files(root_folder, check_nest=True, is_absolute=True): file_name = cache.get_file_name(file_path) if not file_name.startswith(a_consts.GENERATED_PREFIX): continue LOGGER.info("Processing '%s'" % helper.get_simple_name(file_path)) valid_keys, n_generated_functions = get_valid_function_keys_from_file( dataset, file_path) LOGGER.info("Valid Functions: %d / %d\n" % (len(valid_keys), n_generated_functions)) accessed_keys.update(valid_keys) total_valid_functions += len(valid_keys) LOGGER.info("Total valid functions: %d" % total_valid_functions) print(accessed_keys)