def _process(kwargs): pid = os.getpid() mac = kwargs["mac"] log = kwargs["log"] syn_para = kwargs["syn_para"] acr_para = kwargs["acr_para"] img_para = kwargs["img_para"] ip_tv_para = kwargs["ip_tv"] stat_para = kwargs["stat_cfg"] acr_queue = aioqueue.Queue(max_size=acr_para.queue_cap) mkdir(log["path"]) log_init(log) ip_tv_init(ip_tv_para.url, ip_tv_para.local) threads = [ SynThread( mac=mac, syn_para=syn_para, acr_para=acr_para, img_para=img_para, acr_queue=acr_queue, stat_para=stat_para, ), AcrLog( acr_para=acr_para, acr_queue=acr_queue, stat_para=stat_para, ) ] def _signal(sig, frame): logging.info(f"{os.getpid()} recv signal {sig} {frame}") for w in threads: w.close() signal.signal(signal.SIGINT, _signal) signal.signal(signal.SIGTERM, _signal) signal.signal(signal.SIGABRT, _signal) logging.info(f"process {pid} start... ") try: for task in threads: task.start() for task in threads: task.join() except: logging.error(f"process {pid} over... ") logging.info(f"process {pid} end... ")
def main(): log_init(path.join(RESULTS_DIR_PATH, "log", "classifier_training_.txt")) log("ML4Refactoring: Binary classification") refactorings = build_refactorings(Level) # Run models models = build_models() pipeline = None pipeline = BinaryClassificationPipeline(models, refactorings, DATASETS) results = pipeline.run() return results
if not path.exists(fig_path_box): combined_stable_metrics = pd.DataFrame() for level in STABLE_LEVELS: for k in STABLE_Ks: stable_metrics = get_metrics_stable_level_unique_metrics(level, k, metrics, samples=35000) stable_metrics['K'] = k stable_metrics = pd.melt(stable_metrics, id_vars="K", var_name="Metric", value_vars=metrics, value_name="values") stable_metrics["Metric"] = stable_metrics["Metric"].apply(lambda x: f"{x} {str(level)}") combined_stable_metrics = combined_stable_metrics.append(stable_metrics) # plot line_plot_seaborn(combined_stable_metrics, title, fig_path_box, xticks=STABLE_Ks, yticks=yticks, scale="log", custom_palette=custom_palette) else: log(f"--Skipped plot at {fig_path_box}, because it already exists.") log_init(f"results/Distribution/class_metrics_distribution_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt") start_time = time.time() Path(path.dirname("results/Distribution/Class_Metrics/K/")).mkdir(parents=True, exist_ok=True) custom_palette = {"classCbo Level.Class":"red", "classCbo Level.Method":"brown", "classCbo Level.Variable":"orangered", "classCbo Level.Field":"maroon", "classTCC Level.Class":"green", "classTCC Level.Method":"olive", "classTCC Level.Variable":"lime", "classTCC Level.Field":"yellowgreen", "classWmc Level.Class":"blue", "classWmc Level.Method":"navy", "classWmc Level.Variable":"cyan", "classWmc Level.Field":"dodgerblue"} level_merged_stable_k("Distribution/Class_Metrics/K", metrics=CLASS_METRICS_REDUCED_Fields, yticks=[1, 2.5, 3.5, 5, 7.5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 75, 90, 100, 125, 150, 200, 250, 300, 350], title=f"Class Metrics: Stable K's", file_descriptor=f"Class_Metrics_Reduced_K", custom_palette=custom_palette) custom_palette = {"classNumberOfMethods Level.Class":"red", "classNumberOfMethods Level.Method":"brown", "classNumberOfMethods Level.Variable":"orangered", "classNumberOfMethods Level.Field":"maroon", "classNumberOfPublicFields Level.Class":"green", "classNumberOfPublicFields Level.Method":"olive", "classNumberOfPublicFields Level.Variable":"lime", "classNumberOfPublicFields Level.Field":"yellowgreen", "classStringLiteralsQty Level.Class":"blue", "classStringLiteralsQty Level.Method":"navy", "classStringLiteralsQty Level.Variable":"cyan", "classStringLiteralsQty Level.Field":"dodgerblue", "classUniqueWordsQty Level.Class":"black", "classUniqueWordsQty Level.Method":"grey", "classUniqueWordsQty Level.Variable":"lightgrey", "classUniqueWordsQty Level.Field":"snow",
from configs import DATASETS, Level, VALIDATION_DATASETS from db.QueryBuilder import get_all_level_stable, get_level_refactorings_count, get_level_refactorings from db.DBConnector import execute_query from utils.log import log_init, log_close, log import time log_init() log('Begin cache warm-up') start_time = time.time() for dataset in (DATASETS + VALIDATION_DATASETS): log("\n**** dataset: " + dataset) for level in Level: log("-- non refactored instances for " + str(level)) non_refactored = execute_query( get_all_level_stable(int(level), dataset)) log( str(len(non_refactored)) + " non-refactored instances were found for level: " + str(level)) log("-- " + str(level) + " refactoring types with count") refactorings = execute_query( get_level_refactorings_count(int(level), dataset)) log(refactorings.to_string()) for refactoring_name in refactorings['refactoring']: refactoring_instances = execute_query( get_level_refactorings(int(level), refactoring_name, dataset)) log('Cache warm-up took %s seconds.' % (time.time() - start_time)) log_close()
[db_ids_val]) formatted_results = format_results_single_run( DATASET, refactoring_name, ["test set github"], model_name, val_scores["f1_score"], val_scores["precision"], val_scores["recall"], val_scores['accuracy'], val_scores['tn'], val_scores['fp'], val_scores['fn'], val_scores['tp'], val_scores["permutation_importance"], trained_model, features, json.dumps(trained_model.get_params())) save_validation_results(model_name, val_results[0], "test set github", formatted_results) return formatted_results, trained_model.get_params() # Start log_init( f"{SAVE_DIRECTORY}classifier_evaluation_test-set_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" ) log('Begin classifier evaluation') refactorings = build_refactorings(Level) for model_name in [ "LogisticRegressionRefactoringModel", "RandomForestRefactoringModel" ]: evaluation_path = f"{SAVE_DIRECTORY}test_set_evaluation{model_name}.xlsx" params_path = f"{SAVE_DIRECTORY}{model_name}_parameter.xlsx" if not os.path.exists(evaluation_path): evaluation_results = pd.DataFrame() parameter_sets = pd.DataFrame() for refactoring in refactorings: refactoring_name = refactoring.name()
# global path variable local_base_path = '/data/snapshots/' remote_base_path = '/data/snapshots/' if __name__ == '__main__': logging.info('--------------begin perfom all application-------- ') file = open(sys.argv[1], 'r', encoding='utf-8') # file = open('collect_ip.json', 'r', encoding='utf-8') ci_array = json.load(file) # read log config file file_log = open(sys.argv[2], 'r', encoding='utf-8') # file_log = open('loggin_conf.json', 'r', encoding='utf-8') ci_array_log = json.load(file_log) log_init(ci_array_log['logging']) pool = mp.Pool(processes=5) # process pool p_work = partial(transition, remote_base_path, local_base_path) # perform rsync file function for item in ci_array: ''' Multiple processes perform synchronization tasks ''' try: if not check_ssh(host=item.get("ip"), user=item.get("account"), port=item.get("port"), passwd=item.get("pwd"), dest_path="/data/snapshots/"): logging.error('SSH connect faild!') exit(-1) pool.map(p_work, (item, ))
plt.plot(x, y) plt.ylabel(f"Fraction unique classes") plt.xlabel("commit threshold") plt.title(f"blabla") plt.ylim(ymin=1) fig_path = f"results/StableInstances/test.png" plt.savefig(fig_path) def stable_instance_statistics(): query = "SELECT * FROM stable_unique_classes_all_level;" data = execute_query(query) commitThresholds = data["level" == 1]["commitThreshold"] plot_x_y(commitThresholds, data["unique_class_files_fraction"]) data = data.groupby("level") return data log_init(f"") log('Begin Statistics') start_time = time.time() Path(path.dirname("results/StableInstances/")).mkdir(parents=True, exist_ok=True) stable_instance_statistics() log(f"Processing statistics took {time.time() - start_time:.2f} seconds.") log_close() exit()
def plot_frequency(frequency_data, metric, var_name, level): frequency_data_melt = pd.melt(frequency_data, id_vars=metric, var_name=var_name, value_name="Frequency") line_plot_seaborn(frequency_data_melt, x=metric, y="Frequency", hue=var_name, level=level) log_init( f"results/Evolution/statistics_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" ) log('Begin Statistics') start_time = time.time() Path(path.dirname(SAVE_DIR)).mkdir(parents=True, exist_ok=True) for metric in METRICS: for level in REFACTORING_LEVELS: frequency_data = get_frequency_data_refactorings(level, metric).head(101) plot_frequency(frequency_data, metric, "refactoring", level) plot_cdf(frequency_data, metric, "refactoring", level) for level in STABLE_LEVELS: frequency_data_stable = get_frequency_data_stable(level, metric).head(101)
from utils.log import log_init, log_close, log import time import datetime from os import path """ The amount of samples for refactorings and non-refactorings in the database is enormous, thus caching the relevant data on your local machine can speed up the machine learning process. This class fetches the training data for refactoring instances and non-refactoring instances, as configured, from the database and stores the results of the queries in cache files. Note: In order to use this feature, ensure in the USE_CACHE is enabled in the config. """ log_init(path.join(CACHE_DIR_PATH, "results", f"warm-up_cache_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt")) log('Begin cache warm-up') start_time = time.time() for dataset in DATASETS + VALIDATION_DATASETS: for level in [Level.Class, Level.Method, Level.Variable, Level.Field, Level.Other]: log(f"-- non refactored instances for {level} for dataset: {dataset}") for k in [15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]: log(f"---- non refactored instances with k {k} for {level} for dataset: {dataset}") non_refactored = execute_query(get_level_stable(int(level), k, dataset)) log(str(len(non_refactored)) + " non-refactored instances were found for level: " + str(level)) log(f"-- {level} refactoring types with count for dataset: {dataset}") refactorings = execute_query(get_level_refactorings_count(int(level), dataset)) log(refactorings.to_string()) for refactoring_name in LEVEL_MAP[level]:
import copy from os import path from pathlib import Path import joblib from binary_classification import run from configs import RESULTS_DIR_PATH from utils import date_utils from utils.log import log_close, log_init if __name__ == "__main__": log_init( path.join( RESULTS_DIR_PATH, "log", f"classifier_training_{date_utils.windows_path_friendly_now()}.txt" )) projects = [ 'MaintenanceAPI', 'AgreementPreferencesAPI', 'AgreementsOverviewNLAPI', 'mobile_backend', 'EnrollmentAPI', 'mobile-components', 'paymentsapi', 'mobile_tools', 'security-proxy', 'authentication-api', 'registration-api', 'ExperienceComponents',
# -*- coding: utf-8 -*- # @Time : 2020/2/25 13:49 # @Author : zbs # @Site : # @File : reptile_main.py # @Software: PyCharm import sys import json import os # from common.syn_data import Reptile from common.syn_tjj import Reptile from utils.log import log_init if __name__ == '__main__': config_file = sys.argv[1] with open(config_file, 'r', encoding='utf-8') as file: config = json.load(file) project_dir = os.path.abspath(__file__) config['logging']['path'] = os.path.join(os.path.dirname(project_dir), config['logging']['path']) log_init(config['logging']) reptile = Reptile(config["mongodb"]["address"], config["mongodb"]["port"], config["net_address"]) reptile.run()
plot_refactoring_metrics( metrics, level, "test_set_github", hue, yticks=[ 0.1, 0.15, 0.25, 0.5, 0.75, 1, 1.5, 2.0, 2.5, 5, 6, 7.5, 10, 15, 20, 25, 50, 75, 100, 125, 150 ], metrics=PROCESS_METRICS_FIELDS, title=f"Process- and Ownership Metrics: {title} at {level.name}", file_descriptor=f"{file_descriptor}_Process_Ownership_Metrics") log_init( f"{SAVE_DIRECTORY}evaluation_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" ) start_time = time.time() # data import and preparation evaluation_data, prediction_data = import_evaluation(INPUT_DIRECTORY) false_negatives_metrics_all = pd.DataFrame() false_positives_metrics_all = pd.DataFrame() true_negatives_metrics_all = pd.DataFrame() true_positives_metrics_all = pd.DataFrame() for index, row in prediction_data.iterrows(): refactoring_name = row["refactoring_name"] level = get_refactoring_level(refactoring_name) false_negatives_metrics_all = false_negatives_metrics_all.append( extract_false_negatives(row, "test set github", level,
statistics.to_csv(statistics_path, index=False, header=True) log(f"Collected all statistics for {str(level)} and stored them at: {statistics_path}." ) else: statistics = statistics.append(pd.read_csv(statistics_path), ignore_index=True) grouped = statistics.groupby(["metric", "level"], as_index=False).mean() excel_path = f"{save_dir}{file_descriptor}_{dataset}.xlsx" grouped.to_excel(excel_path, index=False) return statistics SAVE_DIR = f"results/Distribution/Statistics/" log_init( f"{SAVE_DIR}feature_statistics_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" ) start_time = time.time() Path(path.dirname(SAVE_DIR)).mkdir(parents=True, exist_ok=True) with pd.option_context('display.max_rows', None, 'display.max_columns', None): # for metric_description, metrics in METRIC_SETS: # statistics = pd.DataFrame() # metrics_data = pd.DataFrame() # for metric in metrics: # metrics = get_last_refactored_instance_all([metric], REFACTORING_SAMPLES * 5) # statistics_metric = compute_statistics(metrics, Level.NONE, metric, extra_field="all") # statistics = statistics.append(statistics_metric, ignore_index=True) # metrics_data = metrics_data.append(metrics) # log(f"Extract {metric}") #