def stable_instance_statistics(): query = "SELECT * FROM stable_unique_classes_all_level;" data = execute_query(query) commitThresholds = data["level" == 1]["commitThreshold"] plot_x_y(commitThresholds, data["unique_class_files_fraction"]) data = data.groupby("level") return data
def query_raw(table_name: str, metrics, descriptor: str): file_path = f"results/Metrics/{table_name}_{descriptor}_raw.csv" Path(path.dirname(file_path)).mkdir(parents=True, exist_ok=True) if not path.exists(file_path): metrics_query = ', '.join([f"{metric}" for metric in metrics]) query = f"SELECT {metrics_query} FROM {table_name}" dataframe = execute_query(query) dataframe.to_csv(file_path, index=False) log(f"Got the raw data from {table_name} for these metrics: {metrics}.") else: dataframe = pd.read_csv(file_path) return dataframe
def get_non_refactored_instances(self, datasets: Iterable[str]): """ Get all non-refactored (stable) instances of the same level of the refactoring, e.g. Level 2 for refactoring "Extract Method". Parameter: dataset (str) (optional): filter the non-refactored for this dataset. If no dataset is specified, no filter is applied. """ return execute_query( get_level_stable(int(self._level), self._commit_threshold, datasets))
def get_refactored_instances(self, datasets: Iterable[str] = [], projects=[]): """ Get all refactoring instances for this refactoring, e.g. for refactoring "Extract Method". Parameter: dataset (str) (optional): filter the refactoring instances for this dataset. If no dataset is specified, no filter is applied. """ return execute_query( get_level_refactorings(int(self._level), self._name, datasets))
def query_avg(table_name: str, function: str, metrics, descriptor: str, group: bool): file_path = f"results/Metrics/{table_name}_{function}_{descriptor}.csv" Path(path.dirname(file_path)).mkdir(parents=True, exist_ok=True) if not path.exists(file_path): metrics_query = ', '.join([f"{function}({metric}) AS \"{metric}\"" for metric in metrics]) if group: query = f"SELECT {metrics_query} FROM {table_name} group by level" else: query = f"SELECT {metrics_query} FROM {table_name}" dataframe = execute_query(query) dataframe.to_csv(file_path, index=False) log(f"Got the data from {table_name} for these metrics: {metrics} for the aggregate function: {function}.") else: dataframe = pd.read_csv(file_path) return dataframe
def retrieve_columns(sql_query, columns, samples=-1): # Hash the query query_hash = hashlib.sha1(sql_query.encode()).hexdigest() # Create the filepath cache_dir = path.join(CACHE_DIR_PATH, "_cache") file_path = path.join(cache_dir, f"{query_hash}.csv") if path.exists(file_path): data = pd.read_csv(file_path, usecols=columns) if samples < 0 or len(data) < samples: return data else: return data.sample(samples) else: return execute_query(sql_query)
from configs import DATASETS, Level, VALIDATION_DATASETS from db.QueryBuilder import get_all_level_stable, get_level_refactorings_count, get_level_refactorings from db.DBConnector import execute_query from utils.log import log_init, log_close, log import time log_init() log('Begin cache warm-up') start_time = time.time() for dataset in (DATASETS + VALIDATION_DATASETS): log("\n**** dataset: " + dataset) for level in Level: log("-- non refactored instances for " + str(level)) non_refactored = execute_query( get_all_level_stable(int(level), dataset)) log( str(len(non_refactored)) + " non-refactored instances were found for level: " + str(level)) log("-- " + str(level) + " refactoring types with count") refactorings = execute_query( get_level_refactorings_count(int(level), dataset)) log(refactorings.to_string()) for refactoring_name in refactorings['refactoring']: refactoring_instances = execute_query( get_level_refactorings(int(level), refactoring_name, dataset)) log('Cache warm-up took %s seconds.' % (time.time() - start_time)) log_close()
def get_non_refactored_instances(self, dataset): return execute_query(get_all_level_stable(int(self._level), dataset))
def get_refactored_instances(self, dataset): return execute_query( get_level_refactorings(int(self._level), self._name, dataset))