def get_extractors(self, data_types, extract_bugs, version): db = DataBuilder(self.project, version) if extract_bugs: data_types.add("bugged") data_types.add("bugged_methods") extractors_to_run = set() for extractor in Extractor.get_all_extractors(self.project, version): if not extract_bugs and "bugged" in extractor.__class__.__name__.lower(): continue extractor_data_types = [] for dt in extractor.data_types: if dt.value in data_types: extractor_data_types.append(dt) extractors_to_run.add(extractor) db.add_data_types(extractor_data_types) return db, extractors_to_run
def build_dataset(version, project): general_log = logging.getLogger(__name__) success_log = logging.getLogger("success") failure_log = logging.getLogger("failure") failure_verbose_log = logging.getLogger("failure_verbose") try: db = DataBuilder(project, version) db.append(DataNameEnum.ImperativeAbstraction) db.append(DataNameEnum.MultifacetedAbstraction) db.append(DataNameEnum.UnnecessaryAbstraction) db.append(DataNameEnum.UnutilizedAbstraction) db.append(DataNameEnum.DeficientEncapsulation) db.append(DataNameEnum.UnexploitedEncapsulation) db.append(DataNameEnum.BrokenModularization) db.append(DataNameEnum.Cyclic_DependentModularization) db.append(DataNameEnum.InsufficientModularization) db.append(DataNameEnum.Hub_likeModularization) db.append(DataNameEnum.BrokenHierarchy) db.append(DataNameEnum.CyclicHierarchy) db.append(DataNameEnum.DeepHierarchy) db.append(DataNameEnum.MissingHierarchy) db.append(DataNameEnum.MultipathHierarchy) db.append(DataNameEnum.RebelliousHierarchy) db.append(DataNameEnum.WideHierarchy) db.append(DataNameEnum.Bugged) general_log.info("{0} | {1} | building dataset".format( project.github(), version)) classes_df, methods_df = db.build() if not classes_df.empty: success_log.info("{0} | {1} | succeeded building dataset".format( project.github(), version)) else: raise Exception("Designite smells dataset is empty.") return classes_df except Exception: failure_log.error( "{0} | {1} | (exception) failed building dataset".format( project.github(), version)) failure_verbose_log.exception( "{0} | {1} | failed building dataset".format( project.github(), version)) return None
def extract_features_to_version(self, classes_data, method_data, version): extractors = Extractor.get_all_extractors(self.project, version) for extractor in extractors: extractor.extract() db = DataBuilder(self.project, version) list(map(lambda d: db.append(d), DataNameEnum)) classes_df, methods_df = db.build() intermediate_dir = Config.get_work_dir_path( os.path.join(Config().config['CACHING']['RepositoryData'], Config().config['VERSION_METRICS']['Intermediate'], self.project.github())) classes_intermediate_dir = os.path.join(intermediate_dir, "classes") methods_intermediate_dir = os.path.join(intermediate_dir, "methods") Path(classes_intermediate_dir).mkdir(parents=True, exist_ok=True) Path(methods_intermediate_dir).mkdir(parents=True, exist_ok=True) classes_df.to_csv(os.path.join(classes_intermediate_dir, version + ".csv"), index=False, sep=';') methods_df.to_csv(os.path.join(methods_intermediate_dir, version + ".csv"), index=False, sep=';') methods_df = self.fillna(methods_df) aggregated_methods_df = self.aggrate_methods_df(methods_df) classes_df.dropna(inplace=True) classes_df.to_csv(os.path.join(intermediate_dir, "classes_df.csv"), index=False, sep=';') aggregated_methods_df.to_csv(os.path.join(intermediate_dir, "aggregated_methods_df.csv"), index=False, sep=';') if 'Class' in classes_df.columns and 'Class' in aggregated_methods_df.columns: classes_df = classes_df.merge(aggregated_methods_df, on=['File', 'Class'], how='outer') else: classes_df = classes_df.merge(aggregated_methods_df, on=['File'], how='outer') classes_df.to_csv(os.path.join(intermediate_dir, "classes_df_afterMerge.csv"), index=False, sep=';') classes_df = self.fillna(classes_df) classes_df.to_csv(os.path.join(classes_data, version + ".csv"), index=False, sep=';') methods_df = methods_df.drop('File', axis=1, errors='ignore') methods_df = methods_df.drop('Class', axis=1, errors='ignore') methods_df = methods_df.drop('Method', axis=1, errors='ignore') methods_df.to_csv(os.path.join(method_data, version + ".csv"), index=False, sep=';') return classes_df, methods_df
def build_dataset(version, project): general_log = logging.getLogger(__name__) success_log = logging.getLogger("success") failure_log = logging.getLogger("failure") failure_verbose_log = logging.getLogger("failure_verbose") try: db = DataBuilder(project, version) db.append(DataNameEnum.ImperativeAbstraction) db.append(DataNameEnum.MultifacetedAbstraction) db.append(DataNameEnum.UnnecessaryAbstraction) db.append(DataNameEnum.UnutilizedAbstraction) db.append(DataNameEnum.DeficientEncapsulation) db.append(DataNameEnum.UnexploitedEncapsulation) db.append(DataNameEnum.BrokenModularization) db.append(DataNameEnum.Cyclic_DependentModularization) db.append(DataNameEnum.InsufficientModularization) db.append(DataNameEnum.Hub_likeModularization) db.append(DataNameEnum.BrokenHierarchy) db.append(DataNameEnum.CyclicHierarchy) db.append(DataNameEnum.DeepHierarchy) db.append(DataNameEnum.MissingHierarchy) db.append(DataNameEnum.MultipathHierarchy) db.append(DataNameEnum.RebelliousHierarchy) db.append(DataNameEnum.WideHierarchy) db.append(DataNameEnum.CBO) db.append(DataNameEnum.WMC_CK) db.append(DataNameEnum.RFC) db.append(DataNameEnum.LOCMethod_CK) db.append(DataNameEnum.Returns) db.append(DataNameEnum.NumberOfVariables) db.append(DataNameEnum.NumberOfParameters_CK) db.append(DataNameEnum.NumberOfLoops) db.append(DataNameEnum.NumberOfComparisons) db.append(DataNameEnum.NumberOfTryCatch) db.append(DataNameEnum.NumberOfParenthesizedExps) db.append(DataNameEnum.NumberOfStringLiterals) db.append(DataNameEnum.NumberOfNumbers) db.append(DataNameEnum.NumberOfAssignments) db.append(DataNameEnum.NumberOfMathOperations) db.append(DataNameEnum.MaxNumberOfNestedBlocks) db.append(DataNameEnum.NumberOfAnonymousClasses) db.append(DataNameEnum.NumberOfInnerClasses) db.append(DataNameEnum.NumberOfLambdas) db.append(DataNameEnum.NumberOfUniqueWords) db.append(DataNameEnum.NumberOfModifiers) db.append(DataNameEnum.NumberOfLogStatements) db.append(DataNameEnum.NumberOfFields) db.append(DataNameEnum.NumberOfPublicFields) db.append(DataNameEnum.NumberOfMethods_Designite) db.append(DataNameEnum.NumberOfPublicMethods_Designite) db.append(DataNameEnum.NumberOfChildren) db.append(DataNameEnum.DepthOfInheritance) db.append(DataNameEnum.LOCClass) db.append(DataNameEnum.LCOM) db.append(DataNameEnum.FANIN) db.append(DataNameEnum.FANOUT) db.append(DataNameEnum.TotalNumberOfOperators) db.append(DataNameEnum.NumberOfDistinctOperators) db.append(DataNameEnum.TotalNumberOfOperands) db.append(DataNameEnum.NumberOfDistinctOperands) db.append(DataNameEnum.Length) db.append(DataNameEnum.Vocabulary) db.append(DataNameEnum.Volume) db.append(DataNameEnum.Difficulty) db.append(DataNameEnum.Effort) db.append(DataNameEnum.NCSSForThisFile) db.append(DataNameEnum.NestedIfElseDepth) db.append(DataNameEnum.BooleanExpressionComplexity) db.append(DataNameEnum.CyclomaticComplexity) db.append(DataNameEnum.NCSSForThisMethod) db.append(DataNameEnum.NPathComplexity) db.append(DataNameEnum.ThrowsCount) db.append(DataNameEnum.NCSSForThisClass) db.append(DataNameEnum.ExecutableStatementCount) db.append(DataNameEnum.MethodLength) db.append(DataNameEnum.FileLength) db.append(DataNameEnum.NumberOfMethods_Checkstyle) db.append(DataNameEnum.NumberOfPublicMethods_Checkstyle) db.append(DataNameEnum.ClassFanOutComplexity) db.append(DataNameEnum.ClassDataAbstractionCoupling) db.append(DataNameEnum.Bugged) general_log.info("{0} | {1} | building dataset".format( project.github(), version)) classes_df, methods_df = db.build() if not classes_df.empty: success_log.info("{0} | {1} | succeeded building dataset".format( project.github(), version)) else: raise Exception("Fowler smells dataset is empty.") def mean_or_union(rows): if rows.dtypes.name == 'bool': return any(rows) try: return np.mean(rows) except TypeError as e: rows = rows.astype('int64') return np.mean(rows) # file-class conversion leaves rows without values(nan) - drop them classes_df.dropna(inplace=True) values = {feature: 0 for feature in list(methods_df.columns)} values.update(dict(zip(('File', 'Class', 'Method'), ['nan'] * 3))) methods_df.fillna(value=values, inplace=True) methods_df.dropna(inplace=True) aggregation_fns = { feature: mean_or_union for feature in list(methods_df.columns)[3:] } aggregated_methods_df = methods_df.groupby( ['File', 'Class']).aggregate(aggregation_fns).reset_index() dataset = classes_df.merge(aggregated_methods_df, on=['File', 'Class'], how='outer') dataset.dropna(inplace=True) return dataset except Exception: failure_log.error( "{0} | {1} | (exception) failed building dataset".format( project.github(), version)) failure_verbose_log.exception( "{0} | {1} | failed building dataset".format( project.github(), version)) return None
def build_dataset(version, project): general_log = logging.getLogger(__name__) success_log = logging.getLogger("success") failure_log = logging.getLogger("failure") failure_verbose_log = logging.getLogger("failure_verbose") try: db = DataBuilder(project, version) db.append(DataNameEnum.GodClass) db.append(DataNameEnum.ClassDataShouldBePrivate) db.append(DataNameEnum.ComplexClass) db.append(DataNameEnum.LazyClass) db.append(DataNameEnum.RefusedBequest) db.append(DataNameEnum.SpaghettiCode) db.append(DataNameEnum.SpeculativeGenerality) db.append(DataNameEnum.DataClass) db.append(DataNameEnum.BrainClass) db.append(DataNameEnum.LargeClass) db.append(DataNameEnum.SwissArmyKnife) db.append(DataNameEnum.AntiSingleton) db.append(DataNameEnum.FeatureEnvy) db.append(DataNameEnum.LongMethod_Organic) db.append(DataNameEnum.LongParameterList_Organic) db.append(DataNameEnum.MessageChain) db.append(DataNameEnum.DispersedCoupling) db.append(DataNameEnum.IntensiveCoupling) db.append(DataNameEnum.ShotgunSurgery) db.append(DataNameEnum.BrainMethod) db.append(DataNameEnum.ImperativeAbstraction) db.append(DataNameEnum.MultifacetedAbstraction) db.append(DataNameEnum.UnnecessaryAbstraction) db.append(DataNameEnum.UnutilizedAbstraction) db.append(DataNameEnum.DeficientEncapsulation) db.append(DataNameEnum.UnexploitedEncapsulation) db.append(DataNameEnum.BrokenModularization) db.append(DataNameEnum.Cyclic_DependentModularization) db.append(DataNameEnum.InsufficientModularization) db.append(DataNameEnum.Hub_likeModularization) db.append(DataNameEnum.BrokenHierarchy) db.append(DataNameEnum.CyclicHierarchy) db.append(DataNameEnum.DeepHierarchy) db.append(DataNameEnum.MissingHierarchy) db.append(DataNameEnum.MultipathHierarchy) db.append(DataNameEnum.RebelliousHierarchy) db.append(DataNameEnum.WideHierarchy) db.append(DataNameEnum.Bugged) general_log.info("{0} | {1} | building dataset".format( project.github(), version)) classes_df, methods_df = db.build() if not classes_df.empty: success_log.info("{0} | {1} | succeeded building dataset".format( project.github(), version)) else: raise Exception("Fowler smells dataset is empty.") def union_smell(value): return any(value) aggregation_fns = { feature: union_smell for feature in list(methods_df.columns)[3:] } aggregated_methods_df = methods_df.groupby( ['File', 'Class']).aggregate(aggregation_fns).reset_index() classes_df.dropna(inplace=True) dataset = classes_df.merge(aggregated_methods_df, on=['File', 'Class'], how='outer') dataset.fillna(False, inplace=True) return dataset except Exception: failure_log.error( "{0} | {1} | (exception) failed building dataset".format( project.github(), version)) failure_verbose_log.exception( "{0} | {1} | failed building dataset".format( project.github(), version)) return None
def get_designite_builder(project, version): db = DataBuilder(project, version) db.append(DataName.ImperativeAbstraction) db.append(DataName.MultifacetedAbstraction) db.append(DataName.UnnecessaryAbstraction) db.append(DataName.UnutilizedAbstraction) db.append(DataName.DeficientEncapsulation) db.append(DataName.UnexploitedEncapsulation) db.append(DataName.BrokenModularization) db.append(DataName.Cyclic_DependentModularization) db.append(DataName.InsufficientModularization) db.append(DataName.Hub_likeModularization) db.append(DataName.BrokenHierarchy) db.append(DataName.CyclicHierarchy) db.append(DataName.DeepHierarchy) db.append(DataName.MissingHierarchy) db.append(DataName.MultipathHierarchy) db.append(DataName.RebelliousHierarchy) db.append(DataName.WideHierarchy) db.append(DataName.Bugged) return db
def get_traditional_builder(project, version): db = DataBuilder(project, version) db.append(DataName.CBO) db.append(DataName.WMC_CK) db.append(DataName.RFC) db.append(DataName.LOCMethod_CK) db.append(DataName.Returns) db.append(DataName.NumberOfVariables) db.append(DataName.NumberOfParameters_CK) db.append(DataName.NumberOfLoops) db.append(DataName.NumberOfComparisons) db.append(DataName.NumberOfTryCatch) db.append(DataName.NumberOfParenthesizedExps) db.append(DataName.NumberOfStringLiterals) db.append(DataName.NumberOfNumbers) db.append(DataName.NumberOfAssignments) db.append(DataName.NumberOfMathOperations) db.append(DataName.MaxNumberOfNestedBlocks) db.append(DataName.NumberOfAnonymousClasses) db.append(DataName.NumberOfInnerClasses) db.append(DataName.NumberOfLambdas) db.append(DataName.NumberOfUniqueWords) db.append(DataName.NumberOfModifiers) db.append(DataName.NumberOfLogStatements) db.append(DataName.NumberOfFields) db.append(DataName.NumberOfPublicFields) db.append(DataName.NumberOfMethods_Designite) db.append(DataName.NumberOfPublicMethods_Designite) db.append(DataName.NumberOfChildren) db.append(DataName.DepthOfInheritance) db.append(DataName.LOCClass) db.append(DataName.LCOM) db.append(DataName.FANIN) db.append(DataName.FANOUT) db.append(DataName.TotalNumberOfOperators) db.append(DataName.NumberOfDistinctOperators) db.append(DataName.TotalNumberOfOperands) db.append(DataName.NumberOfDistinctOperands) db.append(DataName.Length) db.append(DataName.Vocabulary) db.append(DataName.Volume) db.append(DataName.Difficulty) db.append(DataName.Effort) db.append(DataName.NCSSForThisFile) db.append(DataName.NestedIfElseDepth) db.append(DataName.BooleanExpressionComplexity) db.append(DataName.CyclomaticComplexity) db.append(DataName.NCSSForThisMethod) db.append(DataName.NPathComplexity) db.append(DataName.ThrowsCount) db.append(DataName.NCSSForThisClass) db.append(DataName.ExecutableStatementCount) db.append(DataName.MethodLength) db.append(DataName.FileLength) db.append(DataName.NumberOfMethods_Checkstyle) db.append(DataName.NumberOfPublicMethods_Checkstyle) db.append(DataName.ClassFanOutComplexity) db.append(DataName.ClassDataAbstractionCoupling) db.append(DataName.Bugged) return db
def get_designite_fowler_traditional_builder(project, version): db = DataBuilder(project, version) db.append(DataName.ImperativeAbstraction) db.append(DataName.MultifacetedAbstraction) db.append(DataName.UnnecessaryAbstraction) db.append(DataName.UnutilizedAbstraction) db.append(DataName.DeficientEncapsulation) db.append(DataName.UnexploitedEncapsulation) db.append(DataName.BrokenModularization) db.append(DataName.Cyclic_DependentModularization) db.append(DataName.InsufficientModularization) db.append(DataName.Hub_likeModularization) db.append(DataName.BrokenHierarchy) db.append(DataName.CyclicHierarchy) db.append(DataName.DeepHierarchy) db.append(DataName.MissingHierarchy) db.append(DataName.MultipathHierarchy) db.append(DataName.RebelliousHierarchy) db.append(DataName.WideHierarchy) db.append(DataName.GodClass) db.append(DataName.ClassDataShouldBePrivate) db.append(DataName.ComplexClass) db.append(DataName.LazyClass) db.append(DataName.RefusedBequest) db.append(DataName.SpaghettiCode) db.append(DataName.SpeculativeGenerality) db.append(DataName.DataClass) db.append(DataName.BrainClass) db.append(DataName.LargeClass) db.append(DataName.SwissArmyKnife) db.append(DataName.AntiSingleton) db.append(DataName.FeatureEnvy) db.append(DataName.LongMethod_Organic) db.append(DataName.LongParameterList_Organic) db.append(DataName.MessageChain) db.append(DataName.DispersedCoupling) db.append(DataName.IntensiveCoupling) db.append(DataName.ShotgunSurgery) db.append(DataName.BrainMethod) db.append(DataName.CBO) db.append(DataName.WMC_CK) db.append(DataName.RFC) db.append(DataName.LOCMethod_CK) db.append(DataName.Returns) db.append(DataName.NumberOfVariables) db.append(DataName.NumberOfParameters_CK) db.append(DataName.NumberOfLoops) db.append(DataName.NumberOfComparisons) db.append(DataName.NumberOfTryCatch) db.append(DataName.NumberOfParenthesizedExps) db.append(DataName.NumberOfStringLiterals) db.append(DataName.NumberOfNumbers) db.append(DataName.NumberOfAssignments) db.append(DataName.NumberOfMathOperations) db.append(DataName.MaxNumberOfNestedBlocks) db.append(DataName.NumberOfAnonymousClasses) db.append(DataName.NumberOfInnerClasses) db.append(DataName.NumberOfLambdas) db.append(DataName.NumberOfUniqueWords) db.append(DataName.NumberOfModifiers) db.append(DataName.NumberOfLogStatements) db.append(DataName.NumberOfFields) db.append(DataName.NumberOfPublicFields) db.append(DataName.NumberOfMethods_Designite) db.append(DataName.NumberOfPublicMethods_Designite) db.append(DataName.NumberOfChildren) db.append(DataName.DepthOfInheritance) db.append(DataName.LOCClass) db.append(DataName.LCOM) db.append(DataName.FANIN) db.append(DataName.FANOUT) db.append(DataName.TotalNumberOfOperators) db.append(DataName.NumberOfDistinctOperators) db.append(DataName.TotalNumberOfOperands) db.append(DataName.NumberOfDistinctOperands) db.append(DataName.Length) db.append(DataName.Vocabulary) db.append(DataName.Volume) db.append(DataName.Difficulty) db.append(DataName.Effort) db.append(DataName.NCSSForThisFile) db.append(DataName.NestedIfElseDepth) db.append(DataName.BooleanExpressionComplexity) db.append(DataName.CyclomaticComplexity) db.append(DataName.NCSSForThisMethod) db.append(DataName.NPathComplexity) db.append(DataName.ThrowsCount) db.append(DataName.NCSSForThisClass) db.append(DataName.ExecutableStatementCount) db.append(DataName.MethodLength) db.append(DataName.FileLength) db.append(DataName.NumberOfMethods_Checkstyle) db.append(DataName.NumberOfPublicMethods_Checkstyle) db.append(DataName.ClassFanOutComplexity) db.append(DataName.ClassDataAbstractionCoupling) db.append(DataName.Bugged) return db
def get_fowler_builder(project, version): db = DataBuilder(project, version) db.append(DataName.GodClass) db.append(DataName.ClassDataShouldBePrivate) db.append(DataName.ComplexClass) db.append(DataName.LazyClass) db.append(DataName.RefusedBequest) db.append(DataName.SpaghettiCode) db.append(DataName.SpeculativeGenerality) db.append(DataName.DataClass) db.append(DataName.BrainClass) db.append(DataName.LargeClass) db.append(DataName.SwissArmyKnife) db.append(DataName.AntiSingleton) db.append(DataName.FeatureEnvy) db.append(DataName.LongMethod_Organic) db.append(DataName.LongParameterList_Organic) db.append(DataName.MessageChain) db.append(DataName.DispersedCoupling) db.append(DataName.IntensiveCoupling) db.append(DataName.ShotgunSurgery) db.append(DataName.BrainMethod) db.append(DataName.Bugged) return db
def get_designite_fowler_builder(project, version): db = DataBuilder(project, version) db.append(DataName.GodClass) db.append(DataName.ClassDataShouldBePrivate) db.append(DataName.ComplexClass) db.append(DataName.LazyClass) db.append(DataName.RefusedBequest) db.append(DataName.SpaghettiCode) db.append(DataName.SpeculativeGenerality) db.append(DataName.DataClass) db.append(DataName.BrainClass) db.append(DataName.LargeClass) db.append(DataName.SwissArmyKnife) db.append(DataName.AntiSingleton) db.append(DataName.FeatureEnvy) db.append(DataName.LongMethod_Organic) db.append(DataName.LongParameterList_Organic) db.append(DataName.MessageChain) db.append(DataName.DispersedCoupling) db.append(DataName.IntensiveCoupling) db.append(DataName.ShotgunSurgery) db.append(DataName.BrainMethod) db.append(DataName.ImperativeAbstraction) db.append(DataName.MultifacetedAbstraction) db.append(DataName.UnnecessaryAbstraction) db.append(DataName.UnutilizedAbstraction) db.append(DataName.DeficientEncapsulation) db.append(DataName.UnexploitedEncapsulation) db.append(DataName.BrokenModularization) db.append(DataName.Cyclic_DependentModularization) db.append(DataName.InsufficientModularization) db.append(DataName.Hub_likeModularization) db.append(DataName.BrokenHierarchy) db.append(DataName.CyclicHierarchy) db.append(DataName.DeepHierarchy) db.append(DataName.MissingHierarchy) db.append(DataName.MultipathHierarchy) db.append(DataName.RebelliousHierarchy) db.append(DataName.WideHierarchy) db.append(DataName.Bugged) return db