def run(self,
            counting_conditions: counting_condition_dict = default_cc_dict,
            clones: str = ".*\/clones.*",
            average_calculation: bool = False,
            poly_postprocessing: bool = True,
            exp_postprocessing: bool = False,
            extra_include_paths: path_list = ()):
        conditions = list(counting_conditions.keys())
        weightings = list(counting_conditions.values())

        self.debug("The following counting conditions and weightings will be "
                   "used as starting values:")
        self.debug_weightings(conditions, weightings)

        old = None
        while old != weightings:
            old = weightings.copy()
            weightings = self.optimize_weightings(
                conditions,
                weightings,
                clones,
                average_calculation=average_calculation,
                poly_postprocessing=poly_postprocessing,
                exp_postprocessing=exp_postprocessing,
                extra_include_paths=collect_dirs(list(extra_include_paths)))
            self.debug_weightings(conditions, weightings)
            if old != weightings:
                self.debug("Continuing investigation.")
Beispiel #2
0
def get_count_matrices(count_vector_creator, filenames, progress_callback,
                       base_path, extra_include_paths):  # pragma nt: no cover
    """
    Retrieves matrices holding count vectors for all variables for all
    functions in the given file.

    :param count_vector_creator: A object with a get_vectors_for_file method
                                 taking a filename as argument.
    :param filenames:            The files to create count vectors for.
    :param progress_callback:    A function with one float argument which is
                                 called after processing each file with the
                                 progress percentage (float) as an argument.
    :param extra_include_paths:  A list containing additional include paths.
    :return:                     A dict holding a tuple of (file, line,
                                 function) as key and as value a dict with
                                 variable names as key and count vector
                                 objects as value.
    """
    result = {}
    maxlen = len(filenames)
    include_paths = collect_dirs([os.path.dirname(base_path) + '/**'])
    include_paths += extra_include_paths

    for i, filename in enumerate(filenames):
        progress_callback(100 * (i / maxlen))
        count_dict = count_vector_creator.get_vectors_for_file(
            filename, include_paths)
        for function in count_dict:
            if not exclude_function(count_dict[function]):
                result[(filename, function[0],
                        function[1])] = count_dict[function]

    return result
Beispiel #3
0
 def test_dir_collection(self):
     self.assertEqual(
         sorted(collect_dirs([os.path.join(self.collectors_test_dir,
                                           "**")])),
         sorted([os.path.join(self.collectors_test_dir, "bears"),
             os.path.join(self.collectors_test_dir, "bears", "__pycache__"),
             os.path.join(self.collectors_test_dir, "others"),
             os.path.join(self.collectors_test_dir, "others", "c_files"),
             os.path.join(self.collectors_test_dir, "others", "py_files"),
             self.collectors_test_dir]))
Beispiel #4
0
 def test_dir_collection(self):
     self.assertEqual(
         sorted(i for i in
                collect_dirs([os.path.join(self.collectors_test_dir,
                                           "**")])
                if "__pycache__" not in i),
         sorted([os.path.normcase(os.path.join(
             self.collectors_test_dir, "bears")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "others")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "others",
                                           "c_files")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "others",
                                           "py_files")),
             os.path.normcase(self.collectors_test_dir+os.sep)]))
Beispiel #5
0
 def test_dir_string_collection(self):
     self.assertEqual(
         sorted(i for i in collect_dirs(
             os.path.join(self.collectors_test_dir, "**"))
                if "__pycache__" not in i),
         sorted([
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "bears")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others",
                              "c_files")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others",
                              "py_files")),
             os.path.normcase(self.collectors_test_dir + os.sep)
         ]))
Beispiel #6
0
 def test_dir_string_collection(self):
     self.assertEqual(
         sorted(collect_dirs(os.path.join(self.collectors_test_dir,
                                          "**"))),
         sorted([os.path.normcase(os.path.join(
             self.collectors_test_dir, "bears")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "bears",
                                           "__pycache__")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "others")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "others",
                                           "c_files")),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           "others",
                                           "py_files")),
             os.path.normcase(self.collectors_test_dir+os.sep)]))
Beispiel #7
0
 def test_dir_string_collection(self):
     self.assertEqual(
         sorted(i for i in
                collect_dirs(os.path.join(self.collectors_test_dir,
                                          '**'))
                if '__pycache__' not in i),
         sorted([os.path.normcase(os.path.join(
             self.collectors_test_dir, 'bears')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'bears_local_global')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'others')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'others',
                                           'c_files')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'others',
                                           'py_files')),
             os.path.normcase(self.collectors_test_dir)]))
Beispiel #8
0
 def test_dir_string_collection(self):
     self.assertEqual(
         sorted(i for i in
                collect_dirs(os.path.join(self.collectors_test_dir,
                                          '**'))
                if '__pycache__' not in i),
         sorted([os.path.normcase(os.path.join(
             self.collectors_test_dir, 'bears')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'bears_local_global')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'others')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'others',
                                           'c_files')),
             os.path.normcase(os.path.join(self.collectors_test_dir,
                                           'others',
                                           'py_files')),
             os.path.normcase(self.collectors_test_dir)]))
Beispiel #9
0
 def test_dir_collection(self):
     self.assertEqual(
         sorted(collect_dirs([os.path.join(self.collectors_test_dir,
                                           "**")])),
         sorted([
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "bears")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "bears",
                              "__pycache__")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others",
                              "c_files")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others",
                              "py_files")),
             os.path.normcase(self.collectors_test_dir + os.sep)
         ]))
Beispiel #10
0
 def test_ignored(self):
     self.assertEqual(
         sorted(i for i in collect_dirs(
             [os.path.join(self.collectors_test_dir, "**")], [
                 os.path.normcase(
                     os.path.join(self.collectors_test_dir, "others",
                                  "py_files"))
             ]) if "__pycache__" not in i),
         sorted([
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "bears")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir,
                              "bears_local_global")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others")),
             os.path.normcase(
                 os.path.join(self.collectors_test_dir, "others",
                              "c_files")),
             os.path.normcase(self.collectors_test_dir)
         ]))
Beispiel #11
0
    def test_ignored(self):
        self.assertEqual(
            sorted(i for i in
                   collect_dirs([os.path.join(self.collectors_test_dir,
                                              "**")],
                                [os.path.normcase(os.path.join(
                                    self.collectors_test_dir,
                                    "others",
                                    "py_files"))])
                   if "__pycache__" not in i),

            sorted([os.path.normcase(os.path.join(
                self.collectors_test_dir, "bears")),
                os.path.normcase(os.path.join(self.collectors_test_dir,
                                              "bears_local_global")),
                os.path.normcase(os.path.join(self.collectors_test_dir,
                                              "others")),
                os.path.normcase(os.path.join(self.collectors_test_dir,
                                              "others",
                                              "c_files")),
                os.path.normcase(self.collectors_test_dir)]))
def get_count_matrices(count_vector_creator,
                       filenames,
                       progress_callback,
                       base_path,
                       extra_include_paths):
    """
    Retrieves matrices holding count vectors for all variables for all
    functions in the given file.

    :param count_vector_creator: A object with a get_vectors_for_file method
                                 taking a filename as argument.
    :param filenames:            The files to create count vectors for.
    :param progress_callback:    A function with one float argument which is
                                 called after processing each file with the
                                 progress percentage (float) as an argument.
    :param extra_include_paths:  A list containing additional include paths.
    :return:                     A dict holding a tuple of (file, line,
                                 function) as key and as value a dict with
                                 variable names as key and count vector
                                 objects as value.
    """
    result = {}
    maxlen = len(filenames)
    include_paths = collect_dirs([os.path.dirname(base_path) + "/**"])
    include_paths += extra_include_paths

    for i, filename in enumerate(filenames):
        progress_callback(100*(i/maxlen))
        count_dict = count_vector_creator.get_vectors_for_file(filename,
                                                               include_paths)
        for function in count_dict:
            if not exclude_function(count_dict[function]):
                result[(filename,
                        function[0],
                        function[1])] = count_dict[function]

    return result
    def run(self,
            counting_conditions: counting_condition_dict=default_cc_dict,
            average_calculation: bool=False,
            poly_postprocessing: bool=True,
            exp_postprocessing: bool=False,
            extra_include_paths: path_list=()):
        '''
        Retrieves similarities for code clone detection. Those can be reused in
        another bear to produce results.

        Postprocessing may be done because small functions are less likely to
        be clones at the same difference value than big functions which may
        provide a better refactoring opportunity for the user.

        :param counting_conditions: A comma seperated list of counting
                                    conditions. Possible values are: used,
                                    returned, is_condition, in_condition,
                                    in_second_level_condition,
                                    in_third_level_condition, is_assignee,
                                    is_assigner, loop_content,
                                    second_level_loop_content,
                                    third_level_loop_content, is_param,
                                    in_sum, in_product, in_binary_operation,
                                    member_accessed.
                                    Weightings can be assigned to each
                                    condition due to providing a dict
                                    value, i.e. having used weighted in
                                    half as much as other conditions would
                                    simply be: "used: 0.5, is_assignee".
                                    Weightings default to 1 if unset.
        :param average_calculation: If set to true the difference calculation
                                    function will take the average of all
                                    variable differences as the difference,
                                    else it will normalize the function as a
                                    whole and thus weighting in variables
                                    dependent on their size.
        :param poly_postprocessing: If set to true, the difference value of big
                                    function pairs will be reduced using a
                                    polynomial approach.
        :param extra_include_paths: A list containing additional include paths.
        :param exp_postprocessing:  If set to true, the difference value of big
                                    function pairs will be reduced using an
                                    exponential approach.
        '''
        self.debug("Using the following counting conditions:")
        for key, val in counting_conditions.items():
            self.debug(" *", key.__name__, "(weighting: {})".format(val))

        self.debug("Creating count matrices...")
        count_matrices = get_count_matrices(
            ClangCountVectorCreator(list(counting_conditions.keys()),
                                    list(counting_conditions.values())),
            list(self.file_dict.keys()),
            lambda prog: self.debug("{:2.4f}%...".format(prog)),
            self.section["files"].origin,
            collect_dirs(extra_include_paths))

        self.debug("Calculating differences...")

        differences = []
        function_count = len(count_matrices)
        # Thats n over 2, hardcoded to simplify calculation
        combination_length = function_count * (function_count-1) / 2
        partial_get_difference = functools.partial(
            get_difference,
            count_matrices=count_matrices,
            average_calculation=average_calculation,
            poly_postprocessing=poly_postprocessing,
            exp_postprocessing=exp_postprocessing)

        for i, elem in enumerate(
                map(partial_get_difference,
                    [(f1, f2) for f1, f2 in combinations(count_matrices, 2)])):
            if i % 50 == 0:
                self.debug("{:2.4f}%...".format(100*i/combination_length))
            differences.append(elem)

        yield HiddenResult(self, differences)
        yield HiddenResult(self, count_matrices)
Beispiel #14
0
 def test_dir_invalid(self):
     self.assertEqual(collect_dirs(["invalid_path"]), [])
Beispiel #15
0
 def test_dir_invalid(self):
     self.assertEqual(collect_dirs(['invalid_path']), [])
    def run(self,
            counting_conditions: counting_condition_dict = default_cc_dict,
            average_calculation: bool = False,
            poly_postprocessing: bool = True,
            exp_postprocessing: bool = False,
            extra_include_paths: path_list = ()):
        '''
        Retrieves similarities for code clone detection. Those can be reused in
        another bear to produce results.

        Postprocessing may be done because small functions are less likely to
        be clones at the same difference value than big functions which may
        provide a better refactoring opportunity for the user.

        :param counting_conditions: A comma seperated list of counting
                                    conditions. Possible values are: used,
                                    returned, is_condition, in_condition,
                                    in_second_level_condition,
                                    in_third_level_condition, is_assignee,
                                    is_assigner, loop_content,
                                    second_level_loop_content,
                                    third_level_loop_content, is_param,
                                    in_sum, in_product, in_binary_operation,
                                    member_accessed.
                                    Weightings can be assigned to each
                                    condition due to providing a dict
                                    value, i.e. having used weighted in
                                    half as much as other conditions would
                                    simply be: "used: 0.5, is_assignee".
                                    Weightings default to 1 if unset.
        :param average_calculation: If set to true the difference calculation
                                    function will take the average of all
                                    variable differences as the difference,
                                    else it will normalize the function as a
                                    whole and thus weighting in variables
                                    dependent on their size.
        :param poly_postprocessing: If set to true, the difference value of big
                                    function pairs will be reduced using a
                                    polynomial approach.
        :param extra_include_paths: A list containing additional include paths.
        :param exp_postprocessing:  If set to true, the difference value of big
                                    function pairs will be reduced using an
                                    exponential approach.
        '''
        self.debug("Using the following counting conditions:")
        for key, val in counting_conditions.items():
            self.debug(" *", key.__name__, "(weighting: {})".format(val))

        self.debug("Creating count matrices...")
        count_matrices = get_count_matrices(
            ClangCountVectorCreator(list(counting_conditions.keys()),
                                    list(counting_conditions.values())),
            list(self.file_dict.keys()),
            lambda prog: self.debug("{:2.4f}%...".format(prog)),
            self.section["files"].origin, collect_dirs(extra_include_paths))

        self.debug("Calculating differences...")

        differences = []
        function_count = len(count_matrices)
        # Thats n over 2, hardcoded to simplify calculation
        combination_length = function_count * (function_count - 1) / 2
        partial_get_difference = functools.partial(
            get_difference,
            count_matrices=count_matrices,
            average_calculation=average_calculation,
            poly_postprocessing=poly_postprocessing,
            exp_postprocessing=exp_postprocessing)

        for i, elem in enumerate(
                map(partial_get_difference,
                    [(f1, f2) for f1, f2 in combinations(count_matrices, 2)])):
            if i % 50 == 0:
                self.debug("{:2.4f}%...".format(100 * i / combination_length))
            differences.append(elem)

        yield HiddenResult(self, differences)
        yield HiddenResult(self, count_matrices)