예제 #1
0
 def __init__(self, split_percent, dataset: Dataset, seed):
     super(IdentifierDataCreator, self).__init__(split_percent, dataset,
                                                 seed)
     self._code_tokenizer = JavaCodeASTTokenizer()
     self._code_preprocessor = Preprocessor(
         [CamelCaseSplitter(True),
          NonLetterFilter()])
     self._tracelink_type = TraceLinkType.identifier_tracelinks
예제 #2
0
    def __init__(
            self,
            preprocessor=Preprocessor(),
            wordemb_creator=MockWordEmbeddingCreator(),
            tokenizer=JavaCodeASTTokenizer(None, None),
            preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):

        super(MethodNameSentenceEmbeddingCreator,
              self).__init__(preprocessor, wordemb_creator, tokenizer,
                             preprocessed_token_output_directory)
예제 #3
0
 def __init__(
         self,
         preprocessor=Preprocessor(),
         wordemb_creator=MockWordEmbeddingCreator(),
         tokenizer=JavaCodeASTTokenizer(None, None),
         preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):
     super(IdentifierEmbeddingOnlyMethods,
           self).__init__(preprocessor, wordemb_creator, tokenizer,
                          preprocessed_token_output_directory)
     self._with_class_name = False
예제 #4
0
    def __init__(
            self,
            preprocessor=Preprocessor(),
            wordemb_creator=MockWordEmbeddingCreator(),
            tokenizer=JavaCodeASTTokenizer(None, None),
            preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):
        super(IdentifierEmbeddingWithAttribute,
              self).__init__(preprocessor, wordemb_creator, tokenizer,
                             preprocessed_token_output_directory)

        self._with_attribute = True
예제 #5
0
def rename_eanci_code_files_and_solution_matrix():
    tok = JavaCodeASTTokenizer(EANCINoTrans(), WordTokenizer(EANCINoTrans(), True))
    code_files = FileUtil.get_files_in_directory(EANCINoTrans().code_folder(), True)
    code_file_to_class_name_map = {}
    old_sol_matrix = FileUtil.read_xml_format_solution_matrix(EANCINoTrans().folder() / "answer_req_code.xml")
    for code_file in code_files:
        code_file_representation = tok.tokenize(code_file)
        class_name = code_file_representation.classifiers[0].get_original_name()
        new_file_name = class_name + ".java"
        old_file_name = FileUtil.get_filename_from_path(code_file)
        code_file_to_class_name_map[FileUtil.get_filename_without_extension__from_path(old_file_name)] = new_file_name
        os.rename(code_file, EANCINoTrans().code_folder() / new_file_name)
    
    renamed_solution_links = []
    for old_req_name, old_code_name in old_sol_matrix.get_all_trace_links():
        renamed_solution_links.append(f"{old_req_name}.txt: {code_file_to_class_name_map[old_code_name]}")
    FileUtil.write_file(EANCINoTrans().EANCI_SOLUTION_MATRIX_PATH, "\n".join(renamed_solution_links
                                                                             
                                                                             
#rename_eanci_code_files_and_solution_matrix()
        
예제 #6
0
 def __init__(
         self,
         preprocessor=Preprocessor(),
         wordemb_creator=MockWordEmbeddingCreator(),
         tokenizer=JavaCodeASTTokenizer(None, None),
         preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):
     super(IdentifierEmbeddingCreatorWithMethodCommentToClass,
           self).__init__(preprocessor, wordemb_creator, tokenizer,
                          preprocessed_token_output_directory)
     self._with_class_name = True
     self._with_method = True
     self._with_method_comment_to_class = True
     self._with_class_name_to_method = True
예제 #7
0
 def __init__(
         self,
         preprocessor=Preprocessor(),
         wordemb_creator=MockWordEmbeddingCreator(),
         tokenizer=JavaCodeASTTokenizer(None, None),
         preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):
     super(CodeEmbeddingCreator,
           self).__init__(preprocessor, wordemb_creator, tokenizer,
                          preprocessed_token_output_directory)
     self._is_ital_identifier = False
     self._is_ital_comm = False
     if isinstance(wordemb_creator, FastTextAlignedEngItalEmbeddingCreator):
         self._is_ital_comm = True
예제 #8
0
    def __init__(
            self,
            precalculated_weights_file,
            preprocessor=Preprocessor(),
            wordemb_creator=MockWordEmbeddingCreator(),
            tokenizer=JavaCodeASTTokenizer(None, None),
            preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):

        super(TFIDFIdentifierEmbeddingCreator,
              self).__init__(preprocessor, wordemb_creator, tokenizer,
                             preprocessed_token_output_directory)

        if not precalculated_weights_file:
            log.info("No precalculated weights file read")
        else:
            self._tf_idf_data = TFIDFData(precalculated_weights_file)
예제 #9
0
    def __init__(
            self,
            preprocessor=Preprocessor(),
            wordemb_creator=MockWordEmbeddingCreator(),
            tokenizer=JavaCodeASTTokenizer(None, None),
            preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):
        super(IdentifierEmbeddingOnlyClassNameAndComment,
              self).__init__(preprocessor, wordemb_creator, tokenizer,
                             preprocessed_token_output_directory)
        self._with_class_comment = True
        self._with_method = False
        self._with_class_name_to_method = False

        self._with_attribute = False
        self._with_attribute_comment_to_attr = False
        self._with_attribute_comment_to_class = False
예제 #10
0
 def __init__(self, split_percent, dataset: Dataset, seed):
     self._req_tokenizer = SentenceTokenizer()
     self._req_preprocessor = Preprocessor([
         Separator(True),
         CamelCaseSplitter(True),
         NonLetterFilter(),
         DuplicateWhiteSpaceFilter(),
         AddFullStop()
     ])
     self._code_tokenizer = JavaCodeASTTokenizer(SentenceTokenizer())
     self._code_preprocessor = Preprocessor([
         JavaDocFilter(),
         Separator(True),
         CamelCaseSplitter(True),
         NonLetterFilter(),
         DuplicateWhiteSpaceFilter(),
         AddFullStop()
     ])
     self._chosen_req_filenames = set(
     )  # contains req file names (without path and extension) for the training set
     self._remaining_req_filenames = set(
     )  # contains req file names (without path and extension) for the test set
     self._chosen_code_filenames = set(
     )  # contains code file names (without path and extension) for the training set
     self._remaining_code_filenames = set(
     )  # contains req file names (without path and extension) for the test set
     self._chosen_trace_matrix = SolutionTraceMatrix(
     )  # contains valid trace links between chosen code and req files for the training set
     self._remaining_trace_matrix = SolutionTraceMatrix(
     )  # contains valid trace links between remaining code and req files for the test set
     self._all_req_files = FileUtil.get_files_in_directory(
         dataset.req_folder())  # all req files of a project (e.g. etour)
     self._all_code_files = FileUtil.get_files_in_directory(
         dataset.code_folder())  # all code files of a project
     self._split_percent = split_percent  # percentage of chosen file data
     self._dataset = dataset
     self._seed = seed
     self._solution_matrix = dataset.solution_matrix(
     )  # complete solution matrix of a project (e.g. etour)
     self._tracelink_type = None  # Set this in non-abstract sub class constructors
예제 #11
0
    def __init__(
            self,
            preprocessor=Preprocessor(),
            wordemb_creator=MockWordEmbeddingCreator(),
            tokenizer=JavaCodeASTTokenizer(None, None),
            preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR):
        self._with_class_name = True
        self._with_super_classifier = False
        self._with_class_comment = False
        self._with_attribute = False
        self._with_attribute_comment_to_attr = False
        self._with_attribute_comment_to_class = False
        self._with_method = True
        self._with_method_comment_to_method = False
        self._with_method_comment_to_class = False
        self._with_method_body_to_method = False
        self._with_method_body_to_class = False
        self._with_class_name_to_method = True
        self._with_inner_classifier = False
        self._average_function = Util.create_averaged_vector  # function that maps multiple vectors to one

        super(IdentifierEmbeddingCreator,
              self).__init__(preprocessor, wordemb_creator, tokenizer,
                             preprocessed_token_output_directory)
예제 #12
0
            ):
                if not implemented_classifier in classifier_to_file_map:
                    log.info(
                        f"SKIP: Unknown super classifier (probably not part of {dataset.name()}): {implemented_classifier}"
                    )
                    continue
                file_of_super_classifier = classifier_to_file_map[
                    implemented_classifier]
                super_classes.add(file_of_super_classifier)
                # Add sub class relation from super class' perspective
                if file_of_super_classifier in implements_graph:
                    implements_graph[file_of_super_classifier][1].add(
                        code_file_representation.file_name)
                else:
                    implements_graph[file_of_super_classifier] = (set(), {
                        code_file_representation.file_name
                    })

        if code_file_representation.file_name in implements_graph:
            implements_graph[code_file_representation.file_name][0].update(
                super_classes)
        else:
            implements_graph[code_file_representation.file_name] = (
                super_classes, set())

    FileUtil.write_dict_to_json(output_file, implements_graph)


#generate_inheritance_graph(Etour308(), JavaCodeASTTokenizer())
generate_implements_graph(Etour308(), JavaCodeASTTokenizer())
예제 #13
0
log = logging.getLogger(__name__)


def generate_classifer_to_file_map(dataset, tokenizer, output_file=None):
    """
    Precalculates a classifier -> file map and saves it to a json.
    
    Load the json with FileUtil.read_dict_from_json()
    """
    if not output_file:
        output_file = Paths.classifier_to_file_map_filename(dataset)

    classifier_to_file_map = {}
    for file in FileUtil.get_files_in_directory(dataset.code_folder()):
        code_file_representation = tokenizer.tokenize(file)
        assert isinstance(
            code_file_representation, CodeFileRepresentation
        ), "use an appopiate tokenizer to generate a CodeFileRepresentation"
        file_name = FileUtil.get_filename_from_path(file)  # with extension
        for classifier in code_file_representation.classifiers:
            if classifier.get_original_name() in classifier_to_file_map:
                log.info(
                    f"Duplicate classifier name: {classifier.name} -> {file_name} overwrites {classifier.name} -> {classifier_to_file_map[classifier.name]}"
                )
            classifier_to_file_map[classifier.get_original_name()] = file_name
    FileUtil.write_dict_to_json(output_file, classifier_to_file_map)


generate_classifer_to_file_map(Etour308(), JavaCodeASTTokenizer())