def __init__(self, split_percent, dataset: Dataset, seed): super(IdentifierDataCreator, self).__init__(split_percent, dataset, seed) self._code_tokenizer = JavaCodeASTTokenizer() self._code_preprocessor = Preprocessor( [CamelCaseSplitter(True), NonLetterFilter()]) self._tracelink_type = TraceLinkType.identifier_tracelinks
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(MethodNameSentenceEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory)
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingOnlyMethods, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_class_name = False
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingWithAttribute, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_attribute = True
def rename_eanci_code_files_and_solution_matrix(): tok = JavaCodeASTTokenizer(EANCINoTrans(), WordTokenizer(EANCINoTrans(), True)) code_files = FileUtil.get_files_in_directory(EANCINoTrans().code_folder(), True) code_file_to_class_name_map = {} old_sol_matrix = FileUtil.read_xml_format_solution_matrix(EANCINoTrans().folder() / "answer_req_code.xml") for code_file in code_files: code_file_representation = tok.tokenize(code_file) class_name = code_file_representation.classifiers[0].get_original_name() new_file_name = class_name + ".java" old_file_name = FileUtil.get_filename_from_path(code_file) code_file_to_class_name_map[FileUtil.get_filename_without_extension__from_path(old_file_name)] = new_file_name os.rename(code_file, EANCINoTrans().code_folder() / new_file_name) renamed_solution_links = [] for old_req_name, old_code_name in old_sol_matrix.get_all_trace_links(): renamed_solution_links.append(f"{old_req_name}.txt: {code_file_to_class_name_map[old_code_name]}") FileUtil.write_file(EANCINoTrans().EANCI_SOLUTION_MATRIX_PATH, "\n".join(renamed_solution_links #rename_eanci_code_files_and_solution_matrix()
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingCreatorWithMethodCommentToClass, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_class_name = True self._with_method = True self._with_method_comment_to_class = True self._with_class_name_to_method = True
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(CodeEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._is_ital_identifier = False self._is_ital_comm = False if isinstance(wordemb_creator, FastTextAlignedEngItalEmbeddingCreator): self._is_ital_comm = True
def __init__( self, precalculated_weights_file, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(TFIDFIdentifierEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) if not precalculated_weights_file: log.info("No precalculated weights file read") else: self._tf_idf_data = TFIDFData(precalculated_weights_file)
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): super(IdentifierEmbeddingOnlyClassNameAndComment, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory) self._with_class_comment = True self._with_method = False self._with_class_name_to_method = False self._with_attribute = False self._with_attribute_comment_to_attr = False self._with_attribute_comment_to_class = False
def __init__(self, split_percent, dataset: Dataset, seed): self._req_tokenizer = SentenceTokenizer() self._req_preprocessor = Preprocessor([ Separator(True), CamelCaseSplitter(True), NonLetterFilter(), DuplicateWhiteSpaceFilter(), AddFullStop() ]) self._code_tokenizer = JavaCodeASTTokenizer(SentenceTokenizer()) self._code_preprocessor = Preprocessor([ JavaDocFilter(), Separator(True), CamelCaseSplitter(True), NonLetterFilter(), DuplicateWhiteSpaceFilter(), AddFullStop() ]) self._chosen_req_filenames = set( ) # contains req file names (without path and extension) for the training set self._remaining_req_filenames = set( ) # contains req file names (without path and extension) for the test set self._chosen_code_filenames = set( ) # contains code file names (without path and extension) for the training set self._remaining_code_filenames = set( ) # contains req file names (without path and extension) for the test set self._chosen_trace_matrix = SolutionTraceMatrix( ) # contains valid trace links between chosen code and req files for the training set self._remaining_trace_matrix = SolutionTraceMatrix( ) # contains valid trace links between remaining code and req files for the test set self._all_req_files = FileUtil.get_files_in_directory( dataset.req_folder()) # all req files of a project (e.g. etour) self._all_code_files = FileUtil.get_files_in_directory( dataset.code_folder()) # all code files of a project self._split_percent = split_percent # percentage of chosen file data self._dataset = dataset self._seed = seed self._solution_matrix = dataset.solution_matrix( ) # complete solution matrix of a project (e.g. etour) self._tracelink_type = None # Set this in non-abstract sub class constructors
def __init__( self, preprocessor=Preprocessor(), wordemb_creator=MockWordEmbeddingCreator(), tokenizer=JavaCodeASTTokenizer(None, None), preprocessed_token_output_directory=PREPROCESSED_CODE_OUTPUT_DIR): self._with_class_name = True self._with_super_classifier = False self._with_class_comment = False self._with_attribute = False self._with_attribute_comment_to_attr = False self._with_attribute_comment_to_class = False self._with_method = True self._with_method_comment_to_method = False self._with_method_comment_to_class = False self._with_method_body_to_method = False self._with_method_body_to_class = False self._with_class_name_to_method = True self._with_inner_classifier = False self._average_function = Util.create_averaged_vector # function that maps multiple vectors to one super(IdentifierEmbeddingCreator, self).__init__(preprocessor, wordemb_creator, tokenizer, preprocessed_token_output_directory)
): if not implemented_classifier in classifier_to_file_map: log.info( f"SKIP: Unknown super classifier (probably not part of {dataset.name()}): {implemented_classifier}" ) continue file_of_super_classifier = classifier_to_file_map[ implemented_classifier] super_classes.add(file_of_super_classifier) # Add sub class relation from super class' perspective if file_of_super_classifier in implements_graph: implements_graph[file_of_super_classifier][1].add( code_file_representation.file_name) else: implements_graph[file_of_super_classifier] = (set(), { code_file_representation.file_name }) if code_file_representation.file_name in implements_graph: implements_graph[code_file_representation.file_name][0].update( super_classes) else: implements_graph[code_file_representation.file_name] = ( super_classes, set()) FileUtil.write_dict_to_json(output_file, implements_graph) #generate_inheritance_graph(Etour308(), JavaCodeASTTokenizer()) generate_implements_graph(Etour308(), JavaCodeASTTokenizer())
log = logging.getLogger(__name__) def generate_classifer_to_file_map(dataset, tokenizer, output_file=None): """ Precalculates a classifier -> file map and saves it to a json. Load the json with FileUtil.read_dict_from_json() """ if not output_file: output_file = Paths.classifier_to_file_map_filename(dataset) classifier_to_file_map = {} for file in FileUtil.get_files_in_directory(dataset.code_folder()): code_file_representation = tokenizer.tokenize(file) assert isinstance( code_file_representation, CodeFileRepresentation ), "use an appopiate tokenizer to generate a CodeFileRepresentation" file_name = FileUtil.get_filename_from_path(file) # with extension for classifier in code_file_representation.classifiers: if classifier.get_original_name() in classifier_to_file_map: log.info( f"Duplicate classifier name: {classifier.name} -> {file_name} overwrites {classifier.name} -> {classifier_to_file_map[classifier.name]}" ) classifier_to_file_map[classifier.get_original_name()] = file_name FileUtil.write_dict_to_json(output_file, classifier_to_file_map) generate_classifer_to_file_map(Etour308(), JavaCodeASTTokenizer())