def extract_features(self, x): """ prepares all features and returns them as an array """ length_data = x['comment'].apply(lambda c: len(c.split())).to_numpy() stopwords_num = x['comment'].apply( FeatureHelper.get_stop_words_num).to_numpy() functional_types = x['functional_type'].to_numpy() x['comment'] = x['comment'].apply(DataProcesser.remove_stopwords) code_comment_similarity = x.apply( lambda row: TextSimilarity.get_similarity_score( s1=DataProcesser.preprocess(row['comment']), s2=DataProcesser.preprocess(row['code']), type='JACC'), axis=1).to_numpy() code_comment_similarity_cosine = x.apply( lambda row: TextSimilarity.get_similarity_score( s1=DataProcesser.preprocess(row['comment']), s2=DataProcesser.preprocess(row['code']), type='COSINE_TFIDF'), axis=1).to_numpy() comment = x['comment'].to_numpy() features = [ length_data, stopwords_num, code_comment_similarity_cosine, functional_types, comment ] return features
def test_extract_snake_case(self): dp = DataProcesser() expected = 'snake case' received = dp.extract_snake_case('snake_case') self.assertEqual(expected, received) # test multiple snake case expected = 'snake case oh yea' received = dp.extract_snake_case('snake_case_oh_yea') self.assertEqual(expected, received)
def test_extract_camel_case(self): dp = DataProcesser() expected = 'Camel Case' received = dp.extract_camel_case('CamelCase') self.assertEqual(expected, received) # test multiple camel case expected = 'Camel Case Camel Case' received = dp.extract_camel_case('CamelCaseCamelCase') self.assertEqual(expected, received) # test non-camel case expected = 'Camel Case' received = dp.extract_camel_case('Camel Case') self.assertEqual(expected, received)
def test_remove_java_tags(self): dp = DataProcesser() # test basic tag removal text = '@author this' expected = 'this' result = dp.remove_java_tags(text).lstrip() self.assertEqual(expected, result) # test tag removal based on regexp text = '{@link szdsdzsdz} this' expected = 'this' result = dp.remove_java_tags(text).lstrip() self.assertEqual(expected, result) # test no tag removal text = 'as this' expected = 'as this' result = dp.remove_java_tags(text).lstrip() self.assertEqual(expected, result)
def is_line_java_keyword(code_line: str) -> bool: """ returns True if the comment contains only a Java keyword """ code_line = DataProcesser.remove_special_characters(code_line) for keyword in JAVA_KEYWORDS: if code_line == keyword: return True return False
def is_invalid_code(code_line: str) -> bool: """ returns True if the code line is empty, is a comment, contains only a Java tag, only a Java keyword or only a special character """ is_com = is_line_comment(code_line) is_tag = is_line_java_tag(code_line) code_line = DataProcesser.remove_special_characters(code_line) code_line = code_line.replace(" ", "") return (code_line.isspace() or is_com or \ is_tag or is_line_java_keyword(code_line) or is_only_special_char(code_line))
def is_only_special_char(code_line: str) -> str: code_line = code_line.replace(" ", ",") code_line = DataProcesser.remove_special_characters(code_line) return code_line == ''
def test_preprocess(self): dp = DataProcesser() data = '@Override this' result = dp.preprocess(data) self.assertEqual('', result)