def tokenize_java(s, keep_comments=False): try: tokens = [] assert isinstance(s, str) s = s.replace(r'\r', '') tokens_generator = javalang_tok.tokenize( s, keep_comments=keep_comments) for token in tokens_generator: if isinstance(token, javalang_tok.String): tokens.append(process_string( token.value, JAVA_CHAR2TOKEN, JAVA_TOKEN2CHAR, False)) elif isinstance(token, javalang_tok.Comment): com = process_string( token.value, JAVA_CHAR2TOKEN, JAVA_TOKEN2CHAR, True) if len(com) > 0: tokens.append(com) else: tokens.append(token.value) return tokens except: return []
def detokenize_java(s): assert isinstance(s, str) or isinstance(s, list) if isinstance(s, list): s = ' '.join(s) s = s.replace('ENDCOM', 'NEW_LINE') s = s.replace('▁', 'SPACETOKEN') s = s.replace('} "', 'CB_ "') s = s.replace('" {', '" OB_') s = s.replace('*/ ', '*/ NEW_LINE') s = s.replace('} ;', 'CB_COLON NEW_LINE') s = s.replace('} ,', 'CB_COMA') s = s.replace('}', 'CB_ NEW_LINE') s = s.replace('{', 'OB_ NEW_LINE') s = s.replace(';', '; NEW_LINE') lines = re.split('NEW_LINE', s) untok_s = indent_lines(lines) untok_s = untok_s.replace('CB_COLON', '};').replace('CB_COMA', '},').replace( 'CB_', '}').replace('OB_', '{') untok_s = untok_s.replace('> > >', '>>>').replace('<< <', '<<<') untok_s = untok_s.replace('> >', '>>').replace('< <', '<<') try: # call parser of the tokenizer to find comments and string and detokenize them correctly tokens_generator = javalang_tok.tokenize(untok_s, keep_comments=True) for token in tokens_generator: if isinstance(token, javalang_tok.String) or isinstance( token, javalang_tok.Comment): token_ = token.value.replace('STRNEWLINE', '\n').replace( 'TABSYMBOL', '\t').replace(' ', '').replace('SPACETOKEN', ' ') untok_s = untok_s.replace(token.value, token_) except KeyboardInterrupt: raise except: pass return untok_s