def tokenize(self, text): subtokenized_sentences = ( code_to_subtokenized_sentences.code_to_cubert_sentences( code=text, initial_tokenizer=self.code_tokenizer, subword_tokenizer=self.subwork_tokenizer)) return list(itertools.chain(*subtokenized_sentences))
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # The value of the `TokenizerEnum` is a `CuBertTokenizer` subclass. tokenizer = FLAGS.tokenizer.value() subword_tokenizer = text_encoder.SubwordTextEncoder( FLAGS.vocabulary_filepath) with open(FLAGS.input_filepath, 'r') as input_file: code = input_file.read() print('#' * 80) print('Original Code') print('#' * 80) print(code) subtokenized_sentences = ( code_to_subtokenized_sentences.code_to_cubert_sentences( code=code, initial_tokenizer=tokenizer, subword_tokenizer=subword_tokenizer)) print('#' * 80) print('CuBERT Sentences') print('#' * 80) print(subtokenized_sentences) with open(FLAGS.output_filepath, 'wt') as output_file: output_file.write(json.dumps(subtokenized_sentences, indent=2))
def _tokenize_classes(self, project_dir: Path, project_out_dir: Path, project_name: str): classes_out_dir = project_out_dir / 'classes' classes_out_dir.mkdir(parents=True) cs_tokens = {} with open(project_dir / 'classes.csv') as classes_file: classes_reader = csv.reader(classes_file) next(classes_reader) for c_id, c_name, c_path, c_offset in classes_reader: c_body = read_class(project_dir / project_name / c_path, int(c_offset)) c_tokens = code_to_cubert_sentences( code=c_body, initial_tokenizer=self.tokenizer, subword_tokenizer=self.sub_word_tokenizer) cs_tokens[c_id] = c_tokens with open(classes_out_dir / f'{c_name}.json', 'w') as class_out_file: json.dump(c_tokens, class_out_file) return cs_tokens
def _tokenize_methods(self, project_dir: Path, project_out_dir: Path, project_name: str): methods_out_dir = project_out_dir / 'methods' methods_out_dir.mkdir(parents=True) ms_tokenized = [] with open(project_dir / 'methods.csv') as methods_file: methods_reader = csv.reader(methods_file) next(methods_reader) for m_id, m_name, m_path, m_offset, m_src_class, _ in methods_reader: m_body = read_method(project_dir / project_name / m_path, int(m_offset)) m_tokens = code_to_cubert_sentences( code=m_body, initial_tokenizer=self.tokenizer, subword_tokenizer=self.sub_word_tokenizer) ms_tokenized.append((m_name, m_src_class, m_tokens)) with (methods_out_dir / f'{m_name}.json').open('w') as method_out_file: json.dump(m_tokens, method_out_file) return ms_tokenized
def test_code_to_sentences(self): sentences = code_to_subtokenized_sentences.code_to_cubert_sentences( self._CODE, self.tokenizer, self.subword_text_encoder) self.assertEqual([self._WORDPIECE_SUBTOKENS], sentences)