def tokenize(self, text):
     subtokenized_sentences = (
         code_to_subtokenized_sentences.code_to_cubert_sentences(
             code=text,
             initial_tokenizer=self.code_tokenizer,
             subword_tokenizer=self.subwork_tokenizer))
     return list(itertools.chain(*subtokenized_sentences))
Exemple #2
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # The value of the `TokenizerEnum` is a `CuBertTokenizer` subclass.
    tokenizer = FLAGS.tokenizer.value()
    subword_tokenizer = text_encoder.SubwordTextEncoder(
        FLAGS.vocabulary_filepath)

    with open(FLAGS.input_filepath, 'r') as input_file:
        code = input_file.read()
        print('#' * 80)
        print('Original Code')
        print('#' * 80)
        print(code)

    subtokenized_sentences = (
        code_to_subtokenized_sentences.code_to_cubert_sentences(
            code=code,
            initial_tokenizer=tokenizer,
            subword_tokenizer=subword_tokenizer))
    print('#' * 80)
    print('CuBERT Sentences')
    print('#' * 80)
    print(subtokenized_sentences)

    with open(FLAGS.output_filepath, 'wt') as output_file:
        output_file.write(json.dumps(subtokenized_sentences, indent=2))
Exemple #3
0
 def _tokenize_classes(self, project_dir: Path, project_out_dir: Path,
                       project_name: str):
     classes_out_dir = project_out_dir / 'classes'
     classes_out_dir.mkdir(parents=True)
     cs_tokens = {}
     with open(project_dir / 'classes.csv') as classes_file:
         classes_reader = csv.reader(classes_file)
         next(classes_reader)
         for c_id, c_name, c_path, c_offset in classes_reader:
             c_body = read_class(project_dir / project_name / c_path,
                                 int(c_offset))
             c_tokens = code_to_cubert_sentences(
                 code=c_body,
                 initial_tokenizer=self.tokenizer,
                 subword_tokenizer=self.sub_word_tokenizer)
             cs_tokens[c_id] = c_tokens
             with open(classes_out_dir / f'{c_name}.json',
                       'w') as class_out_file:
                 json.dump(c_tokens, class_out_file)
     return cs_tokens
Exemple #4
0
 def _tokenize_methods(self, project_dir: Path, project_out_dir: Path,
                       project_name: str):
     methods_out_dir = project_out_dir / 'methods'
     methods_out_dir.mkdir(parents=True)
     ms_tokenized = []
     with open(project_dir / 'methods.csv') as methods_file:
         methods_reader = csv.reader(methods_file)
         next(methods_reader)
         for m_id, m_name, m_path, m_offset, m_src_class, _ in methods_reader:
             m_body = read_method(project_dir / project_name / m_path,
                                  int(m_offset))
             m_tokens = code_to_cubert_sentences(
                 code=m_body,
                 initial_tokenizer=self.tokenizer,
                 subword_tokenizer=self.sub_word_tokenizer)
             ms_tokenized.append((m_name, m_src_class, m_tokens))
             with (methods_out_dir /
                   f'{m_name}.json').open('w') as method_out_file:
                 json.dump(m_tokens, method_out_file)
     return ms_tokenized
Exemple #5
0
 def test_code_to_sentences(self):
     sentences = code_to_subtokenized_sentences.code_to_cubert_sentences(
         self._CODE, self.tokenizer, self.subword_text_encoder)
     self.assertEqual([self._WORDPIECE_SUBTOKENS], sentences)