def doWork(): model = BPEModel("16K-full", "../bpe/") model.load_hparams() # dataset_directory = 'D:\\Downloads\\Big-Code-full\\' dataset_directory = model.get_data_source_path() # only one class in compilation unit # some_source_filename = 'java_projects\\Algorithms\\src\\org\\rekdev\\trees\\BinaryTreeNode.java' # has multiple classes parallel in one compilation unit # some_source_filename = 'java_projects\\CSSMin\\CSSMin.java' # nested classes # some_source_filename = 'java_projects\\cvs-plugin\\\src\\\main\\\java\\\hudson\\\scm\\CVSChangeLogSet.java' # inner and/or anonymous classes # TODO: anonymous inner classes won't be recognized as ClassDeclaration / ClassCreator kann im Body auch methodendeklarationen enthalten # some_source_filename = 'java_projects\\emf\\plugins\\org.eclipse.emf.codegen\\src\\org\\eclipse\\emf\\codegen\\CodeGen.java' model_vocabulary = model.load_tokens() model_bpe_data = model.load_bpe_pairs() encoder = SimpleBPEEncoder(model_vocabulary, model_bpe_data) method_dataset = MethodDataset(dataset_name = 'methodDataset.jsonl') method_dataset.prepareNewDataset(dataset_directory) # now crawl the directory and process each file... process_all_source_files(dataset_directory, encoder, method_dataset) method_dataset.finish() pass
def doWork(bpe_model_name, bpe_directory, input_dataset_path): model = BPEModel(bpe_model_name, bpe_directory) model.load_hparams() model_vocabulary = model.load_tokens() model_bpe_data = model.load_bpe_pairs() # for encoding the classnames. bpe_encoder = SimpleBPEEncoder(model_vocabulary, model_bpe_data) # TODO: extract these constants to the args/argparser. translation_dataset = TranslationDataset( dataset_name='NextLineTranslationDataset.jsonl') translation_dataset.prepareNewDataset( dataset_directory='D:\\Downloads\\Big-Code-excerpt\\') process_all_lines_in_next_line_dataset(input_dataset_path, bpe_encoder, translation_dataset) translation_dataset.finish() pass
time_after_buildingDict = datetime.datetime.now() print("time after building dictionary: " + str(time_after_buildingDict)) model.save_bpe_pairs(get_emitted_bpe_list()) model.save_tokens(emitted_tokens) print("===[ The End ]===") print("time at start: " + str(time_at_start)) if time_after_walkingfiles is not None: print("time after walking files: " + str(time_after_walkingfiles)) if time_after_aggregating_statistics is not None: print("time after aggregating words: " + str(time_after_aggregating_statistics)) if time_after_splitting_leastFrequentWords is not None: print("time after splitting least frequent words: " + str(time_after_splitting_leastFrequentWords)) print("time after building dictionary: " + str(time_after_buildingDict)) pass if __name__ == '__main__': # "1K-datapoint", "10K-excerpt", "16K-excerpt", "50K-full", "100K-full" # model = BPEModel("1K-datapoint") model = BPEModel("16K-full") # model = BPEModel("16K-excerpt") model.load_hparams() run_me(model)
@author: Maxim Gansert, Mindscan ''' import numpy as np import tensorflow as tf from com.github.c2nes.javalang import tokenizer as tokenizer from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel from de.mindscan.fluentgenesis.bpe.bpe_encoder_decoder import SimpleBPEEncoder PAD = 0 UNK = 0 bpemodel = BPEModel("16K-full", "../bpe/") bpemodel.load_hparams() dataset_directory = bpemodel.get_data_source_path() bpemodel_vocabulary = bpemodel.load_tokens() bpemodel_bpe_data = bpemodel.load_bpe_pairs() bpemodel_vocabulary['<PAD>'] = 0 encoder = SimpleBPEEncoder(bpemodel_vocabulary, bpemodel_bpe_data) MODEL_VOCABULARY_LENGTH = len(bpemodel_vocabulary) checkpoint = '../../../../../data/checkpoints/20200516_1750/predict_m'
SOFTWARE. @autor: Maxim Gansert, Mindscan ''' import tensorflow as tf ## BPE ENCODER PART from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel from de.mindscan.fluentgenesis.bpe.bpe_encoder_decoder import SimpleBPEEncoder SYMBOL_PAD = 0 SYMBOL_START = 16273 SYMBOL_EOS = 16274 bpemodel = BPEModel("16K-full", "../bpe/") bpemodel.load_hparams() bpemodel_vocabulary = bpemodel.load_tokens() bpemodel_bpe_data = bpemodel.load_bpe_pairs() # padding bpemodel_vocabulary['<PAD>'] = SYMBOL_PAD # start symbol bpemodel_vocabulary['<START>'] = SYMBOL_START # end of sentence bpemodel_vocabulary['<EOS>'] = SYMBOL_EOS bpe_encoder = SimpleBPEEncoder(bpemodel_vocabulary, bpemodel_bpe_data) MODEL_VOCABULARY_LENGTH = len(bpemodel_vocabulary)