コード例 #1
0
def doWork():
    model = BPEModel("16K-full", "../bpe/")
    model.load_hparams()
    
    # dataset_directory = 'D:\\Downloads\\Big-Code-full\\'
    dataset_directory = model.get_data_source_path()
    
    # only one class in compilation unit
    # some_source_filename = 'java_projects\\Algorithms\\src\\org\\rekdev\\trees\\BinaryTreeNode.java'
    
    # has multiple classes parallel in one compilation unit
    # some_source_filename = 'java_projects\\CSSMin\\CSSMin.java'
    
    # nested classes
    # some_source_filename = 'java_projects\\cvs-plugin\\\src\\\main\\\java\\\hudson\\\scm\\CVSChangeLogSet.java'

    # inner and/or anonymous classes
    # TODO: anonymous inner classes won't be recognized as ClassDeclaration / ClassCreator kann im Body auch methodendeklarationen enthalten
    # some_source_filename = 'java_projects\\emf\\plugins\\org.eclipse.emf.codegen\\src\\org\\eclipse\\emf\\codegen\\CodeGen.java'
    
    model_vocabulary = model.load_tokens()
    model_bpe_data = model.load_bpe_pairs()
    
    encoder = SimpleBPEEncoder(model_vocabulary, model_bpe_data)
    
    method_dataset = MethodDataset(dataset_name = 'methodDataset.jsonl')
    method_dataset.prepareNewDataset(dataset_directory)
    
    # now crawl the directory and process each file...
    process_all_source_files(dataset_directory, encoder, method_dataset)
    
    method_dataset.finish()
    pass
def doWork(bpe_model_name, bpe_directory, input_dataset_path):
    model = BPEModel(bpe_model_name, bpe_directory)
    model.load_hparams()

    model_vocabulary = model.load_tokens()
    model_bpe_data = model.load_bpe_pairs()

    # for encoding the classnames.
    bpe_encoder = SimpleBPEEncoder(model_vocabulary, model_bpe_data)

    # TODO: extract these constants to the args/argparser.
    translation_dataset = TranslationDataset(
        dataset_name='NextLineTranslationDataset.jsonl')
    translation_dataset.prepareNewDataset(
        dataset_directory='D:\\Downloads\\Big-Code-excerpt\\')

    process_all_lines_in_next_line_dataset(input_dataset_path, bpe_encoder,
                                           translation_dataset)

    translation_dataset.finish()

    pass
コード例 #3
0
    time_after_buildingDict = datetime.datetime.now()
    print("time after building dictionary: " + str(time_after_buildingDict))

    model.save_bpe_pairs(get_emitted_bpe_list())
    model.save_tokens(emitted_tokens)

    print("===[ The End ]===")
    print("time at start: " + str(time_at_start))
    if time_after_walkingfiles is not None:
        print("time after walking files: " + str(time_after_walkingfiles))
    if time_after_aggregating_statistics is not None:
        print("time after aggregating words: " +
              str(time_after_aggregating_statistics))
    if time_after_splitting_leastFrequentWords is not None:
        print("time after splitting least frequent words: " +
              str(time_after_splitting_leastFrequentWords))
    print("time after building dictionary: " + str(time_after_buildingDict))

    pass


if __name__ == '__main__':
    # "1K-datapoint", "10K-excerpt", "16K-excerpt", "50K-full", "100K-full"
    # model = BPEModel("1K-datapoint")
    model = BPEModel("16K-full")
    # model = BPEModel("16K-excerpt")
    model.load_hparams()

    run_me(model)
コード例 #4
0
@author: Maxim Gansert, Mindscan
'''

import numpy as np
import tensorflow as tf

from com.github.c2nes.javalang import tokenizer as tokenizer

from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel
from de.mindscan.fluentgenesis.bpe.bpe_encoder_decoder import SimpleBPEEncoder

PAD = 0
UNK = 0

bpemodel = BPEModel("16K-full", "../bpe/")
bpemodel.load_hparams()

dataset_directory = bpemodel.get_data_source_path()

bpemodel_vocabulary = bpemodel.load_tokens()
bpemodel_bpe_data = bpemodel.load_bpe_pairs()

bpemodel_vocabulary['<PAD>'] = 0

encoder = SimpleBPEEncoder(bpemodel_vocabulary, bpemodel_bpe_data)

MODEL_VOCABULARY_LENGTH = len(bpemodel_vocabulary)

checkpoint = '../../../../../data/checkpoints/20200516_1750/predict_m'
コード例 #5
0
SOFTWARE.

@autor: Maxim Gansert, Mindscan
'''
import tensorflow as tf


## BPE ENCODER PART
from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel
from de.mindscan.fluentgenesis.bpe.bpe_encoder_decoder import SimpleBPEEncoder

SYMBOL_PAD = 0
SYMBOL_START = 16273
SYMBOL_EOS = 16274

bpemodel = BPEModel("16K-full", "../bpe/")
bpemodel.load_hparams()
bpemodel_vocabulary = bpemodel.load_tokens()
bpemodel_bpe_data = bpemodel.load_bpe_pairs()

# padding
bpemodel_vocabulary['<PAD>'] = SYMBOL_PAD
# start symbol
bpemodel_vocabulary['<START>'] = SYMBOL_START
# end of sentence
bpemodel_vocabulary['<EOS>'] = SYMBOL_EOS

bpe_encoder = SimpleBPEEncoder(bpemodel_vocabulary, bpemodel_bpe_data)

MODEL_VOCABULARY_LENGTH = len(bpemodel_vocabulary)