Beispiel #1
0
 def _cat_and_remove(tgt_filename, num_workers):
     with file_io.open(tgt_filename, 'w') as writer:
         for idx in range(num_workers):
             src_filename = tgt_filename + str(idx)
             with file_io.open(src_filename, 'r') as reader:
                 PathManager.copyfileobj(reader, writer)
             PathManager.rm(src_filename)
Beispiel #2
0
 def _save(self, f, kv_iterator):
     if isinstance(f, str):
         PathManager.mkdir(os.path.dirname(f))
         with file_io.open(f, "w") as fd:
             return self.save(fd)
     for k, v in kv_iterator:
         print(json_io.json_dumps([k, v]), file=f)
Beispiel #3
0
    def docstring_tokens_fn(filename,
                            dest_filename,
                            idx,
                            start=0,
                            end=-1,
                            *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring_tokens = json_io.json_loads(line)
                if docstring_tokens:
                    docstring_tokens = [
                        token for token in docstring_tokens \
                        if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token))
                    ]
                    if not all(
                            str.isascii(token) for token in docstring_tokens):
                        docstring_tokens = None
                    if (docstring_tokens is
                            None) or not (3 < len(docstring_tokens) <= 50):
                        docstring_tokens = None
                else:
                    docstring_tokens = None
                print(json_io.json_dumps(docstring_tokens), file=writer)
                line = safe_readline(reader)
Beispiel #4
0
 def save_lang_dict():
     src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl")
     dict = Dictionary.load(src_file)
     tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl')
     PathManager.mkdir(os.path.dirname(tgt_file))
     dict.save(tgt_file)
     return dict
Beispiel #5
0
def main():
    from dataset.py150 import (RAW_DIR, ATTRIBUTES_DIR, )
    from ncc.utils.path_manager import PathManager

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default=RAW_DIR)
    parser.add_argument('--output_dir', type=str, default=ATTRIBUTES_DIR)

    parser.add_argument('--valid_p', type=float, default=0.2)
    parser.add_argument('--max_path_length', type=int, default=8)
    parser.add_argument('--max_path_width', type=int, default=2)
    parser.add_argument('--use_method_name', type=bool, default=True)
    parser.add_argument('--use_nums', type=bool, default=True)
    parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count())
    parser.add_argument('--seed', type=int, default=239)
    args = parser.parse_args()

    np.random.seed(args.seed)

    data_dir = Path(args.data_dir)
    trains = __collect_asts(data_dir / 'python100k_train.json')
    evals = __collect_asts(data_dir / 'python50k_eval.json')

    train, valid = sklearn_model_selection.train_test_split(
        trains,
        test_size=args.valid_p,
    )
    test = evals

    output_dir = Path(args.output_dir)
    PathManager.mkdir(output_dir)
    for split_name, split in zip(('train', 'valid', 'test'), (train, valid, test)):
        output_file = output_dir / f'{split_name}.method_path'
        __collect_all_and_save(split, args, output_file)
Beispiel #6
0
    def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code_tokens = json_io.json_loads(line)
                if code_tokens:
                    # filter comment in code_tokens, eg. //***\n /* */\n
                    code_tokens = [token for token in code_tokens
                                   if not (str.startswith(token, '//') or str.startswith(token, '#') or \
                                           (str.startswith(token, '/*') and str.endswith(token, '*/')))
                                   ]

                    if not all(str.isascii(token) for token in code_tokens):
                        code_tokens = None
                    if code_tokens is None or len(code_tokens) < 1:
                        code_tokens = None
                else:
                    code_tokens = None

                print(json_io.json_dumps(code_tokens), file=writer)
                line = safe_readline(reader)
Beispiel #7
0
    def make_all(lang, vocab):
        for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]):
            # copy shared dict into each languages
            out_dir = os.path.join(args['preprocess']['destdir'], l)
            PathManager.mkdir(out_dir)
            dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl')
            PathManager.copy(dict_path(lang), dst_dict)

            if args['preprocess']['trainpref']:
                out_file = os.path.join(out_dir, f"train.{lang}")
                make_dataset(vocab,
                             args['preprocess']['trainpref'].replace('*', l),
                             "train",
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['validpref']:
                out_file = os.path.join(out_dir, f"valid.{lang}")
                make_dataset(vocab,
                             args['preprocess']['validpref'].replace('*', l),
                             'valid',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['testpref']:
                out_file = os.path.join(out_dir, f"test.{lang}")
                make_dataset(vocab,
                             args['preprocess']['testpref'].replace('*', l),
                             'test',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
Beispiel #8
0
 def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         raise NotImplementedError
     else:
         in_file = file_name(input_prefix, lang)
         out_file = dest_path(output_prefix, lang)
         PathManager.mkdir(os.path.dirname(out_file))
         make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers)
Beispiel #9
0
 def _concate(_tgt_filename, num_workers, tgt_filename):
     src_filenames = [
         _tgt_filename + str(idx) for idx in range(num_workers)
     ]
     with file_io.open(tgt_filename, 'w') as writer:
         for _src_fl in src_filenames:
             with file_io.open(_src_fl, 'r') as reader:
                 shutil.copyfileobj(reader, writer)
             PathManager.rm(_src_fl)
Beispiel #10
0
def recursive_expanduser(obj):
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key] = recursive_expanduser(value)
    elif isinstance(obj, str) and obj.startswith('~/'):
        obj = PathManager.expanduser(obj)
    elif isinstance(obj, list):
        for i, val in enumerate(obj):
            if isinstance(val, str) and val.startswith('~/'):
                obj[i] = PathManager.expanduser(val)
    return obj
Beispiel #11
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         in_file = file_name(input_prefix, lang)
         out_dir = args['preprocess']['destdir']
         PathManager.mkdir(out_dir)
         LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
         shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
     else:
         in_file = file_name(input_prefix, lang)
         out_file = dest_path(output_prefix, lang)
         PathManager.mkdir(os.path.dirname(out_file))
         make_binary_dataset(vocab, in_file, out_file, num_workers)
Beispiel #12
0
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code):
        code_tokens = vocab.tokenize(code)
        # truncating
        code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        source_size = len(source_tokens)
        source_mask = [1] * source_size
        padding_length = config.MAX_SOURCE_LENGTH - len(source_ids)
        source_ids += [vocab.pad()] * padding_length
        source_mask += [0] * padding_length
        return [source_ids, source_mask, source_size]

    def parse_target_input(code):
        target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2]
        target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token]
        target_ids = vocab.convert_tokens_to_ids(target_tokens)
        target_size = len(target_ids)
        target_mask = [1] * target_size
        padding_length = config.MAX_TARGET_LENGTH - len(target_ids)
        target_ids += [vocab.pad_token_id] * padding_length
        target_mask += [0] * padding_length
        return [target_ids, target_mask, target_size]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes']
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # src_code = SPACE_SPLITTER.sub(" ", line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code)
                # target_ids, target_mask
                tgt_line = parse_target_input(src_code)
                for key, src in zip(keys, [src_code] + src_line + tgt_line):
                    data[key].append(src)
            file_io.open(dst_file, mode='wb', data=data)
Beispiel #13
0
    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            # TODO: parse json to txt file, one line one traversal, please help me parallize it.
            """
            because only 1 thread is allowed to write file, we have to use multi-processing for deal with data
            and merge results from CPUs into a block and then dumps such block. 
            """
            def _func(line):
                line = py150_util.separate_dps(
                    json_io.json_loads(line.strip()),
                    args['preprocess']['n_ctx'])
                line = [
                    py150_util.get_dfs(ast) + [ext] for ast, ext in line
                    if len(ast) > 1
                ]
                # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1]
                return line

            with PPool() as thread_pool:
                with file_io.open(file_name(input_prefix, lang), 'r') as f, \
                    file_io.open(dest_path(output_prefix, lang), 'w') as fout:

                    def _write(result):
                        for res in itertools.chain(*result):
                            print(json_io.json_dumps(res), file=fout)

                    batch_data = []
                    for line in f:
                        batch_data.append(line)
                        if len(batch_data) >= MAX_BATCH_SIZE:
                            result = thread_pool.feed(_func,
                                                      batch_data,
                                                      one_params=True)
                            _write(result)
                            del batch_data
                            batch_data = []

                    if len(batch_data) > 0:
                        result = thread_pool.feed(_func,
                                                  batch_data,
                                                  one_params=True)
                        _write(result)
                        del batch_data
        else:
            if lang == 'code_types':
                in_file = file_name(input_prefix, 'ast')
            else:
                in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)
Beispiel #14
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         raise NotImplementedError
     else:
         languages = [
             os.path.basename(d)
             for d in PathManager.ls(os.path.dirname(input_prefix))
         ]
         for l in languages:
             in_file = file_name(input_prefix, lang)
             in_file = str.replace(in_file, '*', l)
             out_file = dest_path(os.path.join(l, output_prefix), lang)
             PathManager.mkdir(os.path.dirname(out_file))
             make_binary_dataset(vocab, in_file, out_file, num_workers)
Beispiel #15
0
    def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring = json_io.json_loads(line)
                print(json_io.json_dumps(docstring), file=writer)
                line = safe_readline(reader)
Beispiel #16
0
def cast_code(raw_code_file, refined_code_file, dst_file):
    with file_io.open(raw_code_file, 'r') as raw_reader:
        raw_codes = {}
        for line in raw_reader:
            raw_code = line
            raw_code = raw_code[raw_code.find('def '):]
            func_name = raw_code[:raw_code.find('(')][4:].strip()
            raw_codes[func_name] = line.rstrip('\n')

    PathManager.mkdir(os.path.dirname(dst_file))
    with file_io.open(refined_code_file,
                      'r') as refined_reader, file_io.open(dst_file,
                                                           'w') as writer:
        for line in refined_reader:
            func_name = line[line.find('def '):].split()[1]
            raw_code = raw_codes[func_name]
            print(raw_code, file=writer)
Beispiel #17
0
def load_raw_data(data_dir, load_keys):
    raw_data = {}
    for mode in constants.MODES:
        for key in load_keys:
            mode_data_dir = os.path.join(data_dir, key, '{}.*'.format(mode))
            jsonl_gz_files = PathManager.ls(mode_data_dir)
            raw_data[mode] = list(load_jsonl_gzs(jsonl_gz_files))
    return raw_data
Beispiel #18
0
def flatten_attrs(raw_file, flatten_dir, lang, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        mode = filename[:str.rfind(filename, '.jsonl')]
        return mode

    mode = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}')
        PathManager.mkdir(os.path.dirname(attr_file))
        attr_writers[attr] = file_io.open(attr_file, 'w')
    print('raw_file: ', raw_file)
    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])
Beispiel #19
0
def flatten(raw_dir, lang, mode, flatten_dir, attrs, num_cores):
    """flatten attributes of raw data"""
    LOGGER.info('Cast attributes({}) of {}-{} dataset'.format(
        attrs, lang, mode))
    with Pool(num_cores) as mpool:
        result = [
            mpool.apply_async(flatten_attrs,
                              (raw_file, flatten_dir, lang, mode, set(attrs)))
            for raw_file in PathManager.ls(
                os.path.join(raw_dir, lang, mode, '*.jsonl.gz'))
        ]
        result = [res.get() for res in result]
Beispiel #20
0
def flatten_attrs(raw_file, flatten_dir, lang, mode, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        filename = filename[:str.rfind(filename, '.jsonl.gz')]
        _, _, idx = filename.split('_')
        return idx

    idx = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_dir = os.path.join(flatten_dir, lang, mode, attr)
        PathManager.mkdir(attr_dir)
        attr_file = os.path.join(attr_dir, '{}.jsonl'.format(idx))
        attr_writers[attr] = file_io.open(attr_file, 'w')

    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])
Beispiel #21
0
def merge_attr_files(flatten_dir, lang, mode, attrs):
    """shell cat"""
    def _merge_files(src_files, tgt_file):
        with file_io.open(tgt_file, 'w') as writer:
            for src_fl in src_files:
                with file_io.open(src_fl, 'r') as reader:
                    shutil.copyfileobj(reader, writer)

    def _get_file_idx(filename):
        filename = os.path.split(filename)[-1]
        idx = int(filename[:str.rfind(filename, '.json')])
        return idx

    for attr in attrs:
        attr_files = PathManager.ls(
            os.path.join(flatten_dir, lang, mode, attr, '*.jsonl'))
        attr_files = sorted(attr_files, key=_get_file_idx)
        assert len(attr_files) > 0, RuntimeError(
            'Attribute({}) files do not exist.'.format(attr))
        dest_file = os.path.join(flatten_dir, lang, '{}.{}'.format(mode, attr))
        _merge_files(attr_files, dest_file)
    PathManager.rm(os.path.join(flatten_dir, lang, mode))
Beispiel #22
0
def initialize_from_checkpoint(args, model):
    if args['checkpoint'].get('init_checkpoint', False) and PathManager.exists(args['checkpoint']['init_checkpoint']):
        with open(args['checkpoint']['init_checkpoint'], 'rb') as reader:
            state = torch.load(reader)
            pretrained_params = state['model']
            del state
        init_params = model.state_dict()
        for module_name, module_param in pretrained_params.items():
            if module_name in init_params:
                if init_params[module_name].data.size() == module_param.data.size():
                    init_params[module_name].data.copy_(module_param.data)
                else:
                    # emebedding
                    token_num = module_param.size(0)
                    # init token embedding
                    init_params[module_name].data[:token_num, ...].copy_(module_param.data[:token_num, ...])
        LOGGER.info(f"Restore parameters from {args['checkpoint']['init_checkpoint']}.")
    else:
        LOGGER.info(f"{args['checkpoint']['init_checkpoint']} does not exist.")
Beispiel #23
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    dictionary = save_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        file = f"{args['preprocess'][f'{mode}pref']}.code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")
        PathManager.mkdir(os.path.dirname(dst_file))
        dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap",
                                               impl='mmap',
                                               vocab_size=len(vocab))
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(file, 'r') as reader:
            data = {'code': []}
            for line in reader:
                line = json_io.json_loads(line)
                code = SPACE_SPLITTER.sub(" ", line)
                data['code'].append(code)
                code_tokens = vocab.encode(code, out_type=str)
                code_tokens = torch.IntTensor(
                    [dictionary.index(token) for token in code_tokens])
                # code_tokens = torch.IntTensor(vocab.encode_as_ids(code))
                dataset.add_item(code_tokens)
            dataset.finalize(f"{dst_file}_tokens.idx")
            # proj indices
            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
Beispiel #24
0
def xfg(src_dir, languages, dst_dir):
    xfg_src_files = PathManager.ls(os.path.join(src_dir, "kernels_ir", '*.ll'))

    filenames = []
    ir_data = []
    for filename in xfg_src_files:
        filenames.append(os.path.basename(filename)[:-3])
        with open(filename, 'r') as reader:
            lines = reader.read().splitlines()
        ir_data.append(lines)
    # convert list to dict
    filenames = {name: idx for idx, name in enumerate(filenames)}

    processed_data, _ = inst2vec_preprocess.preprocess(ir_data)
    processed_data, _ = task_utils.inline_struct_types_txt(
        processed_data, ir_data)
    processed_data = task_utils.abstract_statements_from_identifiers_txt(
        processed_data)

    for idx, lines in enumerate(processed_data):
        processed_data[idx] = [
            line for line in lines if
            not re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)', line)
        ]

    for lang in languages:
        raw_file = os.path.join(src_dir, f'{lang}.csv')
        # read raw csv file to load corresponding benchmarks
        data_frame = pd.read_csv(raw_file)
        benchmarks = data_frame["benchmark"].values.tolist()
        datasets = data_frame["dataset"].values.tolist()
        del data_frame

        # write
        dst_file = os.path.join(dst_dir, lang, f'train.xfg')
        with open(dst_file, 'w') as writer:
            for idx, (bm, ds) in enumerate(zip(benchmarks, datasets)):
                if bm[:3] == "npb":
                    bm += f'_{ds}'
                xfg = processed_data[filenames[bm]]
                print(json_io.json_dumps(xfg), file=writer)
Beispiel #25
0
    def __init__(self, SO_FILE, LANGUAGE, to_lower=False, operators_file=None):
        self.parser = Parser()
        try:
            assert PathManager.exists(SO_FILE), FileExistsError(
                f"{SO_FILE} does not exist, automatically download TreeSitter parse file {LANGUAGE}.so."
            )
        except FileExistsError as err:
            LOGGER.warning(err)
            from ncc.hub.tree_sitter.download import download
            download(LANGUAGE)

        if LANGUAGE == 'csharp':
            LANGUAGE = 'c_sharp'
        self.parser.set_language(Language(SO_FILE, LANGUAGE))
        self.LANGUAGE = LANGUAGE
        self.to_lower = to_lower

        if operators_file is None:
            operators_file = os.path.join(os.path.dirname(__file__),
                                          'operators.json')
        with open(operators_file, 'r') as reader:
            self.operators = json_io.json_load(reader)
Beispiel #26
0
def load_model_ensemble_and_task(filenames,
                                 arg_overrides=None,
                                 task=None,
                                 strict=True,
                                 suffix=''):
    from ncc import tasks

    ensemble = []
    for filename in filenames:
        filename = filename.replace(".pt", suffix + ".pt")
        if not PathManager.exists(filename):
            raise IOError("Model file not found: {}".format(filename))
        state = load_checkpoint_to_cpu(filename, arg_overrides)

        args = state["args"]
        if task is None:
            task = tasks.setup_task(args)

        # build model for ensemble
        model = task.build_model(args)
        model.load_state_dict(state["model"], strict=strict, args=args)
        ensemble.append(model)
    return ensemble, args, task
Beispiel #27
0
def single_main(args, init_distributed=False):
    assert args['dataset']['max_tokens'] is not None or args['dataset']['max_sentences'] is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'
    metrics.reset()

    # 0. Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args['common']['cpu']:
        torch.cuda.set_device(args['distributed_training']['device_id'])
    set_seed.set_seed(args['common']['seed'])
    if init_distributed:
        args['distributed_training'][
            'distributed_rank'] = distributed_utils.distributed_init(args)

    # Verify checkpoint directory
    if distributed_utils.is_master(args):
        save_dir = args['checkpoint']['save_dir']
        checkpoint_utils.verify_checkpoint_directory(save_dir)
        PathManager.rm(os.path.join(
            save_dir, '*.pt'))  # this code will remove pre-trained models

    # 1. Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # 2. Load valid dataset (we load training data below, based on the latest checkpoint)
    task.load_dataset(args['dataset']['valid_subset'], combine=False, epoch=1)

    # 3. Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    LOGGER.info(model)
    LOGGER.info('model {}, criterion {}'.format(args['model']['arch'],
                                                criterion.__class__.__name__))
    LOGGER.info('num. model params: {} (num. trained: {})'.format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    # 4. Build trainer
    trainer = Trainer(args, task, model, criterion)
    LOGGER.info('training on {} GPUs'.format(
        args['distributed_training']['distributed_world_size']))
    LOGGER.info(
        'max tokens per GPU = {} and max sentences per GPU = {}'.format(
            args['dataset']['max_tokens'],
            args['dataset']['max_sentences'],
        ))

    # 5. Load the latest checkpoint if one is available and restore the corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args,
                                                              trainer,
                                                              combine=False)

    # 6. Train until the learning rate gets too small
    max_epoch = args['optimization']['max_epoch'] or math.inf
    max_update = args['optimization']['max_update'] or math.inf
    lr = trainer.get_lr()
    train_meter = meters.StopwatchMeter()
    train_meter.start()
    valid_subsets = args['dataset']['valid_subset'].split(',')
    while (lr > args['optimization']['min_lr']
           and epoch_itr.next_epoch_idx <= max_epoch
           and trainer.get_num_updates() < max_update):
        # train for one epoch
        train(args, trainer, task, epoch_itr)

        if not args['dataset']['disable_validation'] and epoch_itr.epoch % args[
                'dataset']['validate_interval'] == 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)
        else:
            valid_losses = [None]

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint
        if epoch_itr.epoch % args['checkpoint']['save_interval'] == 0:
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                             valid_losses[0])

        # early stop
        if should_stop_early(args, valid_losses[0]):
            LOGGER.info(
                'early stop since valid performance hasn\'t improved for last {} runs'
                .format(args['checkpoint']['patience']))
            break

        epoch_itr = trainer.get_train_iterator(
            epoch_itr.next_epoch_idx,
            combine=False,  # TODO to be checked
            # sharded data: get train iterator for next epoch
            load_dataset=(os.pathsep in args['task']['data']),
        )

    train_meter.stop()
    LOGGER.info('done training in {:.1f} seconds'.format(train_meter.sum))
Beispiel #28
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = PathManager.ls(
            train_path(args['preprocess']['source_lang']))
        if not args['preprocess']['only_train']:
            filenames.extend(
                PathManager.ls(valid_path(args['preprocess']['source_lang'])))
        src_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenization.json_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
        )

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    # copy shared dict into each language's data directory
    for d in PathManager.ls(os.path.dirname(args['preprocess']['trainpref'])):
        lang = os.path.basename(d)
        src_dict.save(
            os.path.join(args['preprocess']['destdir'], lang,
                         f"{args['preprocess']['source_lang']}.dict.jsonl"))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, input_file, output_file, num_workers):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        def consumer(data, _):
            ds.add_item(data)

        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=string2tokens,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            languages = [
                os.path.basename(d)
                for d in PathManager.ls(os.path.dirname(input_prefix))
            ]
            for l in languages:
                in_file = file_name(input_prefix, lang)
                in_file = str.replace(in_file, '*', l)
                out_file = dest_path(os.path.join(l, output_prefix), lang)
                PathManager.mkdir(os.path.dirname(out_file))
                make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
Beispiel #29
0
def save_checkpoint(args, trainer, epoch_itr, val_loss):
    from ncc import meters
    from ncc.utils import distributed_utils
    prev_best = getattr(save_checkpoint, "best", val_loss)
    if val_loss is not None:
        best_function = max if args['checkpoint'][
            'maximize_best_checkpoint_metric'] else min
        save_checkpoint.best = best_function(val_loss, prev_best)

    if args['checkpoint']['no_save'] or not distributed_utils.is_master(args):
        return

    def is_better(a, b):
        return a >= b if args['checkpoint'][
            'maximize_best_checkpoint_metric'] else a <= b

    write_timer = meters.StopwatchMeter()
    write_timer.start()

    epoch = epoch_itr.epoch
    end_of_epoch = epoch_itr.end_of_epoch()
    updates = trainer.get_num_updates()

    checkpoint_conds = collections.OrderedDict()
    checkpoint_conds["checkpoint{}.pt".format(epoch)] = (
        end_of_epoch and not args['checkpoint']['no_epoch_checkpoints']
        and epoch % args['checkpoint']['save_interval'] == 0)
    checkpoint_conds["checkpoint_{}_{}.pt".format(epoch, updates)] = (
        not end_of_epoch and args['checkpoint']['save_interval_updates'] > 0
        and updates % args['checkpoint']['save_interval_updates'] == 0)
    checkpoint_conds["checkpoint_best.pt"] = val_loss is not None and (
        not hasattr(save_checkpoint, "best")
        or is_better(val_loss, save_checkpoint.best))
    if val_loss is not None and args['checkpoint']['keep_best_checkpoints'] > 0:
        checkpoint_conds["checkpoint.best_{}_{:.2f}.pt".format(
            args['checkpoint']['best_checkpoint_metric'],
            val_loss)] = (not hasattr(save_checkpoint, "best")
                          or is_better(val_loss, save_checkpoint.best))
    checkpoint_conds[
        "checkpoint_last.pt"] = not args['checkpoint']['no_last_checkpoints']

    extra_state = {
        "train_iterator": epoch_itr.state_dict(),
        "val_loss": val_loss
    }
    if hasattr(save_checkpoint, "best"):
        extra_state.update({"best": save_checkpoint.best})

    checkpoints = [
        os.path.join(args['checkpoint']['save_dir'], fn)
        for fn, cond in checkpoint_conds.items() if cond
    ]
    if len(checkpoints) > 0:
        trainer.save_checkpoint(checkpoints[0], extra_state)
        for cp in checkpoints[1:]:
            PathManager.copy(checkpoints[0], cp)

        write_timer.stop()
        LOGGER.info(
            "saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {:.6f} seconds)"
            .format(checkpoints[0], epoch, updates, val_loss, write_timer.sum))

    if not end_of_epoch and args['checkpoint']['keep_interval_updates'] > 0:
        # remove old checkpoints; checkpoints are sorted in descending order
        checkpoints = checkpoint_paths(args['checkpoint']['save_dir'],
                                       pattern=r"checkpoint_\d+_(\d+)\.pt")
        for old_chk in checkpoints[
                args['checkpoint']['keep_interval_updates']:]:
            if os.path.lexists(old_chk):
                os.remove(old_chk)

    if args['checkpoint']['keep_last_epochs'] > 0:
        # remove old epoch checkpoints; checkpoints are sorted in descending order
        checkpoints = checkpoint_paths(args['checkpoint']['save_dir'],
                                       pattern=r"checkpoint(\d+)\.pt")
        for old_chk in checkpoints[args['checkpoint']['keep_last_epochs']:]:
            if os.path.lexists(old_chk):
                os.remove(old_chk)

    if args['checkpoint']['keep_best_checkpoints'] > 0:
        # only keep the best N checkpoints according to validation metric
        checkpoints = checkpoint_paths(
            args['checkpoint']['save_dir'],
            pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format(
                args['checkpoint']['best_checkpoint_metric']))
        if not args['checkpoint']['maximize_best_checkpoint_metric']:
            checkpoints = checkpoints[::-1]
        for old_chk in checkpoints[
                args['checkpoint']['keep_best_checkpoints']:]:
            if os.path.lexists(old_chk):
                os.remove(old_chk)
Beispiel #30
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']

    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        PathManager.mkdir(os.path.dirname(dst_file))

        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        src_file,
                        prefix,
                        vocab,
                        token_dict,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                )
            pool.close()

        ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab))
        end = offsets[1]

        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.encode(line, out_type=str)
                code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize(f"{dst_file}.idx")