コード例 #1
0
ファイル: spm_tokenize.py プロジェクト: CGCL-codes/naturalcc
def tokenization(
    in_file,
    out_file,
    lang,
    attr,
    start=0,
    end=-1,
):
    with file_io.open(in_file, "r") as reader, file_io.open(out_file,
                                                            'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line).strip()

            if lang == 'python' and attr == 'code':
                tokens = python_code_tokenize(line)
                line = ' '.join(tokens).strip()

            if attr == 'code':
                line = normalize_program(line, remove_eol=True)
            else:
                line = normalize_docstring(line,
                                           remove_eol=True,
                                           remove_url=True)

            line = line.strip()
            tokens = tokenizer.encode_as_pieces(line)
            print(json_io.json_dumps(tokens), file=writer)
            line = file_io.safe_readline(reader)
コード例 #2
0
    def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    try:
                        ast = util_ast.value2children(ast)
                        ast = util_ast.remove_root_with_uni_child(ast)
                        root_idx = util_ast.get_root_idx(ast)
                        ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx)
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.binarize_tree(ast, idx=root_idx)  # to binary ast tree
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.reset_indices(bin_ast, root_idx)  # reset node indices
                        bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN)
                    except RecursionError:
                        LOGGER.error('RecursionError, ignore this tree')
                        bin_ast = None
                    except Exception as err:
                        LOGGER.error(err)
                        bin_ast = None
                else:
                    bin_ast = None
                print(json_io.json_dumps(bin_ast), file=writer)
                line = safe_readline(reader)
コード例 #3
0
    def docstring_tokens_fn(filename,
                            dest_filename,
                            idx,
                            start=0,
                            end=-1,
                            *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring_tokens = json_io.json_loads(line)
                if docstring_tokens:
                    docstring_tokens = [
                        token for token in docstring_tokens \
                        if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token))
                    ]
                    if not all(
                            str.isascii(token) for token in docstring_tokens):
                        docstring_tokens = None
                    if (docstring_tokens is
                            None) or not (3 < len(docstring_tokens) <= 50):
                        docstring_tokens = None
                else:
                    docstring_tokens = None
                print(json_io.json_dumps(docstring_tokens), file=writer)
                line = safe_readline(reader)
コード例 #4
0
    def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        lang = kwargs.get('lang')
        so_dir = kwargs.get('so_dir')

        so_filename = os.path.join(os.path.expanduser(so_dir),
                                   '{}.so'.format(lang))
        parser = TreeSitterASTParser(so_filename, lang)
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                if code:
                    raw_ast = parser.parse_raw_ast(code)
                else:
                    raw_ast = None
                print(json_io.json_dumps(raw_ast), file=writer)
                line = safe_readline(reader)
コード例 #5
0
 def _cat_and_remove(tgt_filename, num_workers):
     with file_io.open(tgt_filename, 'w') as writer:
         for idx in range(num_workers):
             src_filename = tgt_filename + str(idx)
             with file_io.open(src_filename, 'r') as reader:
                 PathManager.copyfileobj(reader, writer)
             PathManager.rm(src_filename)
コード例 #6
0
    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM)
                    if paths is None:
                        paths = [[None] * 3] * PATH_NUM
                    else:
                        # copy paths size to PATH_NUM
                        if len(paths) < PATH_NUM:
                            supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \
                                         + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths)))
                            paths.extend([paths[idx] for idx in supply_ids])
                    random.shuffle(paths)
                    assert len(paths) == PATH_NUM
                    head, body, tail = zip(*paths)
                else:
                    head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM
                # terminals
                for terminal in itertools.chain(*zip(head, tail)):
                    print(json_io.json_dumps(terminal), file=writer_terminals)
                # path
                for b in body:
                    print(json_io.json_dumps(b), file=writer)
                line = safe_readline(reader)
コード例 #7
0
    def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code_tokens = json_io.json_loads(line)
                if code_tokens:
                    # filter comment in code_tokens, eg. //***\n /* */\n
                    code_tokens = [token for token in code_tokens
                                   if not (str.startswith(token, '//') or str.startswith(token, '#') or \
                                           (str.startswith(token, '/*') and str.endswith(token, '*/')))
                                   ]

                    if not all(str.isascii(token) for token in code_tokens):
                        code_tokens = None
                    if code_tokens is None or len(code_tokens) < 1:
                        code_tokens = None
                else:
                    code_tokens = None

                print(json_io.json_dumps(code_tokens), file=writer)
                line = safe_readline(reader)
コード例 #8
0
def flatten_attrs(raw_file, flatten_dir, mode, attrs, start=0, end=-1):
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, '{}.{}'.format(mode, attr))
        os.makedirs(os.path.dirname(attr_file), exist_ok=True)
        attr_writers[attr] = file_io.open(attr_file, 'w')

    with file_io.open(raw_file, 'r') as reader:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            filename = os.path.join(os.path.dirname(raw_file), line.strip())
            # tokens, types = parse_file(filename)
            try:
                tokens, types = parse_file(filename)
                # replace None with [PAD] for type dictionary build
                types = [PAD if t is None else t for t in types]
            except Exception as err:
                # print(err)
                # print(f'parsing {filename} error')
                line = file_io.safe_readline(reader)
                continue
            print(json_io.json_dumps(tokens), file=attr_writers['code_tokens'])
            print(json_io.json_dumps(types), file=attr_writers['code_types'])
            line = file_io.safe_readline(reader)
コード例 #9
0
 def _concate(_tgt_filename, num_workers, tgt_filename):
     src_filenames = [
         _tgt_filename + str(idx) for idx in range(num_workers)
     ]
     with file_io.open(tgt_filename, 'w') as writer:
         for _src_fl in src_filenames:
             with file_io.open(_src_fl, 'r') as reader:
                 shutil.copyfileobj(reader, writer)
             PathManager.rm(_src_fl)
コード例 #10
0
 def find_func_offsets(filename, offsets):
     func_filename = filename[:str.rfind(filename, '.')] + '.func_name'
     count = 1
     func_offsets = [0 for _ in range(len(offsets))]
     with file_io.open(filename, "r", encoding="utf-8") as f, \
         file_io.open(func_filename, "r", encoding="utf-8") as func:
         line, _ = f.readline(), func.readline()
         while line:
             if f.tell() == offsets[count]:
                 func_offsets[count] = func.tell()
                 count += 1
             line, _ = f.readline(), func.readline()
     return func_offsets
コード例 #11
0
    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            # TODO: parse json to txt file, one line one traversal, please help me parallize it.
            """
            because only 1 thread is allowed to write file, we have to use multi-processing for deal with data
            and merge results from CPUs into a block and then dumps such block. 
            """
            def _func(line):
                line = py150_util.separate_dps(
                    json_io.json_loads(line.strip()),
                    args['preprocess']['n_ctx'])
                line = [
                    py150_util.get_dfs(ast) + [ext] for ast, ext in line
                    if len(ast) > 1
                ]
                # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1]
                return line

            with PPool() as thread_pool:
                with file_io.open(file_name(input_prefix, lang), 'r') as f, \
                    file_io.open(dest_path(output_prefix, lang), 'w') as fout:

                    def _write(result):
                        for res in itertools.chain(*result):
                            print(json_io.json_dumps(res), file=fout)

                    batch_data = []
                    for line in f:
                        batch_data.append(line)
                        if len(batch_data) >= MAX_BATCH_SIZE:
                            result = thread_pool.feed(_func,
                                                      batch_data,
                                                      one_params=True)
                            _write(result)
                            del batch_data
                            batch_data = []

                    if len(batch_data) > 0:
                        result = thread_pool.feed(_func,
                                                  batch_data,
                                                  one_params=True)
                        _write(result)
                        del batch_data
        else:
            if lang == 'code_types':
                in_file = file_name(input_prefix, 'ast')
            else:
                in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)
コード例 #12
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code):
        code_tokens = vocab.tokenize(code)
        # truncating
        code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        source_size = len(source_tokens)
        source_mask = [1] * source_size
        padding_length = config.MAX_SOURCE_LENGTH - len(source_ids)
        source_ids += [vocab.pad()] * padding_length
        source_mask += [0] * padding_length
        return [source_ids, source_mask, source_size]

    def parse_target_input(code):
        target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2]
        target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token]
        target_ids = vocab.convert_tokens_to_ids(target_tokens)
        target_size = len(target_ids)
        target_mask = [1] * target_size
        padding_length = config.MAX_TARGET_LENGTH - len(target_ids)
        target_ids += [vocab.pad_token_id] * padding_length
        target_mask += [0] * padding_length
        return [target_ids, target_mask, target_size]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes']
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # src_code = SPACE_SPLITTER.sub(" ", line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code)
                # target_ids, target_mask
                tgt_line = parse_target_input(src_code)
                for key, src in zip(keys, [src_code] + src_line + tgt_line):
                    data[key].append(src)
            file_io.open(dst_file, mode='wb', data=data)
コード例 #13
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    dictionary = save_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        file = f"{args['preprocess'][f'{mode}pref']}.code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")
        PathManager.mkdir(os.path.dirname(dst_file))
        dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap",
                                               impl='mmap',
                                               vocab_size=len(vocab))
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(file, 'r') as reader:
            data = {'code': []}
            for line in reader:
                line = json_io.json_loads(line)
                code = SPACE_SPLITTER.sub(" ", line)
                data['code'].append(code)
                code_tokens = vocab.encode(code, out_type=str)
                code_tokens = torch.IntTensor(
                    [dictionary.index(token) for token in code_tokens])
                # code_tokens = torch.IntTensor(vocab.encode_as_ids(code))
                dataset.add_item(code_tokens)
            dataset.finalize(f"{dst_file}_tokens.idx")
            # proj indices
            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
コード例 #14
0
    def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring = json_io.json_loads(line)
                print(json_io.json_dumps(docstring), file=writer)
                line = safe_readline(reader)
コード例 #15
0
    def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang'])

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True)
                print(json_io.json_dumps(ast), file=writer)
                line = safe_readline(reader)
コード例 #16
0
 def _add_tok_to_dictionary_single_worker(
     filename: str,
     tokenize: Any,
     eos_word: Optional[str],
     worker_id: int = 0,
     num_workers: int = 1,
 ) -> Counter:
     counter = Counter()
     with file_io.open(filename, "r") as f:
         size = os.fstat(f.fileno()).st_size
         chunk_size = size // num_workers
         offset = worker_id * chunk_size
         end = offset + chunk_size
         f.seek(offset)
         if offset > 0:
             safe_readline(f)  # drop first incomplete line
         line = f.readline()
         while line:
             tokens = tokenize(line)
             counter.update(tokens)
             if eos_word is not None:
                 counter.update([eos_word])
             if f.tell() > end:
                 break
             line = f.readline()
     return counter
コード例 #17
0
    def func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                func_name = json_io.json_loads(line)
                func = func_name.split('.')[-1]
                print(json_io.json_dumps(func), file=writer)
                line = safe_readline(reader)
コード例 #18
0
 def _save(self, f, kv_iterator):
     if isinstance(f, str):
         PathManager.mkdir(os.path.dirname(f))
         with file_io.open(f, "w") as fd:
             return self.save(fd)
     for k, v in kv_iterator:
         print(json_io.json_dumps([k, v]), file=f)
コード例 #19
0
        def __init__(self, path):
            with file_io.open(path, 'rb') as stream:
                magic_test = stream.read(9)
                assert self._HDR_MAGIC == magic_test, (
                    'Index file doesn\'t match expected format. '
                    'Make sure that --dataset-impl is configured properly.')
                version = struct.unpack('<Q', stream.read(8))
                assert (1, ) == version

                dtype_code, = struct.unpack('<B', stream.read(1))
                self._dtype = dtypes[dtype_code]
                self._dtype_size = self._dtype().itemsize

                self._len = struct.unpack('<Q', stream.read(8))[0]
                offset = stream.tell()

            _warmup_mmap_file(path)

            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
            self._bin_buffer = memoryview(self._bin_buffer_mmap)
            self._sizes = np.frombuffer(self._bin_buffer,
                                        dtype=np.int32,
                                        count=self._len,
                                        offset=offset)
            self._pointers = np.frombuffer(self._bin_buffer,
                                           dtype=np.int64,
                                           count=self._len,
                                           offset=offset + self._sizes.nbytes)
コード例 #20
0
    def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast_traversal = util_traversal.get_dfs(ast)
                else:
                    ast_traversal = None
                print(json_io.json_dumps(ast_traversal), file=writer)
                line = safe_readline(reader)
コード例 #21
0
    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast)
                    print(json_io.json_dumps(paths), file=writer)
                line = safe_readline(reader)
コード例 #22
0
 def read_data(self, path):
     with file_io.open(index_file_path(path), mode='rb') as stream:
         magic_test = stream.read(8)
         assert self._HDR_MAGIC == magic_test, (
             'Index file doesn\'t match expected format. '
             'Make sure that --dataset-impl is configured properly.')
         buffer = stream.read()
         self._data = np.frombuffer(buffer, dtype=self._dtype)
コード例 #23
0
def cast_code(raw_code_file, refined_code_file, dst_file):
    with file_io.open(raw_code_file, 'r') as raw_reader:
        raw_codes = {}
        for line in raw_reader:
            raw_code = line
            raw_code = raw_code[raw_code.find('def '):]
            func_name = raw_code[:raw_code.find('(')][4:].strip()
            raw_codes[func_name] = line.rstrip('\n')

    PathManager.mkdir(os.path.dirname(dst_file))
    with file_io.open(refined_code_file,
                      'r') as refined_reader, file_io.open(dst_file,
                                                           'w') as writer:
        for line in refined_reader:
            func_name = line[line.find('def '):].split()[1]
            raw_code = raw_codes[func_name]
            print(raw_code, file=writer)
コード例 #24
0
    def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast is not None:
                    dfs, _ = ast_to_dfs(ast)
                else:
                    dfs = None
                print(json_io.json_dumps(dfs), file=writer)
                line = safe_readline(reader)
コード例 #25
0
                def __enter__(self):
                    """for with open. this init method"""
                    self._file = file_io.open(path, 'wb')

                    self._file.write(cls._HDR_MAGIC)  # self-defined format
                    self._file.write(struct.pack('<Q', 1))  # version number, occupying 8 bit
                    self._file.write(struct.pack('<B', code(dtype)))  # data type, 1 bit

                    return self
コード例 #26
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
 def save_token_dict():
     src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
     tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
     # Dictionary.text_to_jsonl(src_file, tgt_file)
     vocab = Dictionary()
     with file_io.open(src_file, 'r') as reader:
         for line in reader:
             token, num = line.strip().split()
             vocab.add_symbol(token, eval(num))
     vocab.save(tgt_file)
     return vocab
コード例 #27
0
 def read_data(self, path, dictionary, tokenizer):
     with file_io.open(path, 'r', encoding='utf-8') as f:
         for line in f:
             self.lines.append(line.strip('\n'))
             tokens = dictionary.encode_line(
                 line, tokenizer, add_if_not_exist=False,
                 append_eos=self.append_eos, reverse_order=self.reverse_order,
             ).long()
             self.tokens_list.append(tokens)
             self.sizes.append(len(tokens))
     self.sizes = np.array(self.sizes)
コード例 #28
0
    def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast = util_ast.value2children(ast)
                    padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN)
                    root_idx = util_ast.get_root_idx(padded_ast)
                    sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx)
                else:
                    sbt = None
                print(json_io.json_dumps(sbt), file=writer)
                line = safe_readline(reader)
コード例 #29
0
def flatten(raw_file, dst_dir, mode):
    """flatten attributes of raw data"""
    data_frame = pd.read_csv(raw_file)
    attrs = data_frame.columns.values.tolist()[1:-1]
    LOGGER.info('Cast attributes({}) of OpenCL-{} dataset'.format(attrs, lang))
    for attr in attrs:
        dst_file = os.path.join(dst_dir, f"{mode}.{attr}")
        data = getattr(data_frame, attr).values.tolist()
        with file_io.open(dst_file, 'w') as writer:
            for line in data:
                print(json_io.json_dumps(line), file=writer)
コード例 #30
0
def flatten_attrs(raw_file, flatten_dir, lang, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        mode = filename[:str.rfind(filename, '.jsonl')]
        return mode

    mode = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}')
        PathManager.mkdir(os.path.dirname(attr_file))
        attr_writers[attr] = file_io.open(attr_file, 'w')
    print('raw_file: ', raw_file)
    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])