Exemple #1
0
def cli_main():
    import argparse
    parser = argparse.ArgumentParser(
        description=
        "Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)"
    )
    parser.add_argument("--yaml_file",
                        "-f",
                        type=str,
                        help="load {language}.yml for train",
                        default='config/csn_feng/ruby')
    parser.add_argument(
        '--out_file',
        '-o',
        type=str,
        help='output generated file',
        default=None,
    )
    args = parser.parse_args()
    yaml_file = os.path.join(os.path.dirname(__file__),
                             f"{args.yaml_file}.yml")
    out_file = None if args.out_file is None else recursive_expanduser(
        args.out_file)
    LOGGER.info('Load arguments in {}'.format(yaml_file))
    args = load_yaml(yaml_file)
    LOGGER.info(args)
    main(args, out_file)
Exemple #2
0
def cli_main():
    # modal_path = '~/.ncc/demo/summarization/neural_transformer/python_wan.pt'
    modal_path = '~/.ncc/demo/summarization/seq2seq/python_wan.pt'
    code = "def positional(max_positional_args):\n\tdef positional_decorator(wrapped):\n\t\[email protected](wrapped)\n\t\tdef positional_wrapper(*args, **kwargs):\n\t\t\tif (len(args) > max_posi      tional_args):\n\t\t\t\tplural_s = ''\n\t\t\t\tif (max_positional_args != 1):\n\t\t\t\t\tplural_s = 's'\n\t\t\t\tmessage = ('%s()\ttakes\tat\tmost\t%d\tpositional\targument%s\t(%d\tgive      n)' % (wrapped.__name__, max_positional_args, plural_s, len(args)))\n\t\t\t\tif (positional_parameters_enforcement == POSITIONAL_EXCEPTION):\n\t\t\t\t\traise TypeError(message)\n\t\t\t      \telif (positional_parameters_enforcement == POSITIONAL_WARNING):\n\t\t\t\t\tlogger.warning(message)\n\t\t\t\telse:\n\t\t\t\t\tpass\n\t\t\treturn wrapped(*args, **kwargs)\n\t\treturn p      ositional_wrapper\n\tif isinstance(max_positional_args, six.integer_types):\n\t\treturn positional_decorator\n\telse:\n\t\t(args, _, _, defaults) = inspect.getargspec(max_positional_ar      gs)\n\t\treturn positional((len(args) - len(defaults)))(max_positional_args)"
    # ground truth: "a decorator to declare that only the first n arguments my be positional ."

    # modal_path = '~/.ncc/demo/completion/seqrnn/py150.pt'
    # code = "body_content = self._serialize.body(parameters, 'ServicePrincipalCreateParameters')\nrequest = self._client.post(url, query_parameters)\nresponse = self._client.send( request, header_parameters, body_content, operation_config)"
    # ground truth: "(request, header_parameters, body_content, **operation_config)"

    import argparse
    parser = argparse.ArgumentParser(description="Command Interface")
    parser.add_argument("--model",
                        "-m",
                        type=str,
                        help="pytorch model path",
                        default=modal_path)
    parser.add_argument("--input",
                        "-i",
                        type=str,
                        help="model input",
                        default=code)
    args = parser.parse_args()
    args.model = os.path.expanduser(args.model)

    model_output = main(args.model, args.input)
    LOGGER.info(model_output)
Exemple #3
0
def load_langpair_dataset(
    data_path, split,
    src, src_dict,
    tgt, tgt_dict,
    dataset_impl,

    left_pad_source, left_pad_target,
    max_source_positions, max_target_positions,
    prepend_bos=False, load_alignments=False,
    truncate_source=False, append_source_id=False,
    truncate_target=False,
    append_eos_to_target=False,
    portion=None,
):
    src_path = os.path.join(data_path, '{}.{}'.format(split, src))
    src_dataset = _load_dataset(path=src_path, impl=dataset_impl, dict=src_dict)

    if portion is not None and split == 'train':
        LOGGER.info('set {}.{} portion to {}'.format(split, src, portion))
        src_dataset = PortionDataset(src_dataset, portion)

    tgt_path = os.path.join(data_path, '{}.{}'.format(split, tgt))
    tgt_dataset = _load_dataset(path=tgt_path, impl=dataset_impl, dict=tgt_dict)
    if truncate_target:
        LOGGER.info('truncate {}.{} to {}'.format(split, tgt, max_target_positions))
        tgt_dataset = TruncateDataset(tgt_dataset, max_target_positions)

    if prepend_bos:
        assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
        src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
        if tgt_dataset is not None:
            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())

    eos = None
    if append_source_id:
        if tgt_dataset is not None:
            tgt_dataset = AppendTokenDataset(tgt_dataset, tgt_dict.index('[{}]'.format(tgt)))
        eos = tgt_dict.index('[{}]'.format(tgt))

    if portion is not None and split == 'train':
        LOGGER.info('set {}.{} portion to {}'.format(split, tgt, portion))
        tgt_dataset = PortionDataset(tgt_dataset, portion)

    tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None

    LOGGER.info('loaded {} examples from: {}'.format(len(src_dataset), src_path))
    LOGGER.info('loaded {} examples from: {}'.format(len(tgt_dataset), tgt_path))
    return GraphLanguagePairDataset(
        src_dataset, src_dataset.sizes, src_dict,
        tgt_dataset, tgt_dataset_sizes, tgt_dict,
        left_pad_source=left_pad_source,
        left_pad_target=left_pad_target,
        max_source_positions=max_source_positions,
        max_target_positions=max_target_positions,
        align_dataset=None, eos=eos,
        remove_eos_from_source=True,
        append_eos_to_target=append_eos_to_target,
        shuffle=True,

    )
Exemple #4
0
 def build_dataset(args: Dict, src_dicts: Dict[str, Dictionary],
                   tgt_dict: Dictionary):
     """build dataset for modal"""
     for modality, src_dict in src_dicts.items():
         LOGGER.info('Building dataset for {}'.format(modality))
         for lang, data_prefs in args['preprocess']['dataprefs'].items():
             make_all(modality, src_dict, lang, data_prefs)
Exemple #5
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        paths = utils.split_paths(args['task']['data'])
        assert len(paths) > 0

        dict = args['task'].get('dict', None)
        dict_type = args['task'].get('dict_type', None)
        if dict is None and dict_type is None:
            # load dictionaries
            src_dict = cls.load_dictionary(
                os.path.join(
                    paths[0],
                    '{}.dict.jsonl'.format(args['task']['source_lang'])))
            tgt_dict = cls.load_dictionary(
                os.path.join(
                    paths[0],
                    '{}.dict.jsonl'.format(args['task']['target_lang'])))
            assert src_dict.pad() == tgt_dict.pad()
            assert src_dict.eos() == tgt_dict.eos()
            assert src_dict.unk() == tgt_dict.unk()
            LOGGER.info('[{}] dictionary: {} types'.format(
                args['task']['source_lang'], len(src_dict)))
            LOGGER.info('[{}] dictionary: {} types'.format(
                args['task']['target_lang'], len(tgt_dict)))
        else:
            raise NotImplementedError
        return cls(args, src_dict, tgt_dict)
Exemple #6
0
    def setup_task(cls, args, **kwargs):
        """Setup the task.
        """
        # paths = args.data.split(':')
        paths = utils.split_paths(args['task']['data'])
        assert len(paths) > 0
        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.jsonl'))

        data_path = paths[0]
        if args['task']['langs'] is None:
            languages = sorted([
                name for name in os.listdir(data_path)
                if os.path.isdir(os.path.join(data_path, name))
            ])
        else:
            languages = args['task']['langs']  # .split(',')

        if args['task']['add_lang_token']:
            for lang in languages:
                dictionary.add_symbol('[{}]'.format(lang))

        LOGGER.info("Loading dictionary: {} types".format(len(dictionary)))
        # if not hasattr(args, 'shuffle_instance'):
        #     args.shuffle_instance = False
        return cls(args, dictionary)
Exemple #7
0
    def _inference_with_bleu(self, generator, sample, model):
        import sacrebleu

        def decode(toks, escape_unk=False):
            s = self.tgt_dict.string(
                toks.int().cpu(),
                self.args['task']['eval_bleu_remove_bpe'],
                escape_unk=escape_unk,
            )
            if self.tokenizer:
                s = self.tokenizer.decode(s)
            return s

        gen_out = self.inference_step(generator, [model], sample, None)
        hyps, refs = [], []
        for i in range(len(gen_out)):
            hyps.append(decode(gen_out[i][0]['tokens']))
            refs.append(
                decode(
                    utils.strip_pad(sample['target'][i], self.tgt_dict.pad()),
                    escape_unk=True,  # don't count <unk> as matches to the hypo
                ))
        if self.args['task']['eval_bleu_print_samples']:
            LOGGER.info('example hypothesis: ' + hyps[0])
            LOGGER.info('example reference: ' + refs[0])
        # tokenize = sacrebleu.DEFAULT_TOKENIZER if not self.args['task']['eval_tokenized_bleu'] else 'none'
        # return sacrebleu.corpus_bleu(hyps, [refs], tokenize=tokenize)
        if self.args['task']['eval_tokenized_bleu']:
            return sacrebleu.corpus_bleu(hyps, [refs], tokenize='none')
        else:
            return sacrebleu.corpus_bleu(hyps, [refs])
Exemple #8
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        paths = utils.split_paths(args['task']['data'])
        assert len(paths) > 0

        share_dict = args['task'].get('share_dict', False)
        if share_dict:
            src_dict = tgt_dict = cls.load_dictionary(
                os.path.join(paths[0], "dict.jsonl"))
        else:
            # load dictionaries
            src_dict = cls.load_dictionary(
                os.path.join(paths[0],
                             f"{args['task']['source_lang']}.dict.jsonl"))
            tgt_dict = cls.load_dictionary(
                os.path.join(paths[0],
                             f"{args['task']['target_lang']}.dict.jsonl"))
            assert src_dict.pad() == tgt_dict.pad()
            assert src_dict.eos() == tgt_dict.eos()
            assert src_dict.unk() == tgt_dict.unk()
            LOGGER.info('[{}] dictionary: {} types'.format(
                args['task']['source_lang'], len(src_dict)))
            LOGGER.info('[{}] dictionary: {} types'.format(
                args['task']['target_lang'], len(tgt_dict)))
        return cls(args, src_dict, tgt_dict)
Exemple #9
0
def load_tokens_dataset(
    data_path, split, src, src_dict, tgt, tgt_dict, dataset_impl,
    max_source_positions=None, max_target_positions=None, max_positions=None,
    append_source_eos=False, append_target_eos=False,
    shuffle=False,
):
    src_path = os.path.join(data_path, '{}.{}'.format(split, src))
    src_dataset = _load_dataset(src_path, dataset_impl)
    if max_source_positions is not None:
        src_dataset = TruncateDataset(src_dataset, max_source_positions)
    LOGGER.info('loaded {} examples from: {}'.format(len(src_dataset), src_path))

    tgt_path = os.path.join(data_path, '{}.{}'.format(split, tgt))
    tgt_dataset = _load_dataset(tgt_path, dataset_impl)
    if max_target_positions is not None:
        tgt_dataset = TruncateDataset(tgt_dataset, max_target_positions)
    LOGGER.info('loaded {} examples from: {}'.format(len(tgt_dataset), tgt_path))

    return BertDataset(
        src_dataset, src_dataset.sizes, src_dict,
        tgt_dataset, tgt_dataset.sizes, tgt_dict,
        max_source_positions=max_source_positions, max_target_positions=max_target_positions,
        max_positions=max_positions,
        append_source_eos=append_source_eos, append_target_eos=append_target_eos,
        shuffle=shuffle,
    )
Exemple #10
0
def cli_main():
    import argparse
    parser = argparse.ArgumentParser(
        description=
        "Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)"
    )
    parser.add_argument("--yaml_file",
                        "-f",
                        type=str,
                        help="load {yaml_file}.yml for train",
                        default='config/python_wan/python')
    parser.add_argument(
        '--out_file',
        '-o',
        type=str,
        help='output generated file',
        default=None,
    )
    args = parser.parse_args()
    yaml_file = os.path.join(os.path.dirname(__file__),
                             '{}.yml'.format(args.yaml_file))
    out_file = args.out_file
    if out_file:
        dirname = os.path.dirname(out_file)
        assert os.path.isdir(dirname)
        os.makedirs(dirname, exist_ok=True)
    LOGGER.info('Load arguments in {}, output gnerated sentences at {}(if None, it won\'t record prediction).' \
                .format(yaml_file, out_file))
    args = load_yaml(yaml_file)
    LOGGER.info(args)

    torch.cuda.set_device(args['distributed_training']['device_id'])
    main(args, out_file)
Exemple #11
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         in_file = file_name(input_prefix, lang)
         out_dir = args['preprocess']['destdir']
         os.makedirs(out_dir, exist_ok=True)
         LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
         shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
     else:
         in_file = file_name(input_prefix, lang)
         out_file = dest_path(output_prefix, lang)
         os.makedirs(os.path.dirname(out_file), exist_ok=True)
         offsets = find_offsets(in_file, num_workers)
         with Pool(num_workers) as mpool:
             results = [
                 mpool.apply_async(
                     build_dgl_graph,
                     (vocab, in_file, f'{out_file}{worker_id}.mmap',
                      offsets[worker_id], offsets[worker_id + 1]),
                 ) for worker_id in range(num_workers)
             ]
             results = [res.get() for res in results]
         graph_batch = []
         for worker_id in range(num_workers):
             sub_file = f'{out_file}{worker_id}.mmap'
             glist, _ = load_graphs(sub_file)
             graph_batch.extend(glist)
             os.remove(sub_file)
         save_graphs(f'{out_file}.mmap', graph_batch)
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize_bpe(input_file,
                                   vocab,
                                   lambda t: ds.add_item(t),
                                   offset=0,
                                   end=offsets[1]))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, BPE no replaced token".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
            ))
Exemple #13
0
 def setup_task(cls, args, **kwargs):
     paths = utils.split_paths(args['task']['data'])
     assert len(paths) > 0
     dictionary = cls.load_dictionary(
         os.path.join(paths[0],
                      'dict.{}.json'.format(args['task']['source_lang'])))
     LOGGER.info('dictionary: {} types'.format(len(dictionary)))
     return cls(args, dictionary)
Exemple #14
0
 def setup_task(cls, args, **kwargs):
     """Setup the task.
     """
     dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt'))
     LOGGER.info('dictionary: {} types'.format(len(dictionary)))
     if not hasattr(args, 'shuffle_instance'):
         args.shuffle_instance = False
     return cls(args, dictionary)
Exemple #15
0
 def setup_task(cls, args, **kwargs):
     paths = utils.split_paths(args['task']['data'])
     assert len(paths) > 0
     # dictionary = cls.load_dictionary(os.path.join(paths[0], 'codesearchnet_ruby.dict.txt'))
     dictionary = cls.load_dictionary(
         os.path.join(paths[0], 'csnjs_8k_9995p_unigram_url.dict.txt'))
     # dictionary = cls.load_dictionary(args['dataset']['srcdict'])
     LOGGER.info('dictionary: {} types'.format(len(dictionary)))
     return cls(args, dictionary)
Exemple #16
0
    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
        """Load a given dataset split.

        Args:
            split (str): name of the split (e.g., train, valid, test)
        """
        paths = utils.split_paths(self.args.data)
        assert len(paths) > 0
        data_path = paths[(epoch - 1) % len(paths)]
        split_path = os.path.join(data_path, split)

        dataset = data_utils.load_indexed_dataset(
            split_path,
            self.dictionary,
            self.args.dataset_impl,
            combine=combine,
        )
        if dataset is None:
            raise FileNotFoundError('Dataset not found: {} ({})'.format(
                split, split_path))

        dataset = StripTokenDataset(dataset, self.dictionary.eos())

        # create continuous blocks of tokens
        dataset = TokenBlockDataset(
            dataset,
            dataset.sizes,
            self.args.tokens_per_sample -
            2,  # one less for <s> and one for </s>
            pad=self.dictionary.pad(),
            eos=self.dictionary.eos(),
            break_mode=self.args.sample_break_mode,
            document_sep_len=0)

        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())

        mask_whole_words = get_whole_word_mask(self.args, self.source_dictionary) \
            if self.args.mask_length != 'subword' else None

        self.datasets[split] = DenoisingDataset(
            dataset,
            dataset.sizes,
            self.dictionary,
            self.mask_idx,
            mask_whole_words,
            shuffle=self.args.shuffle_instance,
            seed=self.seed,
            args=self.args)
        LOGGER.info(
            "Split: {0}, Loaded {1} samples of denoising_dataset".format(
                split,
                len(self.datasets[split]),
            ))
Exemple #17
0
 def __init__(self, args, params):
     super().__init__(args)
     fused_adam_cls = get_fused_adam_class()
     use_fused_adam = (not args['optimization']['adam']['use_old_adam']
                       and fused_adam_cls is not None
                       and torch.cuda.is_available())
     if use_fused_adam:
         LOGGER.info('using FusedAdam')
         self._optimizer = fused_adam_cls(params, **self.optimizer_config)
     else:
         self._optimizer = Adam(params, **self.optimizer_config)
def flatten(raw_file, dst_dir, mode):
    """flatten attributes of raw data"""
    data_frame = pd.read_csv(raw_file)
    attrs = data_frame.columns.values.tolist()[1:-1]
    LOGGER.info('Cast attributes({}) of OpenCL-{} dataset'.format(attrs, lang))
    for attr in attrs:
        dst_file = os.path.join(dst_dir, f"{mode}.{attr}")
        data = getattr(data_frame, attr).values.tolist()
        with file_io.open(dst_file, 'w') as writer:
            for line in data:
                print(json_io.json_dumps(line), file=writer)
Exemple #19
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         in_file = file_name(input_prefix, lang)
         out_dir = args['preprocess']['destdir']
         os.makedirs(out_dir, exist_ok=True)
         LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
         shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
     else:
         in_file = file_name(input_prefix, lang)
         out_file = dest_path(output_prefix, lang)
         os.makedirs(os.path.dirname(out_file), exist_ok=True)
         make_binary_dataset(vocab, in_file, out_file, num_workers)
def flatten(raw_dir, lang, mode, flatten_dir, attrs, num_cores):
    """flatten attributes of raw data"""
    LOGGER.info('Cast attributes({}) of {}-{} dataset'.format(
        attrs, lang, mode))
    with Pool(num_cores) as mpool:
        result = [
            mpool.apply_async(flatten_attrs,
                              (raw_file, flatten_dir, lang, mode, set(attrs)))
            for raw_file in PathManager.ls(
                os.path.join(raw_dir, lang, mode, '*.jsonl.gz'))
        ]
        result = [res.get() for res in result]
Exemple #21
0
def download(name):
    if name in TREE_SITTER_SO_FILE_ARCHIVE_MAP:
        url = TREE_SITTER_SO_FILE_ARCHIVE_MAP[name]
        LOGGER.info(f"Download {name}.so from {url}")
        gdown.download(url=url,
                       output=os.path.join(__TREE_SITTER_LIBS_DIR__,
                                           f"{name}.so"))
    else:
        raise FileExistsError(
            f"{name}.so has not been uploaded to the server. Please, build {name}.so with " \
            f" {os.path.dirname(__file__)}/build_so.py"
        )
Exemple #22
0
def load_langpair_dataset(
    data_path,
    split,
    src,
    src_dict,
    tgt,
    tgt_dict,
    dataset_impl,
    left_pad_source,
    max_source_positions,
    src_aux=None,
):
    # load source dataset
    src_path = os.path.join(data_path, '{}.{}'.format(split, src))
    src_dataset = _load_dataset(path=src_path,
                                impl=dataset_impl,
                                dict=src_dict)
    src_dataset = TruncateDataset(src_dataset,
                                  truncation_length=max_source_positions,
                                  truncate_prefix=0)

    # load target dataset
    tgt_path = os.path.join(data_path, '{}.{}'.format(split, tgt))
    tgt_dataset = _load_dataset(path=tgt_path,
                                impl=dataset_impl,
                                dict=tgt_dict)

    # load auxiliary dataset
    aux_datasets = OrderedDict()
    for aux in src_aux:
        aux_path = os.path.join(data_path, '{}.{}'.format(split, aux))
        with open(aux_path, 'rb') as reader:
            aux_datasets[aux] = pickle.load(reader)

    tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None

    LOGGER.info('loaded {} examples from: {}'.format(len(src_dataset),
                                                     src_path))
    LOGGER.info('loaded {} examples from: {}'.format(len(tgt_dataset),
                                                     tgt_path))
    return LanguagePairDataset(
        src_dataset,
        src_dataset.sizes,
        src_dict,
        src_aux=aux_datasets,
        tgt=tgt_dataset,
        tgt_sizes=tgt_dataset_sizes,
        tgt_dict=tgt_dict,
        left_pad_source=left_pad_source,
        max_source_positions=max_source_positions,
        shuffle=(split == 'train'),
    )
Exemple #23
0
def spm_train(input: str,
              model_prefix: str,
              vocab_size: int,
              character_coverage=0.9995,
              model_type='unigram',
              special_symbols=None):
    special_symbols = ','.join(special_symbols)
    command = f"--input={input} --model_prefix={model_prefix} --vocab_size={vocab_size} " \
              f"--character_coverage={character_coverage} --model_type={model_type} --unk_piece=[UNK] " \
              f"--pad_piece=[PAD] --user_defined_symbols={special_symbols} --hard_vocab_limit=false"
    LOGGER.info(command)
    # exit()
    spm.SentencePieceTrainer.Train(command)
Exemple #24
0
def cli_main():
    import argparse
    parser = argparse.ArgumentParser(
        description="Downloading/Decompressing CodeSearchNet dataset(s) or Tree-Sitter Library(ies)")
    parser.add_argument(
        "--yaml_file", "-f", help="load {yaml_file}.yml for train", type=str,
    )
    args = parser.parse_args()
    yaml_file = os.path.join(os.path.dirname(__file__), '{}.yml'.format(args.yaml_file))
    LOGGER.info('Load arguments in {}'.format(yaml_file))
    args = load_yaml(yaml_file)
    LOGGER.info(args)
    main(args)
Exemple #25
0
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code):
        code_tokens = vocab.tokenize(code)
        # truncating
        code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        source_size = len(source_tokens)
        source_mask = [1] * source_size
        padding_length = config.MAX_SOURCE_LENGTH - len(source_ids)
        source_ids += [vocab.pad()] * padding_length
        source_mask += [0] * padding_length
        return [source_ids, source_mask, source_size]

    def parse_target_input(code):
        target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2]
        target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token]
        target_ids = vocab.convert_tokens_to_ids(target_tokens)
        target_size = len(target_ids)
        target_mask = [1] * target_size
        padding_length = config.MAX_TARGET_LENGTH - len(target_ids)
        target_ids += [vocab.pad_token_id] * padding_length
        target_mask += [0] * padding_length
        return [target_ids, target_mask, target_size]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes']
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # src_code = SPACE_SPLITTER.sub(" ", line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code)
                # target_ids, target_mask
                tgt_line = parse_target_input(src_code)
                for key, src in zip(keys, [src_code] + src_line + tgt_line):
                    data[key].append(src)
            file_io.open(dst_file, mode='wb', data=data)
Exemple #26
0
 def setup_task(cls, args, **kwargs):
     paths = utils.split_paths(args['task']['data'])
     assert len(paths) > 0
     assert len(paths) > 0
     # load dictionaries
     src_dicts = OrderedDict()
     for lang in args['task']['source_langs']:
         src_dicts[lang] = cls.load_dictionary(os.path.join(paths[0], '{}.dict.json'.format(lang)))
         LOGGER.info('[{}] dictionary: {} types'.format(lang, len(src_dicts[lang]) if lang != 'edges' else 0))
     tgt_dicts = OrderedDict()
     for lang in args['task']['target_langs']:
         tgt_dicts[lang] = cls.load_dictionary(os.path.join(paths[0], '{}.dict.json'.format(lang)))
         LOGGER.info('[{}] dictionary: {} types'.format(lang, len(tgt_dicts[lang])))
     return cls(args, src_dicts, tgt_dicts)
Exemple #27
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    dictionary = save_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        file = f"{args['preprocess'][f'{mode}pref']}.code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")
        PathManager.mkdir(os.path.dirname(dst_file))
        dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap",
                                               impl='mmap',
                                               vocab_size=len(vocab))
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(file, 'r') as reader:
            data = {'code': []}
            for line in reader:
                line = json_io.json_loads(line)
                code = SPACE_SPLITTER.sub(" ", line)
                data['code'].append(code)
                code_tokens = vocab.encode(code, out_type=str)
                code_tokens = torch.IntTensor(
                    [dictionary.index(token) for token in code_tokens])
                # code_tokens = torch.IntTensor(vocab.encode_as_ids(code))
                dataset.add_item(code_tokens)
            dataset.finalize(f"{dst_file}_tokens.idx")
            # proj indices
            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
Exemple #28
0
 def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
     """Clips gradient norm."""
     self.scaler.unscale_(self.optimizer)
     grad_norm = self.fp32_optimizer.clip_grad_norm(max_norm,
                                                    aggregate_norm_fn)
     if not torch.isfinite(grad_norm).all():
         new_loss_scale = self.next_loss_scale
         if new_loss_scale <= self.min_loss_scale:
             raise FloatingPointError((
                 "AMP: Minimum loss scale reached ({}). Your loss is probably exploding. "
                 "Try restarting training or use fp32. {}").format(
                     self.min_loss_scale, new_loss_scale))
         else:
             LOGGER.info("AMP: overflow detected, setting scale to "
                         f"to {new_loss_scale}")
     return grad_norm
Exemple #29
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """

        paths = utils.split_paths(args['task']['data'])
        assert len(paths) > 0
        # load dictionaries
        dictionary = cls.load_dictionary(
            os.path.join(paths[0],
                         '{}.dict.json'.format(args['task']['target_lang'])))
        LOGGER.info('[{}] dictionary: {} types'.format(
            args['task']['target_lang'], len(dictionary)))
        return cls(args, dictionary)
Exemple #30
0
def download(name):
    if name in BPE_MODEL_ARCHIVE_MAP:
        url = BPE_MODEL_ARCHIVE_MAP[name]
        LOGGER.info(f"Download {name} BPE model from {url}")
        out_file = os.path.join(__BPE_DIR__, f"{name}.tar.gz")
        gdown.download(url=url, output=out_file)
        try:
            with tarfile.open(out_file) as reader:
                reader.extractall(__BPE_DIR__)
            os.remove(out_file)
        except tarfile.ExtractError as err:
            LOGGER.error(__BPE_DIR__)
            LOGGER.warning(f"{name}.tar.gz is corrupted, please contact us.")
    else:
        raise FileExistsError(f"No {name}.tar.gz in the server. Please build your own BPE models. " \
                              f"Once they are built, you can upload them into the server.")