Beispiel #1
0
    def __init__(self,
                 vocab_file,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
Beispiel #2
0
 def __init__(self, min_char: int, vocab_file: str, lower: bool,
              add_sentence_boundary: bool, add_word_boundary: bool,
              use_cuda: bool):
     super(WordPieceBatch,
           self).__init__(min_char=min_char,
                          lower=lower,
                          add_sentence_boundary=add_sentence_boundary,
                          add_word_boundary=add_word_boundary,
                          use_cuda=use_cuda)
     self.vocab = load_vocab(vocab_file=vocab_file)
     self.tokenizer = WordpieceTokenizer(vocab=self.vocab)
Beispiel #3
0
 def from_config(cls, config: Config):
     basic_tokenizer = create_component(
         ComponentType.TOKENIZER, config.basic_tokenizer
     )
     vocab = load_vocab(config.wordpiece_vocab_path)
     wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab)
     return cls(vocab, basic_tokenizer, wordpiece_tokenizer)
Beispiel #4
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing"
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = WordpieceTokenizer(vocab=vocab)

        self.assertListEqual(tokenizer.tokenize(""), [])

        self.assertListEqual(tokenizer.tokenize("unwanted running"),
                             ["un", "##want", "##ed", "runn", "##ing"])

        self.assertListEqual(tokenizer.tokenize("unwantedX running"),
                             ["[UNK]", "runn", "##ing"])
Beispiel #5
0
    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(vocab)

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
Beispiel #6
0
class SubwordTokenizer(Tokenizer):
    """
    Subword Tokenizer

    text -> [word tokens] -> [[sub word tokens], ...]

    * Args:
        name: tokenizer name [wordpiece]
    """

    def __init__(self, name, word_tokenizer, config={}):
        super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}")
        self.data_handler = DataHandler(CachePath.VOCAB)
        self.config = config
        self.word_tokenizer = word_tokenizer
        self.subword_tokenizer = None

    """ Tokenizers """

    def _wordpiece(self, text, unit="text"):
        """
        ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld']
        """
        if self.subword_tokenizer is None:
            vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True)
            vocab = load_vocab(vocab_path)
            self.subword_tokenizer = WordpieceTokenizer(vocab)

        tokens = []

        if unit == "word":
            for sub_token in self.subword_tokenizer.tokenize(text):
                tokens.append(sub_token)
        else:
            for token in self.word_tokenizer.tokenize(text):
                for sub_token in self.subword_tokenizer.tokenize(token):
                    tokens.append(sub_token)

        return tokens
Beispiel #7
0
class BertLabelTokenizer:
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self,
                 vocab_file,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BertTokenizer.

        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)

    def tokenize(self, text):
        split_tokens = []
        token_begin_mask = []
        for token in text:
            wordpieces = self.wordpiece_tokenizer.tokenize(token)
            if len(wordpieces) > 0:
                for sub_token in wordpieces:
                    split_tokens.append(sub_token)
                token_begin_mask += [1] + [0] * (len(wordpieces) - 1)
        return split_tokens, token_begin_mask

    def tokenize_labels(self, text, labels):
        split_tokens = []
        split_labels = []
        token_begin_mask = []
        for token, label in zip(text, labels):
            wordpieces = self.wordpiece_tokenizer.tokenize(token)
            if len(wordpieces) > 0:
                for sub_token in wordpieces:
                    split_tokens.append(sub_token)
                split_labels += [label] + ["X"] * (len(wordpieces) - 1)
                token_begin_mask += [1] + [0] * (len(wordpieces) - 1)
        return split_tokens, split_labels, token_begin_mask

    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(
                    len(ids), self.max_len))
        return ids

    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens

    @classmethod
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        cache_dir=None,
                        *inputs,
                        **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
                pretrained_model_name_or_path]
        else:
            vocab_file = pretrained_model_name_or_path
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
                pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
        return tokenizer
Beispiel #8
0
def main(param2val):

    # params
    params = Params.from_param2val(param2val)
    print(params, flush=True)

    #  paths
    project_path = Path(param2val['project_path'])
    save_path = Path(param2val['save_path'])
    srl_eval_path = project_path / 'perl' / 'srl-eval.pl'
    data_path_mlm = project_path / 'data' / 'training' / f'{params.corpus_name}_mlm.txt'
    data_path_train_srl = project_path / 'data' / 'training' / f'{params.corpus_name}_no-dev_srl.txt'
    data_path_devel_srl = project_path / 'data' / 'training' / f'human-based-2018_srl.txt'
    data_path_test_srl = project_path / 'data' / 'training' / f'human-based-2008_srl.txt'
    childes_vocab_path = project_path / 'data' / f'{params.corpus_name}_vocab.txt'
    google_vocab_path = project_path / 'data' / 'bert-base-cased.txt'  # to get word pieces

    # word-piece tokenizer - defines input vocabulary
    vocab = load_vocab(childes_vocab_path, google_vocab_path,
                       params.vocab_size)
    # TODO testing google vocab with wordpieces

    assert vocab['[PAD]'] == 0  # AllenNLP expects this
    assert vocab['[UNK]'] == 1  # AllenNLP expects this
    assert vocab['[CLS]'] == 2
    assert vocab['[SEP]'] == 3
    assert vocab['[MASK]'] == 4
    wordpiece_tokenizer = WordpieceTokenizer(vocab)
    print(f'Number of types in vocab={len(vocab):,}')

    # load utterances for MLM task
    utterances = load_utterances_from_file(data_path_mlm)
    train_utterances, devel_utterances, test_utterances = split(utterances)

    # load propositions for SLR task
    propositions = load_propositions_from_file(data_path_train_srl)
    train_propositions, devel_propositions, test_propositions = split(
        propositions)
    if data_path_devel_srl.is_file(
    ):  # use human-annotated data as devel split
        print(f'Using {data_path_devel_srl.name} as SRL devel split')
        devel_propositions = load_propositions_from_file(data_path_devel_srl)
    if data_path_test_srl.is_file():  # use human-annotated data as test split
        print(f'Using {data_path_test_srl.name} as SRL test split')
        test_propositions = load_propositions_from_file(data_path_test_srl)

    # converters handle conversion from text to instances
    converter_mlm = ConverterMLM(params, wordpiece_tokenizer)
    converter_srl = ConverterSRL(params, wordpiece_tokenizer)

    # get output_vocab
    # note: Allen NLP vocab holds labels, wordpiece_tokenizer.vocab holds input tokens
    # what from_instances() does:
    # 1. it iterates over all instances, and all fields, and all token indexers
    # 2. the token indexer is used to update vocabulary count, skipping words whose text_id is already set
    # 4. a PADDING and MASK symbol are added to 'tokens' namespace resulting in vocab size of 2
    # input tokens are not indexed, as they are already indexed by bert tokenizer vocab.
    # this ensures that the model is built with inputs for all vocab words,
    # such that words that occur only in LM or SRL task can still be input

    # make instances once - this allows iterating multiple times (required when num_epochs > 1)
    train_instances_mlm = converter_mlm.make_instances(train_utterances)
    devel_instances_mlm = converter_mlm.make_instances(devel_utterances)
    test_instances_mlm = converter_mlm.make_instances(test_utterances)
    train_instances_srl = converter_srl.make_instances(train_propositions)
    devel_instances_srl = converter_srl.make_instances(devel_propositions)
    test_instances_srl = converter_srl.make_instances(test_propositions)
    all_instances_mlm = chain(train_instances_mlm, devel_instances_mlm,
                              test_instances_mlm)
    all_instances_srl = chain(train_instances_srl, devel_instances_srl,
                              test_instances_srl)

    # make vocab from all instances
    output_vocab_mlm = Vocabulary.from_instances(all_instances_mlm)
    output_vocab_srl = Vocabulary.from_instances(all_instances_srl)
    # print(f'mlm vocab size={output_vocab_mlm.get_vocab_size()}')  # contain just 2 tokens
    # print(f'srl vocab size={output_vocab_srl.get_vocab_size()}')  # contain just 2 tokens
    assert output_vocab_mlm.get_vocab_size(
        'tokens') == output_vocab_srl.get_vocab_size('tokens')

    # BERT
    print('Preparing Multi-task BERT...')
    input_vocab_size = len(converter_mlm.wordpiece_tokenizer.vocab)
    bert_config = BertConfig(
        vocab_size_or_config_json_file=input_vocab_size,  # was 32K
        hidden_size=params.hidden_size,  # was 768
        num_hidden_layers=params.num_layers,  # was 12
        num_attention_heads=params.num_attention_heads,  # was 12
        intermediate_size=params.intermediate_size)  # was 3072
    bert_model = BertModel(config=bert_config)
    # Multi-tasking BERT
    mt_bert = MTBert(vocab_mlm=output_vocab_mlm,
                     vocab_srl=output_vocab_srl,
                     bert_model=bert_model,
                     embedding_dropout=params.embedding_dropout)
    mt_bert.cuda()
    num_params = sum(p.numel() for p in mt_bert.parameters()
                     if p.requires_grad)
    print('Number of model parameters: {:,}'.format(num_params), flush=True)

    # optimizers
    optimizer_mlm = BertAdam(params=mt_bert.parameters(), lr=params.lr)
    optimizer_srl = BertAdam(params=mt_bert.parameters(), lr=params.lr)
    move_optimizer_to_cuda(optimizer_mlm)
    move_optimizer_to_cuda(optimizer_srl)

    # batching
    bucket_batcher_mlm = BucketIterator(batch_size=params.batch_size,
                                        sorting_keys=[('tokens', "num_tokens")
                                                      ])
    bucket_batcher_mlm.index_with(output_vocab_mlm)
    bucket_batcher_srl = BucketIterator(batch_size=params.batch_size,
                                        sorting_keys=[('tokens', "num_tokens")
                                                      ])
    bucket_batcher_srl.index_with(output_vocab_srl)

    # big batcher to speed evaluation - 1024 is too big
    bucket_batcher_mlm_large = BucketIterator(batch_size=512,
                                              sorting_keys=[('tokens',
                                                             "num_tokens")])
    bucket_batcher_srl_large = BucketIterator(batch_size=512,
                                              sorting_keys=[('tokens',
                                                             "num_tokens")])
    bucket_batcher_mlm_large.index_with(output_vocab_mlm)
    bucket_batcher_srl_large.index_with(output_vocab_srl)

    # init performance collection
    name2col = {
        'devel_pps': [],
        'devel_f1s': [],
    }

    # init
    eval_steps = []
    train_start = time.time()
    loss_mlm = None
    no_mlm_batches = False
    step = 0

    # generators
    train_generator_mlm = bucket_batcher_mlm(train_instances_mlm,
                                             num_epochs=params.num_mlm_epochs)
    train_generator_srl = bucket_batcher_srl(
        train_instances_srl, num_epochs=None)  # infinite generator
    num_train_mlm_batches = bucket_batcher_mlm.get_num_batches(
        train_instances_mlm)
    if params.srl_interleaved:
        max_step = num_train_mlm_batches
    else:
        max_step = num_train_mlm_batches * 2
    print(f'Will stop training at step={max_step:,}')

    while step < max_step:

        # TRAINING
        if step != 0:  # otherwise evaluation at step 0 is influenced by training on one batch
            mt_bert.train()

            # masked language modeling task
            try:
                batch_mlm = next(train_generator_mlm)
            except StopIteration:
                if params.srl_interleaved:
                    break
                else:
                    no_mlm_batches = True
            else:
                loss_mlm = mt_bert.train_on_batch('mlm', batch_mlm,
                                                  optimizer_mlm)

            # semantic role labeling task
            if params.srl_interleaved:
                if random.random() < params.srl_probability:
                    batch_srl = next(train_generator_srl)
                    mt_bert.train_on_batch('srl', batch_srl, optimizer_srl)
            elif no_mlm_batches:
                batch_srl = next(train_generator_srl)
                mt_bert.train_on_batch('srl', batch_srl, optimizer_srl)

        # EVALUATION
        if step % config.Eval.interval == 0:
            mt_bert.eval()
            eval_steps.append(step)

            # evaluate perplexity
            devel_generator_mlm = bucket_batcher_mlm_large(devel_instances_mlm,
                                                           num_epochs=1)
            devel_pp = evaluate_model_on_pp(mt_bert, devel_generator_mlm)
            name2col['devel_pps'].append(devel_pp)
            print(f'devel-pp={devel_pp}', flush=True)

            # test sentences
            if config.Eval.test_sentences:
                test_generator_mlm = bucket_batcher_mlm_large(
                    test_instances_mlm, num_epochs=1)
                out_path = save_path / f'test_split_mlm_results_{step}.txt'
                predict_masked_sentences(mt_bert, test_generator_mlm, out_path)

            # probing - test sentences for specific syntactic tasks
            for name in config.Eval.probing_names:
                # prepare data
                probing_data_path_mlm = project_path / 'data' / 'probing' / f'{name}.txt'
                if not probing_data_path_mlm.exists():
                    print(f'WARNING: {probing_data_path_mlm} does not exist')
                    continue
                probing_utterances_mlm = load_utterances_from_file(
                    probing_data_path_mlm)
                # check that probing words are in vocab
                for u in probing_utterances_mlm:
                    # print(u)
                    for w in u:
                        if w == '[MASK]':
                            continue  # not in output vocab
                        # print(w)
                        assert output_vocab_mlm.get_token_index(
                            w, namespace='labels'), w
                # probing + save results to text
                probing_instances_mlm = converter_mlm.make_probing_instances(
                    probing_utterances_mlm)
                probing_generator_mlm = bucket_batcher_mlm(
                    probing_instances_mlm, num_epochs=1)
                out_path = save_path / f'probing_{name}_results_{step}.txt'
                predict_masked_sentences(mt_bert,
                                         probing_generator_mlm,
                                         out_path,
                                         print_gold=False,
                                         verbose=True)

            # evaluate devel f1
            devel_generator_srl = bucket_batcher_srl_large(devel_instances_srl,
                                                           num_epochs=1)
            devel_f1 = evaluate_model_on_f1(mt_bert, srl_eval_path,
                                            devel_generator_srl)

            name2col['devel_f1s'].append(devel_f1)
            print(f'devel-f1={devel_f1}', flush=True)

            # console
            min_elapsed = (time.time() - train_start) // 60
            pp = torch.exp(loss_mlm) if loss_mlm is not None else np.nan
            print(
                f'step {step:<6,}: pp={pp :2.4f} total minutes elapsed={min_elapsed:<3}',
                flush=True)

        # only increment step once in each iteration of the loop, otherwise evaluation may never happen
        step += 1

    # evaluate train perplexity
    if config.Eval.train_split:
        generator_mlm = bucket_batcher_mlm_large(train_instances_mlm,
                                                 num_epochs=1)
        train_pp = evaluate_model_on_pp(mt_bert, generator_mlm)
    else:
        train_pp = np.nan
    print(f'train-pp={train_pp}', flush=True)

    # evaluate train f1
    if config.Eval.train_split:
        generator_srl = bucket_batcher_srl_large(train_instances_srl,
                                                 num_epochs=1)
        train_f1 = evaluate_model_on_f1(mt_bert,
                                        srl_eval_path,
                                        generator_srl,
                                        print_tag_metrics=True)
    else:
        train_f1 = np.nan
    print(f'train-f1={train_f1}', flush=True)

    # test sentences
    if config.Eval.test_sentences:
        test_generator_mlm = bucket_batcher_mlm(test_instances_mlm,
                                                num_epochs=1)
        out_path = save_path / f'test_split_mlm_results_{step}.txt'
        predict_masked_sentences(mt_bert, test_generator_mlm, out_path)

    # probing - test sentences for specific syntactic tasks
    for name in config.Eval.probing_names:
        # prepare data
        probing_data_path_mlm = project_path / 'data' / 'probing' / f'{name}.txt'
        if not probing_data_path_mlm.exists():
            print(f'WARNING: {probing_data_path_mlm} does not exist')
            continue
        probing_utterances_mlm = load_utterances_from_file(
            probing_data_path_mlm)
        probing_instances_mlm = converter_mlm.make_probing_instances(
            probing_utterances_mlm)
        # batch and do inference
        probing_generator_mlm = bucket_batcher_mlm(probing_instances_mlm,
                                                   num_epochs=1)
        out_path = save_path / f'probing_{name}_results_{step}.txt'
        predict_masked_sentences(mt_bert,
                                 probing_generator_mlm,
                                 out_path,
                                 print_gold=False,
                                 verbose=True)

    # put train-pp and train-f1 into pandas Series
    s1 = pd.Series([train_pp], index=[eval_steps[-1]])
    s1.name = 'train_pp'
    s2 = pd.Series([train_f1], index=[eval_steps[-1]])
    s2.name = 'train_f1'

    # return performance as pandas Series
    series_list = [s1, s2]
    for name, col in name2col.items():
        print(f'Making pandas series with name={name} and length={len(col)}')
        s = pd.Series(col, index=eval_steps)
        s.name = name
        series_list.append(s)

    return series_list
Beispiel #9
0
def test_WordpieceTokenizer():
    model = WordpieceTokenizer(
        tokenization.load_vocab(
            os.path.join(model_dir, "bert-base-cased-vocab.txt")))
    print(model.tokenize("decomposition deoomposition"))
Beispiel #10
0
class WordPieceBatch(CharacterBatch, SpecialTokens):
    def __init__(self, min_char: int, vocab_file: str, lower: bool,
                 add_sentence_boundary: bool, add_word_boundary: bool,
                 use_cuda: bool):
        super(WordPieceBatch,
              self).__init__(min_char=min_char,
                             lower=lower,
                             add_sentence_boundary=add_sentence_boundary,
                             add_word_boundary=add_word_boundary,
                             use_cuda=use_cuda)
        self.vocab = load_vocab(vocab_file=vocab_file)
        self.tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def create_one_batch(self, raw_dataset: List[List[str]]):
        batch_size = len(raw_dataset)
        seq_len = max([len(input_) for input_ in raw_dataset])
        if self.add_sentence_boundary:
            seq_len += 2

        sub_tokens = []
        for raw_data in raw_dataset:
            item = []
            for token in raw_data:
                if self.lower:
                    token = token.lower()
                item.append(self.tokenizer.tokenize(token))
            sub_tokens.append(item)
        max_char_len = max(
            [len(token) for item in sub_tokens for token in item])
        max_char_len = max(max_char_len, self.min_char)
        if self.add_word_boundary:
            max_char_len += 2

        batch = torch.LongTensor(batch_size, seq_len,
                                 max_char_len).fill_(self.pad_id)
        lengths = torch.LongTensor(batch_size, seq_len).fill_(1)
        for i, item in enumerate(sub_tokens):
            if self.add_sentence_boundary:
                item = [self.bos] + item + [self.eos]
            for j, token in enumerate(item):
                if self.add_sentence_boundary and (token == self.bos
                                                   or token == self.eos):
                    if self.add_word_boundary:
                        lengths[i, j] = 3
                        batch[i, j, 0] = self.mapping.get(self.bow)
                        batch[i, j, 1] = self.mapping.get(token)
                        batch[i, j, 2] = self.mapping.get(self.eow)
                    else:
                        lengths[i, j] = 1
                        batch[i, j, 0] = self.mapping.get(token)
                else:
                    if self.add_word_boundary:
                        lengths[i, j] = len(token) + 2
                        batch[i, j, 0] = self.mapping.get(self.bow)
                        for k, sub_token in enumerate(token):
                            batch[i, j, k + 1] = self.mapping.get(
                                sub_token, self.oov_id)
                        batch[i, j,
                              len(token) + 1] = self.mapping.get(self.eow)
                    else:
                        lengths[i, j] = len(token)
                        for k, sub_token in enumerate(token):
                            batch[i, j, k] = self.mapping.get(
                                sub_token, self.oov_id)

        if self.use_cuda:
            batch = batch.cuda()
            lengths = lengths.cuda()
        return batch, lengths

    def create_dict_from_dataset(self, raw_dataset: List[List[str]]):
        n_entries = 0
        for raw_data in raw_dataset:
            for token in raw_data:
                if self.lower:
                    token = token.lower()
                for sub_token in self.tokenizer.tokenize(token):
                    if sub_token not in self.mapping:
                        self.mapping[sub_token] = len(self.mapping)
                        n_entries += 1
        logger.info('+ loaded {0} entries from input'.format(n_entries))
        logger.info('+ current number of entries in mapping is: {0}'.format(
            len(self.mapping)))
from collections import defaultdict
import sys
import argparse
from utilities import *

from pytorch_pretrained_bert.tokenization import BertTokenizer, WordpieceTokenizer

parser = argparse.ArgumentParser(description='Dataset Settings')
ps = parser.add_argument
ps('--dataset', dest='dataset')
args = parser.parse_args()
args.bert_model = 'bert-base-uncased'
args.do_lower_case = 'True'

tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab)


dataset = 'data/{}'.format(args.dataset)

#from utilities import *

def flatten(l):
    return [item for sublist in l for item in sublist]

def load_reviews(fp):
    with open('./{}/{}.json'.format(dataset, fp),'r') as f:
        data = json.load(f)
    return data

def load_set(fp):
from pytorch_pretrained_bert.tokenization import BertTokenizer, WordpieceTokenizer

parser = argparse.ArgumentParser(description='Dataset Settings')
ps = parser.add_argument
ps('--dataset', dest='dataset')  # category
args = parser.parse_args()
args.bert_model = 'bert-base-uncased'
args.do_lower_case = 'True'

#main = ''
in_fp = './data/{}/{}_filter_flat_positive.large.json'.format(
    args.dataset, args.dataset)

tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                          do_lower_case=args.do_lower_case)
wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab)

limit = 99999999999
min_reviews = 5
max_words = 1500

user_count = Counter()
item_count = Counter()
users = defaultdict(list)
items = defaultdict(list)

pairs = set()
interactions = defaultdict(list)

print("Building count dictionary first to save memory...")
#with gzip.open(in_fp, 'r') as f: