Beispiel #1
0
class BertXNLIDataset(Dataset):
    def __init__(self, directory, prefix, vocab_file, max_length: int = 512):
        super().__init__()
        self.max_length = max_length
        with open(os.path.join(directory, 'xnli_' + prefix), 'r') as f:
            lines = f.readlines()
        self.lines = lines
        self.tokenizer = BertWordPieceTokenizer(vocab_file)
        self.label_map = {
            "entailment": 0,
            "neutral": 1,
            "contradiction": 2,
            "contradictory": 2
        }

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx]
        first, second, third = line.strip().split('\t', 2)
        first_input_ids = self.tokenizer.encode(first,
                                                add_special_tokens=False).ids
        second_input_ids = self.tokenizer.encode(second,
                                                 add_special_tokens=False).ids
        label = self.label_map[third]
        input_ids = first_input_ids + [103] + second_input_ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        # convert list to tensor
        input_ids = torch.LongTensor([101] + input_ids + [102])
        label = torch.LongTensor([int(label)])
        return input_ids, label
    def __init__(self,
                 data_path: str,
                 tokenizer: BertWordPieceTokenizer,
                 max_seq_length: int,
                 vocab_file_name: str):
        # For caching
        data_dirname = os.path.dirname(os.path.abspath(data_path))
        split = os.path.basename(os.path.abspath(data_path))

        # Process data
        self.tokenizer = tokenizer
        cached_path = os.path.join(data_dirname, "{}_{}_response_selection_cached".format(split, vocab_file_name))
        if not os.path.exists(cached_path):
            self.examples = []
            data = json.load(open(data_path))
            for example in tqdm(data, desc="Preprocessing"):
                context = ' <turn> '.join([e['utterance'] for e in example['messages-so-far']])
                context = ' '.join(context.split()[-max_seq_length:])
                encoded_context = tokenizer.encode(context)

                response = example['options-for-correct-answers'][0]['utterance']
                response = ' '.join(response.split()[-max_seq_length:])
                encoded_response = tokenizer.encode(response)

                candidates = [
                    ' '.join(e['utterance'].split()[-max_seq_length:])
                    for e in example['options-for-next']
                ]
                encoded_candidates = [tokenizer.encode(cand) for cand in candidates]

                correct_id = example['options-for-correct-answers'][0]['candidate-id']
                correct_ind = [
                    i for i,e in enumerate(example['options-for-next'])
                    if e['candidate-id'] == correct_id
                ]

                candidate_inputs = [{
                    "input_ids": np.array(cand.ids),
                    "attention_mask": np.array(cand.attention_mask),
                    "token_type_ids": np.array(cand.type_ids)
                } for cand in encoded_candidates]

                self.examples.append({
                    "ctx_input_ids": np.array(encoded_context.ids),
                    "ctx_attention_mask": np.array(encoded_context.attention_mask),
                    "ctx_token_type_ids": np.array(encoded_context.type_ids),
                    "rsp_input_ids": np.array(encoded_response.ids),
                    "rsp_attention_mask": np.array(encoded_response.attention_mask),
                    "rsp_token_type_ids": np.array(encoded_response.type_ids),
                    "candidates": candidate_inputs,
                    "correct_candidate": correct_ind
                })
            with open(cached_path, "wb") as f:
                pickle.dump(self.examples, f)
        else:
            LOGGER.info("Loading from cached path: {}".format(cached_path))
            with open(cached_path, "rb") as f:
                self.examples = pickle.load(f)
Beispiel #3
0
class CheckerDecoder:
    def __init__(self, model_dir):
        self.detector = DetectorModel(os.path.join(model_dir, 'detector'))
        self.corrector = CorrectorModel(os.path.join(model_dir, 'corrector'))
        self.tokenizer = BertWordPieceTokenizer(
            os.path.join(model_dir, 'vocab.txt'))
        mask_id = self.tokenizer.encode('[MASK]').ids[1:-1]
        assert len(mask_id) == 1
        self.mask_id = mask_id[0]

    def predict(self, text, suggest=False, k=5, max_k=200):
        tokenized = self.tokenizer.encode(text)
        if len(tokenized.tokens) > MAX_LEN:
            raise ValueError('The text is too long (>512) to process')
        token_ids = tokenized.ids
        segment_ids = tokenized.type_ids
        mapping = rematch(tokenized.offsets)
        token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids])
        probas = self.detector.predict(token_ids, segment_ids)[0][0]
        incorrect_ids = np.where(probas > 0.5)[0]
        token_ids[0, incorrect_ids] = self.mask_id

        if not suggest:
            ret = []
            for i in incorrect_ids:
                ret.append((i - 1, tokenized.tokens[i]))
            return ret

        probas = self.corrector.predict(token_ids, segment_ids)[0][0]
        sorted_probas, sort_indexs = topK(probas, max_k)
        ret = {}
        for i in incorrect_ids:
            if i == 0 or i == len(tokenized.tokens) - 1:
                continue
            current_token = text[mapping[i][0]:mapping[i][-1] + 1]
            current_pinyin = ' '.join(xmnlp.pinyin(current_token))
            cands = []
            for proba, token in zip(
                    sorted_probas[i],
                    self.tokenizer.decode(sort_indexs[i]).split()):
                pinyin = ' '.join(xmnlp.pinyin(token))
                score = 0
                if current_pinyin == pinyin:
                    score = 1
                cands.append((token, proba + score))
            cands.sort(key=lambda x: x[1], reverse=True)
            ret[(i - 1, current_token)] = cands[:k]
        return dict(ret)
    def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = BertWordPieceTokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["a", "sentence"]
Beispiel #5
0
 def _get_word_tokens_len(word: str,
                          tokenizer: BertWordPieceTokenizer) -> int:
     return sum(
         map(
             lambda token: 1 if token not in SPECIAL_TOKENS else 0,
             tokenizer.encode(word).tokens,
         ))
Beispiel #6
0
def inf(text, model):
    class2names = {
    "DESC": "DESCRIPTION",
    "ENTY": "ENTITY",
    "ABBR": "ABBREVIATION",
    "HUM": "HUMAN",
    "NUM": "NUMERIC",
    "LOC": "LOCATION"
    }

    class2names = load_pickle('class2names.pkl')
    subclass2names = load_pickle('subclass2names.pkl')
    idx2class = load_pickle('idx2class.pkl')
    idx2subclass = load_pickle('idx2subclass.pkl')

    tokenizer = BertWordPieceTokenizer('bert-word-piece-custom-wikitext-vocab-10k-vocab.txt', lowercase = True, strip_accents = True)

    tokens = torch.FloatTensor(tokenizer.encode(text).ids).unsqueeze(0).to('cpu')
    cls_, subcls = model(tokens)
    clsIdx = cls_.max(1)[-1].item()
    subclsIdx = subcls.max(1)[-1].item()

    return {
        "class": class2names[idx2class[clsIdx]],
        "subclass": subclass2names[idx2subclass[subclsIdx]]
    }
 def _build_bert_inputs(self):
     vocab_path = os.path.join(os.environ["GOOGLE_BERT_PATH"], "uncased_L-6_H-768_A-12", "vocab.txt")
     tokenizer = BertWordPieceTokenizer(vocab_path)
     encoding = tokenizer.encode("我爱NLP")
     input_ids = tf.constant([encoding.ids], dtype=tf.int32, shape=(1, len(encoding.ids)))
     segment_ids = tf.constant([encoding.type_ids], dtype=tf.int32, shape=(1, len(encoding.type_ids)))
     attention_mask = tf.constant([encoding.attention_mask], dtype=tf.int32, shape=(1, len(encoding.attention_mask)))
     return input_ids, segment_ids, attention_mask
Beispiel #8
0
def tokenize(sentence):
	# Instantiate a Bert tokenizers
	WordPiece = BertWordPieceTokenizer(bertLargeUncased)
	WordPieceEncoder = WordPiece.encode(sentence)
	# Print the ids, tokens and offsets
	print(WordPieceEncoder.ids)
	print(WordPieceEncoder.tokens)
	print(WordPieceEncoder.offsets)
Beispiel #9
0
    def __init__(self, tokenizer: AutoTokenizer, file_path: str, args):
        print(file_path)
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.bert_model_type + "_cached_mlm_" + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.samples = torch.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            # Get the faster tokenizer from tokenizers package
            tokenizer.save_vocabulary(vocab_path='.')
            fast_tokenizer = BertWordPieceTokenizer("vocab.txt",
                                                    lowercase=args.lowercase)
            fast_tokenizer.enable_truncation(tokenizer.max_len)
            fast_tokenizer.enable_padding(max_length=tokenizer.max_len,
                                          pad_token=tokenizer.pad_token)

            self.samples = []

            # Load data over here
            df = pd.read_json(file_path)
            print('SQUAD data: ')

            for _, row in tqdm(df.iterrows(), total=df.shape[0]):
                for paragraph in row['data']['paragraphs']:
                    context = paragraph['context']
                    for qa_pair in paragraph['qas']:
                        question = qa_pair['question']

                        batch = fast_tokenizer.encode(question, context)
                        self.samples.append({
                            'input_ids':
                            batch.ids,
                            'attention_mask':
                            batch.attention_mask
                        })

                        for encoding in batch.overflowing:
                            self.samples.append({
                                'input_ids':
                                encoding.ids,
                                'attention_mask':
                                encoding.attention_mask
                            })

            df = None

            print("Saving features into cached file: ", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                torch.save(self.samples,
                           handle,
                           pickle_protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #10
0
def main(args):
    print(args)
    if args['train']:
        tokenizer = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=True,  # Must be False if cased model
            lowercase=True,
            wordpieces_prefix="##"
        )

        tokenizer.train(
            files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'],
            limit_alphabet=6000,
            vocab_size=32000
        )

        print(tokenizer.save_model("../BertWordPieceTokenizer_32000"))

    elif args['test']:
        test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.'

        print("=========== tokenizer ===========")
        tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str.ids)
        decoded_str = tokenizer.decode(encoded_str.ids)
        print(decoded_str)

        print("=========== BertTokenizer ===========")
        tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)

        print("=========== BertTokenizer2 ===========")
        tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)
    def generate_custom_vocab(self):

        try:
            tokenizer = None
            # root dir path check and generate
            if not os.path.isdir(self.vocab_root_dir):
                os.makedirs(self.vocab_root_dir, exist_ok=True)

            # generate models directory
            self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/'
            os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True)

            user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]',
                                    '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]']
            unused_token_num = 200
            unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)]
            user_defined_symbols = user_defined_symbols + unused_list

            if self.tokenizer_type == 'word':
                # if lowercase is False must set strip_accents option as 'False'
                tokenizer = BertWordPieceTokenizer(strip_accents=False,
                                                   lowercase=True,
                                                   clean_text=True,
                                                   handle_chinese_chars=True,
                                                   wordpieces_prefix="##"
                                                   )

            # when selected 'base' going to use bert-base-uncased tokenizer... close function

            # training vocab start
            corpus_file = [self.corpus_path]
            vocab_size = 32000
            limit_alphabet = 6000
            min_frequency = 3
            tokenizer.train(files=corpus_file,
                            vocab_size=vocab_size,
                            special_tokens=user_defined_symbols,
                            min_frequency=min_frequency,  # 단어의 최소 발생 빈도, 3
                            limit_alphabet=limit_alphabet,  # ByteLevelBPETokenizer 학습시엔 주석처리 필요
                            show_progress=True)

            self.setPrint('Customer Tokenizer Training is completed')

            sentence = '전화 통화가 정상적으로 안됨.'
            output = tokenizer.encode(sentence)
            self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence))
            self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'.
                          format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids)))

            # save tokenizer
            tokenizer.save_model(self.vocab_root_dir + self.vocab_dir)

        except:
            self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0],
                                                           sys.exc_info()[1],
                                                           sys.exc_info()[2].tb_lineno))
Beispiel #12
0
class BERTTokenizer(Tokenizer):
    def __init__(self):
        super(BERTTokenizer, self).__init__()
        self.tokenizer = BertWordPieceTokenizer(DATA_DIR / "vocab" /
                                                "bert.txt",
                                                lowercase=True)

    def tokenize(self, text):
        seg_result = self.tokenizer.encode(text).tokens
        return seg_result
Beispiel #13
0
 def _build_model_inputs(self):
     vocab_path = os.path.join(os.environ["GOOGLE_BERT_PATH"], "uncased_L-6_H-768_A-12", "vocab.txt")
     tokenizer = BertWordPieceTokenizer(vocab_path)
     encoding = tokenizer.encode("I love NLP, Neural [MASK] Processing is amazing!")
     logging.info("   ids: %s", encoding.ids)
     logging.info("tokens: %s", encoding.tokens)
     input_ids = tf.constant([encoding.ids], dtype=tf.int32, shape=(1, len(encoding.ids)))
     segment_ids = tf.constant([encoding.type_ids], dtype=tf.int32, shape=(1, len(encoding.type_ids)))
     attention_mask = tf.constant([encoding.attention_mask], dtype=tf.int32, shape=(1, len(encoding.attention_mask)))
     return input_ids, segment_ids, attention_mask
Beispiel #14
0
    def test_basic_encode(self, bert_files):
        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])

        # Encode with special tokens by default
        output = tokenizer.encode("My name is John", "pair")
        assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
        assert output.tokens == [
            "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"
        ]
        assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15),
                                  (0, 0), (0, 4), (0, 0)]
        assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]

        # Can encode without the special tokens
        output = tokenizer.encode("My name is John",
                                  "pair",
                                  add_special_tokens=False)
        assert output.ids == [2026, 2171, 2003, 2198, 3940]
        assert output.tokens == ["my", "name", "is", "john", "pair"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
        assert output.type_ids == [0, 0, 0, 0, 1]
Beispiel #15
0
    def __init__(self,
                 data_path: str,
                 tokenizer: BertWordPieceTokenizer,
                 max_seq_length: int,
                 vocab_file_name: str):
        # For caching
        data_dirname = os.path.dirname(os.path.abspath(data_path))
        split = os.path.basename(os.path.abspath(data_path))

        # Slot categories
        slot_vocab_path = os.path.join(os.path.dirname(data_path), "vocab.slot")
        slot_names = [e.strip() for e in open(slot_vocab_path).readlines()]
        slot_names.insert(0, "[PAD]")
        self.slot_label_to_idx = dict((label, idx) for idx, label in enumerate(slot_names))
        self.slot_idx_to_label = {idx: label for label, idx in self.slot_label_to_idx.items()}

        # Intent categories
        intent_vocab_path = os.path.join(data_dirname, "vocab.intent")
        intent_names = [e.strip() for e in open(intent_vocab_path).readlines()]
        self.intent_label_to_idx = dict((label, idx) for idx, label in enumerate(intent_names))
        self.intent_idx_to_label = {idx: label for label, idx in self.intent_label_to_idx.items()}

        # Process data
        self.tokenizer = tokenizer
        cached_path = os.path.join(data_dirname, "{}_{}_top_cached".format(split, vocab_file_name))
        if not os.path.exists(cached_path):
            self.examples = []
            data = [e.strip() for e in open(data_path).readlines() ]
            for example in tqdm(data):
                example, intent = example.split(" <=> ")
                text = " ".join([e.split(":")[0] for e in example.split()])
                slots = " ".join([e.split(":")[1] for e in example.split()])
                encoded = tokenizer.encode(text)
                encoded_slot_labels = self.encode_token_labels([text], [slots],
                                                               len(encoded.ids),
                                                               tokenizer,
                                                               self.slot_label_to_idx,
                                                               max_seq_length)
                self.examples.append({
                    "input_ids": np.array(encoded.ids)[-max_seq_length:],
                    "attention_mask": np.array(encoded.attention_mask)[-max_seq_length:],
                    "token_type_ids": np.array(encoded.type_ids)[-max_seq_length:],
                    "slot_labels": encoded_slot_labels[-max_seq_length:],
                    "intent_label": self.intent_label_to_idx[intent],
                    "ind": len(self.examples),
                })
            with open(cached_path, "wb") as f:
                pickle.dump(self.examples, f)
        else:
            LOGGER.info("Loading from cached path: {}".format(cached_path))
            with open(cached_path, "rb") as f:
                self.examples = pickle.load(f)
Beispiel #16
0
def tokenizer(file_name, model, is_file=False, remove_punc=False):
    # Kurdish tokenizer
    """Given a list of sentences in Kurdish, return the tokenized one as text with spaces between tokens """
    if is_file:
        with open(file_name, "r") as f:
            text = f.read().split("\n")

    else:
        text = file_name

    models = {
        "wordpiece":
        'tokenization_models/ckb-wordpiece_all_False_50000-vocab.txt',
        "bpe": 'tokenization_models/ckb_bpe_50k.model',
        "unigram": 'tokenization_models/ckb_unigram_50k.model',
        "WordPunct": ""
    }

    tokenized_text = list()

    if model == "wordpiece":
        WordPiece = BertWordPieceTokenizer(models[model],
                                           strip_accents=False,
                                           clean_text=False,
                                           lowercase=False)
        for sentence in text:
            WordPieceEncoder = WordPiece.encode(sentence)
            sentence_tokenized = " ".join(WordPieceEncoder.tokens)

            for token in ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "##"]:
                sentence_tokenized = sentence_tokenized.replace(token, " ")

            tokenized_text.append(" ".join(sentence_tokenized.split()))

    elif model == "WordPunct":
        for sentence in text:
            tokenized_text.append(" ".join(
                WordPunctTokenizer().tokenize(sentence)))

    else:
        sp = spm.SentencePieceProcessor()
        sp.Load(models[model])

        for sentence in text:
            # print(" ".join( sp.EncodeAsPieces(sentence)).replace("▁", "") )
            tokenized_text.append(" ".join(
                sp.EncodeAsPieces(sentence)).replace("▁", ""))

    if remove_punc:
        return remove_punctuation("\n".join(tokenized_text))
    else:
        return "\n".join(tokenized_text)
Beispiel #17
0
    def __init__(
        self,
        data_path: str,
        tokenizer: BertWordPieceTokenizer,
        max_seq_length: int,
        vocab_file_name: str,
    ):
        # For caching
        data_dirname = os.path.dirname(os.path.abspath(data_path))
        split = os.path.basename(os.path.abspath(data_path))

        # Intent categories
        intent_vocab_path = os.path.join(data_dirname, "categories.json")
        intent_names = json.load(open(intent_vocab_path))
        self.intent_label_to_idx = dict(
            (label, idx) for idx, label in enumerate(intent_names))
        self.intent_idx_to_label = {
            idx: label
            for label, idx in self.intent_label_to_idx.items()
        }

        # Process data
        self.tokenizer = tokenizer
        cached_path = os.path.join(
            data_dirname, "{}_{}_intent_cached".format(split, vocab_file_name))

        if not os.path.exists(cached_path):
            self.examples = []
            reader = csv.reader(open(data_path))
            next(reader, None)
            out = []
            for utt, intent in tqdm(reader):
                encoded = tokenizer.encode(utt)

                self.examples.append({
                    "input_ids":
                    np.array(encoded.ids)[-max_seq_length:],
                    "attention_mask":
                    np.array(encoded.attention_mask)[-max_seq_length:],
                    "token_type_ids":
                    np.array(encoded.type_ids)[-max_seq_length:],
                    "intent_label":
                    self.intent_label_to_idx[intent],
                    "ind":
                    len(self.examples),
                })
            with open(cached_path, "wb") as f:
                pickle.dump(self.examples, f)
        else:
            LOGGER.info("Loading from cached path: {}".format(cached_path))
            with open(cached_path, "rb") as f:
                self.examples = pickle.load(f)
    def preprocess(self,max_len):
        utterance = self.utterance
        context = self.context
        label = self.label
        max_len=int(max_len) 

        tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Tokenize utterance
        tokenized_utterance = tokenizer.encode(utterance)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_utterance.ids[1:]

        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_utterance.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            input_ids = input_ids[:max_len]
            attention_mask = attention_mask[:max_len]
            token_type_ids = token_type_ids[:max_len]
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.label = label
        self.context_token_to_char = tokenized_context.offsets
    def __init__(self, json_data_file, labels, vocab_dict, n_tokens):
        """
        labels is an object of class Labels()
        """
        WordPiece = BertWordPieceTokenizer(
            "bert-base-uncased-vocab.txt",
            lowercase=True,
            add_special_tokens=False,
            sep_token="",
            cls_token="",
        )
        self.x = []
        self.y = []

        self.similarity = torch.zeros((labels.n_labels, labels.n_labels))

        vocab = set()
        vocab.update(["UNK"])
        for l in tqdm(open(json_data_file)):
            d = json.loads(l)
            WordPieceEncoder = WordPiece.encode(d["text"])
            tokens = WordPieceEncoder.tokens
            self.x.append(tokens)
            vocab.update(tokens)
            self.y.append(labels.multihot(d["label"]))

            li = [labels.stoi[l] for l in d["label"]]
            for i in li:
                for j in li:
                    self.similarity[i, j] += 1
                    self.similarity[j, i] += 1

        self.similarity /= len(self.x)

        self.vocab = {tok: i for i, tok in enumerate(vocab)}

        if vocab_dict != None:
            self.vocab = vocab_dict

        for idx in tqdm(range(len(self.x))):
            self.x[idx] = [
                self.vocab[i] if i in self.vocab else self.vocab["UNK"]
                for i in self.x[idx]
            ][:n_tokens]
            if len(self.x[idx]) < n_tokens:
                self.x[idx] += [self.vocab["UNK"]] * n_tokens
                self.x[idx] = self.x[idx][:n_tokens]

        self.len = len(self.x)
Beispiel #20
0
 def _build_bert_inputs(self):
     vocab_path = os.path.join(BASE_DIR, 'bert_uncased_L-6_H-768_A-12',
                               'vocab.txt')
     tokenizer = BertWordPieceTokenizer(vocab_path)
     encoding = tokenizer.encode('我爱NLP')
     input_ids = tf.constant([encoding.ids],
                             dtype=tf.int32,
                             shape=(1, len(encoding.ids)))
     segment_ids = tf.constant([encoding.type_ids],
                               dtype=tf.int32,
                               shape=(1, len(encoding.type_ids)))
     attention_mask = tf.constant([encoding.attention_mask],
                                  dtype=tf.int32,
                                  shape=(1, len(encoding.attention_mask)))
     return input_ids, segment_ids, attention_mask
Beispiel #21
0
def test_tokenizer(text, model):
    # # Encode and decode
    # WordPiece = BertWordPieceTokenizer(model, lowercase=False)
    WordPiece = BertWordPieceTokenizer(model,
                                       strip_accents=True,
                                       clean_text=False,
                                       lowercase=False)
    WordPieceEncoder = WordPiece.encode(text)
    # print(WordPieceEncoder)

    # print(WordPieceEncoder.ids)
    # print(WordPieceEncoder.tokens)
    # print(WordPieceEncoder.offsets)

    return " ".join(WordPieceEncoder.tokens)
Beispiel #22
0
class BertTokenizer:
    def __init__(self, pretrained_name: str = "bert-base-cased-vocab.txt"):
        self.tokenizer = BertWordPieceTokenizer(pretrained_name,
                                                lowercase=False)

    def tokenize(self, s: str, offset: int = 0) -> List[Token]:
        output = self.tokenizer.encode(s)
        result = []
        n = len(output.tokens)
        for i, (bpe, pos, token_id) in enumerate(
                zip(output.tokens, output.offsets, output.ids)):
            if i == 0 or i == n - 1:
                continue
            result.append(Token(bpe, idx=offset + pos[0], text_id=token_id))
        return result
Beispiel #23
0
    def __init__(self,
                 data_path: str,
                 tokenizer: BertWordPieceTokenizer,
                 max_seq_length: int,
                 vocab_file_name: str):
        # For caching
        data_dirname = os.path.dirname(os.path.abspath(data_path))
        split = os.path.basename(os.path.abspath(data_path))

        # Slot categories
        slot_vocab_path = os.path.join(os.path.dirname(data_path), "slots.json")
        slot_names = json.load(open(slot_vocab_path))
        slot_names.insert(0, "[PAD]")
        self.slot_label_to_idx = dict((label, idx) for idx, label in enumerate(slot_names))
        self.slot_idx_to_label = {idx: label for label, idx in self.slot_label_to_idx.items()}

        # Process data
        self.tokenizer = tokenizer
        cached_path = os.path.join(data_dirname, "{}_{}_slots_cached".format(split, vocab_file_name))
        texts = []
        slotss = []
        if not os.path.exists(cached_path):
            self.examples = []
            data = json.load(open(data_path))
            for example in tqdm(data):
                for text,slots in  self.parse_example(example, max_seq_length):
                    encoded = tokenizer.encode(text)
                    encoded_slot_labels = self.encode_token_labels([text], [slots],
                                                                   len(encoded.ids),
                                                                   tokenizer,
                                                                   self.slot_label_to_idx,
                                                                   max_seq_length)
                    self.examples.append({
                        "input_ids": np.array(encoded.ids)[-max_seq_length:],
                        "attention_mask": np.array(encoded.attention_mask)[-max_seq_length:],
                        "token_type_ids": np.array(encoded.type_ids)[-max_seq_length:],
                        "slot_labels": encoded_slot_labels[-max_seq_length:]
                    })
                    texts.append(text)
                    slotss.append(slots)
            with open(cached_path, "wb") as f:
                pickle.dump(self.examples, f)
        else:
            LOGGER.info("Loading from cached path: {}".format(cached_path))
            with open(cached_path, "rb") as f:
                self.examples = pickle.load(f)
Beispiel #24
0
class Tokenizer:
    def __init__(self, bert_model = "bert-base-uncased"):
        self.bert_model = bert_model
        self.vocabulary_path = "{}-vocab.txt".format(bert_model)
        print("Vocabulary for BERT model {}: {}".format(bert_model, self.vocabulary_path))
        self.tokenizer = BertWordPieceTokenizer(self.vocabulary_path)
    
    def encode(self, plain_text: list, max_length=100):
        """Use encode_plus instead?
        """
        token_ids = np.zeros(shape=(len(plain_text), max_length), dtype=np.int32)

        for i, text in enumerate(plain_text):
            encoded = self.tokenizer.encode(text)
            token_ids[i, 0:len(encoded)] = encoded.ids
        attention_masks = (token_ids != 0).astype(np.int32)
        return {"input_ids": token_ids, "attention_masks": attention_masks}
Beispiel #25
0
class BERT16SDataset(Dataset):
	"""
	A torch dataset class designed to load a 16S data found in a tsv file and encode it for BERT.
	:param vocab_path: str, path to the pre-trained bert tokenizer vocab file.
	:param data_path: str, path to the 16S data file.
	:param block_size: str, maximal BERT input (an encoded sample will be padded to this length if too short)
	:param max_word_length: int, the maximal word length the tokenizer can encode.
	"""
	def __init__(self, vocab_path: str, data_path: str, block_size=512, max_word_length=100):

		assert os.path.isfile(data_path)
		assert os.path.isfile(vocab_path)

		_logger.info(f"Loading BERT tokenizer using vocab file {vocab_path}")
		self.tokenizer = BertWordPieceTokenizer(
			vocab_path,
			handle_chinese_chars=False,
			lowercase=False)
		self.tokenizer.enable_truncation(block_size)
		self.tokenizer.enable_padding(max_length=block_size)

		_logger.info(f"Loading 16S dataset file at {data_path}...")
		self._16s_corpus_df = pd.read_csv(data_path, sep='\t')
		_logger.info(f"16S corpus is of shape {self._16s_corpus_df.shape}")

		self.samples = self._16s_corpus_df.seq.values.tolist()
		self.max_word_length = max_word_length

	def __len__(self):
		return len(self._16s_corpus_df)

	def __getitem__(self, i):
		sample = self._split_sequence_by_max_word_length(self.samples[i])
		tokens = self.tokenizer.encode(sample)
		return torch.tensor(tokens.ids, dtype=torch.long)

	def _split_sequence_by_max_word_length(self, seq):
		"""
		split a 16S sequence (~1K long usually) into white-spaces separated chunks that the tokenizer can encode.
		:param seq: str, 16S sequence
		:return: str
		"""
		chunks = [seq[i: i + self.max_word_length] for i in range(0, len(seq), self.max_word_length)]
		return ' '.join(chunks)
class BertTokenizer(Transformer):
    @timer
    def setup(self, stopwords=None, punct=None, lower=True, strip=True):
        self.tokenizer = BertWordPieceTokenizer(VOCAB_FILE, lowercase=lower)
        self.punct = punct or set(string.punctuation)
        self.stopwords = stopwords or set(sw.words("english"))

    def process_single(self, document):
        tokenized_text = self.tokenizer.encode(document)
        for token in tokenized_text.tokens:
            # If stopword, ignore token and continue
            if token in self.stopwords or token == "[CLS]" or token == "[SEP]":
                continue

            # If punctuation, ignore token and continue
            if all(char in self.punct for char in token):
                continue

            yield token
Beispiel #27
0
class Tokenizer:
    def __init__(self, lang):
        """
        A Tokenizer class to load and train a custom tokenizer
        Using the Hugging Face tokenization library for the same
        """
        self.tokenizer_dir = r"data/{}".format(lang)
        if not os.path.exists(self.tokenizer_dir):
            os.mkdir(self.tokenizer_dir)
        self.vocab = self.tokenizer_dir + "/vocab.txt"
        if os.path.exists(self.vocab):
            print("Initialized tokenizer using cached vocab file {}".format(self.vocab))
            self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab)
        else:
            self.tokenizer = BertWordPieceTokenizer()

        self.tokenizer.enable_padding(max_length=MAX_LENGTH)
        self.tokenizer.enable_truncation(max_length=MAX_LENGTH)

    def train_tokenizer(self, sentences):
        """
        Train a tokenizer with a list of sentences
        """

        if not os.path.exists(self.vocab):
            print("Training tokenizer for {}".format(self.tokenizer_dir))
            # Hugging Face only accepts a Temp File with sentences for Training Tokenizer
            with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f:
                [f.write(i + "\n") for i in sentences]
            self.tokenizer.train([self.tokenizer_dir + "/data.txt"])
            self.tokenizer.save(self.tokenizer_dir)
            print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size()))

            # Removing the temp file
            os.remove(self.tokenizer_dir + "/data.txt")

    def encode(self, decoded):
        return self.tokenizer.encode(decoded)

    def decode(self, encoded):
        return self.tokenizer.decode_batch(encoded)
Beispiel #28
0
class SentimentModel:
    def __init__(self, model_dir):
        # load session and graph
        self.sess = tf.Session(graph=tf.Graph())
        tf.saved_model.loader.load(self.sess, ['serve'], export_dir=model_dir)
        self.tokenizer = BertWordPieceTokenizer(os.path.join(model_dir, 'vocab.txt'))
        self.tokenizer.enable_truncation(max_length=MAX_LEN)

    def predict(self, text):
        tokenized = self.tokenizer.encode(text)
        token_ids = tokenized.ids
        segment_ids = tokenized.type_ids
        token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids])
        # placeholder
        input_token = self.sess.graph.get_tensor_by_name('Input-Token:0')
        input_segment = self.sess.graph.get_tensor_by_name('Input-Segment:0')
        output = self.sess.graph.get_tensor_by_name('label/Softmax:0')

        probas = self.sess.run([output], feed_dict={input_token: token_ids,
                                                    input_segment: segment_ids})
        return tuple(probas[0][0].tolist())
Beispiel #29
0
def sentences_to_paragraphs(
        sentences: List[str], max_paragraph_len: int,
        tokenizer: BertWordPieceTokenizer) -> List[Paragraph]:
    parag_start, parag_end, paragraphs = 0, 0, []
    hold_parag, hold_parag_len = '', 0
    for sentence in sentences:
        encoding = tokenizer.encode(sentence)
        # [1:-1] to exclude [CLS] and [SEP]
        clipped_offsets = encoding.offsets[1:-1][:max_paragraph_len]
        (sent_start, _), (_,
                          sent_end) = clipped_offsets[0], clipped_offsets[-1]
        clipped_sentence = sentence[sent_start:sent_end]
        n_tokens = len(clipped_offsets)
        if (hold_parag_len + n_tokens) > max_paragraph_len:
            paragraphs.append(Paragraph(parag_start, parag_end, hold_parag))
            hold_parag, hold_parag_len = '', 0
            parag_start = parag_end
        hold_parag, hold_parag_len = hold_parag + clipped_sentence, hold_parag_len + n_tokens
        parag_end += len(sentence)
    if hold_parag:
        paragraphs.append(Paragraph(parag_start, parag_end, hold_parag))
    return paragraphs
Beispiel #30
0
class BertChnSentCorpDataset(Dataset):
    def __init__(self, directory, prefix, vocab_file, max_length: int = 512):
        super().__init__()
        self.max_length = max_length
        with open(os.path.join(directory, prefix + '.tsv'), 'r') as f:
            lines = f.readlines()
        self.lines = lines[1:]
        self.tokenizer = BertWordPieceTokenizer(vocab_file)

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx]
        label, sentence = line.split('\t', 1)
        input_ids = self.tokenizer.encode(sentence,
                                          add_special_tokens=False).ids
        if len(input_ids) > self.max_length - 2:
            input_ids = input_ids[:self.max_length - 2]
        # convert list to tensor
        input_ids = torch.LongTensor([101] + input_ids + [102])
        label = torch.LongTensor([int(label)])
        return input_ids, label