コード例 #1
0
ファイル: tensorizers.py プロジェクト: omargamal510/pytext
    def numberize(self, row):
        """Tokenize, look up in vocabulary."""
        seq = []

        if self.add_bol_token:
            bol = EOL if self.use_eol_token_for_bol else BOL
            tokens, _, _ = self._lookup_tokens(
                pre_tokenized=[Token(bol, -1, -1)])
            seq.append(tokens)

        for raw_text in row[self.column]:
            tokens, _, _ = self._lookup_tokens(raw_text)
            seq.append(tokens)

        if self.add_eol_token:
            tokens, _, _ = self._lookup_tokens(
                pre_tokenized=[Token(EOL, -1, -1)])
            seq.append(tokens)

        max_len = max(len(sentence) for sentence in seq)
        for sentence in seq:
            pad_len = max_len - len(sentence)
            if pad_len:
                sentence += [self.vocab.get_pad_index()] * pad_len
        return seq, len(seq)
コード例 #2
0
ファイル: tensorizers.py プロジェクト: slbinilkumar/pytext
    def _process(self, row, raw_token_output):
        sentence_process_fn = (
            self._tokenize if raw_token_output else self._lookup_tokens
        )
        pad_token = (
            self.vocab.pad_token if raw_token_output else self.vocab.get_pad_index()
        )
        seq = []

        if self.add_bol_token:
            bol = EOL if self.use_eol_token_for_bol else BOL
            tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(bol, -1, -1)])
            seq.append(list(tokens))

        for raw_text in row[self.column]:
            tokens, _, _ = sentence_process_fn(raw_text)
            seq.append(list(tokens))

        if self.add_eol_token:
            tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(EOL, -1, -1)])
            seq.append(list(tokens))

        max_len = max(len(sentence) for sentence in seq)
        for sentence in seq:
            pad_len = max_len - len(sentence)
            if pad_len:
                sentence += [pad_token] * pad_len
        return seq, len(seq)
コード例 #3
0
def tokenize(
    text: str = None,
    pre_tokenized: List[Token] = None,
    tokenizer: Tokenizer = None,
    bos_token: Optional[str] = None,
    eos_token: Optional[str] = None,
    pad_token: str = PAD,
    use_eos_token_for_bos: bool = False,
    max_seq_len: int = 2**30,
):
    tokenized = (pre_tokenized
                 or tokenizer.tokenize(text)[:max_seq_len -
                                             (bos_token is not None) -
                                             (eos_token is not None)])
    if bos_token:
        if use_eos_token_for_bos:
            bos_token = eos_token
        tokenized = [Token(bos_token, -1, -1)] + tokenized
    if eos_token:
        tokenized.append(Token(eos_token, -1, -1))
    if not tokenized:
        tokenized = [Token(pad_token, -1, -1)]

    tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end)
                                                for t in tokenized))
    return tokenized_texts, start_idx, end_idx
コード例 #4
0
ファイル: tensorizers.py プロジェクト: jz3707/pytext
 def _lookup_tokens(self, text):
     tokenized = self.tokenizer.tokenize(text)[:self.max_seq_len]
     if self.add_bos_token:
         bos = EOS if self.use_eos_token_for_bos else BOS
         tokenized = [Token(bos, -1, -1)] + tokenized
     if self.add_eos_token:
         tokenized.append(Token(EOS, -1, -1))
     tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end)
                                                 for t in tokenized))
     tokens = self.vocab.lookup_all(tokenized_texts)
     return tokens, start_idx, end_idx
コード例 #5
0
ファイル: tensorizers.py プロジェクト: leolorenzoluis/pytext
    def _tokenize(self, text=None, pre_tokenized=None):
        tokenized = pre_tokenized or self.tokenizer.tokenize(text)[: self.max_seq_len]
        if self.add_bos_token:
            bos = EOS if self.use_eos_token_for_bos else BOS
            tokenized = [Token(bos, -1, -1)] + tokenized
        if self.add_eos_token:
            tokenized.append(Token(EOS, -1, -1))
        if not tokenized:
            tokenized = [Token(PAD, -1, -1)]

        tokenized_texts, start_idx, end_idx = zip(
            *((t.value, t.start, t.end) for t in tokenized)
        )
        return tokenized_texts, start_idx, end_idx
コード例 #6
0
ファイル: tensorizers.py プロジェクト: sailfish009/pytext
    def numberize(self, row):
        """Convert text to bytes, pad batch."""
        tokens = self.tokenizer.tokenize(row[self.text_column])[: self.max_seq_len]
        if self.add_bos_token:
            bos = EOS if self.use_eos_token_for_bos else BOS
            tokens = [Token(bos, -1, -1)] + tokens
        if self.add_eos_token:
            tokens.append(Token(EOS, -1, -1))

        if not tokens:
            tokens = [Token(PAD, -1, -1)]
        bytes = [self._numberize_token(token)[: self.max_byte_len] for token in tokens]
        token_lengths = len(tokens)
        byte_lengths = [len(token_bytes) for token_bytes in bytes]
        return bytes, token_lengths, byte_lengths
コード例 #7
0
ファイル: tensorizers.py プロジェクト: leolorenzoluis/pytext
 def numberize(self, row):
     """Convert text to bytes, pad batch."""
     tokens = self.tokenizer.tokenize(row[self.text_column])[: self.max_seq_len]
     if not tokens:
         tokens = [Token(PAD, -1, -1)]
     bytes = [self._numberize_token(token)[: self.max_byte_len] for token in tokens]
     token_lengths = len(tokens)
     byte_lengths = [len(token_bytes) for token_bytes in bytes]
     return bytes, token_lengths, byte_lengths
コード例 #8
0
def tokenize(
    text: str = None,
    pre_tokenized: List[Token] = None,
    tokenizer: Tokenizer = None,
    add_bos_token: bool = False,
    add_eos_token: bool = False,
    use_eos_token_for_bos: bool = False,
    max_seq_len: int = 2**30,
):
    tokenized = (pre_tokenized
                 or tokenizer.tokenize(text)[:max_seq_len - add_bos_token -
                                             add_eos_token])
    if add_bos_token:
        bos = EOS if use_eos_token_for_bos else BOS
        tokenized = [Token(bos, -1, -1)] + tokenized
    if add_eos_token:
        tokenized.append(Token(EOS, -1, -1))
    if not tokenized:
        tokenized = [Token(PAD, -1, -1)]

    tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end)
                                                for t in tokenized))
    return tokenized_texts, start_idx, end_idx