Python compat_as_text Exemples, texar.utils.dtypes.compat_as_text Python Exemples

Exemple #1

0

Afficher le fichier

def str_join(tokens, sep=' ', compat=True):
    """Concats :attr:`tokens` along the last dimension with intervening
    occurrences of :attr:`sep`.

    Args:
        tokens: An `n`-D numpy array or (possibly nested) list of `str`.
        sep (str): The string intervening between the tokens.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        An `(n-1)`-D numpy array (or list) of `str`.
    """
    def _recur_join(s):
        if len(s) == 0:
            return ''
        elif is_str(s[0]):
            return sep.join(s)
        else:
            s_ = [_recur_join(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if compat:
        tokens = compat_as_text(tokens)

    str_ = _recur_join(tokens)

    #if isinstance(tokens, (list, tuple)):
    #    return type(tokens)(str_)
    #else:
    #    return np.asarray(str_)
    return str_

Exemple #2

0

Afficher le fichier

def strip_bos(str_, bos_token='<BOS>', compat=True):
    """Remove the leading BOS token.

    Assumes tokens in the strings are separated with the space character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        bos_token (str): The BOS token. Default is '<BOS>' as defined in
            :class:`~texar.data.SpecialTokens`.BOS
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            return ' '.join(s.strip().split()).replace(bos_token + ' ', '')
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if compat:
        str_ = compat_as_text(str_)

    strp_str = _recur_strip(str_)

    #if isinstance(str_, (list, tuple)):
    #    return type(str_)(strp_str)
    #else:
    #    return np.asarray(strp_str)
    return strp_str

Exemple #3

0

Afficher le fichier

def strip_token(str_, token, is_token_list=False, compat=True):
    """Returns a copy of strings with leading and trailing tokens removed.

    Note that besides :attr:`token`, all leading and trailing whitespace
    characters are also removed.

    If :attr:`is_token_list` is False, then the function assumes tokens in
    :attr:`str_` are separated with whitespace character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        token (str): The token to strip, e.g., the '<PAD>' token defined in
            :class:`~texar.data.SpecialTokens`.PAD
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        The stripped strings of the same structure/shape as :attr:`str_`.

    Example:

        .. code-block:: python

            str_ = '<PAD> a sentence <PAD> <PAD>  '
            str_stripped = strip_token(str_, '<PAD>')
            # str_stripped == 'a sentence'

            str_ = ['<PAD>', 'a', 'sentence', '<PAD>', '<PAD>', '', '']
            str_stripped = strip_token(str_, '<PAD>', is_token_list=True)
            # str_stripped == 'a sentence'
    """
    def _recur_strip(s):
        if is_str(s):
            if token == "":
                return ' '.join(s.strip().split())
            else:
                return ' '.join(s.strip().split()).\
                    replace(' '+token, '').replace(token+' ', '')
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    strp_str = _recur_strip(s)

    if is_token_list:
        strp_str = _recur_split(strp_str, str_)

    return strp_str

Exemple #4

0

Afficher le fichier

def strip_token(str_, token, compat=True):
    """Returns a copy of the strings with leading and trailing tokens
    removed.

    Assumes tokens in the strings are separated with the space character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        token (str): The token to strip, e.g., the '<PAD>' token defined in
            :class:`~texar.data.SpecialTokens`.PAD
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        The stripped strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            return ' '.join(s.strip().split()).\
                replace(' '+token, '').replace(token+' ', '')
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if compat:
        str_ = compat_as_text(str_)

    strp_str = _recur_strip(str_)

    #if isinstance(str_, (list, tuple)):
    #    return type(str_)(strp_str)
    #else:
    #    return np.asarray(strp_str)
    return strp_str

Exemple #5

0

Afficher le fichier

Fichier : utils.py Projet : ml-lab/Text_Infilling

def map_ids_to_strs(ids,
                    vocab,
                    join=True,
                    strip_pad='<PAD>',
                    strip_bos='<BOS>',
                    strip_eos='<EOS>',
                    compat=True):
    """Transforms indexes to strings by id-token mapping, token concat, token
    stripping, etc.

    Args:
        ids: An n-D numpy array or (possibly nested) list of `int` indexes.
        vocab: An instance of :class:`~texar.data.Vocab`.
        join (bool): Whether concat along the last dimension of :attr:`ids`
            the tokens into a string with a space character.
        strip_pad (str): The PAD token to strip from the strings (i.e., remove
            the leading and trailing PAD tokens of the strings). Default
            is '<PAD>' as defined in
            :class:`~texar.data.vocabulary.SpecialTokens`.`PAD`.
            Set to `None` or `False` to disable the stripping.
        strip_bos (str): The BOS token to strip from the strings (i.e., remove
            the leading BOS tokens of the strings).
            Default is '<BOS>' as defined in
            :class:`~texar.data.vocabulary.SpecialTokens`.`BOS`.
            Set to `None` or `False` to disable the stripping.
        strip_eos (str): The EOS token to strip from the strings (i.e., remove
            the EOS tokens and all subsequent tokens of the strings).
            Default is '<EOS>' as defined in
            :class:`~texar.data.vocabulary.SpecialTokens`.`EOS`.
            Set to `None` or `False` to disable the stripping.
    Returns:
        If :attr:`join`=True, returns a (n-1)-D numpy array (or list) of
        concatenated strings. If :attr:`join`=False, returns an n-D numpy
        array (or list) of str tokens.
    """
    tokens = vocab.map_ids_to_tokens_py(ids)

    if compat:
        tokens = compat_as_text(tokens)

    str_ = str_join(tokens, compat=False)

    str_ = strip_special_tokens(str_,
                                strip_pad=strip_pad,
                                strip_bos=strip_bos,
                                strip_eos=strip_eos,
                                compat=False)

    def _recur_split(s):
        if is_str(s):
            return _maybe_list_to_array(s.split(), str_)
        else:
            s_ = [_recur_split(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if join:
        return str_
    else:
        return _recur_split(str_)

Exemple #6

0

Afficher le fichier

def strip_special_tokens(str_,
                         strip_pad='<PAD>',
                         strip_bos='<BOS>',
                         strip_eos='<EOS>',
                         compat=True):
    """Removes special tokens of strings, including:

        - Removes EOS and all subsequent tokens
        - Removes leading and and trailing PAD tokens
        - Removes leading BOS tokens

    This is a joint function of :func:`strip_eos`, :func:`strip_pad`, and
    :func:`strip_bos`

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        strip_pad (str): The PAD token to strip from the strings (i.e., remove
            the leading and trailing PAD tokens of the strings). Default
            is '<PAD>' as defined in
            :class:`~texar.data.SpecialTokens`.PAD.
            Set to `None` or `False` to disable the stripping.
        strip_bos (str): The BOS token to strip from the strings (i.e., remove
            the leading BOS tokens of the strings).
            Default is '<BOS>' as defined in
            :class:`~texar.data.SpecialTokens`.BOS.
            Set to `None` or `False` to disable the stripping.
        strip_eos (str): The EOS token to strip from the strings (i.e., remove
            the EOS tokens and all subsequent tokens of the strings).
            Default is '<EOS>' as defined in
            :class:`~texar.data.SpecialTokens`.EOS.
            Set to `None` or `False` to disable the stripping.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same shape of :attr:`str_` with special tokens stripped.
    """
    if compat:
        str_ = compat_as_text(str_)

    if strip_eos is not None and strip_eos is not False:
        str_ = _strip_eos_(str_, strip_eos, compat=False)

    if strip_pad is not None and strip_pad is not False:
        str_ = strip_token(str_, strip_pad, compat=False)

    if strip_bos is not None and strip_bos is not False:
        str_ = _strip_bos_(str_, strip_bos, compat=False)

    return str_

Exemple #7

0

Afficher le fichier

def strip_bos(str_, bos_token='<BOS>', is_token_list=False, compat=True):
    """Remove all leading BOS tokens.

    Note that besides :attr:`bos_token`, all leading and trailing whitespace
    characters are also removed.

    If :attr:`is_token_list` is False, then the function assumes tokens in
    :attr:`str_` are separated with whitespace character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        bos_token (str): The BOS token. Default is '<BOS>' as defined in
            :class:`~texar.data.SpecialTokens`.BOS
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            if bos_token == '':
                return ' '.join(s.strip().split())
            else:
                return ' '.join(s.strip().split()).replace(bos_token+' ', '')
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    strp_str = _recur_strip(s)

    if is_token_list:
        strp_str = _recur_split(strp_str, str_)

    return strp_str

Exemple #8

0

Afficher le fichier

def strip_eos(str_, eos_token='<EOS>', is_token_list=False, compat=True):
    """Remove the EOS token and all subsequent tokens.

    If :attr:`is_token_list` is False, then the function assumes tokens in
    :attr:`str_` are separated with whitespace character.

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        eos_token (str): The EOS token. Default is '<EOS>' as defined in
            :class:`~texar.data.SpecialTokens`.EOS
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            s_tokens = s.split()
            if eos_token in s_tokens:
                return ' '.join(s_tokens[:s_tokens.index(eos_token)])
            else:
                return s
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    strp_str = _recur_strip(s)

    if is_token_list:
        strp_str = _recur_split(strp_str, str_)

    return strp_str

Exemple #9

0

Afficher le fichier

Fichier : utils.py Projet : ml-lab/Text_Infilling

def strip_eos(str_, eos_token='<EOS>', compat=True):
    """Remove the EOS token and all subsequent tokens.

    Assumes tokens in the strings are separated with the space character.

    Args:
        str_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        eos_token (str): The EOS token. Default is '<EOS>' as defined in
            :class:`~texar.data.vocabulary.SpecialTokens`.`EOS`
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same structure/shape as :attr:`str_`.
    """
    def _recur_strip(s):
        if is_str(s):
            s_tokens = s.split()
            if eos_token in s_tokens:
                return ' '.join(s_tokens[:s_tokens.index(eos_token)])
            else:
                return s
        else:
            s_ = [_recur_strip(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if compat:
        str_ = compat_as_text(str_)

    strp_str = _recur_strip(str_)

    #if isinstance(str_, (list, tuple)):
    #    return type(str_)(strp_str)
    #else:
    #    return np.asarray(strp_str)
    return strp_str

Exemple #10

0

Afficher le fichier

def map_ids_to_strs(ids,
                    vocab,
                    join=True,
                    strip_pad='<PAD>',
                    strip_bos='<BOS>',
                    strip_eos='<EOS>',
                    compat=True):
    """Transforms `int` indexes to strings by id-token mapping, token concat,
    and special token stripping, etc.

    Args:
        ids: An n-D numpy array or (possibly nested) list of `int` indexes.
        vocab: An instance of :class:`~texar.data.Vocab`.
        join (bool): Whether to concat along the last dimension of the
            the tokens into a string separated with a space character.
        strip_pad (str): The PAD token to strip from the strings (i.e., remove
            the leading and trailing PAD tokens of the strings). Default
            is '<PAD>' as defined in
            :class:`~texar.data.SpecialTokens`.PAD.
            Set to `None` or `False` to disable the stripping.
        strip_bos (str): The BOS token to strip from the strings (i.e., remove
            the leading BOS tokens of the strings).
            Default is '<BOS>' as defined in
            :class:`~texar.data.SpecialTokens`.BOS.
            Set to `None` or `False` to disable the stripping.
        strip_eos (str): The EOS token to strip from the strings (i.e., remove
            the EOS tokens and all subsequent tokens of the strings).
            Default is '<EOS>' as defined in
            :class:`~texar.data.SpecialTokens`.EOS.
            Set to `None` or `False` to disable the stripping.

    Returns:
        If :attr:`join` is True, returns a `(n-1)`-D numpy array (or list) of
        concatenated strings. If :attr:`join` is False, returns an `n`-D numpy
        array (or list) of str tokens.

    Example:

        .. code-block:: python

            text_ids = [[1, 9, 6, 2, 0, 0], [1, 28, 7, 8, 2, 0]]

            text = map_ids_to_strs(text_ids, data.vocab)
            # text == ['a sentence', 'parsed from ids']

            text = map_ids_to_strs(
                text_ids, data.vocab, join=False,
                strip_pad=None, strip_bos=None, strip_eos=None)
            # text == [['<BOS>', 'a', 'sentence', '<EOS>', '<PAD>', '<PAD>'],
            #          ['<BOS>', 'parsed', 'from', 'ids', '<EOS>', '<PAD>']]
    """
    tokens = vocab.map_ids_to_tokens_py(ids)

    if compat:
        tokens = compat_as_text(tokens)

    str_ = str_join(tokens, compat=False)

    str_ = strip_special_tokens(str_,
                                strip_pad=strip_pad,
                                strip_bos=strip_bos,
                                strip_eos=strip_eos,
                                compat=False)

    def _recur_split(s):
        if is_str(s):
            return _maybe_list_to_array(s.split(), str_)
        else:
            s_ = [_recur_split(si) for si in s]
            return _maybe_list_to_array(s_, s)

    if join:
        return str_
    else:
        return _recur_split(str_)

Exemple #11

0

Afficher le fichier

Fichier : bleu.py Projet : yucoian/texar-pytorch

def corpus_bleu(list_of_references: List[List[MaybeList[str]]],
                hypotheses: List[MaybeList[str]],
                max_order: int = 4,
                lowercase: bool = False,
                smooth: bool = False,
                use_bp: bool = True,
                return_all: bool = False) -> MaybeList[float]:
    r"""Computes corpus-level BLEU score.

    Args:
        list_of_references: A list of lists of references for each hypothesis.
            Each reference can be either a list of string tokens, or a string
            containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        hypotheses: A list of hypothesis sentences.
            Each hypothesis can be either a list of string tokens, or a
            string containing tokenized tokens separated with whitespaces.
            List can also be numpy array.
        lowercase (bool): If `True`, lowercase reference and hypothesis
            tokens.
        max_order (int): Maximum n-gram order to use when computing
            BLEU score.
        smooth (bool): Whether or not to apply `(Lin et al. 2004)` smoothing.
        use_bp (bool): Whether to apply brevity penalty.
        return_all (bool): If `True`, returns BLEU and all
            n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a ``float32``
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of
        ``float32`` scores: ``[BLEU] + n-gram precisions``,
        which is of length :attr:`max_order` +1.
    """
    list_of_references = compat_as_text(list_of_references)
    hypotheses = compat_as_text(hypotheses)

    matches_by_order = [0] * max_order
    possible_matches_by_order = [0] * max_order
    reference_length = 0
    hypothesis_length = 0

    for (references, hypothesis) in zip(list_of_references, hypotheses):
        reference_length += min(len(r) for r in references)
        hypothesis_length += len(hypothesis)

        merged_ref_ngram_counts: Counter[Tuple[str, ...]] = \
            collections.Counter()
        for reference in references:
            reference = _maybe_str_to_list(reference)
            if lowercase:
                reference = _lowercase(reference)
            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)

        hypothesis = _maybe_str_to_list(hypothesis)
        if lowercase:
            hypothesis = _lowercase(hypothesis)
        hypothesis_ngram_counts = _get_ngrams(hypothesis, max_order)

        overlap = hypothesis_ngram_counts & merged_ref_ngram_counts
        for ngram in overlap:
            matches_by_order[len(ngram) - 1] += overlap[ngram]
        for order in range(1, max_order + 1):
            possible_matches = len(hypothesis) - order + 1
            if possible_matches > 0:
                possible_matches_by_order[order - 1] += possible_matches

    precisions = [0.0] * max_order
    for i in range(0, max_order):
        if smooth:
            precisions[i] = ((matches_by_order[i] + 1.) /
                             (possible_matches_by_order[i] + 1.))
        else:
            if possible_matches_by_order[i] > 0:
                precisions[i] = (float(matches_by_order[i]) /
                                 possible_matches_by_order[i])
            else:
                precisions[i] = 0.0

    if min(precisions) > 0:
        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
        geo_mean = math.exp(p_log_sum)
    else:
        geo_mean = 0

    if use_bp:
        ratio = float(hypothesis_length) / reference_length
        if ratio > 1.0:
            bp = 1.
        else:
            if abs(ratio) < 1e-8:
                bp = 0.
            else:
                bp = math.exp(1 - 1. / ratio)
    else:
        bp = 1.

    bleu = geo_mean * bp

    if return_all:
        return [bleu * 100] + [p * 100 for p in precisions]
    else:
        return bleu * 100

Exemple #12

0

Afficher le fichier

Fichier : bleu_moses.py Projet : chengqianma/CSE517-FinalProject

def corpus_bleu_moses(list_of_references,
                      hypotheses,
                      lowercase=False,
                      return_all=False):
    """Calculates corpus-level BLEU score using the
    **MOSES multi-bleu.perl** script.

    Args:
        list_of_references: A list of lists of references for each hypothesis.
            Each reference can be either a string, or a list of string tokens.
            List can also be numpy array.
        hypotheses: A list of hypothesis sentences.
            Each hyperthsis can be either a string, or a list of string tokens.
            List can also be numpy array.
        lowercase (bool): If `True`, pass the "-lc" flag to the multi-bleu
            script.
        return_all (bool): If `True`, returns BLEU and all n-gram precisions.

    Returns:
        If :attr:`return_all` is `False` (default), returns a float32
        BLEU score.

        If :attr:`return_all` is `True`, returns a list of 5 float32 scores:
        `[BLEU, 1-gram precision, ..., 4-gram precision]`.
    """
    list_of_references = compat_as_text(list_of_references)
    hypotheses = compat_as_text(hypotheses)

    if np.size(hypotheses) == 0:
        return np.float32(0.)  # pylint: disable=no-member

    # Get multi-bleu.perl
    cur_dir = os.path.dirname(os.path.realpath(__file__))
    multi_bleu_path = os.path.abspath(
        os.path.join(cur_dir, "..", "..", "bin", "utils", "multi-bleu.perl"))

    # Create a temporary folder containing hyperthesis and reference files
    result_path = tempfile.mkdtemp()
    # Create hyperthesis file
    hfile_path = os.path.join(result_path, 'hyp')
    hyps = [_maybe_list_to_str(h) for h in hypotheses]
    with open(hfile_path, 'w', encoding='utf-8') as hfile:
        text = "\n".join(hyps)
        hfile.write(text)
        hfile.write("\n")
    # Create reference files
    max_nrefs = max([len(refs) for refs in list_of_references])
    rfile_path = os.path.join(result_path, 'ref')
    for rid in range(max_nrefs):
        with open(rfile_path + '%d' % rid, 'w', encoding='utf-8') as rfile:
            for refs in list_of_references:
                if rid < len(refs):
                    ref = _maybe_list_to_str(refs[rid])
                    rfile.write(ref + "\n")
                else:
                    rfile.write("\n")

    # Calculate BLEU
    multi_bleu_cmd = [multi_bleu_path]
    if lowercase:
        multi_bleu_cmd += ["-lc"]
    multi_bleu_cmd += [rfile_path]
    with open(hfile_path, "r") as hyp_input:
        try:
            multi_bleu_ret = subprocess.check_output(multi_bleu_cmd,
                                                     stdin=hyp_input,
                                                     stderr=subprocess.STDOUT)
            multi_bleu_ret = multi_bleu_ret.decode("utf-8")
            bleu_score = _parse_multi_bleu_ret(multi_bleu_ret, return_all)
        except subprocess.CalledProcessError as error:
            if error.output is not None:
                tf.logging.warning(
                    "multi-bleu.perl returned non-zero exit code")
                tf.logging.warning(error.output)
            if return_all:
                bleu_score = [np.float32(0.0)] * 5
            else:
                bleu_score = np.float32(0.0)

    shutil.rmtree(result_path)

    return np.float32(bleu_score)

Exemple #13

0

Afficher le fichier

def strip_special_tokens(str_, strip_pad='<PAD>', strip_bos='<BOS>',
                         strip_eos='<EOS>', is_token_list=False, compat=True):
    """Removes special tokens in strings, including:

        - Removes EOS and all subsequent tokens
        - Removes leading and and trailing PAD tokens
        - Removes leading BOS tokens

    Note that besides the special tokens, all leading and trailing whitespace
    characters are also removed.

    This is a joint function of :func:`strip_eos`, :func:`strip_pad`, and
    :func:`strip_bos`

    Args:
        str\_: A `str`, or an `n`-D numpy array or (possibly nested)
            list of `str`.
        strip_pad (str): The PAD token to strip from the strings (i.e., remove
            the leading and trailing PAD tokens of the strings). Default
            is '<PAD>' as defined in
            :class:`~texar.data.SpecialTokens`.PAD.
            Set to `None` or `False` to disable the stripping.
        strip_bos (str): The BOS token to strip from the strings (i.e., remove
            the leading BOS tokens of the strings).
            Default is '<BOS>' as defined in
            :class:`~texar.data.SpecialTokens`.BOS.
            Set to `None` or `False` to disable the stripping.
        strip_eos (str): The EOS token to strip from the strings (i.e., remove
            the EOS tokens and all subsequent tokens of the strings).
            Default is '<EOS>' as defined in
            :class:`~texar.data.SpecialTokens`.EOS.
            Set to `None` or `False` to disable the stripping.
        is_token_list (bool): Whether each sentence in :attr:`str_` is a list
            of tokens. If False, each sentence in :attr:`str_` is assumed to
            contain tokens separated with space character.
        compat (bool): Whether to convert tokens into `unicode` (Python 2)
            or `str` (Python 3).

    Returns:
        Strings of the same shape of :attr:`str_` with special tokens stripped.
    """
    s = str_

    if compat:
        s = compat_as_text(s)

    if is_token_list:
        s = str_join(s, compat=False)

    if strip_eos is not None and strip_eos is not False:
        s = _strip_eos_(s, strip_eos, is_token_list=False, compat=False)

    if strip_pad is not None and strip_pad is not False:
        s = strip_token(s, strip_pad, is_token_list=False, compat=False)

    if strip_bos is not None and strip_bos is not False:
        s = _strip_bos_(s, strip_bos, is_token_list=False, compat=False)

    if is_token_list:
        s = _recur_split(s, str_)

    return s