Python _is_punctuation Exemples, tokenization._is_punctuation Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : run_cmrc2018_drcd_baseline.py Projet : ZeweiChu/cmrc2018

def customize_tokenizer(text, do_lower_case=False):
  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
  temp_x = ""
  text = tokenization.convert_to_unicode(text)
  for c in text:
    if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c):
      temp_x += " " + c + " "
    else:
      temp_x += c
  if do_lower_case:
    temp_x = temp_x.lower()
  return temp_x.split() # 所以我们这里会拿到一个list

Exemple #2

0

Afficher le fichier

 def _joinTokens_orig(self, example):
     tokens = []
     for t0i, token0 in enumerate(example.tokens0):
         if token0.startswith("##"):
             while len(tokens) > 0 and tokens[-1] == " " and not (
                     len(tokens) > 1
                     and tokenizationOrig._is_punctuation(tokens[-2][-1])):
                 tokens.pop()
             token0 = token0[2:]
         tokens.append(token0)
     text = "".join(tokens)
     return text

Exemple #3

0

Afficher le fichier

Fichier : tokenization_test.py Projet : noDefinition/works

 def test_is_punctuation(self):
     self.assertTrue(tokenization._is_punctuation(u"-"))
     self.assertTrue(tokenization._is_punctuation(u"$"))
     self.assertTrue(tokenization._is_punctuation(u"`"))
     self.assertTrue(tokenization._is_punctuation(u"."))
     self.assertFalse(tokenization._is_punctuation(u"A"))
     self.assertFalse(tokenization._is_punctuation(u" "))

Exemple #4

0

Afficher le fichier

Fichier : processing.py Projet : zmskye/TextBrewer

def customize_tokenizer(text, do_lower_case=True):
    temp_x = ""
    text = tokenization.convert_to_unicode(text)
    for c in text:
        if _is_chinese_char(ord(c)) or tokenization._is_punctuation(
                c) or tokenization._is_whitespace(
                    c) or tokenization._is_control(c):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()

Exemple #5

0

Afficher le fichier

Fichier : tokenization_test.py Projet : Wanke15/bert

  def test_is_punctuation(self):
    self.assertTrue(tokenization._is_punctuation(u"-"))
    self.assertTrue(tokenization._is_punctuation(u"$"))
    self.assertTrue(tokenization._is_punctuation(u"`"))
    self.assertTrue(tokenization._is_punctuation(u"."))

    self.assertFalse(tokenization._is_punctuation(u"A"))
    self.assertFalse(tokenization._is_punctuation(u" "))

Exemple #6

0

Afficher le fichier

def _is_chinese_or_punctuation(ch):
    return _is_chinese_char(ch) or _is_punctuation(ch)