Esempio n. 1
0
def setter(a):
    for b in a:
        b0, b1 = uc.charname(b), 0.001
        b2 = len(a) + b1 * 2
        if "CJK" not in b0 and "LETTER" not in b0:
            b1 += 1
        return (b1 / b2) > 0.8
Esempio n. 2
0
def alt(a):
    b1 = [0, 0]
    for b in a:
        b0 = uc.charname(b)
        if "LEFT" in b0:
            b1[0] = 1
        elif "RIGHT" in b0:
            b1[1] = 1
        else:
            pass
    return b1 == [1, 1]
Esempio n. 3
0
def revEnv(a):
    f = list(reversed(copy.copy(a)))
    #    print(f)
    try:
        for x in range(len(f)):
            if "LATIN" in uc.charname(f[x]):
                break
            f.pop(x)
        return "".join(list(reversed(f)))
    except:
        return
Esempio n. 4
0
def pairFinder(a,b):
  # nope. remove these things.
  fi=lambda x: list(map(lambda z:z[1],list(filter(lambda y: y[0],grouper(uc.charname(x))))))
  a0,b0=fi(a),fi(b)
  # print(a0,b0)
  l,l0,l1=len(a0),len(b0),0
  if l==l0:
    for x in range(l):
      if a0[x]!=b0[x]:
        l1+=1
    if l1==1:
      return True
    else:
      return False
  else:
    return False
Esempio n. 5
0
def getBatch(a):
    return [uc.charname(x) for x in a]
Esempio n. 6
0
def probe(a):
    for b in a:
        if "CJK" not in b:
            return False
    return True


def latin(a):
    b0, b1 = len(a), 0
    for b in a:
        if "LATIN" in b and "FULLWIDTH" not in b:
            b1 += 1
    return 0.7 < (b1 / b0 + 0.1)


u0 = lambda y: [unicode_charnames.charname(x) for x in y]
drx = {"control": [], "stopword": []}
for gf in range(len(fg)):
    u = getShit(fg[gf])
    for u1 in u:
        ux = u0(u1)
        if probe(ux) or latin(ux):
            # drx["stopword"].append(u1)
            drx["stopword"].append(u1)
        else:
            for y in u1:
                drx["control"].append(y)
drx["stopword"] = hash(drx["stopword"])
drx["control"] = hash(drx["control"] + drm)
storeAList(drx)
print(drx)
Esempio n. 7
0
# import unicode_charnames as uc
import jieba
# use stack?
from getFromPickleR import returnAList
import unicode_charnames as uc
# import wordninja as wj
xfz = list(filter(lambda x: len(x) > 0, returnAList()['stopword']))
xf0 = [x for x in xfz if "CJK" in uc.charname(x[0])]
xf1 = [x for x in xfz if x not in xf0]

def getBatch(a):
  return [uc.charname(x) for x in a]

def checkMe(a):
  if len(a)>6:
    if len(list(set(a)))==1:
      return True
  return False

def checker(a):
  if "DIGIT" in a:
    return 1
  else:
    return 0

# def wrapper(a,b):
#   # j=lambda x: [z for y in x for z in y]
#   if b==True:
#     return list(jieba.cut(a))
#   else:
#     return wj.split(a)
Esempio n. 8
0
    def test_charname(self):
        expected = "LATIN CAPITAL LETTER E WITH ACUTE"
        self.assertEqual(charname("É"), expected)
        self.assertEqual(charname("\u00C9"), expected)
        self.assertEqual(charname(chr(0xC9)), expected)

        self.assertEqual(charname("\u3400"), "CJK UNIFIED IDEOGRAPH-3400")
        self.assertEqual(charname("\U0003134A"), "CJK UNIFIED IDEOGRAPH-3134A")
        self.assertEqual(charname("\uF900"),
                         "CJK COMPATIBILITY IDEOGRAPH-F900")
        self.assertEqual(charname("\U00017000"), "TANGUT IDEOGRAPH-17000")
        self.assertEqual(charname("\U0001B170"), "NUSHU CHARACTER-1B170")
        self.assertEqual(charname("\U00018CD5"),
                         "KHITAN SMALL SCRIPT CHARACTER-18CD5")

        self.assertEqual(charname("\u0000"), "<control-0000>")
        self.assertEqual(charname("\uF8FF"), "<private-use-F8FF>")
        self.assertEqual(charname("\uD800"), "<surrogate-D800>")
        self.assertEqual(charname("\U0010FFFF"), "<noncharacter-10FFFF>")
Esempio n. 9
0
import unicode_charnames
# shall we concern the predefined groups?
# is there any missing alphabet or components?
# hidden candidate everywhere.
def getShit(a):
  with open(a, "r") as f:
    return f.read()
u=getShit("example\\0.log")
u0=[unicode_charnames.charname(x) for x in u]
for u1 in u0:
    print(u1)
    # just how the f**k can we do this?