Esempio n. 1
0
def jatokenize(content):
    ret_list = []
    lines = tagger.parse(content).split('\n')
    for line in lines:
        if line == "EOS":
            break
        line = line.split('\t')
        word = line[2]

        try:
            jtype = unicodedata.name(word[0])
        except:
            continue
        # 漢字でない一文字のwordは無視
        # 'ー'や'*'も同様
        if len(word) == 1 and jtype[0:4] != 'CJK ':
            continue
        # 二文字のひらがなは無視
        if (len(word) == 2 and jtype[0:4] == 'HIRA'
                and unicodedata.name(word[1])[0:4] == 'HIRA'):
            continue
        if jtype[0:4] == 'LATI':
            continue
        if word.isdigit():
            continue
        if (line[3][:2] == '名詞' or line[3][:2] == '動詞'
                or line[3][:2] == '副詞' or line[3][:3] == '形容詞'):
            ofs.write("%s " % word)
            ret_list.append(word.encode('utf8'))
    ofs.write("\n")
    return ret_list
Esempio n. 2
0
def codepoint_simple(arg):
    arg = arg.upper()

    r_label = re.compile('\\b' + arg.replace(' ', '.*\\b') + '\\b')

    results = []
    for cp in xrange(0xFFFF):
        u = unichr(cp)
        try:
            name = unicodedata.name(u)
        except ValueError:
            continue

        if r_label.search(name):
            results.append((len(name), u, cp, name))
    if not results:
        r_label = re.compile('\\b' + arg.replace(' ', '.*\\b'))
        for cp in xrange(0xFFFF):
            u = unichr(cp)
            try:
                name = unicodedata.name(u)
            except ValueError:
                continue

            if r_label.search(name):
                results.append((len(name), u, cp, name))

    if not results:
        return None

    length, u, cp, name = sorted(results)[0]
    return about(u, cp, name)
 def test_cjk(self):
     import sys
     import unicodedata
     cases = ((0x3400, 0x4DB5),
              (0x4E00, 0x9FA5))
     if unicodedata.unidata_version >= "4.1":
         cases = ((0x3400, 0x4DB5),
                  (0x4E00, 0x9FBB),
                  (0x20000, 0x2A6D6))
     for first, last in cases:
         # Test at and inside the boundary
         for i in (first, first + 1, last - 1, last):
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             char = ('\\U%08X' % i).decode('unicode-escape')
             assert unicodedata.name(char) == charname
             assert unicodedata.lookup(charname) == char
         # Test outside the boundary
         for i in first - 1, last + 1:
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             char = ('\\U%08X' % i).decode('unicode-escape')
             try:
                 unicodedata.name(char)
             except ValueError, e:
                 assert e.message == 'no such name'
             raises(KeyError, unicodedata.lookup, charname)
Esempio n. 4
0
 def format(self, stream, args):
     char = unicode(args.next())
     if len(char) != 1:
         raise TypeError("expected single character")
     if self.atsign:
         if char in python_escapes:
             stream.write('"\\%s"' % python_escapes[char])
         else:
             try:
                 stream.write('u"\\N{%s}"' % unicodedata.name(char))
             except ValueError:
                 stream.write(repr(char))
     else:
         if unicodedata.category(char).startswith("C"):
             try:
                 stream.write(unicodedata.name(char))
             except ValueError:
                 code = ord(char)
                 if code in ascii_control_chars:
                     i = 1 if self.colon else 0
                     stream.write(ascii_control_chars[code][i])
                 else:
                     raise FormatError("unprintable character")
         else:
             stream.write(char)
Esempio n. 5
0
 def _do_write(fname, variable, version, date, table):
     print("writing {} ..".format(fname))
     import unicodedata
     import datetime
     import string
     utc_now = datetime.datetime.now(tz=datetime.timezone.utc)
     INDENT = 4
     with open(fname, 'w') as fp:
         fp.write("# Generated: {iso_utc}\n"
                  "# Source: {version}\n"
                  "# Date: {date}\n"
                  "{variable} = (".format(iso_utc=utc_now.isoformat(),
                                          version=version,
                                          date=date,
                                          variable=variable))
         for start, end in table:
             ucs_start, ucs_end = unichr(start), unichr(end)
             hex_start, hex_end = ('0x{0:04x}'.format(start),
                                   '0x{0:04x}'.format(end))
             try:
                 name_start = string.capwords(unicodedata.name(ucs_start))
             except ValueError:
                 name_start = u''
             try:
                 name_end = string.capwords(unicodedata.name(ucs_end))
             except ValueError:
                 name_end = u''
             fp.write('\n' + (' ' * INDENT))
             fp.write('({0}, {1},),'.format(hex_start, hex_end))
             fp.write('  # {0:24s}..{1}'.format(
                 name_start[:24].rstrip() or '(nil)',
                 name_end[:24].rstrip()))
         fp.write('\n)\n')
     print("complete.")
Esempio n. 6
0
 def data(self, index, role ):
     global UC_CAT_EXPAND, COL_ALIGNMENT, COL_TOOLTIPS
     (char, count) = self.chardata.get_tuple(index.row())
     if role == Qt.DisplayRole : # request for actual data
         if 0 == index.column():
             return char
         elif 1 == index.column():
             return '0x{0:04x}'.format(ord(char))
         elif 2 == index.column():
             return count
         elif 3 == index.column():
             if char in C.NAMED_ENTITIES :
                 return '&' + C.NAMED_ENTITIES[char] + ';'
             else:
                 return '&#{0:d};'.format(ord(char))
         elif 4 == index.column():
             return UC_CAT_EXPAND[unicodedata.category(char).lower()]
         else: # assuming column is 5, unicode name
             return unicodedata.name(char,'no name?').title()
     elif (role == Qt.TextAlignmentRole) :
         return COL_ALIGNMENT[index.column()]
     elif (role == Qt.ToolTipRole) or (role == Qt.StatusTipRole) :
         if index.column() < 5 :
             return COL_TOOLTIPS[index.column()]
         # For column 5, the tooltip is the name string, because a narrow
         # column may not expose the entire name any other way.
         return unicodedata.name(char,'no name?').title()
     # Sorry, we don't support other roles
     return None
Esempio n. 7
0
def showdict(data, indent):
    first=True
    for key in sorted(data.keys()):
        value=data[key]
        if first:
            first=False
        else:
            print
        print " "*max(indent,0) + "("+key,
        # Sneaky trick: we don't want to go newline-indent over and
        # over for long sequences, i.e. cases where there is only
        # one possible follower.  So we skip the newlines in those
        # cases, and tell the next-lower iteration not to do the whole
        # indent thing by passing a negative indent.  We don't just
        # pass 0 or 1 because if another iteration *further down*
        # turns out not to be an only case, it will need to know
        # the right indent to pass along.  So a case like 
        # R-O-{CK|LL}, the O is unique after the R, so no linefeed,
        # but then the {C|L} are not unique after the O.
        if type(value)==dict:
            if len(value)>1:
                print ""
                showdict(value, abs(indent)+4),
            else:
                showdict(value, -(abs(indent)+4)),
        else:
            print "    "+value.encode('utf-8'),
            if "-n" in sys.argv:
                try:
                    print unicodedata.name(value),
                except:
                    pass
        print ")",
Esempio n. 8
0
def extractKeyword(text,word_class=["名詞","形容詞"]):
	tmp = splitTag(text) #まずハッシュタグを抽出
	text = tmp[0]
	keywords = tmp[1]
	tagger = MeCab.Tagger('-Ochasen')
	node = tagger.parseToNode(text.encode('utf-8'))
	while node:
		try:
			if node.feature.split(',')[0] in word_class:
			#print node.surface
				uniname = node.surface.decode('utf-8')[0] #名詞の一文字目 ↓で数字、ひらがな、カタカナ、漢字、アルファベットのみをkeywordsに追加
				if (unicodedata.name(uniname)[0:8] == "HIRAGANA") or (unicodedata.name(uniname)[0:8] == "KATAKANA") or (unicodedata.name(uniname)[0:18] == "HALFWIDTH KATAKANA") or (unicodedata.name(uniname)[0:3] == "CJK") or (unicodedata.name(uniname)[0:5] == "LATIN") or (unicodedata.name(uniname)[0:5] == "DIGIT"):
					term = node.surface.replace('*','*')
					term = term.replace('"','”')
					term = term.replace("'","’")
					keywords.append(term.decode('utf-8'))
					#print node.surface.decode('utf-8')
		except Exception as e:
			print "-"*10
			print "エラー(MeCab)"
			print node.surface
			print str(type(e))
			print str(e.args)
			print e.message
			print str(e)
			print "-"*10
		node = node.next
	return keywords
Esempio n. 9
0
def safe_path(origtitle):
    title = safe_path_component(ftfy(origtitle))
    
    if len(title) == 0:
        title = origtitle = u'_'

    if title.startswith(u'-') or title.startswith(u'.'):
        title = u'_' + title
    try:
        charname = safe_path_component(unicodedata.name(origtitle[0]))
    except ValueError:
        charname = u'UNKNOWN'
    category = charname.split('_')[0]

    # some ridiculous stuff to give every article a unique name that can be
    # stored on multiple file systems and tab-completed
    if len(origtitle) == 1:
        pieces = [u'single_character', category, charname + '.json']
    else:
        try:
            charname2 = safe_path_component(unicodedata.name(origtitle[1]))
        except ValueError:
            charname2 = u'UNKNOWN'
        text_to_encode = unicodedata.normalize("NFKD", safe_path_component(title[:64]))
        finalpart = text_to_encode.encode('punycode').rstrip('-')
        pieces = [charname, charname2, finalpart + '.json']
    path = u'/'.join(pieces)
    return path
Esempio n. 10
0
def report_code_points(char_class, code_point_list, text=''):
    '''Report all code points which have been added to or removed from a
    character class.
    '''
    for code_point in sorted(code_point_list):
        if type(code_point) == type(int()):
            print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
                  %{'text': text,
                    'char': chr(code_point),
                    'char_class': char_class,
                    'code_point': hex(code_point),
                    'name': unicodedata.name(chr(code_point), 'name unknown')})
        else:
            print(('%(char_class)s: %(text)s: '
                   + '%(char0)s → %(char1)s '
                   + '%(code_point0)s → %(code_point1)s '
                   + '%(name0)s → %(name1)s') %{
                'text': text,
                'char_class': char_class,
                'char0': chr(code_point[0]),
                'code_point0': hex(code_point[0]),
                'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
                'char1': chr(code_point[1]),
                'code_point1': hex(code_point[1]),
                'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
            })
Esempio n. 11
0
def get_unicode_str(size=10, max_char=0xFFFF, onlyNormalized=False, includeUnexisting=False):
    '''
    generates valid (for current OS) Unicode file name
    Notice: if includeUnexisting==True, it is possible that files don't get synchronized
    '''
    if platform.system() == "Windows":
        # Unicode characters 1 through 31, as well as quote ("), less than (<), greater than (>), pipe (|), backspace (\b), null (\0) and tab (\t).
        exclude = string.punctuation + u"\t" +  u''.join([unichr(x) for x in range(0, 32)])
    else:
        # I guess it mainly depends on fs type
        #exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 1)])
        exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 32)])


    name = u""
    while len(name) < size:
        c = unichr(random.randint(0, max_char))
        if c not in exclude:
            try:
                if not includeUnexisting:
                    unicodedata.name(c) #this will cause invalid unicode character to throw exception
                if onlyNormalized:
                    name = name + unicodedata.normalize('NFC',c) #only normalized chars
                else:
                    name = name + c
            except ValueError:
                pass
    return name
Esempio n. 12
0
    def test_cjk(self):
        import sys
        import unicodedata

        cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FA5))
        if unicodedata.unidata_version >= "5":  # don't know the exact limit
            cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FCB), (0x20000, 0x2A6D6), (0x2A700, 0x2B734))
        elif unicodedata.unidata_version >= "4.1":
            cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FBB), (0x20000, 0x2A6D6))
        for first, last in cases:
            # Test at and inside the boundary
            for i in (first, first + 1, last - 1, last):
                charname = "CJK UNIFIED IDEOGRAPH-%X" % i
                char = ("\\U%08X" % i).decode("unicode-escape")
                assert unicodedata.name(char) == charname
                assert unicodedata.lookup(charname) == char
            # Test outside the boundary
            for i in first - 1, last + 1:
                charname = "CJK UNIFIED IDEOGRAPH-%X" % i
                char = ("\\U%08X" % i).decode("unicode-escape")
                try:
                    unicodedata.name(char)
                except ValueError, e:
                    assert e.message == "no such name"
                raises(KeyError, unicodedata.lookup, charname)
 def test_cjk(self):
     import sys
     if sys.maxunicode < 0x10ffff:
         skip("requires a 'wide' python build.")
     import unicodedata
     cases = ((0x3400, 0x4DB5),
              (0x4E00, 0x9FA5))
     if unicodedata.unidata_version >= "4.1":
         cases = ((0x3400, 0x4DB5),
                  (0x4E00, 0x9FBB),
                  (0x20000, 0x2A6D6))
     for first, last in cases:
         # Test at and inside the boundary
         for i in (first, first + 1, last - 1, last):
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             assert unicodedata.name(unichr(i)) == charname
             assert unicodedata.lookup(charname) == unichr(i)
         # Test outside the boundary
         for i in first - 1, last + 1:
             charname = 'CJK UNIFIED IDEOGRAPH-%X'%i
             try:
                 unicodedata.name(unichr(i))
             except ValueError:
                 pass
             raises(KeyError, unicodedata.lookup, charname)
Esempio n. 14
0
def clean_Ustring_fromU(string):
    from unicodedata import name, normalize
    gClean = ''
    for ch in u''.join(string.decode('utf-8', 'ignore')):
        try:
            if name(ch).startswith('LATIN') or name(ch) == 'SPACE':
                gClean = gClean + ch
            else: # Remove non-latin characters and change them by spaces
                gClean = gClean + ' '
        except ValueError: # In the case name of 'ch' does not exist in the unicode database.
            gClean = gClean + ' '
    
    try: # Trying different cases for bad input documents.
        normalized_string = normalize('NFKC', gClean.lower())
    except TypeError:
        #sys.stderr.write('Bad formed string at the first attempt\n')
        try:
            range_error = 999
            normalized_string = normalize('NFKC', gClean[0:range_error].lower()) # One thousand of characters are written if available. 
        except TypeError:
            #sys.stderr.write('\nThe wrong string at the second attempt: before %s words' % range_error)
            try:
                range_error = 99
                normalized_string = normalize('NFKC', gClean[0:range_error].lower())
            except TypeError:
                #sys.stderr.write('\nThe wrong string at the third attempt: before %s words' % range_error)
                try:
                    range_error = 49
                    normalized_string = normalize('NFKC', gClean[0:range_error].lower())
                except TypeError:    
                    #sys.stderr.write('\nIt was not possible forming output file after three attempts. Fatally bad file')
                    normalized_string = '# Fatally bad File\n'
                    pass
    return  normalized_string.split() # Return the unicode normalized document.
Esempio n. 15
0
def main():

    # get files 
    files = []
    for i in range(1,29):
        if i < 26:
            files.append("db/Minna_no_nihongo_1.%02d.txt" % i)
        else: 
            files.append("db/Minna_no_nihongo_2.%02d.txt" % i)

    # get words from files
    words = get_words_from_files(files)


    # add words to network
    G=nx.Graph()
    for w in words:
        G.add_node(w)
        G.node[w]['furigana'] = words[w]['furigana']
        G.node[w]['meaning'] = words[w]['meaning']
        G.node[w]['chapter'] = words[w]['chapter']

    # to make statistics
    nbins, dmin, dmax = 20, 0, 1
    hist, edges = np.histogram([0], bins=nbins, range=(dmin, dmax))

    # adding edges
    words = G.nodes()
    print("Total number of words: ",len(words))

    for word1, word2 in itertools.combinations(words,2):
        for w1 in word1:
            for w2 in word2:
                if "CJK UNIFIED" in ud.name(w1) and "CJK UNIFIED" in ud.name(w2):
                    f1, f2 = fingerprint[w1], fingerprint[w2]
                    match = SequenceMatcher(None, f1, f2 , autojunk=True)
                    ratio = match.ratio()

                    # add data to histogram
                    new_hist, edges = np.histogram(ratio, bins=nbins, range=(dmin, dmax))
                    hist += new_hist
                    
                    if ratio > 0.8:                      
                        # G.add_edge(word1, word2, weight=5*ratio-4) # 0.8 - 1 --> 0 - 1
                        G.add_edge(word1, word2, weight=4*ratio-3.2) # 0.8 - 1 --> 0 - 0.8
                        break

    # plot data
    score = 0.5*(edges[1:] + edges[:-1])
    plt.plot(score, hist)
    plt.xlabel("score")
    plt.ylabel("histogram")
    plt.show()


    G = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)

    print("Total number of words connected: ", len(G[0].nodes()))
    nx.write_graphml(G[0], "kanjis.graphml", encoding='utf-8', prettyprint=True)
Esempio n. 16
0
 def is_in_alphabet(self, uchr, alphabet):
     if self.no_memory:
         return not uchr.isalpha() or alphabet in ud.name(uchr)
     try: 
         return self.alphabet_letters[alphabet][uchr]
     except KeyError:
         return self.alphabet_letters[alphabet].setdefault(
             uchr, alphabet in ud.name(uchr))
Esempio n. 17
0
def codePointToCharacter(cp):
    if cp <= maxSupportedCodePoint():
        char = unichr(cp)
        if cp > 0xFF: # unicodedata doesn't have names for control characters, so skip the name test on the ASCII ones at least
            try:
                unicodedata.name(char)
            except Exception,e:
                raise e
        return char
Esempio n. 18
0
def symbolEq(s1,s2):
	if len(s1) != len(s2): return False
	if len(s1) != 0: return False
	try:
		x1 = unicodedata.name(u'%s' % s1.decode('utf-8'))
		x2 = unicodedata.name(u'%s' % s2.decode('utf-8'))
	except:
		return False
	return x1 == x2
Esempio n. 19
0
	def endElement(self,name):
		if name=='ar':
			self.in_arContent=False
			
			#Now store the entry:
			try:
				
				ch=self.normalize_whitespace(self.article)
				if not len(ch):return
				
				lidx=ch.index('[')
				ridx=ch.index(']')
				self.romanization=ch[lidx+1:ridx]
				split_romanization=string.split(self.romanization,u' ',199)
				self.translation=ch[ridx+1:]
				
				cjktraditional=[]
				cjksimplified=[]
				for cidx in range(len(self.splitkey[0])):
					cjktraditional.append(unicodedata.name(self.splitkey[0][cidx]))
					cjksimplified.append(unicodedata.name(self.splitkey[1][cidx]))
					
				#print self.romanization,self.translation
				entry={
					'traditional':self.splitkey[0],		#uchar string
					'simplified':self.splitkey[1],		#uchar string
					'cjktraditional':cjktraditional,
					'cjksimplified':cjksimplified,
					'romanization':split_romanization,	#list of morphenes
					'frequencies':[],					#filled by post-process with romanized morphene frequencies
					'translation':self.translation,
				}

				if self.dict.has_key(entry['traditional']):
					#print 'already have: ',`entry['traditional']`,entry['romanization']#fontset more likely to have traditional, if any
					#print 'proof:',self.dict[entry['traditional']]['romanization']
					pass
				else:self.dict[entry['traditional']]=entry
				
				
				#Add to distro:
				for item in entry['traditional']:
					try:self.dist[item]+=1
					except:self.dist[item]=1
				
				
				if math.fmod(len(self.dict.keys()),100)==0:
					msglist=[
						"Words  :%6d"%(len(self.dict.keys())),
						"Symbols:%6d"%(len(self.dist.keys()))
					]
					self.progress_message(msglist)
				
			except Exception,e:
				if DEBUG:print e
				
			self.article=u''
Esempio n. 20
0
    def is_unicode(self, char):
        # http://docs.python.org/2/library/unicodedata.html
        is_unicode = True
        try:
            unicodedata.name(unicode(char))
        except ValueError:
            is_unicode = False

        return is_unicode
Esempio n. 21
0
    def test_parse_rand_utf16(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 1, 'cA', 120),
            (1000, 1, 'cG', 120),
            (1000, 1, 'cH', 120),
            ]

        print "What about messages to log (INFO) about unmatched quotes (before eol)"
        # got this ..trying to avoid for now
        # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
        
            print "inspect:", h2o.dump_json(inspect)
            numRows = inspect['numRows']
            self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount))
            numCols = inspect['numCols']
            self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))

            for k in range(colCount):
                naCnt = inspect['cols'][k]['naCnt']
                self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt))

                stype = inspect['cols'][k]['type']
                self.assertEqual("Enum", stype, msg='col %s type %s should be Enum' % (k, stype))

        #**************************
        # for background knowledge; (print info)
        import unicodedata
        # u = unichr(233) + unichr(0x0bf2) + unichr(3972) + unichr(6000) + unichr(13231)
        # left and right single quotes
        u = unichr(0x201c) + unichr(0x201d)
        # preferred apostrophe (right single quote)
        u = unichr(0x2019) 
        u = unichr(0x2018) + unichr(6000) + unichr(0x2019)
        # grave and acute?
        # u = unichr(0x60) + unichr(0xb4)
        # don't do this. grave with apostrophe http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html
        # u = unichr(0x60) + unichr(0x27)

        for i, c in enumerate(u):
            print i, '%04x' % ord(c), unicodedata.category(c),
            print unicodedata.name(c)
Esempio n. 22
0
def unicode(ctx, pline, userdata):
    args = pline.trailing.split()[1:]
    if not args:
        ctx.command('/say No symbol given.')
        return

    if args[0].startswith('U+'):
        codepoint = int(args[0][2:], 16)
        char = unichr(codepoint)
        try:
            name = unicodedata.name(char)
        except ValueError:
            name = 'n/a'
        if codepoint < 32:
            char = '-'
        ctx.command(u'/say {}, Name: {}'.format(char, name))
        return

    reststr = ' '.join(args)
    if all(char in string.ascii_uppercase + string.digits + ' -'
           for char in reststr):
        try:
            char = unicodedata.lookup(reststr.strip())
        except KeyError:
            pass
        else:
            codepoint = ord(char)
            ctx.command(u'/say {}, Codepoint: U+{:X}'.format(char, codepoint))
            return

    symbol = args[0].decode(ctx.encoding)
    nfc_symbol = unicodedata.normalize(u'NFC', symbol)
    if len(nfc_symbol) > 1:
        ctx.command('/say Too many symbols.')
        return
    try:
        name = unicodedata.name(nfc_symbol)
    except TypeError:
        ctx.command('/say Unknown character or invalid input.')
        return
    except ValueError:
        name = 'n/a'
    nfd_symbol = unicodedata.normalize(u'NFD', symbol)
    category = unicodedata.category(symbol)
    codepoint = ord(nfc_symbol)
    outstr = u'Codepoint: U+{:X}, Name: {}, Category: {}.'.format(codepoint, name, category)
    if len(nfd_symbol) > len(nfc_symbol):
        outstr += u' (Compose: '
        slist = []
        for char in nfd_symbol:
            codepoint = ord(char)
            slist.append(
                u'U+{:X}'.format(codepoint))
        outstr += u', '.join(slist) + ')'
    ctx.command(u'/say {}'.format(outstr))
Esempio n. 23
0
def weight_for_leven_edits(wordFrom, wordTo, edits, weight_rules, max_weight, debug=False):
    if (debug):
        print
        print
        print "Weight Analysis"
        print "word in: ", wordFrom
        dump(wordFrom)
        print
        print "word to: ", wordTo
        dump(wordTo)
    cumulative_weight = 0
    for edit in edits:
        edit_weight = 0
        if (debug):
            print edit
        (command, char_num_in_word_one, char_num_in_word_two) = edit
        if (char_num_in_word_one > (len(wordFrom) - 1)):
            char_in_word_one = ''
        else:
            char_in_word_one = wordFrom[char_num_in_word_one]
        if (char_num_in_word_two > (len(wordTo) - 1)):
            char_in_word_two = ''
        else:
            char_in_word_two = wordTo[char_num_in_word_two]
        if (debug):
            print '\t', command
            if char_in_word_one:
                print '\t', unicodedata.name(char_in_word_one)
            else:
                print '\tx'
            if char_in_word_two:
                print '\t', unicodedata.name(char_in_word_two)
            else:
                print '\tx'
        if (command == 'replace'):
            edit_weight = 10
        elif (command == 'delete'):
            edit_weight = 15
        elif (command == 'insert'):
            edit_weight = 18
        else:
            raise ValueError('unknown Levenshtein edit operation: ' + command)
        for weight_rule in weight_rules:
            if (weight_rule[0] == command) and (weight_rule[1] == '*' or char_in_word_one in weight_rule[1]) and (weight_rule[2] == '*' or char_in_word_two in weight_rule[2]):
                if (debug):
                    print '\t weight rule applied:'
                    print '\t', weight_rule
                edit_weight = weight_rule[3]
                break
        if (debug):
            print '\tweight: ', edit_weight
        cumulative_weight += edit_weight
        if (cumulative_weight >= max_weight):
          break
    return cumulative_weight
Esempio n. 24
0
def breakdown_into_validwords(sentence):
    """
    与えられた文章(文字列)を形態素解析してリスト返却します
    - 入出力例
    -   IN:  "今日はいい天気ですね"
    -   OUT: ['今日', '天気']
    """
    ret_list = []

    if sentence == '' or not(isinstance(sentence, str)):
        return ret_list

    sentence = sentence.replace("\n", "")

    model = MeCab.Model_create("-Ochasen -d mecab-ipadic-neologd")
    tagger = model.createTagger()
    lines = tagger.parse(sentence).split('\n')
    for line in lines:
        if line == "EOS":
            break
        line = line.split('\t')
        word = line[2]

        # 卑猥な単語を含む文章は除外
        if word in ['ちんちん', 'ちんこ', 'キンタマ', 'きんたま', '痴漢']:
            return []
        # TODO:除外リストの作成
        if word in ['今日', '俺', '私', '僕', '人', '思う', 'ちゃう',
                '何', '行く', 'もらう', 'られる', 'くれる', 'すぎる']:
            continue
        try:
            jtype = unicodedata.name(word[0])
        except:
            continue
        # 漢字でない一文字のwordは無視
        # 'ー'や'*'も同様
        if len(word) == 1 and jtype[0:4] != 'CJK ':
            continue
        # 二文字のひらがなは無視
        if (len(word) == 2 and jtype[0:4] == 'HIRA'
                and unicodedata.name(word[1])[0:4] == 'HIRA'):
            continue
        # 伸ばし棒を含むひらがなは無視
        if jtype[0:4] == 'HIRA' and 'ー' in word:
            continue
        if jtype[0:4] == 'LATI':
            continue
        if word.isdigit():
            continue
        if (line[3][:2] == '名詞' or line[3][:2] == '動詞'
                or line[3][:2] == '副詞' or line[3][:3] == '形容詞'):
            ret_list.append(word)
            # print(word)

    return ret_list
Esempio n. 25
0
def memo_to_fn(x):
    bits = []
    print x
    for l in x:
        if "LETTER" in unicodedata.name(l):
            bits.append(l)
        elif "DIGIT" in unicodedata.name(l):
            bits.append(l)
        elif bits[-1] != "_":
            bits.append("_")
    return "".join(bits)
 def is_accepted_char(uchar):
     if unicodedata.name(uchar).startswith('CJK'):
         return True
     if unicodedata.name(uchar).startswith('LATIN'):
         return True
     if unicodedata.name(uchar).startswith('SPACE'):
         return True
     if unicodedata.name(uchar).startswith('DIGIT'):
         return True
     if unicodedata.name(uchar).startswith('GREEK'):
         return True
     return False
Esempio n. 27
0
def print_unicode_entry(n):
    u = get_unicode_using_unicode_escape(n)
    print '{:8d} {:8x}'.format(n, n),
    print u.encode('utf8'), unicodedata.category(u),
    try:
        print unicodedata.name(u),
    except:
        print 'unicodedata has no name defined',
    try:
        print unicodedata.digit(u)
    except:
        print 'unicodedata has no numeric value'
Esempio n. 28
0
def giphy_me():
  terms = ircmsg.partition('giphy me ')[2]
  # translate from emoji to actual words
  if (terms.encode('utf-8')[:1]==b'\xf0'):
    terms = uni.name(terms.encode('utf-8')[:4].decode('utf-8'))
  if (terms.encode('utf-8')[:1]==b'\xe2'):
    terms = uni.name(terms.encode('utf-8')[:3].decode('utf-8'))
  if (terms == "blerg" or terms == "blergh"):
    terms = "30rock"
  terms = terms.lower()
  print("searching giphy for "+terms)
  sendmsg(current_channel, search_gifs(terms))
Esempio n. 29
0
def is_cyrillic(text):
    count_cyrillic = 0
    count_latin = 0
    for c in text:
        try:
            if "CYRILLIC" in unicodedata.name(c):
                count_cyrillic += 1
            if "LATIN" in unicodedata.name(c):
                count_latin += 1
        except:
            pass
    return count_cyrillic >= .3 * (count_latin + count_cyrillic)
Esempio n. 30
0
	def _strip_noise_bytes(self, obj, replace='_'):
		'''Make sure there arent any random weird chars that dont belong to any alphabet.
			Only ascii non-letters are allowed, as fancy symbols don't seem to work well with curses.'''
		if not isinstance(obj, str): obj = str(obj)
		obj_ucs = list()
		for uc in obj:
			try:
				unicodedata.name(uc)
				if unicodedata.category(uc) != 'Ll': uc.encode('ascii')
			except (ValueError, UnicodeEncodeError):
				if replace: obj_ucs.append(replace)
			else: obj_ucs.append(uc)
		return ''.join(obj_ucs)
Esempio n. 31
0
s = '100'

print(s.isdecimal())

s = '0xF'
print(s.isdecimal())

s = '10.55'
print(s.isdecimal())

s = ''
print(s.isdecimal())

s = '1٠2𝟜'  # U+0660, U+1D7DC
print(s.isdecimal())
print(int(s))

import unicodedata

count = 0
for codepoint in range(2**16):
    ch = chr(codepoint)
    if ch.isdecimal():
        print(u'{:04x}: {} ({})'.format(codepoint, ch,
                                        unicodedata.name(ch, 'UNNAMED')))
        count = count + 1
print(f'Total Number of Decimal Unicode Characters = {count}')
Esempio n. 32
0
# BEGIN NUMERICS_DEMO
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print(
        'U+%04x' % ord(char),  # <1>
        char.center(6),  # <2>
        're_dig' if re_digit.match(char) else '-',  # <3>
        'isdig' if char.isdigit() else '-',  # <4>
        'isnum' if char.isnumeric() else '-',  # <5>
        format(unicodedata.numeric(char), '5.2f'),  # <6>
        unicodedata.name(char),  # <7>
        sep='\t')
# END NUMERICS_DEMO
Esempio n. 33
0
async def search(q: str):  # <7>
    chars = app.state.index.search(q)
    return ({'char': c, 'name': name(c)} for c in chars)  # <8>
Esempio n. 34
0
 def test_named_sequences_names_in_pua_range(self):
     # We are storing named seq in the PUA 15, but their names shouldn't leak
     for cp in range(0xf0100, 0xf0fff):
         with self.assertRaises(ValueError) as cm:
             unicodedata.name(chr(cp))
         self.assertEqual(str(cm.exception), 'no such name')
Esempio n. 35
0
class ucp:
    name = unicodedata.name(u'क')
    category = unicodedata.category(u'क')
    medial = 'ka'
    final = 'k'
Esempio n. 36
0
def extract_currency(text_list, left_chars=20, right_chars=20):
    """Return a summary dictionary about currency symbols in ``text_list``

    Get a summary of the number of currency symbols, their frequency,
    the top ones, and more.

    :param list text_list: A list of text strings.
    :param int left_chars: The number of characters to extract, to the
        left of the symbol when getting :attr:`surrounding_text`
    :param int right_chars: The number of characters to extract, to the
        left of the symbol when getting :attr:`surrounding_text`
    :returns summary: A dictionary with various stats about currencies

    >>> posts = ['today ₿1 is around $4k', 'and ₿ in £ & €?', 'no idea']
    >>> currency_summary = extract_currency(posts)
    >>> currency_summary.keys()
    dict_keys(['currency_symbols', 'currency_symbols_flat',
    'currency_symbol_counts', 'currency_symbol_freq',
    'top_currency_symbols', 'overview', 'currency_symbol_names'])

    >>> currency_summary['currency_symbols']
    [['₿', '$'], ['₿', '£', '€'], []]

    A simple extract of currencies from each of the posts. An empty list if
    none exist

    >>> currency_summary['currency_symbols_flat']
    ['₿', '$', '₿', '£', '€']

    All currency symbols in one flat list.

    >>> currency_summary['currency_symbol_counts']
    [2, 3, 0]

    The count of currency symbols per post.

    >>> currency_summary['currency_symbol_freq']
    [(0, 1), (2, 1), (3, 1)]

    Shows how many posts had 0, 1, 2, 3, etc. currency symbols
    (number_of_symbols, count)

    >>> currency_summary['top_currency_symbols']
    [('₿', 2), ('$', 1), ('£', 1), ('€', 1)]

    >>> currency_summary['currency_symbol_names']
    [['bitcoin sign', 'dollar sign'], ['bitcoin sign', 'pound sign',
    'euro sign'], []]

    >>> currency_summary['surrounding_text']
    [['today ₿1 is around $4k'], ['and ₿ in £ & €?'], []]

    >>> extract_currency(posts, 5, 5)['surrounding_text']
    [['oday ₿1 is ', 'ound $4k'], ['and ₿ in £', ' & €?'], []]

    >>> extract_currency(posts, 0, 3)['surrounding_text']
    [['₿1 i', '$4k'], ['₿ in', '£ & ', '€?'], []]

    >>> currency_summary['overview']
    {'num_posts': 3,
    'num_currency_symbols': 5,
    'currency_symbols_per_post': 1.6666666666666667,
    'unique_currency_symbols': 4}
    """
    summary = extract(text_list, CURRENCY, 'currency_symbol')
    summary['currency_symbol_names'] = [[name(c).lower() for c in x] if x
                                        else [] for x in
                                        summary['currency_symbols']]
    surrounding_text_regex = re.compile(r'.{0,' + str(left_chars) + '}' +
                                        CURRENCY_RAW +
                                        r'.{0,' + str(right_chars) + '}')
    summary['surrounding_text'] = [surrounding_text_regex.findall(text)
                                   for text in text_list]
    return summary
Esempio n. 37
0
s = 'Rod'
print('\nRed String:', s)
print('Type:', type(s), '\tLength:', len(s))

s = s.encode('utf-8')
print('\nEncoded String:', s)
print('Type:', type(s), '\tLength:', len(s))

s = s.decode('utf-8')
print('\nDecoded String:', s)
print('Type', type(s), '\tLength:', len(s))

import unicodedata
for i in range(len(s)):
    print(s[i], unicodedata.name(s[i]), sep=':')

s = b'Gr\xc3\xb6n'
print('\nGreen String:', s.decode('utf-8'))

s = 'Gr\N{LATIN SMALL LETTER O WITH DIAERESIS}n'
print('Green String:', s)
Esempio n. 38
0
	def show_lut(self, platform_name, layout_name, debug_print):
		layout = self.layouts[platform_name][layout_name]
		sorted_keys = sorted(layout)
		table = []
		glyphs = ""
		table.append(["Glyph", "Unicode", "HID code", "modifier+isocode", "modifier+scancode", "Description"])
		for key in sorted_keys:
			mod, keycode, isocode, hidcode, deadkey = layout[key]
			try:
				des = unicodedata.name(chr(key))
			except:
				des = "No Data"
			table.append([chr(key), key, f"{hidcode:#0{4}x}", "+".join(mod) + " " + str(isocode),"+".join(mod) + " " + str(keycode), des])
			glyphs+=chr(key)
		if debug_print:
			for row in table:
				print("{0: >10} {1: >10} {2: >10} {3: >20} {4: >20} {5: >40}".format(*row))			
			print("\r\nAll glyphs:\r\n" + ''.join(sorted(glyphs)))
			
		# Glyphs generated by transforms
		transforms = self.transforms[platform_name][layout_name] 
		sorted_transforms = sorted(transforms)
		glyphs_from_transforms = ""
		for key in sorted_transforms:
			glyphs_from_transforms += key
			
		# Generate raw HID code + modifier to glyph mapping
		hid_to_glyph_lut = {}
		modifier_map = {	'ctrlL':	0x00,
							'shiftL':	KEY_SHIFT,
							'shift':	KEY_SHIFT,
							'atlL':		KEY_RIGHT_ALT,
							'opt':		0x00,
							'cmd':		0x00,
							'ctrlR':	0x00,
							'shiftR':	KEY_SHIFT,
							'altR':		KEY_RIGHT_ALT,
							'cmdR':		0x00}	
		for key in sorted_keys:
			mod, keycode, isocode, hidcode, deadkey = layout[key]
			modifier_mask = 0x00
			for modifier in mod:
				modifier_mask |= modifier_map[modifier]
			hid_to_glyph_lut[chr(key)] = [[modifier_mask,hidcode]]
		#print(hid_to_glyph_lut)
		
		# Part below is to compare with mooltipass mini storage
		mini_lut_array_bin = []
		if debug_print:
			print("\r\nMooltipass Mini Old LUT:")
		mini_modifier_map = {	'ctrlL':	0x00,
								'shiftL':	0x80,
								'shift':	0x80,
								'atlL':		0x40,
								'opt':		0x00,
								'cmd':		0x00,
								'ctrlR':	0x00,
								'shiftR':	0x80,
								'altR':		0x40,
								'cmdR':		0x00}	
		mini_lut = ""
		for key in sorted_keys:
			mod, keycode, isocode, hidcode, deadkey = layout[key]
			modifier_mask = 0x00
			for modifier in mod:
				modifier_mask |= mini_modifier_map[modifier]
			# Europe key hack
			if hidcode == 0x64:
				hidcode = 0x03
			# Apply modifier mask
			hidcode |= modifier_mask
			mini_lut += f"{hidcode:#0{4}x} "
			mini_lut_array_bin.append(hidcode)
		if debug_print:
			print(mini_lut)
		
		# Return dictionary
		return {"mini_lut_bin": mini_lut_array_bin, "covered_glyphs":glyphs, "hid_to_glyph_lut":hid_to_glyph_lut, "glyphs_from_transforms":glyphs_from_transforms, "transforms":transforms}
Esempio n. 39
0
# -*- coding: utf-8 -*-
# Nola
"""
    大小写折叠:
        把所有文本变成小写,再做其他转换。使用str.casefold()方法python3.3新增
        对于只包含lantin1字符的字符串,此方法相当于str.lower()
        唯有两个例外:
            微符号'µ'会变成小写的希腊字母“μ”;德语 Eszett(“sharp s”,ß)会变成“ss”
"""
from unicodedata import name

micro = 'µ'
print(name(micro))
micro_cf = micro.casefold()
print(name(micro_cf))
print(micro, micro_cf)
print('{:-^30}'.format('-'))
eszett = 'ß'
print(name(eszett))
eszett_cf = eszett.casefold()
print(eszett, eszett_cf)
Esempio n. 40
0
s1.add('Melon')

# frozenset은 추가 불가
# s5.add('Melon')

print('EX6-1 -', s1, type(s1))
print('EX6-2 -', s2, type(s2))
print('EX6-3 -', s3, type(s3))
print('EX6-4 -', s4, type(s4))
print('EX6-5 -', s5, type(s5))

# 선언 최적화
from dis import dis

print('EX6-5 -')
print(dis('{10}'))

print('EX6-6 -')
print(dis('set([10])'))

print()
print()

# 지능형 집합(Comprehending Set)
from unicodedata import name

print('EX7-1 -')

print({name(chr(i), '') for i in range(0, 256)})

exit()
Esempio n. 41
0
def extract_exclamations(text_list):
    """Return a summary dictionary about exclamation (mark)s in ``text_list``

    Get a summary of the number of exclamation marks, their frequency,
    the top ones, as well the exclamations written/said.

    :param list text_list: A list of text strings.
    :returns summary: A dictionary with various stats about exclamations

    >>> posts = ['Who are you!', 'What is this!', 'No exclamation here?']
    >>> exclamation_summary = extract_exclamations(posts)
    >>> exclamation_summary.keys()
    dict_keys(['exclamation_marks', 'exclamation_marks_flat',
    'exclamation_mark_counts', 'exclamation_mark_freq',
    'top_exclamation_marks', 'overview', 'exclamation_mark_names',
    'exclamation_text'])

    >>> exclamation_summary['exclamation_marks']
    [['!'], ['!'], []]

    A simple extract of exclamation marks from each of the posts. An empty
    list if none exist

    >>> exclamation_summary['exclamation_marks_flat']
    ['!', '!']

    All exclamation marks in one flat list.

    >>> exclamation_summary['exclamation_mark_counts']
    [1, 1, 0]

    The count of exclamation marks per post.

    >>> exclamation_summary['exclamation_mark_freq']
    [(0, 1), (1, 2)]

    Shows how many posts had 0, 1, 2, 3, etc. exclamation marks
    (number_of_symbols, count)

    >>> exclamation_summary['top_exclamation_marks']
    [('!', 2)]

    Might be interesting if you have different types of exclamation marks

    >>> exclamation_summary['exclamation_mark_names']
    [['exclamation mark'], ['exclamation mark'], []]

    >>> exclamation_summary['overview']
    {'num_posts': 3,
    'num_exclamation_marks': 2,
    'exclamation_marks_per_post': 0.6666666666666666,
    'unique_exclamation_marks': 1}

    >>> posts2 = ["don't go there!", 'مرحبا. لا تذهب!', '¡Hola! ¿cómo estás?',
    ... 'a few different exclamation marks! make sure you see them!']

    >>> exclamation_summary = extract_exclamations(posts2)

    >>> exclamation_summary['exclamation_marks']
    [['!'], ['!'], ['¡', '!'], ['!', '!']]
    # might be displayed in opposite order due to RTL exclamation mark
    A simple extract of exclamation marks from each of the posts.
    An empty list if none exist

    >>> exclamation_summary['exclamation_marks_flat']
    ['!', '!', '¡', '!', '!', '!']

    All exclamation marks in one flat list.

    >>> exclamation_summary['exclamation_mark_counts']
    [1, 1, 2, 2]

    The count of exclamation marks per post.

    >>> exclamation_summary['exclamation_mark_freq']
    [(1, 2), (2, 2)]

    Shows how many posts had 0, 1, 2, 3, etc. exclamation marks
    (number_of_symbols, count)

    >>> exclamation_summary['top_exclamation_marks']
    [('!', 5), ('¡', 1)]

    Might be interesting if you have different types of exclamation marks

    >>> exclamation_summary['exclamation_mark_names']
    [['exclamation mark'], ['exclamation mark'],
    ['inverted exclamation mark', 'exclamation mark'],
    ['exclamation mark', 'exclamation mark']]

    >>> exclamation_summary['overview']
    {'num_posts': 4,
    'num_exclamation_marks': 6,
    'exclamation_marks_per_post': 1.5,
    'unique_exclamation_marks': 4}
    """
    summary = extract(text_list, EXCLAMATION_MARK, key_name='exclamation_mark')
    summary['exclamation_mark_names'] = [[name(c).lower() for c in x] if x
                                         else [] for x in
                                         summary['exclamation_marks']]
    summary['exclamation_text'] = [EXCLAMATION.findall(text)
                                   for text in text_list]
    return summary
Esempio n. 42
0
def isKanji(unichar):
    try:
        return unicodedata.name(unichar).find('CJK UNIFIED IDEOGRAPH') >= 0
    except ValueError:
        # a control character
        return False
Esempio n. 43
0
from unicodedata import name

print({chr(i) for i in range(32, 256) if 'SIGN' in name(chr(i), '')})
print({name(chr(i), '') for i in range(32, 256) if 'SIGN' in name(chr(i), '')})
Esempio n. 44
0
def getCharName(ch):
	try:
		return unicodedata.name(ch).title()
	except ValueError:
		return ''
Esempio n. 45
0
    numbers = array.array('h', [-2, -1, 0, 1, 2])
    octets = bytes(numbers)
    print(octets)

    octets = b'Montr\xe9al'
    print(octets.decode('cp1252'))
    print(octets.decode('iso8859_7'))
    print(octets.decode('koi8_r'))
    print(octets.decode('utf_8', errors='replace'))

    for expression in expressions.split():
        value = eval(expression)
        print(expression.rjust(30), '->', repr(value))

    ohm = '\u2126'
    print(name(ohm))
    ohm_c = normalize('NFC', ohm)
    print(name(ohm_c))
    print(ohm == ohm_c)
    print(normalize('NFC', ohm) == normalize('NFC', ohm_c))

    print(nfc_equal('A', 'a'))
    print(fold_equal('A', 'a'))

    for char in sample:
        print('U+%04x' % ord(char),
              char.center(6),
              're_dig' if re_digit.match(char) else '-',
              'isdig' if char.isdigit() else '-',
              'isnum' if char.isnumeric() else '-',
              format(numeric(char), '5.2f'),
Esempio n. 46
0
def _combining_class(cp):
    v = unicodedata.combining(unichr(cp))
    if v == 0:
        if not unicodedata.name(unichr(cp)):
            raise ValueError("Unknown character in unicodedata")
    return v
Esempio n. 47
0
def judge_lang(text):
    for ch in target:
        word = unicodedata.name(ch)
        if "CJK UNIFIED" in word or "HIRAGANA" in word or "KATAKANA" in word:
            return("Japanese")  # 漢字・ひらがな・カタカナが含まれたら日本語
    return("English")  # 含まれなければ英語
Esempio n. 48
0
def unicode_test(value):
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print('value="%s", name="%s", value2="%s"' % (value, name, value2))
Esempio n. 49
0
def unicode2desc(u):
    return(unicodedata.name(u))
Esempio n. 50
0
def unicode_test(value):
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print('value="%s", name="%s", value2="%s"' % (value, name, value2))


unicode_test('A')
unicode_test('$')
unicode_test('\u00a2')
unicode_test('\u20ac')
unicode_test('\u2603')

place = 'café'
print(place)

print(unicodedata.name('\u00e9'))
# print(unicodedata.lookup('E WITH ACUTE, LATIN SMALL LETTER'))
print(unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE'))

place = 'caf\u00e9'
print(place)

place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}'
print(place)

u_umlaut = '\N{LATIN SMALL LETTER U WITH DIAERESIS}'
print(u_umlaut)

drink = 'Gew' + u_umlaut + 'rztraminer'
print('Now I can finally have my', drink, 'in a', place)
Esempio n. 51
0
 def test_ascii_letters(self):
     for char in "".join(map(chr, range(ord("a"), ord("z")))):
         name = "LATIN SMALL LETTER %s" % char.upper()
         code = unicodedata.lookup(name)
         self.assertEqual(unicodedata.name(code), name)
Esempio n. 52
0
 def name(self):
     """Return unicodedata.name."""
     try:
         return unicodedata.name(self.c)
     except:
         return ''
Esempio n. 53
0
 def test_bmp_characters(self):
     for code in range(0x10000):
         char = chr(code)
         name = unicodedata.name(char, None)
         if name is not None:
             self.assertEqual(unicodedata.lookup(name), char)
Esempio n. 54
0
 def to_string(c):
     digit = format(ord(c), "x")
     name = unicodedata.name(c, "Name not found.")
     return fmt.format(digit, name, c)
Esempio n. 55
0
async def on_raw_reaction_add(payload):
    channel = client.get_channel(payload.channel_id)
    user = client.get_user(payload.user_id)
    emoji = payload.emoji.name
    try:
        message = await channel.fetch_message(payload.message_id)
    except AttributeError:
        return
    if user == client.user:
        pass
    elif message.author == client.user:
        title = message.embeds[0].title
        if "99% Bets's bets" in title:
            try:
                string = unicodedata.name(emoji[0]).split(" ")[1].lower()
                page = nums.index(string) - 1
                embed, pages = discord_api.list_bets(page, "99% Bets", False)
                await message.edit(embed=embed)
            except Exception as error:
                pass
        elif " as a bin" in title:
            try:
                string = unicodedata.name(emoji[0]).split(" ")[1].lower()
                page = nums.index(string) - 1
                embed, pages = discord_api.related_markets_bin(message.embeds[0].title.split('"')[1], page)
                await message.edit(embed=embed)
            except:
                pass
        elif " in the title" in title:
            try:
                string = unicodedata.name(emoji[0]).split(" ")[1].lower()
                page = nums.index(string) - 1
                embed, pages = discord_api.related_markets_title(message.embeds[0].title.split('"')[1], page)
                await message.edit(embed=embed)
            except:
                pass
        elif "Leaderboard" in title:
            try:
                string = unicodedata.name(emoji[0]).split(" ")[1].lower()
                page = nums.index(string) - 1
                embed, pages = discord_api.leaderboard(page)
                await message.edit(embed=embed)
            except Exception as error:
                pass
        elif "'s bets" in title:
            try:
                string = unicodedata.name(emoji[0]).split(" ")[1].lower()
                page = nums.index(string) - 1
                embed, pages = discord_api.list_bets(page, title.split("'")[0], True)
                await message.edit(embed=embed)
            except Exception as error:
                pass

        if unicodedata.name(emoji[0]) == "CLOCKWISE RIGHTWARDS AND LEFTWARDS OPEN CIRCLE ARROWS":
            if "99% Bets's bets" in title:
                page = int(message.embeds[0].description.split("```")[1].split("\n")[0]) - 1
                embed, pages = discord_api.list_bets(page, "99% Bets", False)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
            elif " as a bin" in title:
                page = int(message.embeds[0].description.split(" ")[-3]) - 1
                embed, pages = discord_api.related_markets_bin(message.embeds[0].title.split('"')[1], page)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
            elif " in the title" in title:
                page = int(message.embeds[0].description.split(" ")[-3]) - 1
                embed, pages = discord_api.related_markets_title(message.embeds[0].title.split('"')[1], page)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
            elif "markets with negative risk" in title:
                hidden = message.embeds[0].description.split("```")[1].split("\n")[0]
                num = None if hidden == "None" else int(hidden)
                embed = discord_api.risk_all(num)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
            elif "Market risk for" in title:
                hidden = message.embeds[0].description.split("```")[1].split("\n")[0].split(".")
                num = None if hidden[1] == "None" else int(hidden[1])
                mini = True if hidden[2] == "True" else False
                embed = discord_api.risk_market(hidden[0], num, mini)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
            elif "Orderbook for " in title:
                hidden = message.embeds[0].description.split("```")[1].split("\n")[0]
                embed = discord_api.orderbook(hidden)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
            elif "Market bins for " in title:
                hidden = message.embeds[0].description.split("```")[1].split("\n")[0].replace(' ', '.')
                embed = discord_api.bins(hidden)
                if embed.description != message.embeds[0].description:
                    await message.edit(embed=embed)
    color = "White" if color == WHITE else "Black"
    name = {
        DRAUGHT: "Draught",
        PAWN: "ChessPawn",
        ROOK: "ChessRook",
        KNIGHT: "ChessKnight",
        BISHOP: "ChessBishop",
        KING: "ChessKing",
        QUEEN: "ChessQueen"
    }[kind]
    return globals()[color + name]()


class Piece(str):

    __slots__ = ()


for code in itertools.chain((0x26C0, 0x26C2), range(0x2654, 0x2660)):
    char = chr(code)
    name = unicodedata.name(char).title().replace(" ", "")
    if name.endswith("sMan"):
        name = name[:-4]
    new = (lambda char: lambda Class: Piece.__new__(Class, char))(char)
    new.__name__ = "__new__"
    Class = type(name, (Piece, ), dict(__slots__=(), __new__=new))
    globals()[name] = Class

if __name__ == "__main__":
    main()
Esempio n. 57
0
        .withColumnRenamed("Website Address","site")\
        .select("id","desc","active","cat","site")

extradesc = polione.join(politwo.withColumnRenamed("Description","desc2"), "BN/Registration Number", 'full')\
        .withColumnRenamed("BN/Registration Number","id")\
        .withColumnRenamed("Description","desc1")\
        .select("id","desc1","desc2")

datardd = data.join(extradesc, "id", "left")\
        .rdd\
        .map(lambda x: x.asDict())\
        .map(lambda x: {
            **x,
            "d": "".join(
                ch if unicodedata.name(ch).startswith(
                    ('LATIN', 'DIGIT', 'SPACE', 'APOSTROPHE')
                ) else " " \
                for ch in " ".join(
                    [x['desc'] if x['desc'] else "",
                     x['desc1'] if x['desc1'] else "",
                     x['desc2'] if x['desc2'] else ""]
                )
            )
        })\
        .map(lambda x: {
            **x,
            'd': " ".join(w.lower() if w.lower() not in stopwords else "" for w in x['d'].split())
        })\
        .map(lambda x: {**x, 'd': " ".join(re.sub(r'.*\d.*', '', w) for w in x['d'].split())})\
        .map(lambda x: {**x, 'd': " ".join(re.sub(r'.*\'.*', '', w) for w in x['d'].split())})\
        .map(lambda x: {**x, 'd': re.sub(r' +',' ',x['d']).strip()})\
Esempio n. 58
0
def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))
Esempio n. 59
0
def extract_questions(text_list):
    """Return a summary dictionary about question(mark)s in ``text_list``

    Get a summary of the number of question marks, their frequency,
    the top ones, as well the questions asked.

    :param list text_list: A list of text strings.
    :returns summary: A dictionary with various stats about questions

    >>> posts = ['How are you?', 'What is this?', 'No question Here!']
    >>> question_summary = extract_questions(posts)
    >>> question_summary.keys()
    dict_keys(['question_marks', 'question_marks_flat',
    'question_mark_counts', 'question_mark_freq', 'top_question_marks',
    'overview', 'question_mark_names', 'question_text'])

    >>> question_summary['question_marks']
    [['?'], ['?'], []]

    A simple extract of question marks from each of the posts. An empty
    list if none exist

    >>> question_summary['question_marks_flat']
    ['?', '?']

    All question marks in one flat list.

    >>> question_summary['question_mark_counts']
    [1, 1, 0]

    The count of question marks per post.

    >>> question_summary['question_mark_freq']
    [(0, 1), (1, 2)]

    Shows how many posts had 0, 1, 2, 3, etc. question marks
    (number_of_symbols, count)

    >>> question_summary['top_question_marks']
    [('?', 2)]

    Might be interesting if you have different types of question marks
    (Arabic, Spanish, Greek, Armenian, or other)

    >>> question_summary['question_mark_names']
    [['question mark'], ['question mark'], []]

    >>> question_summary['overview']
    {'num_posts': 3,
    'num_question_marks': 2,
    'question_marks_per_post': 0.6666666666666666,
    'unique_question_marks': 1}

    >>> posts2 = ['Πώς είσαι;', 'مرحباً. كيف حالك؟', 'Hola, ¿cómo estás?',
    ... 'Can you see the new questions? Did you notice the different marks?']

    >>> question_summary = extract_questions(posts2)

    >>> question_summary['question_marks']
    [[';'], ['؟'], ['¿', '?'], ['?', '?']]
    # might be displayed in opposite order due to RTL question mark
    A simple extract of question marks from each of the posts. An empty list if
    none exist

    >>> question_summary['question_marks_flat']
    [';', '؟', '¿', '?', '?', '?']

    All question marks in one flat list.

    >>> question_summary['question_mark_counts']
    [1, 1, 2, 2]

    The count of question marks per post.

    >>> question_summary['question_mark_freq']
    [(1, 2), (2, 2)]

    Shows how many posts had 0, 1, 2, 3, etc. question marks
    (number_of_symbols, count)

    >>> question_summary['top_question_marks']
    [('?', 3), (';', 1), ('؟', 1), ('¿', 1)]

    Might be interesting if you have different types of question marks
    (Arabic, Spanish, Greek, Armenian, or other)

    >>> question_summary['question_mark_names']
    [['greek question mark'], ['arabic question mark'],
    ['inverted question mark', 'question mark'],
    ['question mark', 'question mark']]
    # correct order

    >>> question_summary['overview']
    {'num_posts': 4,
    'num_question_marks': 6,
    'question_marks_per_post': 1.5,
    'unique_question_marks': 4}
    """
    summary = extract(text_list, QUESTION_MARK, key_name='question_mark')
    summary['question_mark_names'] = [[name(c).lower() for c in x] if x
                                      else [] for x in
                                      summary['question_marks']]
    summary['question_text'] = [QUESTION.findall(text)
                                for text in text_list]
    return summary
Esempio n. 60
0
import unicodedata

u = chr(233) + chr(0x0bf2) + chr(3972) + chr(6000) + chr(13231)

for i, c in enumerate(u):
    print(i, '%04x' % ord(c), unicodedata.category(c), end=" ")
    print(unicodedata.name(c))

# Get numeric value of second character
print("­Ъае")