def jatokenize(content): ret_list = [] lines = tagger.parse(content).split('\n') for line in lines: if line == "EOS": break line = line.split('\t') word = line[2] try: jtype = unicodedata.name(word[0]) except: continue # 漢字でない一文字のwordは無視 # 'ー'や'*'も同様 if len(word) == 1 and jtype[0:4] != 'CJK ': continue # 二文字のひらがなは無視 if (len(word) == 2 and jtype[0:4] == 'HIRA' and unicodedata.name(word[1])[0:4] == 'HIRA'): continue if jtype[0:4] == 'LATI': continue if word.isdigit(): continue if (line[3][:2] == '名詞' or line[3][:2] == '動詞' or line[3][:2] == '副詞' or line[3][:3] == '形容詞'): ofs.write("%s " % word) ret_list.append(word.encode('utf8')) ofs.write("\n") return ret_list
def codepoint_simple(arg): arg = arg.upper() r_label = re.compile('\\b' + arg.replace(' ', '.*\\b') + '\\b') results = [] for cp in xrange(0xFFFF): u = unichr(cp) try: name = unicodedata.name(u) except ValueError: continue if r_label.search(name): results.append((len(name), u, cp, name)) if not results: r_label = re.compile('\\b' + arg.replace(' ', '.*\\b')) for cp in xrange(0xFFFF): u = unichr(cp) try: name = unicodedata.name(u) except ValueError: continue if r_label.search(name): results.append((len(name), u, cp, name)) if not results: return None length, u, cp, name = sorted(results)[0] return about(u, cp, name)
def test_cjk(self): import sys import unicodedata cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FA5)) if unicodedata.unidata_version >= "4.1": cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FBB), (0x20000, 0x2A6D6)) for first, last in cases: # Test at and inside the boundary for i in (first, first + 1, last - 1, last): charname = 'CJK UNIFIED IDEOGRAPH-%X'%i char = ('\\U%08X' % i).decode('unicode-escape') assert unicodedata.name(char) == charname assert unicodedata.lookup(charname) == char # Test outside the boundary for i in first - 1, last + 1: charname = 'CJK UNIFIED IDEOGRAPH-%X'%i char = ('\\U%08X' % i).decode('unicode-escape') try: unicodedata.name(char) except ValueError, e: assert e.message == 'no such name' raises(KeyError, unicodedata.lookup, charname)
def format(self, stream, args): char = unicode(args.next()) if len(char) != 1: raise TypeError("expected single character") if self.atsign: if char in python_escapes: stream.write('"\\%s"' % python_escapes[char]) else: try: stream.write('u"\\N{%s}"' % unicodedata.name(char)) except ValueError: stream.write(repr(char)) else: if unicodedata.category(char).startswith("C"): try: stream.write(unicodedata.name(char)) except ValueError: code = ord(char) if code in ascii_control_chars: i = 1 if self.colon else 0 stream.write(ascii_control_chars[code][i]) else: raise FormatError("unprintable character") else: stream.write(char)
def _do_write(fname, variable, version, date, table): print("writing {} ..".format(fname)) import unicodedata import datetime import string utc_now = datetime.datetime.now(tz=datetime.timezone.utc) INDENT = 4 with open(fname, 'w') as fp: fp.write("# Generated: {iso_utc}\n" "# Source: {version}\n" "# Date: {date}\n" "{variable} = (".format(iso_utc=utc_now.isoformat(), version=version, date=date, variable=variable)) for start, end in table: ucs_start, ucs_end = unichr(start), unichr(end) hex_start, hex_end = ('0x{0:04x}'.format(start), '0x{0:04x}'.format(end)) try: name_start = string.capwords(unicodedata.name(ucs_start)) except ValueError: name_start = u'' try: name_end = string.capwords(unicodedata.name(ucs_end)) except ValueError: name_end = u'' fp.write('\n' + (' ' * INDENT)) fp.write('({0}, {1},),'.format(hex_start, hex_end)) fp.write(' # {0:24s}..{1}'.format( name_start[:24].rstrip() or '(nil)', name_end[:24].rstrip())) fp.write('\n)\n') print("complete.")
def data(self, index, role ): global UC_CAT_EXPAND, COL_ALIGNMENT, COL_TOOLTIPS (char, count) = self.chardata.get_tuple(index.row()) if role == Qt.DisplayRole : # request for actual data if 0 == index.column(): return char elif 1 == index.column(): return '0x{0:04x}'.format(ord(char)) elif 2 == index.column(): return count elif 3 == index.column(): if char in C.NAMED_ENTITIES : return '&' + C.NAMED_ENTITIES[char] + ';' else: return '&#{0:d};'.format(ord(char)) elif 4 == index.column(): return UC_CAT_EXPAND[unicodedata.category(char).lower()] else: # assuming column is 5, unicode name return unicodedata.name(char,'no name?').title() elif (role == Qt.TextAlignmentRole) : return COL_ALIGNMENT[index.column()] elif (role == Qt.ToolTipRole) or (role == Qt.StatusTipRole) : if index.column() < 5 : return COL_TOOLTIPS[index.column()] # For column 5, the tooltip is the name string, because a narrow # column may not expose the entire name any other way. return unicodedata.name(char,'no name?').title() # Sorry, we don't support other roles return None
def showdict(data, indent): first=True for key in sorted(data.keys()): value=data[key] if first: first=False else: print print " "*max(indent,0) + "("+key, # Sneaky trick: we don't want to go newline-indent over and # over for long sequences, i.e. cases where there is only # one possible follower. So we skip the newlines in those # cases, and tell the next-lower iteration not to do the whole # indent thing by passing a negative indent. We don't just # pass 0 or 1 because if another iteration *further down* # turns out not to be an only case, it will need to know # the right indent to pass along. So a case like # R-O-{CK|LL}, the O is unique after the R, so no linefeed, # but then the {C|L} are not unique after the O. if type(value)==dict: if len(value)>1: print "" showdict(value, abs(indent)+4), else: showdict(value, -(abs(indent)+4)), else: print " "+value.encode('utf-8'), if "-n" in sys.argv: try: print unicodedata.name(value), except: pass print ")",
def extractKeyword(text,word_class=["名詞","形容詞"]): tmp = splitTag(text) #まずハッシュタグを抽出 text = tmp[0] keywords = tmp[1] tagger = MeCab.Tagger('-Ochasen') node = tagger.parseToNode(text.encode('utf-8')) while node: try: if node.feature.split(',')[0] in word_class: #print node.surface uniname = node.surface.decode('utf-8')[0] #名詞の一文字目 ↓で数字、ひらがな、カタカナ、漢字、アルファベットのみをkeywordsに追加 if (unicodedata.name(uniname)[0:8] == "HIRAGANA") or (unicodedata.name(uniname)[0:8] == "KATAKANA") or (unicodedata.name(uniname)[0:18] == "HALFWIDTH KATAKANA") or (unicodedata.name(uniname)[0:3] == "CJK") or (unicodedata.name(uniname)[0:5] == "LATIN") or (unicodedata.name(uniname)[0:5] == "DIGIT"): term = node.surface.replace('*','*') term = term.replace('"','”') term = term.replace("'","’") keywords.append(term.decode('utf-8')) #print node.surface.decode('utf-8') except Exception as e: print "-"*10 print "エラー(MeCab)" print node.surface print str(type(e)) print str(e.args) print e.message print str(e) print "-"*10 node = node.next return keywords
def safe_path(origtitle): title = safe_path_component(ftfy(origtitle)) if len(title) == 0: title = origtitle = u'_' if title.startswith(u'-') or title.startswith(u'.'): title = u'_' + title try: charname = safe_path_component(unicodedata.name(origtitle[0])) except ValueError: charname = u'UNKNOWN' category = charname.split('_')[0] # some ridiculous stuff to give every article a unique name that can be # stored on multiple file systems and tab-completed if len(origtitle) == 1: pieces = [u'single_character', category, charname + '.json'] else: try: charname2 = safe_path_component(unicodedata.name(origtitle[1])) except ValueError: charname2 = u'UNKNOWN' text_to_encode = unicodedata.normalize("NFKD", safe_path_component(title[:64])) finalpart = text_to_encode.encode('punycode').rstrip('-') pieces = [charname, charname2, finalpart + '.json'] path = u'/'.join(pieces) return path
def report_code_points(char_class, code_point_list, text=''): '''Report all code points which have been added to or removed from a character class. ''' for code_point in sorted(code_point_list): if type(code_point) == type(int()): print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s' %{'text': text, 'char': chr(code_point), 'char_class': char_class, 'code_point': hex(code_point), 'name': unicodedata.name(chr(code_point), 'name unknown')}) else: print(('%(char_class)s: %(text)s: ' + '%(char0)s → %(char1)s ' + '%(code_point0)s → %(code_point1)s ' + '%(name0)s → %(name1)s') %{ 'text': text, 'char_class': char_class, 'char0': chr(code_point[0]), 'code_point0': hex(code_point[0]), 'name0': unicodedata.name(chr(code_point[0]), 'name unknown'), 'char1': chr(code_point[1]), 'code_point1': hex(code_point[1]), 'name1': unicodedata.name(chr(code_point[1]), 'name unknown') })
def get_unicode_str(size=10, max_char=0xFFFF, onlyNormalized=False, includeUnexisting=False): ''' generates valid (for current OS) Unicode file name Notice: if includeUnexisting==True, it is possible that files don't get synchronized ''' if platform.system() == "Windows": # Unicode characters 1 through 31, as well as quote ("), less than (<), greater than (>), pipe (|), backspace (\b), null (\0) and tab (\t). exclude = string.punctuation + u"\t" + u''.join([unichr(x) for x in range(0, 32)]) else: # I guess it mainly depends on fs type #exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 1)]) exclude = u"/" + u"." + u''.join([unichr(x) for x in range(0, 32)]) name = u"" while len(name) < size: c = unichr(random.randint(0, max_char)) if c not in exclude: try: if not includeUnexisting: unicodedata.name(c) #this will cause invalid unicode character to throw exception if onlyNormalized: name = name + unicodedata.normalize('NFC',c) #only normalized chars else: name = name + c except ValueError: pass return name
def test_cjk(self): import sys import unicodedata cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FA5)) if unicodedata.unidata_version >= "5": # don't know the exact limit cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FCB), (0x20000, 0x2A6D6), (0x2A700, 0x2B734)) elif unicodedata.unidata_version >= "4.1": cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FBB), (0x20000, 0x2A6D6)) for first, last in cases: # Test at and inside the boundary for i in (first, first + 1, last - 1, last): charname = "CJK UNIFIED IDEOGRAPH-%X" % i char = ("\\U%08X" % i).decode("unicode-escape") assert unicodedata.name(char) == charname assert unicodedata.lookup(charname) == char # Test outside the boundary for i in first - 1, last + 1: charname = "CJK UNIFIED IDEOGRAPH-%X" % i char = ("\\U%08X" % i).decode("unicode-escape") try: unicodedata.name(char) except ValueError, e: assert e.message == "no such name" raises(KeyError, unicodedata.lookup, charname)
def test_cjk(self): import sys if sys.maxunicode < 0x10ffff: skip("requires a 'wide' python build.") import unicodedata cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FA5)) if unicodedata.unidata_version >= "4.1": cases = ((0x3400, 0x4DB5), (0x4E00, 0x9FBB), (0x20000, 0x2A6D6)) for first, last in cases: # Test at and inside the boundary for i in (first, first + 1, last - 1, last): charname = 'CJK UNIFIED IDEOGRAPH-%X'%i assert unicodedata.name(unichr(i)) == charname assert unicodedata.lookup(charname) == unichr(i) # Test outside the boundary for i in first - 1, last + 1: charname = 'CJK UNIFIED IDEOGRAPH-%X'%i try: unicodedata.name(unichr(i)) except ValueError: pass raises(KeyError, unicodedata.lookup, charname)
def clean_Ustring_fromU(string): from unicodedata import name, normalize gClean = '' for ch in u''.join(string.decode('utf-8', 'ignore')): try: if name(ch).startswith('LATIN') or name(ch) == 'SPACE': gClean = gClean + ch else: # Remove non-latin characters and change them by spaces gClean = gClean + ' ' except ValueError: # In the case name of 'ch' does not exist in the unicode database. gClean = gClean + ' ' try: # Trying different cases for bad input documents. normalized_string = normalize('NFKC', gClean.lower()) except TypeError: #sys.stderr.write('Bad formed string at the first attempt\n') try: range_error = 999 normalized_string = normalize('NFKC', gClean[0:range_error].lower()) # One thousand of characters are written if available. except TypeError: #sys.stderr.write('\nThe wrong string at the second attempt: before %s words' % range_error) try: range_error = 99 normalized_string = normalize('NFKC', gClean[0:range_error].lower()) except TypeError: #sys.stderr.write('\nThe wrong string at the third attempt: before %s words' % range_error) try: range_error = 49 normalized_string = normalize('NFKC', gClean[0:range_error].lower()) except TypeError: #sys.stderr.write('\nIt was not possible forming output file after three attempts. Fatally bad file') normalized_string = '# Fatally bad File\n' pass return normalized_string.split() # Return the unicode normalized document.
def main(): # get files files = [] for i in range(1,29): if i < 26: files.append("db/Minna_no_nihongo_1.%02d.txt" % i) else: files.append("db/Minna_no_nihongo_2.%02d.txt" % i) # get words from files words = get_words_from_files(files) # add words to network G=nx.Graph() for w in words: G.add_node(w) G.node[w]['furigana'] = words[w]['furigana'] G.node[w]['meaning'] = words[w]['meaning'] G.node[w]['chapter'] = words[w]['chapter'] # to make statistics nbins, dmin, dmax = 20, 0, 1 hist, edges = np.histogram([0], bins=nbins, range=(dmin, dmax)) # adding edges words = G.nodes() print("Total number of words: ",len(words)) for word1, word2 in itertools.combinations(words,2): for w1 in word1: for w2 in word2: if "CJK UNIFIED" in ud.name(w1) and "CJK UNIFIED" in ud.name(w2): f1, f2 = fingerprint[w1], fingerprint[w2] match = SequenceMatcher(None, f1, f2 , autojunk=True) ratio = match.ratio() # add data to histogram new_hist, edges = np.histogram(ratio, bins=nbins, range=(dmin, dmax)) hist += new_hist if ratio > 0.8: # G.add_edge(word1, word2, weight=5*ratio-4) # 0.8 - 1 --> 0 - 1 G.add_edge(word1, word2, weight=4*ratio-3.2) # 0.8 - 1 --> 0 - 0.8 break # plot data score = 0.5*(edges[1:] + edges[:-1]) plt.plot(score, hist) plt.xlabel("score") plt.ylabel("histogram") plt.show() G = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True) print("Total number of words connected: ", len(G[0].nodes())) nx.write_graphml(G[0], "kanjis.graphml", encoding='utf-8', prettyprint=True)
def is_in_alphabet(self, uchr, alphabet): if self.no_memory: return not uchr.isalpha() or alphabet in ud.name(uchr) try: return self.alphabet_letters[alphabet][uchr] except KeyError: return self.alphabet_letters[alphabet].setdefault( uchr, alphabet in ud.name(uchr))
def codePointToCharacter(cp): if cp <= maxSupportedCodePoint(): char = unichr(cp) if cp > 0xFF: # unicodedata doesn't have names for control characters, so skip the name test on the ASCII ones at least try: unicodedata.name(char) except Exception,e: raise e return char
def symbolEq(s1,s2): if len(s1) != len(s2): return False if len(s1) != 0: return False try: x1 = unicodedata.name(u'%s' % s1.decode('utf-8')) x2 = unicodedata.name(u'%s' % s2.decode('utf-8')) except: return False return x1 == x2
def endElement(self,name): if name=='ar': self.in_arContent=False #Now store the entry: try: ch=self.normalize_whitespace(self.article) if not len(ch):return lidx=ch.index('[') ridx=ch.index(']') self.romanization=ch[lidx+1:ridx] split_romanization=string.split(self.romanization,u' ',199) self.translation=ch[ridx+1:] cjktraditional=[] cjksimplified=[] for cidx in range(len(self.splitkey[0])): cjktraditional.append(unicodedata.name(self.splitkey[0][cidx])) cjksimplified.append(unicodedata.name(self.splitkey[1][cidx])) #print self.romanization,self.translation entry={ 'traditional':self.splitkey[0], #uchar string 'simplified':self.splitkey[1], #uchar string 'cjktraditional':cjktraditional, 'cjksimplified':cjksimplified, 'romanization':split_romanization, #list of morphenes 'frequencies':[], #filled by post-process with romanized morphene frequencies 'translation':self.translation, } if self.dict.has_key(entry['traditional']): #print 'already have: ',`entry['traditional']`,entry['romanization']#fontset more likely to have traditional, if any #print 'proof:',self.dict[entry['traditional']]['romanization'] pass else:self.dict[entry['traditional']]=entry #Add to distro: for item in entry['traditional']: try:self.dist[item]+=1 except:self.dist[item]=1 if math.fmod(len(self.dict.keys()),100)==0: msglist=[ "Words :%6d"%(len(self.dict.keys())), "Symbols:%6d"%(len(self.dist.keys())) ] self.progress_message(msglist) except Exception,e: if DEBUG:print e self.article=u''
def is_unicode(self, char): # http://docs.python.org/2/library/unicodedata.html is_unicode = True try: unicodedata.name(unicode(char)) except ValueError: is_unicode = False return is_unicode
def test_parse_rand_utf16(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 1, 'cA', 120), (1000, 1, 'cG', 120), (1000, 1, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60) print "inspect:", h2o.dump_json(inspect) numRows = inspect['numRows'] self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount)) numCols = inspect['numCols'] self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount)) for k in range(colCount): naCnt = inspect['cols'][k]['naCnt'] self.assertEqual(0, naCnt, msg='col %s naCnt %d should be 0' % (k, naCnt)) stype = inspect['cols'][k]['type'] self.assertEqual("Enum", stype, msg='col %s type %s should be Enum' % (k, stype)) #************************** # for background knowledge; (print info) import unicodedata # u = unichr(233) + unichr(0x0bf2) + unichr(3972) + unichr(6000) + unichr(13231) # left and right single quotes u = unichr(0x201c) + unichr(0x201d) # preferred apostrophe (right single quote) u = unichr(0x2019) u = unichr(0x2018) + unichr(6000) + unichr(0x2019) # grave and acute? # u = unichr(0x60) + unichr(0xb4) # don't do this. grave with apostrophe http://www.cl.cam.ac.uk/~mgk25/ucs/quotes.html # u = unichr(0x60) + unichr(0x27) for i, c in enumerate(u): print i, '%04x' % ord(c), unicodedata.category(c), print unicodedata.name(c)
def unicode(ctx, pline, userdata): args = pline.trailing.split()[1:] if not args: ctx.command('/say No symbol given.') return if args[0].startswith('U+'): codepoint = int(args[0][2:], 16) char = unichr(codepoint) try: name = unicodedata.name(char) except ValueError: name = 'n/a' if codepoint < 32: char = '-' ctx.command(u'/say {}, Name: {}'.format(char, name)) return reststr = ' '.join(args) if all(char in string.ascii_uppercase + string.digits + ' -' for char in reststr): try: char = unicodedata.lookup(reststr.strip()) except KeyError: pass else: codepoint = ord(char) ctx.command(u'/say {}, Codepoint: U+{:X}'.format(char, codepoint)) return symbol = args[0].decode(ctx.encoding) nfc_symbol = unicodedata.normalize(u'NFC', symbol) if len(nfc_symbol) > 1: ctx.command('/say Too many symbols.') return try: name = unicodedata.name(nfc_symbol) except TypeError: ctx.command('/say Unknown character or invalid input.') return except ValueError: name = 'n/a' nfd_symbol = unicodedata.normalize(u'NFD', symbol) category = unicodedata.category(symbol) codepoint = ord(nfc_symbol) outstr = u'Codepoint: U+{:X}, Name: {}, Category: {}.'.format(codepoint, name, category) if len(nfd_symbol) > len(nfc_symbol): outstr += u' (Compose: ' slist = [] for char in nfd_symbol: codepoint = ord(char) slist.append( u'U+{:X}'.format(codepoint)) outstr += u', '.join(slist) + ')' ctx.command(u'/say {}'.format(outstr))
def weight_for_leven_edits(wordFrom, wordTo, edits, weight_rules, max_weight, debug=False): if (debug): print print print "Weight Analysis" print "word in: ", wordFrom dump(wordFrom) print print "word to: ", wordTo dump(wordTo) cumulative_weight = 0 for edit in edits: edit_weight = 0 if (debug): print edit (command, char_num_in_word_one, char_num_in_word_two) = edit if (char_num_in_word_one > (len(wordFrom) - 1)): char_in_word_one = '' else: char_in_word_one = wordFrom[char_num_in_word_one] if (char_num_in_word_two > (len(wordTo) - 1)): char_in_word_two = '' else: char_in_word_two = wordTo[char_num_in_word_two] if (debug): print '\t', command if char_in_word_one: print '\t', unicodedata.name(char_in_word_one) else: print '\tx' if char_in_word_two: print '\t', unicodedata.name(char_in_word_two) else: print '\tx' if (command == 'replace'): edit_weight = 10 elif (command == 'delete'): edit_weight = 15 elif (command == 'insert'): edit_weight = 18 else: raise ValueError('unknown Levenshtein edit operation: ' + command) for weight_rule in weight_rules: if (weight_rule[0] == command) and (weight_rule[1] == '*' or char_in_word_one in weight_rule[1]) and (weight_rule[2] == '*' or char_in_word_two in weight_rule[2]): if (debug): print '\t weight rule applied:' print '\t', weight_rule edit_weight = weight_rule[3] break if (debug): print '\tweight: ', edit_weight cumulative_weight += edit_weight if (cumulative_weight >= max_weight): break return cumulative_weight
def breakdown_into_validwords(sentence): """ 与えられた文章(文字列)を形態素解析してリスト返却します - 入出力例 - IN: "今日はいい天気ですね" - OUT: ['今日', '天気'] """ ret_list = [] if sentence == '' or not(isinstance(sentence, str)): return ret_list sentence = sentence.replace("\n", "") model = MeCab.Model_create("-Ochasen -d mecab-ipadic-neologd") tagger = model.createTagger() lines = tagger.parse(sentence).split('\n') for line in lines: if line == "EOS": break line = line.split('\t') word = line[2] # 卑猥な単語を含む文章は除外 if word in ['ちんちん', 'ちんこ', 'キンタマ', 'きんたま', '痴漢']: return [] # TODO:除外リストの作成 if word in ['今日', '俺', '私', '僕', '人', '思う', 'ちゃう', '何', '行く', 'もらう', 'られる', 'くれる', 'すぎる']: continue try: jtype = unicodedata.name(word[0]) except: continue # 漢字でない一文字のwordは無視 # 'ー'や'*'も同様 if len(word) == 1 and jtype[0:4] != 'CJK ': continue # 二文字のひらがなは無視 if (len(word) == 2 and jtype[0:4] == 'HIRA' and unicodedata.name(word[1])[0:4] == 'HIRA'): continue # 伸ばし棒を含むひらがなは無視 if jtype[0:4] == 'HIRA' and 'ー' in word: continue if jtype[0:4] == 'LATI': continue if word.isdigit(): continue if (line[3][:2] == '名詞' or line[3][:2] == '動詞' or line[3][:2] == '副詞' or line[3][:3] == '形容詞'): ret_list.append(word) # print(word) return ret_list
def memo_to_fn(x): bits = [] print x for l in x: if "LETTER" in unicodedata.name(l): bits.append(l) elif "DIGIT" in unicodedata.name(l): bits.append(l) elif bits[-1] != "_": bits.append("_") return "".join(bits)
def is_accepted_char(uchar): if unicodedata.name(uchar).startswith('CJK'): return True if unicodedata.name(uchar).startswith('LATIN'): return True if unicodedata.name(uchar).startswith('SPACE'): return True if unicodedata.name(uchar).startswith('DIGIT'): return True if unicodedata.name(uchar).startswith('GREEK'): return True return False
def print_unicode_entry(n): u = get_unicode_using_unicode_escape(n) print '{:8d} {:8x}'.format(n, n), print u.encode('utf8'), unicodedata.category(u), try: print unicodedata.name(u), except: print 'unicodedata has no name defined', try: print unicodedata.digit(u) except: print 'unicodedata has no numeric value'
def giphy_me(): terms = ircmsg.partition('giphy me ')[2] # translate from emoji to actual words if (terms.encode('utf-8')[:1]==b'\xf0'): terms = uni.name(terms.encode('utf-8')[:4].decode('utf-8')) if (terms.encode('utf-8')[:1]==b'\xe2'): terms = uni.name(terms.encode('utf-8')[:3].decode('utf-8')) if (terms == "blerg" or terms == "blergh"): terms = "30rock" terms = terms.lower() print("searching giphy for "+terms) sendmsg(current_channel, search_gifs(terms))
def is_cyrillic(text): count_cyrillic = 0 count_latin = 0 for c in text: try: if "CYRILLIC" in unicodedata.name(c): count_cyrillic += 1 if "LATIN" in unicodedata.name(c): count_latin += 1 except: pass return count_cyrillic >= .3 * (count_latin + count_cyrillic)
def _strip_noise_bytes(self, obj, replace='_'): '''Make sure there arent any random weird chars that dont belong to any alphabet. Only ascii non-letters are allowed, as fancy symbols don't seem to work well with curses.''' if not isinstance(obj, str): obj = str(obj) obj_ucs = list() for uc in obj: try: unicodedata.name(uc) if unicodedata.category(uc) != 'Ll': uc.encode('ascii') except (ValueError, UnicodeEncodeError): if replace: obj_ucs.append(replace) else: obj_ucs.append(uc) return ''.join(obj_ucs)
s = '100' print(s.isdecimal()) s = '0xF' print(s.isdecimal()) s = '10.55' print(s.isdecimal()) s = '' print(s.isdecimal()) s = '1٠2𝟜' # U+0660, U+1D7DC print(s.isdecimal()) print(int(s)) import unicodedata count = 0 for codepoint in range(2**16): ch = chr(codepoint) if ch.isdecimal(): print(u'{:04x}: {} ({})'.format(codepoint, ch, unicodedata.name(ch, 'UNNAMED'))) count = count + 1 print(f'Total Number of Decimal Unicode Characters = {count}')
# BEGIN NUMERICS_DEMO import unicodedata import re re_digit = re.compile(r'\d') sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285' for char in sample: print( 'U+%04x' % ord(char), # <1> char.center(6), # <2> 're_dig' if re_digit.match(char) else '-', # <3> 'isdig' if char.isdigit() else '-', # <4> 'isnum' if char.isnumeric() else '-', # <5> format(unicodedata.numeric(char), '5.2f'), # <6> unicodedata.name(char), # <7> sep='\t') # END NUMERICS_DEMO
async def search(q: str): # <7> chars = app.state.index.search(q) return ({'char': c, 'name': name(c)} for c in chars) # <8>
def test_named_sequences_names_in_pua_range(self): # We are storing named seq in the PUA 15, but their names shouldn't leak for cp in range(0xf0100, 0xf0fff): with self.assertRaises(ValueError) as cm: unicodedata.name(chr(cp)) self.assertEqual(str(cm.exception), 'no such name')
class ucp: name = unicodedata.name(u'क') category = unicodedata.category(u'क') medial = 'ka' final = 'k'
def extract_currency(text_list, left_chars=20, right_chars=20): """Return a summary dictionary about currency symbols in ``text_list`` Get a summary of the number of currency symbols, their frequency, the top ones, and more. :param list text_list: A list of text strings. :param int left_chars: The number of characters to extract, to the left of the symbol when getting :attr:`surrounding_text` :param int right_chars: The number of characters to extract, to the left of the symbol when getting :attr:`surrounding_text` :returns summary: A dictionary with various stats about currencies >>> posts = ['today ₿1 is around $4k', 'and ₿ in £ & €?', 'no idea'] >>> currency_summary = extract_currency(posts) >>> currency_summary.keys() dict_keys(['currency_symbols', 'currency_symbols_flat', 'currency_symbol_counts', 'currency_symbol_freq', 'top_currency_symbols', 'overview', 'currency_symbol_names']) >>> currency_summary['currency_symbols'] [['₿', '$'], ['₿', '£', '€'], []] A simple extract of currencies from each of the posts. An empty list if none exist >>> currency_summary['currency_symbols_flat'] ['₿', '$', '₿', '£', '€'] All currency symbols in one flat list. >>> currency_summary['currency_symbol_counts'] [2, 3, 0] The count of currency symbols per post. >>> currency_summary['currency_symbol_freq'] [(0, 1), (2, 1), (3, 1)] Shows how many posts had 0, 1, 2, 3, etc. currency symbols (number_of_symbols, count) >>> currency_summary['top_currency_symbols'] [('₿', 2), ('$', 1), ('£', 1), ('€', 1)] >>> currency_summary['currency_symbol_names'] [['bitcoin sign', 'dollar sign'], ['bitcoin sign', 'pound sign', 'euro sign'], []] >>> currency_summary['surrounding_text'] [['today ₿1 is around $4k'], ['and ₿ in £ & €?'], []] >>> extract_currency(posts, 5, 5)['surrounding_text'] [['oday ₿1 is ', 'ound $4k'], ['and ₿ in £', ' & €?'], []] >>> extract_currency(posts, 0, 3)['surrounding_text'] [['₿1 i', '$4k'], ['₿ in', '£ & ', '€?'], []] >>> currency_summary['overview'] {'num_posts': 3, 'num_currency_symbols': 5, 'currency_symbols_per_post': 1.6666666666666667, 'unique_currency_symbols': 4} """ summary = extract(text_list, CURRENCY, 'currency_symbol') summary['currency_symbol_names'] = [[name(c).lower() for c in x] if x else [] for x in summary['currency_symbols']] surrounding_text_regex = re.compile(r'.{0,' + str(left_chars) + '}' + CURRENCY_RAW + r'.{0,' + str(right_chars) + '}') summary['surrounding_text'] = [surrounding_text_regex.findall(text) for text in text_list] return summary
s = 'Rod' print('\nRed String:', s) print('Type:', type(s), '\tLength:', len(s)) s = s.encode('utf-8') print('\nEncoded String:', s) print('Type:', type(s), '\tLength:', len(s)) s = s.decode('utf-8') print('\nDecoded String:', s) print('Type', type(s), '\tLength:', len(s)) import unicodedata for i in range(len(s)): print(s[i], unicodedata.name(s[i]), sep=':') s = b'Gr\xc3\xb6n' print('\nGreen String:', s.decode('utf-8')) s = 'Gr\N{LATIN SMALL LETTER O WITH DIAERESIS}n' print('Green String:', s)
def show_lut(self, platform_name, layout_name, debug_print): layout = self.layouts[platform_name][layout_name] sorted_keys = sorted(layout) table = [] glyphs = "" table.append(["Glyph", "Unicode", "HID code", "modifier+isocode", "modifier+scancode", "Description"]) for key in sorted_keys: mod, keycode, isocode, hidcode, deadkey = layout[key] try: des = unicodedata.name(chr(key)) except: des = "No Data" table.append([chr(key), key, f"{hidcode:#0{4}x}", "+".join(mod) + " " + str(isocode),"+".join(mod) + " " + str(keycode), des]) glyphs+=chr(key) if debug_print: for row in table: print("{0: >10} {1: >10} {2: >10} {3: >20} {4: >20} {5: >40}".format(*row)) print("\r\nAll glyphs:\r\n" + ''.join(sorted(glyphs))) # Glyphs generated by transforms transforms = self.transforms[platform_name][layout_name] sorted_transforms = sorted(transforms) glyphs_from_transforms = "" for key in sorted_transforms: glyphs_from_transforms += key # Generate raw HID code + modifier to glyph mapping hid_to_glyph_lut = {} modifier_map = { 'ctrlL': 0x00, 'shiftL': KEY_SHIFT, 'shift': KEY_SHIFT, 'atlL': KEY_RIGHT_ALT, 'opt': 0x00, 'cmd': 0x00, 'ctrlR': 0x00, 'shiftR': KEY_SHIFT, 'altR': KEY_RIGHT_ALT, 'cmdR': 0x00} for key in sorted_keys: mod, keycode, isocode, hidcode, deadkey = layout[key] modifier_mask = 0x00 for modifier in mod: modifier_mask |= modifier_map[modifier] hid_to_glyph_lut[chr(key)] = [[modifier_mask,hidcode]] #print(hid_to_glyph_lut) # Part below is to compare with mooltipass mini storage mini_lut_array_bin = [] if debug_print: print("\r\nMooltipass Mini Old LUT:") mini_modifier_map = { 'ctrlL': 0x00, 'shiftL': 0x80, 'shift': 0x80, 'atlL': 0x40, 'opt': 0x00, 'cmd': 0x00, 'ctrlR': 0x00, 'shiftR': 0x80, 'altR': 0x40, 'cmdR': 0x00} mini_lut = "" for key in sorted_keys: mod, keycode, isocode, hidcode, deadkey = layout[key] modifier_mask = 0x00 for modifier in mod: modifier_mask |= mini_modifier_map[modifier] # Europe key hack if hidcode == 0x64: hidcode = 0x03 # Apply modifier mask hidcode |= modifier_mask mini_lut += f"{hidcode:#0{4}x} " mini_lut_array_bin.append(hidcode) if debug_print: print(mini_lut) # Return dictionary return {"mini_lut_bin": mini_lut_array_bin, "covered_glyphs":glyphs, "hid_to_glyph_lut":hid_to_glyph_lut, "glyphs_from_transforms":glyphs_from_transforms, "transforms":transforms}
# -*- coding: utf-8 -*- # Nola """ 大小写折叠: 把所有文本变成小写,再做其他转换。使用str.casefold()方法python3.3新增 对于只包含lantin1字符的字符串,此方法相当于str.lower() 唯有两个例外: 微符号'µ'会变成小写的希腊字母“μ”;德语 Eszett(“sharp s”,ß)会变成“ss” """ from unicodedata import name micro = 'µ' print(name(micro)) micro_cf = micro.casefold() print(name(micro_cf)) print(micro, micro_cf) print('{:-^30}'.format('-')) eszett = 'ß' print(name(eszett)) eszett_cf = eszett.casefold() print(eszett, eszett_cf)
s1.add('Melon') # frozenset은 추가 불가 # s5.add('Melon') print('EX6-1 -', s1, type(s1)) print('EX6-2 -', s2, type(s2)) print('EX6-3 -', s3, type(s3)) print('EX6-4 -', s4, type(s4)) print('EX6-5 -', s5, type(s5)) # 선언 최적화 from dis import dis print('EX6-5 -') print(dis('{10}')) print('EX6-6 -') print(dis('set([10])')) print() print() # 지능형 집합(Comprehending Set) from unicodedata import name print('EX7-1 -') print({name(chr(i), '') for i in range(0, 256)}) exit()
def extract_exclamations(text_list): """Return a summary dictionary about exclamation (mark)s in ``text_list`` Get a summary of the number of exclamation marks, their frequency, the top ones, as well the exclamations written/said. :param list text_list: A list of text strings. :returns summary: A dictionary with various stats about exclamations >>> posts = ['Who are you!', 'What is this!', 'No exclamation here?'] >>> exclamation_summary = extract_exclamations(posts) >>> exclamation_summary.keys() dict_keys(['exclamation_marks', 'exclamation_marks_flat', 'exclamation_mark_counts', 'exclamation_mark_freq', 'top_exclamation_marks', 'overview', 'exclamation_mark_names', 'exclamation_text']) >>> exclamation_summary['exclamation_marks'] [['!'], ['!'], []] A simple extract of exclamation marks from each of the posts. An empty list if none exist >>> exclamation_summary['exclamation_marks_flat'] ['!', '!'] All exclamation marks in one flat list. >>> exclamation_summary['exclamation_mark_counts'] [1, 1, 0] The count of exclamation marks per post. >>> exclamation_summary['exclamation_mark_freq'] [(0, 1), (1, 2)] Shows how many posts had 0, 1, 2, 3, etc. exclamation marks (number_of_symbols, count) >>> exclamation_summary['top_exclamation_marks'] [('!', 2)] Might be interesting if you have different types of exclamation marks >>> exclamation_summary['exclamation_mark_names'] [['exclamation mark'], ['exclamation mark'], []] >>> exclamation_summary['overview'] {'num_posts': 3, 'num_exclamation_marks': 2, 'exclamation_marks_per_post': 0.6666666666666666, 'unique_exclamation_marks': 1} >>> posts2 = ["don't go there!", 'مرحبا. لا تذهب!', '¡Hola! ¿cómo estás?', ... 'a few different exclamation marks! make sure you see them!'] >>> exclamation_summary = extract_exclamations(posts2) >>> exclamation_summary['exclamation_marks'] [['!'], ['!'], ['¡', '!'], ['!', '!']] # might be displayed in opposite order due to RTL exclamation mark A simple extract of exclamation marks from each of the posts. An empty list if none exist >>> exclamation_summary['exclamation_marks_flat'] ['!', '!', '¡', '!', '!', '!'] All exclamation marks in one flat list. >>> exclamation_summary['exclamation_mark_counts'] [1, 1, 2, 2] The count of exclamation marks per post. >>> exclamation_summary['exclamation_mark_freq'] [(1, 2), (2, 2)] Shows how many posts had 0, 1, 2, 3, etc. exclamation marks (number_of_symbols, count) >>> exclamation_summary['top_exclamation_marks'] [('!', 5), ('¡', 1)] Might be interesting if you have different types of exclamation marks >>> exclamation_summary['exclamation_mark_names'] [['exclamation mark'], ['exclamation mark'], ['inverted exclamation mark', 'exclamation mark'], ['exclamation mark', 'exclamation mark']] >>> exclamation_summary['overview'] {'num_posts': 4, 'num_exclamation_marks': 6, 'exclamation_marks_per_post': 1.5, 'unique_exclamation_marks': 4} """ summary = extract(text_list, EXCLAMATION_MARK, key_name='exclamation_mark') summary['exclamation_mark_names'] = [[name(c).lower() for c in x] if x else [] for x in summary['exclamation_marks']] summary['exclamation_text'] = [EXCLAMATION.findall(text) for text in text_list] return summary
def isKanji(unichar): try: return unicodedata.name(unichar).find('CJK UNIFIED IDEOGRAPH') >= 0 except ValueError: # a control character return False
from unicodedata import name print({chr(i) for i in range(32, 256) if 'SIGN' in name(chr(i), '')}) print({name(chr(i), '') for i in range(32, 256) if 'SIGN' in name(chr(i), '')})
def getCharName(ch): try: return unicodedata.name(ch).title() except ValueError: return ''
numbers = array.array('h', [-2, -1, 0, 1, 2]) octets = bytes(numbers) print(octets) octets = b'Montr\xe9al' print(octets.decode('cp1252')) print(octets.decode('iso8859_7')) print(octets.decode('koi8_r')) print(octets.decode('utf_8', errors='replace')) for expression in expressions.split(): value = eval(expression) print(expression.rjust(30), '->', repr(value)) ohm = '\u2126' print(name(ohm)) ohm_c = normalize('NFC', ohm) print(name(ohm_c)) print(ohm == ohm_c) print(normalize('NFC', ohm) == normalize('NFC', ohm_c)) print(nfc_equal('A', 'a')) print(fold_equal('A', 'a')) for char in sample: print('U+%04x' % ord(char), char.center(6), 're_dig' if re_digit.match(char) else '-', 'isdig' if char.isdigit() else '-', 'isnum' if char.isnumeric() else '-', format(numeric(char), '5.2f'),
def _combining_class(cp): v = unicodedata.combining(unichr(cp)) if v == 0: if not unicodedata.name(unichr(cp)): raise ValueError("Unknown character in unicodedata") return v
def judge_lang(text): for ch in target: word = unicodedata.name(ch) if "CJK UNIFIED" in word or "HIRAGANA" in word or "KATAKANA" in word: return("Japanese") # 漢字・ひらがな・カタカナが含まれたら日本語 return("English") # 含まれなければ英語
def unicode_test(value): name = unicodedata.name(value) value2 = unicodedata.lookup(name) print('value="%s", name="%s", value2="%s"' % (value, name, value2))
def unicode2desc(u): return(unicodedata.name(u))
def unicode_test(value): name = unicodedata.name(value) value2 = unicodedata.lookup(name) print('value="%s", name="%s", value2="%s"' % (value, name, value2)) unicode_test('A') unicode_test('$') unicode_test('\u00a2') unicode_test('\u20ac') unicode_test('\u2603') place = 'café' print(place) print(unicodedata.name('\u00e9')) # print(unicodedata.lookup('E WITH ACUTE, LATIN SMALL LETTER')) print(unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')) place = 'caf\u00e9' print(place) place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}' print(place) u_umlaut = '\N{LATIN SMALL LETTER U WITH DIAERESIS}' print(u_umlaut) drink = 'Gew' + u_umlaut + 'rztraminer' print('Now I can finally have my', drink, 'in a', place)
def test_ascii_letters(self): for char in "".join(map(chr, range(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) self.assertEqual(unicodedata.name(code), name)
def name(self): """Return unicodedata.name.""" try: return unicodedata.name(self.c) except: return ''
def test_bmp_characters(self): for code in range(0x10000): char = chr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char)
def to_string(c): digit = format(ord(c), "x") name = unicodedata.name(c, "Name not found.") return fmt.format(digit, name, c)
async def on_raw_reaction_add(payload): channel = client.get_channel(payload.channel_id) user = client.get_user(payload.user_id) emoji = payload.emoji.name try: message = await channel.fetch_message(payload.message_id) except AttributeError: return if user == client.user: pass elif message.author == client.user: title = message.embeds[0].title if "99% Bets's bets" in title: try: string = unicodedata.name(emoji[0]).split(" ")[1].lower() page = nums.index(string) - 1 embed, pages = discord_api.list_bets(page, "99% Bets", False) await message.edit(embed=embed) except Exception as error: pass elif " as a bin" in title: try: string = unicodedata.name(emoji[0]).split(" ")[1].lower() page = nums.index(string) - 1 embed, pages = discord_api.related_markets_bin(message.embeds[0].title.split('"')[1], page) await message.edit(embed=embed) except: pass elif " in the title" in title: try: string = unicodedata.name(emoji[0]).split(" ")[1].lower() page = nums.index(string) - 1 embed, pages = discord_api.related_markets_title(message.embeds[0].title.split('"')[1], page) await message.edit(embed=embed) except: pass elif "Leaderboard" in title: try: string = unicodedata.name(emoji[0]).split(" ")[1].lower() page = nums.index(string) - 1 embed, pages = discord_api.leaderboard(page) await message.edit(embed=embed) except Exception as error: pass elif "'s bets" in title: try: string = unicodedata.name(emoji[0]).split(" ")[1].lower() page = nums.index(string) - 1 embed, pages = discord_api.list_bets(page, title.split("'")[0], True) await message.edit(embed=embed) except Exception as error: pass if unicodedata.name(emoji[0]) == "CLOCKWISE RIGHTWARDS AND LEFTWARDS OPEN CIRCLE ARROWS": if "99% Bets's bets" in title: page = int(message.embeds[0].description.split("```")[1].split("\n")[0]) - 1 embed, pages = discord_api.list_bets(page, "99% Bets", False) if embed.description != message.embeds[0].description: await message.edit(embed=embed) elif " as a bin" in title: page = int(message.embeds[0].description.split(" ")[-3]) - 1 embed, pages = discord_api.related_markets_bin(message.embeds[0].title.split('"')[1], page) if embed.description != message.embeds[0].description: await message.edit(embed=embed) elif " in the title" in title: page = int(message.embeds[0].description.split(" ")[-3]) - 1 embed, pages = discord_api.related_markets_title(message.embeds[0].title.split('"')[1], page) if embed.description != message.embeds[0].description: await message.edit(embed=embed) elif "markets with negative risk" in title: hidden = message.embeds[0].description.split("```")[1].split("\n")[0] num = None if hidden == "None" else int(hidden) embed = discord_api.risk_all(num) if embed.description != message.embeds[0].description: await message.edit(embed=embed) elif "Market risk for" in title: hidden = message.embeds[0].description.split("```")[1].split("\n")[0].split(".") num = None if hidden[1] == "None" else int(hidden[1]) mini = True if hidden[2] == "True" else False embed = discord_api.risk_market(hidden[0], num, mini) if embed.description != message.embeds[0].description: await message.edit(embed=embed) elif "Orderbook for " in title: hidden = message.embeds[0].description.split("```")[1].split("\n")[0] embed = discord_api.orderbook(hidden) if embed.description != message.embeds[0].description: await message.edit(embed=embed) elif "Market bins for " in title: hidden = message.embeds[0].description.split("```")[1].split("\n")[0].replace(' ', '.') embed = discord_api.bins(hidden) if embed.description != message.embeds[0].description: await message.edit(embed=embed)
color = "White" if color == WHITE else "Black" name = { DRAUGHT: "Draught", PAWN: "ChessPawn", ROOK: "ChessRook", KNIGHT: "ChessKnight", BISHOP: "ChessBishop", KING: "ChessKing", QUEEN: "ChessQueen" }[kind] return globals()[color + name]() class Piece(str): __slots__ = () for code in itertools.chain((0x26C0, 0x26C2), range(0x2654, 0x2660)): char = chr(code) name = unicodedata.name(char).title().replace(" ", "") if name.endswith("sMan"): name = name[:-4] new = (lambda char: lambda Class: Piece.__new__(Class, char))(char) new.__name__ = "__new__" Class = type(name, (Piece, ), dict(__slots__=(), __new__=new)) globals()[name] = Class if __name__ == "__main__": main()
.withColumnRenamed("Website Address","site")\ .select("id","desc","active","cat","site") extradesc = polione.join(politwo.withColumnRenamed("Description","desc2"), "BN/Registration Number", 'full')\ .withColumnRenamed("BN/Registration Number","id")\ .withColumnRenamed("Description","desc1")\ .select("id","desc1","desc2") datardd = data.join(extradesc, "id", "left")\ .rdd\ .map(lambda x: x.asDict())\ .map(lambda x: { **x, "d": "".join( ch if unicodedata.name(ch).startswith( ('LATIN', 'DIGIT', 'SPACE', 'APOSTROPHE') ) else " " \ for ch in " ".join( [x['desc'] if x['desc'] else "", x['desc1'] if x['desc1'] else "", x['desc2'] if x['desc2'] else ""] ) ) })\ .map(lambda x: { **x, 'd': " ".join(w.lower() if w.lower() not in stopwords else "" for w in x['d'].split()) })\ .map(lambda x: {**x, 'd': " ".join(re.sub(r'.*\d.*', '', w) for w in x['d'].split())})\ .map(lambda x: {**x, 'd': " ".join(re.sub(r'.*\'.*', '', w) for w in x['d'].split())})\ .map(lambda x: {**x, 'd': re.sub(r' +',' ',x['d']).strip()})\
def is_latin(uchr): try: return latin_letters[uchr] except KeyError: return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))
def extract_questions(text_list): """Return a summary dictionary about question(mark)s in ``text_list`` Get a summary of the number of question marks, their frequency, the top ones, as well the questions asked. :param list text_list: A list of text strings. :returns summary: A dictionary with various stats about questions >>> posts = ['How are you?', 'What is this?', 'No question Here!'] >>> question_summary = extract_questions(posts) >>> question_summary.keys() dict_keys(['question_marks', 'question_marks_flat', 'question_mark_counts', 'question_mark_freq', 'top_question_marks', 'overview', 'question_mark_names', 'question_text']) >>> question_summary['question_marks'] [['?'], ['?'], []] A simple extract of question marks from each of the posts. An empty list if none exist >>> question_summary['question_marks_flat'] ['?', '?'] All question marks in one flat list. >>> question_summary['question_mark_counts'] [1, 1, 0] The count of question marks per post. >>> question_summary['question_mark_freq'] [(0, 1), (1, 2)] Shows how many posts had 0, 1, 2, 3, etc. question marks (number_of_symbols, count) >>> question_summary['top_question_marks'] [('?', 2)] Might be interesting if you have different types of question marks (Arabic, Spanish, Greek, Armenian, or other) >>> question_summary['question_mark_names'] [['question mark'], ['question mark'], []] >>> question_summary['overview'] {'num_posts': 3, 'num_question_marks': 2, 'question_marks_per_post': 0.6666666666666666, 'unique_question_marks': 1} >>> posts2 = ['Πώς είσαι;', 'مرحباً. كيف حالك؟', 'Hola, ¿cómo estás?', ... 'Can you see the new questions? Did you notice the different marks?'] >>> question_summary = extract_questions(posts2) >>> question_summary['question_marks'] [[';'], ['؟'], ['¿', '?'], ['?', '?']] # might be displayed in opposite order due to RTL question mark A simple extract of question marks from each of the posts. An empty list if none exist >>> question_summary['question_marks_flat'] [';', '؟', '¿', '?', '?', '?'] All question marks in one flat list. >>> question_summary['question_mark_counts'] [1, 1, 2, 2] The count of question marks per post. >>> question_summary['question_mark_freq'] [(1, 2), (2, 2)] Shows how many posts had 0, 1, 2, 3, etc. question marks (number_of_symbols, count) >>> question_summary['top_question_marks'] [('?', 3), (';', 1), ('؟', 1), ('¿', 1)] Might be interesting if you have different types of question marks (Arabic, Spanish, Greek, Armenian, or other) >>> question_summary['question_mark_names'] [['greek question mark'], ['arabic question mark'], ['inverted question mark', 'question mark'], ['question mark', 'question mark']] # correct order >>> question_summary['overview'] {'num_posts': 4, 'num_question_marks': 6, 'question_marks_per_post': 1.5, 'unique_question_marks': 4} """ summary = extract(text_list, QUESTION_MARK, key_name='question_mark') summary['question_mark_names'] = [[name(c).lower() for c in x] if x else [] for x in summary['question_marks']] summary['question_text'] = [QUESTION.findall(text) for text in text_list] return summary
import unicodedata u = chr(233) + chr(0x0bf2) + chr(3972) + chr(6000) + chr(13231) for i, c in enumerate(u): print(i, '%04x' % ord(c), unicodedata.category(c), end=" ") print(unicodedata.name(c)) # Get numeric value of second character print("Ъае")