def distance(s1: str, s2: str): """Compute the Levenshtein edit distance between two Unicode strings Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme clusters. This should be the correct way to compare two Unicode strings. """ seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1))) seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2))) return levenshtein(seq1, seq2)
def editops(word1, word2): """ Return sequence of edit operations transforming one string to another. Note that this returns indices to the _grapheme clusters_, not characters! """ word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) return seq_editops(word1, word2)
def test_align(): """ Test aligning by character while retaining segment id info The difficulty here is that aligning should work on grapheme clusters, not Python characters. """ test1 = ExtractedText( None, [ ExtractedText("s0", None, None, "foo"), ExtractedText("s1", None, None, "bar"), ExtractedText("s2", None, None, "batzinga"), ], " ", None, ) test2 = ExtractedText( None, [ ExtractedText("x0", None, None, "foo"), ExtractedText("x1", None, None, "bar"), # extra . ExtractedText("x2", None, None, "."), # deletion + different grapheme cluster, m̃ also is two Python characters ExtractedText("x3", None, None, "bazim̃ga"), ], " ", None, ) left_pos = 0 right_pos = 0 alignment = [] for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos( left_pos) if left is not None else None right_id = test2.segment_id_for_pos( right_pos) if right is not None else None el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: left_pos += len(left) if right is not None: right_pos += len(right) print("test1: {}".format(test1.text)) print("test2: {}".format(test2.text)) assert alignment[0] == ("f", "f", "s0", "x0") assert alignment[8] == (None, ".", None, "x2") assert alignment[12] == ("t", None, "s2", None) assert alignment[15] == ("n", "m̃", "s2", "x3")
def test_align(): """ Test aligning by character while retaining segment id info The difficulty here is that aligning should work on grapheme clusters, not Python characters. """ test1 = ExtractedText(None, [ ExtractedText('s0', None, None, 'foo'), ExtractedText('s1', None, None, 'bar'), ExtractedText('s2', None, None, 'batzinga') ], ' ', None) test2 = ExtractedText( None, [ ExtractedText('x0', None, None, 'foo'), ExtractedText('x1', None, None, 'bar'), # extra . ExtractedText('x2', None, None, '.'), # deletion + different grapheme cluster, m̃ also is two Python characters ExtractedText('x3', None, None, 'bazim̃ga'), ], ' ', None) left_pos = 0 right_pos = 0 alignment = [] for left, right in seq_align(grapheme_clusters(test1.text), grapheme_clusters(test2.text)): left_id = test1.segment_id_for_pos( left_pos) if left is not None else None right_id = test2.segment_id_for_pos( right_pos) if right is not None else None el = AlignmentElement(left, right, left_id, right_id) alignment.append(el) if left is not None: left_pos += len(left) if right is not None: right_pos += len(right) print('test1: {}'.format(test1.text)) print('test2: {}'.format(test2.text)) assert alignment[0] == ('f', 'f', 's0', 'x0') assert alignment[8] == (None, '.', None, 'x2') assert alignment[12] == ('t', None, 's2', None) assert alignment[15] == ('n', 'm̃', 's2', 'x3')
def character_error_rate(reference, compared): d = distance(reference, compared) if d == 0: return 0 n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if n == 0: return float('inf') return d / n
def test_character_error_rate_between_page_files(): # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi. # The fi ligature does not count. gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml'))) ocr = page_text( ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml'))) gt_len = len(list(grapheme_clusters(gt))) expected_cer = 2 / gt_len assert character_error_rate(gt, ocr) == expected_cer
def grapheme_len(text): """Number of graphemes in `text` This is the length of the `text` when printed:: >>> s = 'Â' >>> len(s) 2 >>> grapheme_len(s) 1 """ return len(list(grapheme_clusters(text)))
def character_error_rate_n(reference, compared) -> Tuple[float, int]: """ Compute character error rate. :return: character error rate and length of the reference """ d = distance(reference, compared) n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference)))) if d == 0: return 0, n if n == 0: return float('inf'), n return d / n, n
def decode(d, spacing=''): """Decodes an emoji string to binary data Args: d: String to be decoded spacing: A character that was placed between each emoji. Default is none. Returns: Binary data as a long """ #start the return data r = 0 decode_version = None i = 0 #for each cluster of unicode data found in the string for v in list(gc.grapheme_clusters(d)): #if v is a spacing character, ignore it if v == spacing: continue #Block sizes of unicode change between py2 and 3 if (sys.version_info > (3, 0)) or platform.system() == 'Linux': block_size = 2 else: block_size = 4 #for 4 bytes in the cluster (2 emoji or less) for dcode in [ v[k:k + block_size] for k in range(0, len(v), block_size) ]: #join mutliple codepoints together (2 bytes each) p = '-'.join( format(cp.ord(x), 'x').zfill(4).upper() for x in [ dcode[l:l + (block_size // 2)] for l in range(0, len(dcode), block_size // 2) ]) #get the first emoji as the version number if i == 0: decode_version = VERSION_EMOJI.index(p) i += 1 else: #do a version check if decode_version is None: raise ValueError("No version information was found") #decode the emoji through the map and push it onto the result r = (r << 10) + EMOJI[decode_version].index(p) #return the data return r
def get_char_tensor(self, X): word_int = [] length = 0 # Go through each tensor in each batch for b in range(0, X.shape[0]): each_X = X[b] char_int = [] w = [] # For character-level if self.char_level: # Get all the characters w += (list(x) for x in self.tensortosent(each_X).split()) # Get all the index of those characters char_int += ([ self.dataloader.char_field.vocab.stoi[c] for c in each ] for each in w) # For grapheme-level else: w += (list(grapheme_clusters(x)) for x in self.tensortosent(each_X).split()) char_int += ([ self.dataloader.graph_field.vocab.stoi[c] for c in each ] for each in w) if length < max(map(len, char_int)): length = max(map(len, char_int)) word_int.append(char_int) # Padding to match the max_length words whose size is less than max(filter_size) if length < max(self.conv_filter_sizes): length += max(self.conv_filter_sizes) - length # Make each tensor equal in size X_char = np.array([[xi + [0] * (length - len(xi)) for xi in each] for each in word_int]) # Convert to tensor from numpy array X_char = torch.from_numpy(X_char) return X_char
def _grapheme_len(text, fail_with_zero=False): """Number of graphemes in `text`. This is the length of the `text` when printed:: >>> s = 'Â' >>> len(s) 2 >>> _grapheme_len(s) 1 If `fail_with_zero` is given a True, return 0 if `text` is not a string, instead of throwing a TypeError. """ try: return len(list(grapheme_clusters(text))) except TypeError: if fail_with_zero: return 0 raise
def get_strs_and_emojis_for_text(text, emoji_size=(38, 38)): # Get ordered text to write vals = [] for txt in gc.grapheme_clusters(text): image_file, image_filename = get_emoji_im_for_unicode(txt, emoji_size) modifier = EmojiFilesCache.TONE_MODIFIER_MAP.get(image_filename) if modifier and len(vals) >= 1: last_emoji = vals[-1] filename = os.path.basename(last_emoji.filename) if filename.endswith(".0.png"): filename = filename[:-len(".0.png")] filename = "%s.%s.png" % (filename, modifier) filepath = os.path.join(EMOJI_FILES_PATH, filename) emoji_im = Image.open(filepath) emoji_im.thumbnail(emoji_size, Image.ANTIALIAS) vals[-1] = emoji_im else: vals.append(image_file) else: if image_file: vals.append(image_file) else: vals.append(txt) # Collapse text into contiguous strings punctuated by images result = [] prev_idx = 0 for idx, val in enumerate(vals): if not isinstance(val, basestring): res = ''.join(vals[prev_idx:idx]) if res: result.append(res) result.append(val) prev_idx = idx + 1 return vals
import json from uniseg.graphemecluster import grapheme_clusters with open('posts.json', 'r') as f: data = json.load(f) total = 0 grapheme_total = 0 for _, post in data.iteritems(): grapheme_total += len(list(grapheme_clusters(post['data']['selftext']))) grapheme_total += len(list(grapheme_clusters(post['data']['title']))) total += len(post['data']['selftext']) total += len(post['data']['title']) print total, grapheme_total
def __init__(self, config, k): self.root_path = os.path.join(config.root_path, k) self.batch_size = config.batch_size self.device = config.device self.use_pos = config.use_pos self.txt_field = data.Field(tokenize=list, use_vocab=True, unk_token='<unk>', batch_first=True) self.label_field = data.Field(unk_token=None, batch_first=True) self.char_field = data.Field(unk_token='<unk>', sequential=False) self.graph_field = data.Field(unk_token='<unk>', sequential=False) self.fields = (('TEXT', self.txt_field), ('LABEL', self.label_field)) if config.use_pos: self.pos_field = data.Field(unk_token=None, batch_first=True) self.fields = (('TEXT', self.txt_field), ('POS', self.pos_field), ('LABEL', self.label_field)) self.train_ds, self.val_ds, self.test_ds = SequenceTaggingDataset.splits( path=self.root_path, fields=self.fields, separator='\t', train='train.txt', validation='val.txt', test='test.txt') self.char_list = [] self.graph_list = [] for each in self.train_ds.examples + self.test_ds.examples + self.val_ds.examples: for x in each.TEXT: self.char_list += list(x) self.graph_list += list(grapheme_clusters(x)) self.char_list = list(set(self.char_list)) self.graph_list = list(set(self.graph_list)) self.graph_list.sort() self.char_list.sort() self.char_field.build_vocab(self.char_list) self.graph_field.build_vocab(self.graph_list) self.embedding_dir = config.emb_dir self.vec = vocab.Vectors(name=config.emb_file, cache=self.embedding_dir) self.txt_field.build_vocab(self.train_ds, self.test_ds, self.val_ds, max_size=None, vectors=self.vec) self.label_field.build_vocab(self.train_ds.LABEL, self.test_ds.LABEL, self.val_ds.LABEL) if config.char_pretrained: self.char_vec = vocab.Vectors(name=config.char_emb_file, cache=self.embedding_dir) self.graph_vec = vocab.Vectors(name=config.graph_emb_file, cache=self.embedding_dir) self.char_field.build_vocab(self.char_list, vectors=self.char_vec) self.graph_field.build_vocab(self.graph_list, vectors=self.graph_vec) else: self.char_field.build_vocab(self.char_list) self.graph_field.build_vocab(self.graph_list) self.vocab_size = len(self.txt_field.vocab) self.tagset_size = len(self.label_field.vocab) self.char_vocab_size = len(self.char_field.vocab) self.graph_vocab_size = len(self.graph_field.vocab) self.weights = self.txt_field.vocab.vectors self.char_weights = self.char_field.vocab.vectors self.graph_weights = self.graph_field.vocab.vectors if config.use_pos: self.pos_field.build_vocab(self.train_ds.POS, self.test_ds.POS, self.val_ds.POS) # Because len(pos) = 56 and len(pos_field.vocab) = 55 self.pos_size = len(self.pos_field.vocab) + 2 self.pos_one_hot = np.eye(self.pos_size) self.one_hot_weight = torch.from_numpy(self.pos_one_hot).float() if config.verbose: self.print_stat()
def split_sentence(sentence): clusters = grapheme_clusters(sentence) return [cluster for cluster in clusters if len(cluster) <= 4]
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): gtx = '' ocrx = '' def format_thing(t, css_classes=None, id_=None): if t is None: html_t = none css_classes += ' ellipsis' elif t == '\n': html_t = '<br>' else: html_t = escape(t) html_custom_attrs = "" # Set Bootstrap tooltip to the segment id if id_: html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_) if css_classes: return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format( css_classes=css_classes, html_t=html_t, html_custom_attrs=html_custom_attrs) else: return '{html_t}'.format(html_t=html_t) if isinstance(gt_in, ExtractedText): if not isinstance(ocr_in, ExtractedText): raise TypeError() # XXX splitting should be done in ExtractedText gt_things = list(grapheme_clusters(gt_in.text)) ocr_things = list(grapheme_clusters(ocr_in.text)) else: gt_things = gt_in ocr_things = ocr_in g_pos = 0 o_pos = 0 for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): css_classes = None gt_id = None ocr_id = None if g != o: css_classes = '{css_prefix}diff{k} diff'.format( css_prefix=css_prefix, k=k) if isinstance(gt_in, ExtractedText): gt_id = gt_in.segment_id_for_pos( g_pos) if g is not None else None ocr_id = ocr_in.segment_id_for_pos( o_pos) if o is not None else None # Deletions and inserts only produce one id + None, UI must # support this, i.e. display for the one id produced gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) if g is not None: g_pos += len(g) if o is not None: o_pos += len(o) return \ ''' <div class="row"> <div class="col-md-6 gt">{}</div> <div class="col-md-6 ocr">{}</div> </div> '''.format(gtx, ocrx)
def editops(word1, word2): # XXX Note that this returns indices to the _grapheme clusters_, not characters! word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1))) word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2))) return seq_editops(word1, word2)
def extract_emojis(str): return list(gc.grapheme_clusters(str))
print datetime.datetime.now(), "Fetching Messages from file" with open(LOCAL_FILE_NAME) as f: data = json.load(f) filt = filter(lambda x: x['time'] > "2018-12-09 00:00:00", data) print datetime.datetime.now(), "Extracting Emojis" results = [] for msg in filt: if msg['message'] == None: continue done = [] emjs = [x.encode('utf-8') for x in gc.grapheme_clusters(msg['message'])] for aaa in emjs: if len(aaa) < 4: continue a = aaa m = json.dumps({"k": a}) # print(len(a),json.loads(m)["k"]) # done += [json.loads(m)["k"].encode("utf-8")] emjs = done results += [{"emojis": emjs, "time": msg['time'], "who": msg['from']}] print datetime.datetime.now(), "People people frequencies" people = {} for t in results: # ADD PERSON IF NOT IN CACHE
import json from uniseg.graphemecluster import grapheme_clusters with open('posts.json', 'r') as f: data = json.load(f) clusters = set([]) for _, post in data.iteritems(): clusters.update(grapheme_clusters(post['data']['selftext'])) clusters.update(grapheme_clusters(post['data']['title'])) for cluster in clusters: print cluster.__repr__(), print ': ', print cluster