Example #1
0
def distance(s1: str, s2: str):
    """Compute the Levenshtein edit distance between two Unicode strings

    Note that this is different from levenshtein() as this function knows about Unicode normalization and grapheme
    clusters. This should be the correct way to compare two Unicode strings.
    """
    seq1 = list(grapheme_clusters(unicodedata.normalize('NFC', s1)))
    seq2 = list(grapheme_clusters(unicodedata.normalize('NFC', s2)))
    return levenshtein(seq1, seq2)
Example #2
0
def editops(word1, word2):
    """
    Return sequence of edit operations transforming one string to another.

    Note that this returns indices to the _grapheme clusters_, not characters!
    """
    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
    return seq_editops(word1, word2)
def test_align():
    """
    Test aligning by character while retaining segment id info

    The difficulty here is that aligning should work on grapheme clusters,
    not Python characters.
    """

    test1 = ExtractedText(
        None,
        [
            ExtractedText("s0", None, None, "foo"),
            ExtractedText("s1", None, None, "bar"),
            ExtractedText("s2", None, None, "batzinga"),
        ],
        " ",
        None,
    )
    test2 = ExtractedText(
        None,
        [
            ExtractedText("x0", None, None, "foo"),
            ExtractedText("x1", None, None, "bar"),
            # extra .
            ExtractedText("x2", None, None, "."),
            # deletion + different grapheme cluster, m̃ also is two Python characters
            ExtractedText("x3", None, None, "bazim̃ga"),
        ],
        " ",
        None,
    )

    left_pos = 0
    right_pos = 0
    alignment = []
    for left, right in seq_align(grapheme_clusters(test1.text),
                                 grapheme_clusters(test2.text)):
        left_id = test1.segment_id_for_pos(
            left_pos) if left is not None else None
        right_id = test2.segment_id_for_pos(
            right_pos) if right is not None else None
        el = AlignmentElement(left, right, left_id, right_id)
        alignment.append(el)
        if left is not None:
            left_pos += len(left)
        if right is not None:
            right_pos += len(right)

    print("test1: {}".format(test1.text))
    print("test2: {}".format(test2.text))

    assert alignment[0] == ("f", "f", "s0", "x0")
    assert alignment[8] == (None, ".", None, "x2")
    assert alignment[12] == ("t", None, "s2", None)
    assert alignment[15] == ("n", "m̃", "s2", "x3")
def test_align():
    """
    Test aligning by character while retaining segment id info

    The difficulty here is that aligning should work on grapheme clusters,
    not Python characters.
    """

    test1 = ExtractedText(None, [
        ExtractedText('s0', None, None, 'foo'),
        ExtractedText('s1', None, None, 'bar'),
        ExtractedText('s2', None, None, 'batzinga')
    ], ' ', None)
    test2 = ExtractedText(
        None,
        [
            ExtractedText('x0', None, None, 'foo'),
            ExtractedText('x1', None, None, 'bar'),
            # extra .
            ExtractedText('x2', None, None, '.'),
            # deletion + different grapheme cluster, m̃ also is two Python characters
            ExtractedText('x3', None, None, 'bazim̃ga'),
        ],
        ' ',
        None)

    left_pos = 0
    right_pos = 0
    alignment = []
    for left, right in seq_align(grapheme_clusters(test1.text),
                                 grapheme_clusters(test2.text)):
        left_id = test1.segment_id_for_pos(
            left_pos) if left is not None else None
        right_id = test2.segment_id_for_pos(
            right_pos) if right is not None else None
        el = AlignmentElement(left, right, left_id, right_id)
        alignment.append(el)
        if left is not None:
            left_pos += len(left)
        if right is not None:
            right_pos += len(right)

    print('test1: {}'.format(test1.text))
    print('test2: {}'.format(test2.text))

    assert alignment[0] == ('f', 'f', 's0', 'x0')
    assert alignment[8] == (None, '.', None, 'x2')
    assert alignment[12] == ('t', None, 's2', None)
    assert alignment[15] == ('n', 'm̃', 's2', 'x3')
def character_error_rate(reference, compared):
    d = distance(reference, compared)
    if d == 0:
        return 0

    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))
    if n == 0:
        return float('inf')

    return d / n
Example #6
0
def test_character_error_rate_between_page_files():
    # In the fake OCR file, we changed 2 characters and replaced a fi ligature with fi.
    # The fi ligature does not count.
    gt = page_text(ET.parse(os.path.join(data_dir, 'test-gt.page2018.xml')))
    ocr = page_text(
        ET.parse(os.path.join(data_dir, 'test-fake-ocr.page2018.xml')))

    gt_len = len(list(grapheme_clusters(gt)))
    expected_cer = 2 / gt_len

    assert character_error_rate(gt, ocr) == expected_cer
Example #7
0
def grapheme_len(text):
    """Number of graphemes in `text`

    This is the length of the `text` when printed::
        >>> s = 'Â'
        >>> len(s)
        2
        >>> grapheme_len(s)
        1
    """
    return len(list(grapheme_clusters(text)))
def character_error_rate_n(reference, compared) -> Tuple[float, int]:
    """
    Compute character error rate.

    :return: character error rate and length of the reference
    """
    d = distance(reference, compared)
    n = len(list(grapheme_clusters(unicodedata.normalize('NFC', reference))))

    if d == 0:
        return 0, n
    if n == 0:
        return float('inf'), n
    return d / n, n
Example #9
0
def decode(d, spacing=''):
    """Decodes an emoji string to binary data

  Args:
      d: String to be decoded
      spacing: A character that was placed between each emoji. Default is none.

  Returns:
      Binary data as a long
  """
    #start the return data
    r = 0
    decode_version = None
    i = 0
    #for each cluster of unicode data found in the string
    for v in list(gc.grapheme_clusters(d)):
        #if v is a spacing character, ignore it
        if v == spacing:
            continue

        #Block sizes of unicode change between py2 and 3
        if (sys.version_info > (3, 0)) or platform.system() == 'Linux':
            block_size = 2
        else:
            block_size = 4
        #for 4 bytes in the cluster (2 emoji or less)
        for dcode in [
                v[k:k + block_size] for k in range(0, len(v), block_size)
        ]:
            #join mutliple codepoints together (2 bytes each)
            p = '-'.join(
                format(cp.ord(x), 'x').zfill(4).upper() for x in [
                    dcode[l:l + (block_size // 2)]
                    for l in range(0, len(dcode), block_size // 2)
                ])

            #get the first emoji as the version number
            if i == 0:
                decode_version = VERSION_EMOJI.index(p)
                i += 1
            else:
                #do a version check
                if decode_version is None:
                    raise ValueError("No version information was found")

                #decode the emoji through the map and push it onto the result
                r = (r << 10) + EMOJI[decode_version].index(p)

    #return the data
    return r
Example #10
0
    def get_char_tensor(self, X):
        word_int = []
        length = 0

        # Go through each tensor in each batch
        for b in range(0, X.shape[0]):
            each_X = X[b]
            char_int = []
            w = []

            # For character-level
            if self.char_level:
                # Get all the characters
                w += (list(x) for x in self.tensortosent(each_X).split())

                # Get all the index of those characters
                char_int += ([
                    self.dataloader.char_field.vocab.stoi[c] for c in each
                ] for each in w)

            # For grapheme-level
            else:
                w += (list(grapheme_clusters(x))
                      for x in self.tensortosent(each_X).split())
                char_int += ([
                    self.dataloader.graph_field.vocab.stoi[c] for c in each
                ] for each in w)

            if length < max(map(len, char_int)):
                length = max(map(len, char_int))

            word_int.append(char_int)

        # Padding to match the max_length words whose size is less than max(filter_size)
        if length < max(self.conv_filter_sizes):
            length += max(self.conv_filter_sizes) - length

        # Make each tensor equal in size
        X_char = np.array([[xi + [0] * (length - len(xi)) for xi in each]
                           for each in word_int])

        # Convert to tensor from numpy array
        X_char = torch.from_numpy(X_char)

        return X_char
Example #11
0
def _grapheme_len(text, fail_with_zero=False):
    """Number of graphemes in `text`.

    This is the length of the `text` when printed::

        >>> s = 'Â'
        >>> len(s)
        2
        >>> _grapheme_len(s)
        1

    If `fail_with_zero` is given a True, return 0 if `text` is not a string,
    instead of throwing a TypeError.
    """
    try:
        return len(list(grapheme_clusters(text)))
    except TypeError:
        if fail_with_zero:
            return 0
        raise
def get_strs_and_emojis_for_text(text, emoji_size=(38, 38)):
    # Get ordered text to write
    vals = []
    for txt in gc.grapheme_clusters(text):
        image_file, image_filename = get_emoji_im_for_unicode(txt, emoji_size)

        modifier = EmojiFilesCache.TONE_MODIFIER_MAP.get(image_filename)
        if modifier and len(vals) >= 1:
            last_emoji = vals[-1]
            filename = os.path.basename(last_emoji.filename)
            if filename.endswith(".0.png"):
                filename = filename[:-len(".0.png")]
                filename = "%s.%s.png" % (filename, modifier)
                filepath = os.path.join(EMOJI_FILES_PATH, filename)
                emoji_im = Image.open(filepath)
                emoji_im.thumbnail(emoji_size, Image.ANTIALIAS)
                vals[-1] = emoji_im
            else:
                vals.append(image_file)
        else:
            if image_file:
                vals.append(image_file)
            else:
                vals.append(txt)

    # Collapse text into contiguous strings punctuated by images
    result = []
    prev_idx = 0
    for idx, val in enumerate(vals):
        if not isinstance(val, basestring):
            res = ''.join(vals[prev_idx:idx])
            if res:
                result.append(res)
            result.append(val)
            prev_idx = idx + 1

    return vals
Example #13
0
import json
from uniseg.graphemecluster import grapheme_clusters

with open('posts.json', 'r') as f:
    data = json.load(f)

total = 0
grapheme_total = 0

for _, post in data.iteritems():
    grapheme_total += len(list(grapheme_clusters(post['data']['selftext'])))
    grapheme_total += len(list(grapheme_clusters(post['data']['title'])))
    total += len(post['data']['selftext'])
    total += len(post['data']['title'])

print total, grapheme_total
Example #14
0
    def __init__(self, config, k):
        self.root_path = os.path.join(config.root_path, k)
        self.batch_size = config.batch_size
        self.device = config.device
        self.use_pos = config.use_pos

        self.txt_field = data.Field(tokenize=list,
                                    use_vocab=True,
                                    unk_token='<unk>',
                                    batch_first=True)
        self.label_field = data.Field(unk_token=None, batch_first=True)
        self.char_field = data.Field(unk_token='<unk>', sequential=False)
        self.graph_field = data.Field(unk_token='<unk>', sequential=False)

        self.fields = (('TEXT', self.txt_field), ('LABEL', self.label_field))

        if config.use_pos:
            self.pos_field = data.Field(unk_token=None, batch_first=True)
            self.fields = (('TEXT', self.txt_field), ('POS', self.pos_field),
                           ('LABEL', self.label_field))

        self.train_ds, self.val_ds, self.test_ds = SequenceTaggingDataset.splits(
            path=self.root_path,
            fields=self.fields,
            separator='\t',
            train='train.txt',
            validation='val.txt',
            test='test.txt')

        self.char_list = []
        self.graph_list = []
        for each in self.train_ds.examples + self.test_ds.examples + self.val_ds.examples:
            for x in each.TEXT:
                self.char_list += list(x)
                self.graph_list += list(grapheme_clusters(x))
        self.char_list = list(set(self.char_list))
        self.graph_list = list(set(self.graph_list))

        self.graph_list.sort()
        self.char_list.sort()

        self.char_field.build_vocab(self.char_list)
        self.graph_field.build_vocab(self.graph_list)

        self.embedding_dir = config.emb_dir
        self.vec = vocab.Vectors(name=config.emb_file,
                                 cache=self.embedding_dir)

        self.txt_field.build_vocab(self.train_ds,
                                   self.test_ds,
                                   self.val_ds,
                                   max_size=None,
                                   vectors=self.vec)
        self.label_field.build_vocab(self.train_ds.LABEL, self.test_ds.LABEL,
                                     self.val_ds.LABEL)

        if config.char_pretrained:
            self.char_vec = vocab.Vectors(name=config.char_emb_file,
                                          cache=self.embedding_dir)
            self.graph_vec = vocab.Vectors(name=config.graph_emb_file,
                                           cache=self.embedding_dir)

            self.char_field.build_vocab(self.char_list, vectors=self.char_vec)
            self.graph_field.build_vocab(self.graph_list,
                                         vectors=self.graph_vec)
        else:
            self.char_field.build_vocab(self.char_list)
            self.graph_field.build_vocab(self.graph_list)

        self.vocab_size = len(self.txt_field.vocab)
        self.tagset_size = len(self.label_field.vocab)
        self.char_vocab_size = len(self.char_field.vocab)
        self.graph_vocab_size = len(self.graph_field.vocab)

        self.weights = self.txt_field.vocab.vectors
        self.char_weights = self.char_field.vocab.vectors
        self.graph_weights = self.graph_field.vocab.vectors

        if config.use_pos:
            self.pos_field.build_vocab(self.train_ds.POS, self.test_ds.POS,
                                       self.val_ds.POS)
            # Because len(pos) = 56 and len(pos_field.vocab) = 55
            self.pos_size = len(self.pos_field.vocab) + 2
            self.pos_one_hot = np.eye(self.pos_size)
            self.one_hot_weight = torch.from_numpy(self.pos_one_hot).float()

        if config.verbose:
            self.print_stat()
Example #15
0
def split_sentence(sentence):
    clusters = grapheme_clusters(sentence)
    return [cluster for cluster in clusters if len(cluster) <= 4]
Example #16
0
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
    gtx = ''
    ocrx = ''

    def format_thing(t, css_classes=None, id_=None):
        if t is None:
            html_t = none
            css_classes += ' ellipsis'
        elif t == '\n':
            html_t = '<br>'
        else:
            html_t = escape(t)

        html_custom_attrs = ""

        # Set Bootstrap tooltip to the segment id
        if id_:
            html_custom_attrs += 'data-toggle="tooltip" title="{}"'.format(id_)

        if css_classes:
            return '<span class="{css_classes}" {html_custom_attrs}>{html_t}</span>'.format(
                css_classes=css_classes,
                html_t=html_t,
                html_custom_attrs=html_custom_attrs)
        else:
            return '{html_t}'.format(html_t=html_t)

    if isinstance(gt_in, ExtractedText):
        if not isinstance(ocr_in, ExtractedText):
            raise TypeError()
        # XXX splitting should be done in ExtractedText
        gt_things = list(grapheme_clusters(gt_in.text))
        ocr_things = list(grapheme_clusters(ocr_in.text))
    else:
        gt_things = gt_in
        ocr_things = ocr_in

    g_pos = 0
    o_pos = 0
    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
        css_classes = None
        gt_id = None
        ocr_id = None
        if g != o:
            css_classes = '{css_prefix}diff{k} diff'.format(
                css_prefix=css_prefix, k=k)
            if isinstance(gt_in, ExtractedText):
                gt_id = gt_in.segment_id_for_pos(
                    g_pos) if g is not None else None
                ocr_id = ocr_in.segment_id_for_pos(
                    o_pos) if o is not None else None
                # Deletions and inserts only produce one id + None, UI must
                # support this, i.e. display for the one id produced

        gtx += joiner + format_thing(g, css_classes, gt_id)
        ocrx += joiner + format_thing(o, css_classes, ocr_id)

        if g is not None:
            g_pos += len(g)
        if o is not None:
            o_pos += len(o)


    return \
        '''
        <div class="row">
           <div class="col-md-6 gt">{}</div>
           <div class="col-md-6 ocr">{}</div>
        </div>
        '''.format(gtx, ocrx)
Example #17
0
def editops(word1, word2):
    # XXX Note that this returns indices to the _grapheme clusters_, not characters!
    word1 = list(grapheme_clusters(unicodedata.normalize('NFC', word1)))
    word2 = list(grapheme_clusters(unicodedata.normalize('NFC', word2)))
    return seq_editops(word1, word2)
Example #18
0
def extract_emojis(str):
    return list(gc.grapheme_clusters(str))
Example #19
0
print datetime.datetime.now(), "Fetching Messages from file"

with open(LOCAL_FILE_NAME) as f:
    data = json.load(f)

filt = filter(lambda x: x['time'] > "2018-12-09 00:00:00", data)

print datetime.datetime.now(), "Extracting Emojis"

results = []
for msg in filt:
    if msg['message'] == None:
        continue

    done = []
    emjs = [x.encode('utf-8') for x in gc.grapheme_clusters(msg['message'])]
    for aaa in emjs:
        if len(aaa) < 4:
            continue
        a = aaa
        m = json.dumps({"k": a})
        # print(len(a),json.loads(m)["k"])  #
        done += [json.loads(m)["k"].encode("utf-8")]
    emjs = done
    results += [{"emojis": emjs, "time": msg['time'], "who": msg['from']}]

print datetime.datetime.now(), "People people frequencies"

people = {}
for t in results:
    # ADD PERSON IF NOT IN CACHE
Example #20
0
import json
from uniseg.graphemecluster import grapheme_clusters

with open('posts.json', 'r') as f:
  data = json.load(f)

clusters = set([])

for _, post in data.iteritems():
  clusters.update(grapheme_clusters(post['data']['selftext']))
  clusters.update(grapheme_clusters(post['data']['title']))

for cluster in clusters:
  print cluster.__repr__(),
  print ': ',
  print cluster