Ejemplo n.º 1
0
# coding: utf8
from __future__ import unicode_literals

from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from emoji import UNICODE_EMOJI

from .about import __version__

# make sure multi-character emoji don't contain whitespace
EMOJI = {e.replace(' ', ''): t for e, t in UNICODE_EMOJI.items()}


class Emoji(object):
    """spaCy v2.0 pipeline component for adding emoji meta data to `Doc` objects.
    Detects emoji consisting of one or more unicode characters, and can
    optionally merge multi-char emoji (combined pictures, emoji with skin tone
    modifiers) into one token. Emoji are matched using spaCy's `PhraseMatcher`,
    and looked up in the data table provided by the "emoji" package:
    https://github.com/carpedm20/emoji

    USAGE:
        >>> import spacy
        >>> from spacymoji import Emoji
        >>> nlp = spacy.load('en')
        >>> emoji = Emoji(nlp)
        >>> nlp.add_pipe(emoji, first=True)
        >>> doc = nlp(u"This is a test 😻 👍🏿")
        >>> assert doc._.has_emoji == True
        >>> assert doc[2:5]._.has_emoji == True
        >>> assert doc[0]._.is_emoji == False
Ejemplo n.º 2
0
from typing import Dict, Optional, Tuple, List, Union
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.language import Language
from spacy.util import filter_spans
from emoji import UNICODE_EMOJI

# Make sure multi-character emoji don't contain whitespace
EMOJI = {e.replace(" ", ""): t for e, t in UNICODE_EMOJI.items()}

DEFAULT_ATTRS = ("has_emoji", "is_emoji", "emoji_desc", "emoji")

DEFAULT_CONFIG = {
    "merge_spans": True,
    "lookup": {},
    "pattern_id": "EMOJI",
    "attrs": DEFAULT_ATTRS,
    "force_extension": True,
}


@Language.factory("emoji", default_config=DEFAULT_CONFIG)
def create_emoji(
    nlp: Language,
    name: str,
    merge_spans: bool = True,
    lookup: Optional[Dict[str, str]] = None,
    pattern_id: str = "EMOJI",
    attrs: Tuple[str, str, str, str] = DEFAULT_ATTRS,
    force_extension: bool = True,
):