# coding: utf8 from __future__ import unicode_literals from spacy.tokens import Doc, Span, Token from spacy.matcher import PhraseMatcher from emoji import UNICODE_EMOJI from .about import __version__ # make sure multi-character emoji don't contain whitespace EMOJI = {e.replace(' ', ''): t for e, t in UNICODE_EMOJI.items()} class Emoji(object): """spaCy v2.0 pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Emoji are matched using spaCy's `PhraseMatcher`, and looked up in the data table provided by the "emoji" package: https://github.com/carpedm20/emoji USAGE: >>> import spacy >>> from spacymoji import Emoji >>> nlp = spacy.load('en') >>> emoji = Emoji(nlp) >>> nlp.add_pipe(emoji, first=True) >>> doc = nlp(u"This is a test 😻 👍🏿") >>> assert doc._.has_emoji == True >>> assert doc[2:5]._.has_emoji == True >>> assert doc[0]._.is_emoji == False
from typing import Dict, Optional, Tuple, List, Union from spacy.tokens import Doc, Span, Token from spacy.matcher import PhraseMatcher from spacy.language import Language from spacy.util import filter_spans from emoji import UNICODE_EMOJI # Make sure multi-character emoji don't contain whitespace EMOJI = {e.replace(" ", ""): t for e, t in UNICODE_EMOJI.items()} DEFAULT_ATTRS = ("has_emoji", "is_emoji", "emoji_desc", "emoji") DEFAULT_CONFIG = { "merge_spans": True, "lookup": {}, "pattern_id": "EMOJI", "attrs": DEFAULT_ATTRS, "force_extension": True, } @Language.factory("emoji", default_config=DEFAULT_CONFIG) def create_emoji( nlp: Language, name: str, merge_spans: bool = True, lookup: Optional[Dict[str, str]] = None, pattern_id: str = "EMOJI", attrs: Tuple[str, str, str, str] = DEFAULT_ATTRS, force_extension: bool = True, ):