def __init__(self, language="es"): """ Init method :param language: input language """ self.__stemmer = snowballstemmer.stemmer("spanish") Token.set_extension("stem", default="", force=True)
def __init__(self): super().__init__() if not Doc.has_extension(self.name): Doc.set_extension(self.name, default=[]) if not Token.has_extension('is_lexical'): Token.set_extension('is_lexical', default=False)
def enable_spacy_extensions(): """Enables custom extensions for spaCy for dealing with citations.""" Token.set_extension('is_in_text_citation', default=False, force=True) Span.set_extension('tokens_without_citations', getter=get_span_tokens_without_citations, force=True) Span.set_extension('text_without_citations', getter=get_span_text_without_citations, force=True) Span.set_extension('text_with_ws_without_citations', getter=get_span_text_with_ws_wo_cites, force=True)
def __init__(self, nlp, name, source: str = None, domain: str = None): Token.set_extension(BabelnetAnnotator.__FIELD, default=None, force=True) self.__lang = nlp.lang self.__bn_lang = bn.Language.fromISO(nlp.lang) self.__source = source self.__domain = domain self.__bn_domain = None if domain: self.__bn_domain = bn.BabelDomain.valueOfName(domain) self.__bn_source = None if source: self.__bn_source = getattr(bn.BabelSenseSource, source)
def text_to_instance(self, source_string: str, target_string: str = None, paragraph_id: str = None, turn_id: int = 0) -> Instance: # type: ignore # pylint: disable=arguments-differ tokenized_source = self._tokenizer.tokenize(source_string) tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._token_indexers) if target_string is not None: tokenized_target = self._tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._token_indexers) return Instance({"source_tokens": source_field}) else: return Instance({"source_tokens": source_field})
def __init__(self, language: str = "es"): """ Init method :param language: language of the annotation """ self.__sentiment_words = load_dict(language, "sentiment_words.csv") self.__boosters = load_dict(language, "boosters.csv") self.__negations = load_dict(language, "negations.csv") Span.set_extension("sentiment_weight", default=0.0, force=True) Token.set_extension("sentiment_weight", default=0.0, force=True) Token.set_extension("negation_weight", default=1.0, force=True) Token.set_extension("booster_weight", default=0.0, force=True)
def __init__(self, nlp, name): self.__lang = nlp.lang Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True) load_wordnet_domains()
def _set_spacy_extensions(self): def synset_getter(token): if not Disambiguator._wn: from nltk.corpus import wordnet as wn Disambiguator._wn = wn else: wn = Disambiguator._wn offset = token._.offset if offset: return wn.synset_from_pos_and_offset(offset[-1], int(offset[3:-1])) else: return None def bnid_getter(token): return babelnet_map.get(token._.offset) from spacy.tokens import Doc, Token Doc.set_extension('lang', default='en') Token.set_extension('lemma_preset_', default=None) Token.set_extension('pos_preset_', default=None) Token.set_extension('lemma_preset_else_spacy', getter=lambda t: t._.lemma_preset_ or t.lemma_) Token.set_extension('pos_preset_else_spacy', getter=lambda t: t._.pos_preset_ or t.pos_) Token.set_extension('offset', default=None) Token.set_extension('synset', getter=synset_getter) Token.set_extension('bnid', getter=bnid_getter) Token.set_extension('disambiguator_internals', default=None)
def __init__(self, lang: str = 'es'): Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True) load_wordnet_domains() self.__lang = lang
import spacy from spacy.tokens.token import Token nlp = spacy.load('en_core_web_lg') input_file = '../../../tasks/02-structural-linguistics/examiner-headlines.txt' output_file = './2.1-correct_headlines.txt' big_letter_tag = ['NN', 'PRP', 'VB', 'JJ', 'RB'] Token.set_extension('need_capitalize', default=False) def _check_token_needs_capitalizing(token): pos_big_letter = any(token.tag_.startswith(tag) for tag in big_letter_tag) # розрізняти prepositions та subordinate conjunctions pos_conjunction = token.pos_ == 'ADP' and len(list(token.children)) == 0 # hyphen is handled separately return pos_big_letter or pos_conjunction def mark_doc(line: str): doc = nlp(line) # first word capitalization doc[0]._.set('need_capitalize', True) for token in doc[1:-1]: if token.text != '-': needs_capitalization = _check_token_needs_capitalizing(token) if needs_capitalization: token._.set('need_capitalize', True)
import spacy from blingfire import text_to_sentences from spacy.tokens import Span, Doc from spacy.tokens.token import Token from data_model import span_to_dict from relationships_resolver import SimpleResolutionResolver, VicinityResolutionResolver Span.set_extension('id', default=None, force=True) Span.set_extension('links', default=[], force=True) Span.set_extension('linkable', default=False, force=True) Span.set_extension('bounding_boxes', default=[], force=True) Span.set_extension('formattedText', default="", force=True) Token.set_extension('id', default=None, force=True) Token.set_extension('links', default=[], force=True) Token.set_extension('linkable', default=False, force=True) Token.set_extension('bounding_boxes', default=[], force=True) Token.set_extension('formattedText', default="", force=True) def decode(response): try: return response.json() except ValueError as e: return "Error: " + str(e) def entities_classes(): return [
def __init__(self, nlp, name, K=10, xnet='wordnet'): self.__lang = nlp.lang self.__K = K Token.set_extension(self.__FIELD, default=None, force=True) Ares.load_data(nlp.lang, xnet)