def run(self, input: List(Flags())) -> Flags(): result = {} for d in input: result.update(d) return result
def __init__( self, extractors: Distinct( algorithm(Word(), Flags()), exceptions=["MultipleFeatureExtractor"] ), merger: algorithm(List(Flags()), Flags()), ): self.extractors = extractors self.merger = merger
def _build_pipeline(): builder = build_pipelines(input=Tuple(List(List(Flags())), List(List(Category()))), output=List(List(Category())), registry=find_classes(include="CRF")) return builder.sample(sampler=Sampler(random_state=0))
def __init__( self, tokenizer: algorithm(Document(), List(Sentence())), feature_extractor: algorithm(Sentence(), Flags()), ): self.tokenizer = tokenizer self.feature_extractor = feature_extractor
def __init__( self, tokenizer: algorithm(Sentence(), List(Word())), feature_extractor: algorithm(Word(), Flags()), include_text: Boolean(), ): self.tokenizer = tokenizer self.feature_extractor = feature_extractor self.include_text = include_text
def run(self, input: Sentence()) -> Flags(): tokens = self.tokenizer.run(input) flags = [self.feature_extractor(w) for w in tokens] if self.include_text: return { f"{w}|{f}": v for w, flag in zip(tokens, flags) for f, v in flag.items() } else: return {f: v for flag in flags for f, v in flag.items()}
def run(self, input: Sentence()) -> Tuple(List(Word()), List(Flags())): tokenized = self.nlp(input) tokens = [] flags = [] for token in tokenized: token_flags = {} if self.extract_lemma: token_flags["lemma"] = token.lemma_ if self.extract_pos_tag: token_flags["pos"] = token.pos_ for kv in token.tag_.split("|"): kv = kv.split("=") if len(kv) == 2: token_flags["tag_" + kv[0]] = kv[1] else: token_flags["tag_" + kv[0]] = True if self.extract_dep: token_flags["dep"] = token.dep_ if self.extract_entity: token_flags["ent_type"] = token.ent_type_ token_flags["ent_kb_id"] = token.ent_kb_id_ if self.extract_details: token_flags["is_alpha"] = token.is_alpha token_flags["is_ascii"] = token.is_ascii token_flags["is_digit"] = token.is_digit token_flags["is_lower"] = token.is_lower token_flags["is_upper"] = token.is_upper token_flags["is_title"] = token.is_title token_flags["is_punct"] = token.is_punct token_flags["is_left_punct"] = token.is_left_punct token_flags["is_right_punct"] = token.is_right_punct token_flags["is_space"] = token.is_space token_flags["is_bracket"] = token.is_bracket token_flags["is_quote"] = token.is_quote token_flags["is_currency"] = token.is_currency token_flags["like_url"] = token.like_url token_flags["like_num"] = token.like_num token_flags["like_email"] = token.like_email token_flags["is_oov"] = token.is_oov token_flags["is_stop"] = token.is_stop if self.extract_sentiment: token_flags["sentiment"] = token.sentiment tokens.append(token.text) flags.append(token_flags) return tokens, flags
def run( self, input: Word(domain='general', language='spanish')) -> Flags(): """This method use Word2Vect of gensim for tranform a word in embedding vector. """ return dict(in_wikipedia=bool(wikipedia.search(input)))
def run(self, input: Word()) -> Flags(): r_exp = self._regex() b = re.fullmatch(r_exp, input) if self.full else re.search(r_exp, input) return {f"is_{self._name}_regex": bool(b)}
def __init__(self, tokenizer: algorithm(Sentence(), List(Word())), token_feature_extractor: algorithm(Word(), Flags()), # token_sentence_encoder: algorithm(Word(), ) ): pass
def run(self, input: Document()) -> List(Flags()): tokens = self.tokenizer.run(input) flags = [self.feature_extractor(w) for w in tokens] return
def run(self, input: Word()) -> Flags(): flags = [extractor.run(input) for extractor in self.extractors] return self.merger.run(flags)
def run( self, input: Tuple(List(List(Flags())), List(List(Category()))) ) -> List(List(Category())): return SklearnEstimator.run(self, input)
def run(self, input: List(Flags())) -> MatrixContinuousDense(): return super().run(input)