Esempio n. 1
0
    def run(self, input: List(Flags())) -> Flags():
        result = {}

        for d in input:
            result.update(d)

        return result
Esempio n. 2
0
 def __init__(
     self,
     extractors: Distinct(
         algorithm(Word(), Flags()), exceptions=["MultipleFeatureExtractor"]
     ),
     merger: algorithm(List(Flags()), Flags()),
 ):
     self.extractors = extractors
     self.merger = merger
Esempio n. 3
0
def _build_pipeline():
    builder = build_pipelines(input=Tuple(List(List(Flags())),
                                          List(List(Category()))),
                              output=List(List(Category())),
                              registry=find_classes(include="CRF"))

    return builder.sample(sampler=Sampler(random_state=0))
Esempio n. 4
0
 def __init__(
     self,
     tokenizer: algorithm(Document(), List(Sentence())),
     feature_extractor: algorithm(Sentence(), Flags()),
 ):
     self.tokenizer = tokenizer
     self.feature_extractor = feature_extractor
Esempio n. 5
0
 def __init__(
     self,
     tokenizer: algorithm(Sentence(), List(Word())),
     feature_extractor: algorithm(Word(), Flags()),
     include_text: Boolean(),
 ):
     self.tokenizer = tokenizer
     self.feature_extractor = feature_extractor
     self.include_text = include_text
Esempio n. 6
0
    def run(self, input: Sentence()) -> Flags():
        tokens = self.tokenizer.run(input)
        flags = [self.feature_extractor(w) for w in tokens]

        if self.include_text:
            return {
                f"{w}|{f}": v for w, flag in zip(tokens, flags) for f, v in flag.items()
            }
        else:
            return {f: v for flag in flags for f, v in flag.items()}
Esempio n. 7
0
    def run(self, input: Sentence()) -> Tuple(List(Word()), List(Flags())):
        tokenized = self.nlp(input)

        tokens = []
        flags = []

        for token in tokenized:
            token_flags = {}
            if self.extract_lemma:
                token_flags["lemma"] = token.lemma_
            if self.extract_pos_tag:
                token_flags["pos"] = token.pos_

                for kv in token.tag_.split("|"):
                    kv = kv.split("=")
                    if len(kv) == 2:
                        token_flags["tag_" + kv[0]] = kv[1]
                    else:
                        token_flags["tag_" + kv[0]] = True

            if self.extract_dep:
                token_flags["dep"] = token.dep_
            if self.extract_entity:
                token_flags["ent_type"] = token.ent_type_
                token_flags["ent_kb_id"] = token.ent_kb_id_
            if self.extract_details:
                token_flags["is_alpha"] = token.is_alpha
                token_flags["is_ascii"] = token.is_ascii
                token_flags["is_digit"] = token.is_digit
                token_flags["is_lower"] = token.is_lower
                token_flags["is_upper"] = token.is_upper
                token_flags["is_title"] = token.is_title
                token_flags["is_punct"] = token.is_punct
                token_flags["is_left_punct"] = token.is_left_punct
                token_flags["is_right_punct"] = token.is_right_punct
                token_flags["is_space"] = token.is_space
                token_flags["is_bracket"] = token.is_bracket
                token_flags["is_quote"] = token.is_quote
                token_flags["is_currency"] = token.is_currency
                token_flags["like_url"] = token.like_url
                token_flags["like_num"] = token.like_num
                token_flags["like_email"] = token.like_email
                token_flags["is_oov"] = token.is_oov
                token_flags["is_stop"] = token.is_stop
            if self.extract_sentiment:
                token_flags["sentiment"] = token.sentiment

            tokens.append(token.text)
            flags.append(token_flags)

        return tokens, flags
Esempio n. 8
0
 def run(
     self, input: Word(domain='general', language='spanish')) -> Flags():
     """This method use Word2Vect of gensim for tranform a word in embedding vector.
     """
     return dict(in_wikipedia=bool(wikipedia.search(input)))
Esempio n. 9
0
 def run(self, input: Word()) -> Flags():
     r_exp = self._regex()
     b = re.fullmatch(r_exp, input) if self.full else re.search(r_exp, input) 
     return {f"is_{self._name}_regex": bool(b)}
Esempio n. 10
0
 def __init__(self,
     tokenizer: algorithm(Sentence(), List(Word())),
     token_feature_extractor: algorithm(Word(), Flags()),
     # token_sentence_encoder: algorithm(Word(), )
 ):
     pass
Esempio n. 11
0
 def run(self, input: Document()) -> List(Flags()):
     tokens = self.tokenizer.run(input)
     flags = [self.feature_extractor(w) for w in tokens]
     return
Esempio n. 12
0
 def run(self, input: Word()) -> Flags():
     flags = [extractor.run(input) for extractor in self.extractors]
     return self.merger.run(flags)
Esempio n. 13
0
 def run(
     self, input: Tuple(List(List(Flags())), List(List(Category())))
 ) -> List(List(Category())):
     return SklearnEstimator.run(self, input)
Esempio n. 14
0
 def run(self, input: List(Flags())) -> MatrixContinuousDense():
     return super().run(input)