def test_simple_pipeline_graph(): graph = build_pipeline_graph(input=MatrixContinuousDense() ,output= MatrixContinuousDense() ,registry=[ExactAlgorithm, HigherInputAlgorithm, LowerOutputAlgorithm]).graph assert_graph(graph, 3, 3, 6) graph = build_pipeline_graph(input=List(Text()) ,output= Document() ,registry=[WordToWordAlgorithm, TextToWordAlgorithm, WordToWordListAlgorithm, WordListToSentenceAlgorithm, WordListToSentenceAlgorithm, SentenceListToDocumentAlgorithm, TextListToDocumentAlgorithm]).graph assert_graph(graph, 2, 2, 12) graph = build_pipeline_graph(input=List(Word()) ,output=Document() ,registry=[WordToWordAlgorithm, TextToWordAlgorithm, WordToWordListAlgorithm, WordListToSentenceAlgorithm, WordListToSentenceAlgorithm, SentenceListToDocumentAlgorithm, TextListToDocumentAlgorithm]).graph assert_graph(graph, 2, 1, 10)
def run(self, input: Sentence()) -> Tuple(List(Word()), List(Flags())): tokenized = self.nlp(input) tokens = [] flags = [] for token in tokenized: token_flags = {} if self.extract_lemma: token_flags["lemma"] = token.lemma_ if self.extract_pos_tag: token_flags["pos"] = token.pos_ for kv in token.tag_.split("|"): kv = kv.split("=") if len(kv) == 2: token_flags["tag_" + kv[0]] = kv[1] else: token_flags["tag_" + kv[0]] = True if self.extract_dep: token_flags["dep"] = token.dep_ if self.extract_entity: token_flags["ent_type"] = token.ent_type_ token_flags["ent_kb_id"] = token.ent_kb_id_ if self.extract_details: token_flags["is_alpha"] = token.is_alpha token_flags["is_ascii"] = token.is_ascii token_flags["is_digit"] = token.is_digit token_flags["is_lower"] = token.is_lower token_flags["is_upper"] = token.is_upper token_flags["is_title"] = token.is_title token_flags["is_punct"] = token.is_punct token_flags["is_left_punct"] = token.is_left_punct token_flags["is_right_punct"] = token.is_right_punct token_flags["is_space"] = token.is_space token_flags["is_bracket"] = token.is_bracket token_flags["is_quote"] = token.is_quote token_flags["is_currency"] = token.is_currency token_flags["like_url"] = token.like_url token_flags["like_num"] = token.like_num token_flags["like_email"] = token.like_email token_flags["is_oov"] = token.is_oov token_flags["is_stop"] = token.is_stop if self.extract_sentiment: token_flags["sentiment"] = token.sentiment tokens.append(token.text) flags.append(token_flags) return tokens, flags
def test_meta_pipeline_graph(): # Test List algorithm generation build_pipeline_graph(input=List(Word()), output=List(Word()), registry=[WordToWordAlgorithm]) # Test Tuple breakdown feature build_pipeline_graph(input=Tuple(Word(), Matrix()), output=Text(), registry=[WordToWordAlgorithm]) # Test Tuple breakdown feature and List algorithm generation build_pipeline_graph(input=Tuple(List(Word()), Matrix()), output=List(Word()), registry=[WordToWordAlgorithm])
def __init__( self, lowercase: Boolean(), stopwords_remove: Boolean(), binary: Boolean(), inner_tokenizer: algorithm(Sentence(), List(Word())), inner_stemmer: algorithm(Word(), Stem()), inner_stopwords: algorithm(List(Word()), List(Word())), ): self.stopwords_remove = stopwords_remove self.inner_tokenizer = inner_tokenizer self.inner_stemmer = inner_stemmer self.inner_stopwords = inner_stopwords SklearnTransformer.__init__(self) _CountVectorizer.__init__(self, lowercase=lowercase, binary=binary)
def run(self, input: List(Flags())) -> Flags(): result = {} for d in input: result.update(d) return result
def __init__( self, tokenizer: algorithm(Document(), List(Sentence())), feature_extractor: algorithm(Sentence(), Flags()), ): self.tokenizer = tokenizer self.feature_extractor = feature_extractor
def __init__( self, tokenizer: algorithm(Sentence(), List(Word())), feature_extractor: algorithm(Word(), Flags()), include_text: Boolean(), ): self.tokenizer = tokenizer self.feature_extractor = feature_extractor self.include_text = include_text
def __init__( self, extractors: Distinct( algorithm(Word(), Flags()), exceptions=["MultipleFeatureExtractor"] ), merger: algorithm(List(Flags()), Flags()), ): self.extractors = extractors self.merger = merger
def run(self, input: List(ContinuousVector())) -> ContinuousVector(): input = np.vstack(input) if self.mode == "mean": return input.mean(axis=1) elif self.mode == "max": return input.max(axis=1) raise ValueError("Invalid mode: %s" % self.mode)
def _build_pipeline(): builder = build_pipelines(input=Tuple(List(List(Flags())), List(List(Category()))), output=List(List(Category())), registry=find_classes(include="CRF")) return builder.sample(sampler=Sampler(random_state=0))
def test_save_load_list(): ListClass = build_composite_list( input_type=Tuple(MatrixContinuousDense(), CategoricalVector()), output_type=List(CategoricalVector()), ) algorithm = ListClass(DummyAlgorithm) fp = BytesIO() Pickler(fp).dump(algorithm) fp.seek(0) algorithm2 = Unpickler(fp).load() assert repr(algorithm) == repr(algorithm2)
def run(self, input: List(MatrixContinuousDense())) -> Tensor3(): return np.vstack([np.expand_dims(m, axis=0) for m in input])
def run(self, input: List(ContinuousVector())) -> MatrixContinuousDense(): return np.vstack(input)
def run( self, input: Tuple(Sentence(), List(Tuple(Entity(), Entity(), Category()))) ) -> Tuple(List(Vector()), CategoricalVector()): pass
def __init__(self, tokenizer: algorithm(Sentence(), List(Word())), token_feature_extractor: algorithm(Word(), Flags()), # token_sentence_encoder: algorithm(Word(), ) ): pass
def run( self, input: Tuple(Sentence(), List(Entity())) ) -> Tuple(List(Word()), List(Postag())): pass
def run( self, input: Tuple(List(List(Flags())), List(List(Category()))) ) -> List(List(Category())): return SklearnEstimator.run(self, input)
def run(self, input: List(Sentence())) -> MatrixContinuousSparse(): return SklearnTransformer.run(self, input)
def run(self, input: List(Word())) -> List(Vector()): pass
def run( self, input: Tuple(List(MatrixContinuousDense()), List(List(Postag()))) ) -> List(List(Postag())): return super().run(input)
dorothea, german_credit, gisette, shuttle, wine_quality, yeast, ]: X, y, *_ = dataset.load() run_automl(X, y, name=dataset.__name__) for dataset in [movie_reviews, haha]: X, y, *_ = dataset.load() run_automl(X, y, name=dataset.__name__, input=List(Sentence()), output=CategoricalVector()) for dataset in [meddocan]: X, _, y, _ = dataset.load() run_automl(X, y, name=dataset.__name__, input=List(List(Word())), output=List(List(Postag()))) for dataset in [cifar10]: X, y, *_ = dataset.load() run_automl(X, y, name=dataset.__name__,
def run(self, input:List(Text())) -> Document(): pass
parser.add_argument("--token", default=None) parser.add_argument("--channel", default=None) args = parser.parse_args() print(args) # ## Experimentation # Instantiate the classifier. # Note that the input and output types here are defined to match the problem statement, # i.e., entity recognition. classifier = AutoML( search_algorithm=PESearch, input=List(List(Word())), output=List(List(Postag())), search_iterations=args.iterations, score_metric=meddocan.F1_beta, cross_validation_steps=1, search_kwargs=dict( pop_size=args.popsize, search_timeout=args.global_timeout, evaluation_timeout=args.timeout, memory_limit=args.memory * 1024 ** 3, ), ) # This custom logger is used for debugging purposes, to be able later to recover # the best pipelines and all the errors encountered in the experimentation process.
def run(self, input: List(Flags())) -> MatrixContinuousDense(): return super().run(input)
def run(self, input: Sentence()) -> List(Word()): pass
def run(self, input: Document()) -> List(Flags()): tokens = self.tokenizer.run(input) flags = [self.feature_extractor(w) for w in tokens] return
def run(self, input: List(Vector())) -> Matrix(): pass
def __init__(self, tokenizer: algorithm(Sentence(), List(Word()))): self.tokenizer = tokenizer
# The next line will print all the algorithms that AutoGOAL found # in the `contrib` library, i.e., anything that could be potentially used # to solve an AutoML problem. for cls in find_classes(): print("Using: %s" % cls.__name__) # ## Experimentation # Instantiate the classifier. # Note that the input and output types here are defined to match the problem statement, # i.e., text classification. classifier = AutoML( search_algorithm=PESearch, input=List(Sentence()), output=CategoricalVector(), search_iterations=args.iterations, score_metric=f1_score, search_kwargs=dict( pop_size=args.popsize, search_timeout=args.global_timeout, evaluation_timeout=args.timeout, memory_limit=args.memory * 1024**3, ), ) # This custom logger is used for debugging purposes, to be able later to recover # the best pipelines and all the errors encountered in the experimentation process.
def run(self, input:List(Sentence())) -> Document(): pass