def is_transformer(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an sklearn general transformer. If True, returns the valid (input, output) types. Examples: >>> from sklearn.feature_extraction.text import CountVectorizer >>> is_transformer(CountVectorizer) (True, (List(Sentence()), MatrixContinuousSparse())) >>> from sklearn.decomposition.pca import PCA >>> is_transformer(PCA) (True, (MatrixContinuousDense(), MatrixContinuousDense())) """ if not is_algorithm(cls, verbose=verbose): return False, None allowed_inputs = set() allowed_outputs = set() for input_type in [ kb.MatrixContinuousDense(), kb.MatrixContinuousSparse(), kb.List(kb.Sentence()), ]: for output_type in [ kb.MatrixContinuousDense(), kb.MatrixContinuousSparse(), kb.List(kb.Sentence()), ]: try: X = DATA_TYPE_EXAMPLES[input_type] clf = cls() X = clf.fit_transform(X) assert is_data_type(X, output_type) allowed_inputs.add(input_type) allowed_outputs.add(output_type) except Exception as e: if verbose: warnings.warn(str(e)) if len(allowed_outputs) != 1: return False, None inputs = combine_types(*allowed_inputs) if allowed_inputs: return True, (inputs, list(allowed_outputs)[0]) else: return False, None
def is_tagger(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an nltk pos tagger. If True, returns the valid (input, output) types. Examples: >>> from nltk.tag import AffixTagger >>> from nltk.tokenize import PunktSentenceTokenizer >>> is_tagger(AffixTagger) (True, (List(List(Word())), List(List(Postag())))) >>> is_tagger(LogisticRegression) (False, None) """ if not _is_tagger(cls, verbose=verbose): return False, None inputs = [] output = kb.List(kb.List(kb.Postag())) for input_type in [kb.List(kb.List(kb.Word()))]: try: X = DATA_TYPE_EXAMPLES[input_type] X_train = [[(word, word) for word in sentence] for sentence in X] tagger = cls(train=X_train) # tagger = cls() y = tagger.tag_sents(X) assert DATA_RESOLVERS[output](y) inputs.append(input_type) except Exception as e: if verbose: warnings.warn(str(e)) inputs = combine_types(*inputs) if inputs: return True, (inputs, output) else: is_ptt = is_pretrained_tagger(cls, verbose) is_ckr = is_chunker(cls, verbose) if is_ptt[0]: return is_ptt if is_ckr[0]: return is_ckr return False, None
def is_chunker(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an nltk chunker. If True, returns the valid (input, output) types. Examples: >>> from nltk.chunk.named_entity import NEChunkParserTagger >>> from nltk.tokenize import PunktSentenceTokenizer >>> is_chunker(NEChunkParserTagger) (True, (List(List(Tuple(Word(), Word()))), List(List(Tuple(Tuple(Word(), Word()), Word()))))) >>> is_chunker(PunktSentenceTokenizer) (False, None) """ if not _is_tagger(cls, verbose=verbose): return False, None inputs = [] output = kb.List(kb.List(kb.Chunktag())) for input_type in [kb.List(kb.List(kb.Postag()))]: try: X = DATA_TYPE_EXAMPLES[input_type] X_train = [[((word, postag), postag) for word, postag in sentence] for sentence in X] chunker = cls(train=X_train) y = chunker.tag_sents(X) assert DATA_RESOLVERS[output](y) inputs.append(input_type) except Exception as e: if verbose: warnings.warn(str(e)) inputs = combine_types(*inputs) if inputs: return True, (inputs, output) else: return False, None
def is_pretrained_tagger(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an nltk sentence tokenizer. If True, returns the valid (input, output) types. Examples: >>> from nltk.tag import AffixTagger >>> from nltk.tag.perceptron import PerceptronTagger >>> is_pretrained_tagger(PerceptronTagger) (True, (List(Word()), List(Tuple(Word(), Word())))) >>> is_pretrained_tagger(AffixTagger) (False, None) """ if not _is_tagger(cls, verbose=verbose): return False, None inputs = [] output = kb.List(kb.Postag()) for input_type in [kb.List(kb.Word())]: try: X = DATA_TYPE_EXAMPLES[input_type] tagger = cls() y = tagger.tag(X) assert DATA_RESOLVERS[output](y) inputs.append(input_type) except Exception as e: if verbose: warnings.warn(str(e)) inputs = combine_types(*inputs) if inputs: return True, (inputs, output) else: return False, None
def is_sent_tokenizer(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an nltk sentence tokenizer. If True, returns the valid (input, output) types. Examples: >>> from sklearn.linear_model import LogisticRegression >>> from nltk.tokenize import PunktSentenceTokenizer >>> is_sent_tokenizer(PunktSentenceTokenizer) (True, (Document(), List(Sentence()))) >>> is_sent_tokenizer(LogisticRegression) (False, None) """ if not _is_sent_tokenizer(cls, verbose=verbose): return False, None inputs = [] output = kb.List(kb.Sentence()) for input_type in [kb.Document()]: try: X = DATA_TYPE_EXAMPLES[input_type] tokenizer = cls() y = tokenizer.tokenize(X) assert DATA_RESOLVERS[output](y) inputs.append(input_type) except Exception as e: if verbose: warnings.warn(str(e)) inputs = combine_types(*inputs) if inputs: return True, (inputs, output) else: return False, None
obj = set(obj) return len(obj) > 0.1 * original_length and all( isinstance(x, str) for x in obj) except: return False from autogoal import kb DATA_RESOLVERS = { kb.MatrixContinuousDense(): is_matrix_continuous_dense, kb.MatrixContinuousSparse(): is_matrix_continuous_sparse, kb.CategoricalVector(): is_categorical, kb.ContinuousVector(): is_continuous, kb.List(kb.Sentence()): is_string_list, } DATA_TYPE_EXAMPLES = { kb.MatrixContinuousDense(): np.random.rand(10, 10), kb.MatrixContinuousSparse(): sp.rand(10, 10), kb.CategoricalVector(): np.asarray(["A"] * 5 + ["B"] * 5), kb.ContinuousVector(): np.random.rand(10), kb.DiscreteVector(): np.random.randint(0, 10, (10, ), dtype=int), kb.List(kb.Sentence()): ["abc bcd def feg geh hij jkl lmn nop pqr"] * 10, } def is_algorithm(cls, verbose=False): if hasattr(cls, "fit") and hasattr(cls, "predict"): return "estimator"
@abc.abstractmethod def transform(self, X, y=None): pass GENERATION_RULES = dict( LatentDirichletAllocation=dict(ignore_params=set(["evaluate_every"])), RadiusNeighborsClassifier=dict(ignore=True, ), KNeighborsTransformer=dict(ignore_params=set(["metric"])), RadiusNeighborsTransformer=dict(ignore_params=set(["metric"])), LocalOutlierFactor=dict(ignore_params=set(["metric"])), RadiusNeighborsRegressor=dict(ignore_params=set(["metric"])), LabelBinarizer=dict( ignore_params=set(["neg_label", "pos_label"]), input_annotation=kb.List(kb.Category()), ), HashingVectorizer=dict(ignore_params=set( ["token_pattern", "analyzer", "input", "decode_error"])), SpectralBiclustering=dict(ignore_params=set(["n_components", "n_init"])), SpectralCoclustering=dict(ignore_params=set(["n_init"])), KMeans=dict(ignore_params=set(["n_init"])), MiniBatchKMeans=dict(ignore_params=set(["batch_size", "n_init"])), DictionaryLearning=dict(ignore=True), MiniBatchDictionaryLearning=dict(ignore=True), LassoLars=dict(ignore_params=["alpha"]), TheilSenRegressor=dict(ignore_params=["max_subpopulation"]), TSNE=dict(ignore=True, ignore_params=["perplexity"]), )
import warnings import inspect import re import numpy as np import scipy.sparse as sp from autogoal import kb from autogoal.contrib.sklearn._utils import is_matrix_continuous_dense,\ is_matrix_continuous_sparse,\ is_categorical,\ is_continuous,\ is_string_list DATA_TYPE_EXAMPLES = { kb.Postag(): ("lorem", "ipsum"), # (str, str) Tagged token kb.List(kb.Postag()): [("lorem", "ipsum")] * 10, # [(str, str), (str, str)] List of tagged tokens kb.List(kb.List(kb.Postag())): [ [("lorem", "ipsum")] * 2 ], # [[(str, str), (str, str)], [(str, str), (str, str)]] List of Tagged Sentences kb.Chunktag(): (("lorem", "ipsum"), "ipsum"), # ((str, str), str) IOB Tagged token kb.List(kb.Chunktag()): [(("lorem", "ipsum"), "ipsum")] * 10, # [((str, str), str), ((str, str), str)] List of IOB Tagged token kb.List(kb.List(kb.Chunktag())): [ [(("lorem", "ipsum"), "ipsum")] * 2 ], # [[((str, str), str), ((str, str), str)], [((str, str), str), ((str, str), str)]] List of IOB Tagged Sentences kb.Stem(): "ips", kb.Word(): "ipsum",