def is_transformer(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an sklearn general transformer. If True, returns the valid (input, output) types. Examples: >>> from sklearn.feature_extraction.text import CountVectorizer >>> is_transformer(CountVectorizer) (True, (List(Sentence()), MatrixContinuousSparse())) >>> from sklearn.decomposition.pca import PCA >>> is_transformer(PCA) (True, (MatrixContinuousDense(), MatrixContinuousDense())) """ if not is_algorithm(cls, verbose=verbose): return False, None allowed_inputs = set() allowed_outputs = set() for input_type in [ kb.MatrixContinuousDense(), kb.MatrixContinuousSparse(), kb.List(kb.Sentence()), ]: for output_type in [ kb.MatrixContinuousDense(), kb.MatrixContinuousSparse(), kb.List(kb.Sentence()), ]: try: X = DATA_TYPE_EXAMPLES[input_type] clf = cls() X = clf.fit_transform(X) assert is_data_type(X, output_type) allowed_inputs.add(input_type) allowed_outputs.add(output_type) except Exception as e: if verbose: warnings.warn(str(e)) if len(allowed_outputs) != 1: return False, None inputs = combine_types(*allowed_inputs) if allowed_inputs: return True, (inputs, list(allowed_outputs)[0]) else: return False, None
def combine_types(*types): if len(types) == 1: return types[0] types = set(types) if types == {kb.MatrixContinuousDense(), kb.MatrixContinuousSparse()}: return kb.MatrixContinuous() return None
def is_clusterer(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an nltk clusterer. If True, returns the valid (input, output) types. Examples: >>> from sklearn.linear_model import LogisticRegression >>> from nltk.cluster import GAAClusterer >>> is_clusterer(GAAClusterer) (True, (MatrixContinuousDense(), CategoricalVector())) >>> is_clusterer(LogisticRegression) (False, None) """ if not _is_clusterer(cls, verbose=verbose): return False, None inputs = [] for input_type in [ kb.MatrixContinuousDense(), kb.MatrixContinuousSparse() ]: try: X = DATA_TYPE_EXAMPLES[input_type] y = DATA_TYPE_EXAMPLES[kb.CategoricalVector()] clusterer = cls() clusterer.cluster(X) y = [clusterer.classify(x) for x in X] assert is_categorical(y) inputs.append(input_type) except Exception as e: if verbose: warnings.warn(str(e)) inputs = combine_types(*inputs) if inputs: return True, (inputs, kb.CategoricalVector()) else: return False, None
def is_clusterer(cls, verbose=False): """Determine if `cls` corresponds to something that resembles an sklearn clustering algorithm. If True, returns the valid (input, output) types. Examples: >>> from sklearn.linear_model import LogisticRegression, LinearRegression >>> is_clusterer(LogisticRegression) (False, None) >>> is_clusterer(LinearRegression) (False, None) >>> from sklearn.cluster import KMeans >>> is_clusterer(KMeans) (True, (MatrixContinuous(), DiscreteVector())) """ if not is_algorithm(cls, verbose=verbose): return False, None inputs = [] for input_type in [ kb.MatrixContinuousDense(), kb.MatrixContinuousSparse() ]: try: X = DATA_TYPE_EXAMPLES[input_type] clf = cls() y = clf.fit_predict(X) assert is_discrete(y) inputs.append(input_type) except Exception as e: if verbose: warnings.warn(str(e)) inputs = combine_types(*inputs) if inputs: return True, (inputs, kb.DiscreteVector()) else: return False, None
assert len(obj.shape) == 1 original_length = len(obj) obj = set(obj) return len(obj) > 0.1 * original_length and all( isinstance(x, str) for x in obj) except: return False from autogoal import kb DATA_RESOLVERS = { kb.MatrixContinuousDense(): is_matrix_continuous_dense, kb.MatrixContinuousSparse(): is_matrix_continuous_sparse, kb.CategoricalVector(): is_categorical, kb.ContinuousVector(): is_continuous, kb.List(kb.Sentence()): is_string_list, } DATA_TYPE_EXAMPLES = { kb.MatrixContinuousDense(): np.random.rand(10, 10), kb.MatrixContinuousSparse(): sp.rand(10, 10), kb.CategoricalVector(): np.asarray(["A"] * 5 + ["B"] * 5), kb.ContinuousVector(): np.random.rand(10), kb.DiscreteVector(): np.random.randint(0, 10, (10, ), dtype=int), kb.List(kb.Sentence()): ["abc bcd def feg geh hij jkl lmn nop pqr"] * 10, }
kb.List(kb.Chunktag()): [(("lorem", "ipsum"), "ipsum")] * 10, # [((str, str), str), ((str, str), str)] List of IOB Tagged token kb.List(kb.List(kb.Chunktag())): [ [(("lorem", "ipsum"), "ipsum")] * 2 ], # [[((str, str), str), ((str, str), str)], [((str, str), str), ((str, str), str)]] List of IOB Tagged Sentences kb.Stem(): "ips", kb.Word(): "ipsum", kb.Sentence(): "It is the best of all movies.", kb.Document(): "It is the best of all movies. I actually love that action scene.", kb.MatrixContinuousDense(): np.random.rand(10, 10), kb.MatrixContinuousSparse(): sp.rand(10, 10), kb.CategoricalVector(): np.asarray(["A"] * 5 + ["B"] * 5), kb.ContinuousVector(): np.random.rand(10), kb.DiscreteVector(): np.random.randint(0, 10, (10, ), dtype=int), kb.List(kb.Word()): ["ipsu", "lorem"], kb.List(kb.Document()): ["abc ipsu lorem say hello", "ipsum lorem", "abc"] * 2, kb.List(kb.List(kb.Stem())): [["abc", "ipsu", "lorem"] * 10], kb.List(kb.List(kb.Word())): [["abc", "ipsu", "lorem"] * 10], kb.List(kb.List(kb.Sentence())): [["abc a sentence lorem"], ["ipsum lorem"], ["abc"]] }