def __repr__(self, N_CHAR_MAX=700): # N_CHAR_MAX is the (approximate) maximum number of non-blank # characters to render. We pass it as an optional parameter to ease # the tests. from sklearn.utils._pprint import _EstimatorPrettyPrinter N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences # use ellipsis for sequences with a lot of elements pp = _EstimatorPrettyPrinter( compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW) repr_ = pp.pformat(self) # Use bruteforce ellipsis when there are a lot of non-blank characters n_nonblank = len(''.join(repr_.split())) if n_nonblank > N_CHAR_MAX: lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends regex = r'^(\s*\S){%d}' % lim # The regex '^(\s*\S){%d}' % n # matches from the start of the string until the nth non-blank # character: # - ^ matches the start of string # - (pattern){n} matches n repetitions of pattern # - \s*\S matches a non-blank char following zero or more blanks left_lim = re.match(regex, repr_).end() right_lim = re.match(regex, repr_[::-1]).end() if '\n' in repr_[left_lim:-right_lim]: # The left side and right side aren't on the same line. # To avoid weird cuts, e.g.: # categoric...ore', # we need to start the right side with an appropriate newline # character so that it renders properly as: # categoric... # handle_unknown='ignore', # so we add [^\n]*\n which matches until the next \n regex += r'[^\n]*\n' right_lim = re.match(regex, repr_[::-1]).end() ellipsis = '...' if left_lim + len(ellipsis) < len(repr_) - right_lim: # Only add ellipsis if it results in a shorter repr repr_ = repr_[:left_lim] + '...' + repr_[-right_lim:] return repr_
def test_n_max_elements_to_show(): n_max_elements_to_show = 30 pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=n_max_elements_to_show) # No ellipsis vocabulary = {i: i for i in range(n_max_elements_to_show)} vectorizer = CountVectorizer(vocabulary=vocabulary) expected = r""" CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29})""" expected = expected[1:] # remove first \n assert pp.pformat(vectorizer) == expected # Now with ellipsis vocabulary = {i: i for i in range(n_max_elements_to_show + 1)} vectorizer = CountVectorizer(vocabulary=vocabulary) expected = r""" CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, ...})""" expected = expected[1:] # remove first \n assert pp.pformat(vectorizer) == expected # Also test with lists param_grid = {'C': list(range(n_max_elements_to_show))} gs = GridSearchCV(SVC(), param_grid) expected = """ GridSearchCV(cv='warn', error_score='raise-deprecating', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), iid='warn', n_jobs=None, param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n assert pp.pformat(gs) == expected # Now with ellipsis param_grid = {'C': list(range(n_max_elements_to_show + 1))} gs = GridSearchCV(SVC(), param_grid) expected = """ GridSearchCV(cv='warn', error_score='raise-deprecating', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), iid='warn', n_jobs=None, param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, ...]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n assert pp.pformat(gs) == expected
def test_gridsearch_pipeline(): # render a pipeline inside a gridsearch pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True) pipeline = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())]) N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] param_grid = [{ 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }] gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid) expected = """ GridSearchCV(cv=3, error_score='raise-deprecating', estimator=Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classify', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))]), iid='warn', n_jobs=1, param_grid=[{'classify__C': [1, 10, 100, 1000], 'reduce_dim': [PCA(copy=True, iterated_power=7, n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False), NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=None, shuffle=False, solver='cd', tol=0.0001, verbose=0)], 'reduce_dim__n_components': [2, 4, 8]}, {'classify__C': [1, 10, 100, 1000], 'reduce_dim': [SelectKBest(k=10, score_func=<function chi2 at some_address>)], 'reduce_dim__k': [2, 4, 8]}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n repr_ = pp.pformat(gspipline) # Remove address of '<function chi2 at 0x.....>' for reproducibility repr_ = re.sub('function chi2 at 0x.*>', 'function chi2 at some_address>', repr_) assert repr_ == expected
def test_gridsearch_pipeline(print_changed_only_false): # render a pipeline inside a gridsearch pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True) pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())]) N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { "reduce_dim": [PCA(iterated_power=7), NMF()], "reduce_dim__n_components": N_FEATURES_OPTIONS, "classify__C": C_OPTIONS, }, { "reduce_dim": [SelectKBest(chi2)], "reduce_dim__k": N_FEATURES_OPTIONS, "classify__C": C_OPTIONS, }, ] gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid) expected = """ GridSearchCV(cv=3, error_score='raise-deprecating', estimator=Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classify', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False))]), iid='warn', n_jobs=1, param_grid=[{'classify__C': [1, 10, 100, 1000], 'reduce_dim': [PCA(copy=True, iterated_power=7, n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False), NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=None, shuffle=False, solver='cd', tol=0.0001, verbose=0)], 'reduce_dim__n_components': [2, 4, 8]}, {'classify__C': [1, 10, 100, 1000], 'reduce_dim': [SelectKBest(k=10, score_func=<function chi2 at some_address>)], 'reduce_dim__k': [2, 4, 8]}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n repr_ = pp.pformat(gspipline) # Remove address of '<function chi2 at 0x.....>' for reproducibility repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_) assert repr_ == expected
def test_n_max_elements_to_show(): n_max_elements_to_show = 30 pp = _EstimatorPrettyPrinter( compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=n_max_elements_to_show ) # No ellipsis vocabulary = {i: i for i in range(n_max_elements_to_show)} vectorizer = CountVectorizer(vocabulary=vocabulary) expected = r""" CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29})""" expected = expected[1:] # remove first \n assert pp.pformat(vectorizer) == expected # Now with ellipsis vocabulary = {i: i for i in range(n_max_elements_to_show + 1)} vectorizer = CountVectorizer(vocabulary=vocabulary) expected = r""" CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, ...})""" expected = expected[1:] # remove first \n assert pp.pformat(vectorizer) == expected # Also test with lists param_grid = {'C': list(range(n_max_elements_to_show))} gs = GridSearchCV(SVC(), param_grid) expected = """ GridSearchCV(cv='warn', error_score='raise-deprecating', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), iid='warn', n_jobs=None, param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n assert pp.pformat(gs) == expected # Now with ellipsis param_grid = {'C': list(range(n_max_elements_to_show + 1))} gs = GridSearchCV(SVC(), param_grid) expected = """ GridSearchCV(cv='warn', error_score='raise-deprecating', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), iid='warn', n_jobs=None, param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, ...]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n assert pp.pformat(gs) == expected
def test_gridsearch_pipeline(): # render a pipeline inside a gridsearch pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True) pipeline = Pipeline([ ('reduce_dim', PCA()), ('classify', LinearSVC()) ]) N_FEATURES_OPTIONS = [2, 4, 8] C_OPTIONS = [1, 10, 100, 1000] param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS } ] gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid) expected = """ GridSearchCV(cv=3, error_score='raise-deprecating', estimator=Pipeline(memory=None, steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)), ('classify', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))]), iid='warn', n_jobs=1, param_grid=[{'classify__C': [1, 10, 100, 1000], 'reduce_dim': [PCA(copy=True, iterated_power=7, n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False), NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200, n_components=None, random_state=None, shuffle=False, solver='cd', tol=0.0001, verbose=0)], 'reduce_dim__n_components': [2, 4, 8]}, {'classify__C': [1, 10, 100, 1000], 'reduce_dim': [SelectKBest(k=10, score_func=<function chi2 at some_address>)], 'reduce_dim__k': [2, 4, 8]}], pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)""" expected = expected[1:] # remove first \n repr_ = pp.pformat(gspipline) # Remove address of '<function chi2 at 0x.....>' for reproducibility repr_ = re.sub('function chi2 at 0x.*>', 'function chi2 at some_address>', repr_) assert repr_ == expected