def get_features_extr(features_str_list, verbose=1): ''' Returns a feature union object containing all the features extractor referenced to in the features_str_list. ''' features_str_list = features_str_list.split("+") feat_extr_list = [] # final feature extractor name feat_extr_union_name = "" if(verbose): print("Starting loading features extractor ... ") # load each features vectorizer and build the union # the name of each sub extractor is the final estimator for feat_extr_str in features_str_list: feat_extr = load_features_extr(feat_extr_str, verbose) feat_extr_pipe_name = feat_extr[-1][0] feat_extr_pipe = get_pipeline( features_extr=feat_extr, classifier=None, verbose=verbose>2 ) feat_extr_list.append((feat_extr_pipe_name,feat_extr_pipe)) feat_extr_union_name += "+" + feat_extr_pipe_name feat_extr_union_name = feat_extr_union_name[1:] feat_extr_union = FeatureUnion(feat_extr_list) res = (feat_extr_union_name, feat_extr_union) if(verbose): print("features extractor loaded : " + feat_extr_union_name + "\n") return res
def get_prepared_data(payload): # num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', # 'total_bedrooms', 'population', 'households', 'median_income'] full_pipeline = get_pipeline() prepared_data = full_pipeline.transform(payload) return prepared_data
def set_pipeline(self): Pipeline = get_pipeline(self.dataset, self.config.suffix) pipe = Pipeline(self.output_size, self.c_dim, self.real_batch_size, os.path.join(self.data_dir, self.dataset), with_labels=False, format=self.format, timer=self.timer, sample_dir=self.sample_dir) self.image_batch = pipe.connect() print(self.format) if self.format == 'NCHW': self.images_NHWC = tf.transpose(self.image_batch, [0, 2, 3, 1]) else: self.images_NHWC = self.image_batch self.pipe = pipe
def get_prepared_data(payload): full_pipeline = get_pipeline() prepared_data = full_pipeline.transform(payload) return prepared_data
import boto3 import sagemaker region = boto3.Session().region_name role = sagemaker.get_execution_role() default_bucket = sagemaker.session.Session().default_bucket() # Change these to reflect your project/business name or if you want to separate ModelPackageGroup/Pipeline from the rest of your team model_package_group_name = f"sagemaker-group-insurance" pipeline_name = f"sagemaker-pipeline-insurance" print(role) from pipeline import get_pipeline pipeline = get_pipeline( region=region, role=role, default_bucket=default_bucket, model_package_group_name=model_package_group_name, pipeline_name=pipeline_name, ) pipeline.upsert(role_arn=role) execution = pipeline.start()
def create_pipeline(): """Create pipeline.""" pl.get_pipeline(PIPELINE_FILE)
def get_pipeline(): """Get the pipeline object.""" pip = pl.get_pipeline(PIPELINE_FILE) pip.loglev = 'debug' return pip
def optimize(options): ''' Optimize the given classifier or/and features extractor on a specified list of parameters Will proceed as follows : - loads the dataset - builds the corpus - load the parameters for tuning - loads the classifiers - loads the features extractors - builds the execution pipelines - trains and compares the different classifiers on the corpus - outputs the best set of parameters found ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Label type not specified", "expected 'v' or 'g'") if not (options["hyper-parameters"]): abort_clean("hyper parameters not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the optimize parameters try: params = load_config(options["hyper-parameters"]) except: abort_clean("Configuration couldn't be loaded", "given path: " + options["hyper-parameters"]) #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=params["classifier-call"], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = get_features_extr( features_str_list=params["features-extractr-call"], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) # Set the classifier and the parameters to be tuned tuning_parameters = get_opt_parameters(params) scores = params["scores"] if options["verbosity"]: print("Starting the optimization process ...") # Launch the tuning of hyper parameters for score in scores: print("Tuning hyper-parameters for %s" % score) optimize_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) clf_optimizer = GridSearchCV(estimator=pipeline, param_grid=tuning_parameters, scoring='%s_macro' % score, fit_params=None, n_jobs=-1, pre_dispatch='2*n_jobs', iid=True, cv=None, refit=True, verbose=options["verbosity"], error_score='raise', return_train_score=True) # Start optimisation clf_optimizer.fit(optimize_corpus["tweets"], optimize_corpus["labels"]) if options["verbosity"]: print("Best parameters set found on development set:") best_parameters = clf_optimizer.best_params_ for param_name in sorted(best_parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print() if options["verbosity"] > 1: print("Grid scores on development set:") means = clf_optimizer.cv_results_['mean_test_score'] stds = clf_optimizer.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf_optimizer.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) # saving results save_optimisation_results(grid=clf_optimizer, output_dir=options["output-dir"], score=score, verbose=options["verbosity"])
def train(options): ''' Trains a specified classifier on a specified dataset using specified feature extractors. Will proceed as follows : - loads the dataset - builds the corpus - loads the classifier - loads the features extractor - builds the execution pipeline - trains the classifier on the corpus - cross-validates the resulting model [optional] - saves the resulting model [optional] ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("Labels not specified", "expected 'l', 'g' or 'v'") if not (options["features"]) and not (options["gensim"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") if not (options["aggregation"]): abort_clean("Aggregation strategy not specified") #-------------------------------------------------------------------------- # Load the tweets in one language for variety or gender classification Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifier t0 = time() classifier = get_classifier(classifier_str=options["classifier"][0], config=None, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Load the features extractors features_extr = None if not (options["gensim"]): features_extr = get_features_extr( features_str_list=options["features"][0], verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Build the execution pipeline pipeline = get_pipeline(features_extr=features_extr, classifier=classifier, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Train the execution pipeline # train and cross validate results if (options["cross-validation"]): if (options["verbosity"]): print("Model Training with cross validation\n") if options["gensim"]: model, pipeline, scores = train_model_gensim_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, config=options["hyper-parameters"], token_level=options["token-level"], verbose=options["verbosity"]) else: pipeline, scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=options["verbosity"]) if options["verbosity"]: print_scores(scores) if options["output-dir"]: if options["gensim"]: filename = str("doc2vec" + "-siz_" + str(model[0].vector_size) + "-win_" + str(model[0].window) + "-cnt_" + str(model[0].min_count) + get_classifier_name(classifier)) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_scores(scores=scores, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) # train without validation --> output-dir required else: if options["verbosity"]: print("Model Training without cross validation\n") if not (options["output-dir"]): abort_clean("No output directory specified.", "Training without persisting is not allowed") train_corpus = build_corpus(authors=Authors, label_type=options["label-type"], verbosity=options["verbosity"]) pipeline = train_model(corpus=train_corpus, pipeline=pipeline, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # Save the resulting model if options["gensim"]: filename = "doc2vec+" + get_classifier_name(classifier) else: filename = str( get_features_extr_name(features_extr) + "+" + get_classifier_name(classifier)) save_model(pipeline=pipeline, output_dir=options["output-dir"], filename=filename, verbose=options["verbosity"]) #-------------------------------------------------------------------------- # End Execution if options["verbosity"]: print("Training task complete in " + str(round(time() - t0)) + " s")
def compare(options): ''' Compare a set of specified classifiers on a specified dataset using specified features Will proceed as follows : - loads the dataset - builds the corpus - loads the classifiers - loads the features extractors - builds the execution pipelines - trains the different classifiers on the corpus - saves the scores obtained by each classifier on each set of features ''' #-------------------------------------------------------------------------- # Check basic requirements if not (options["label-type"]): abort_clean("label type not specified", "expected 'l', 'g' or 'v'") if not (options["features"]): abort_clean("Features not specified") if not (options["classifier"]): abort_clean("Classifier not specified") #-------------------------------------------------------------------------- # Load the tweets Authors = parse_tweets_from_dir(input_dir=options["input-dir"], output_dir=options["processed-tweets-dir"], label=True, aggregation=options["aggregation"], verbosity_level=options["verbosity"]) if not (Authors): abort_clean("Tweets loading failed") #-------------------------------------------------------------------------- # Load the classifiers classifier_str_list = [] if isinstance(options["classifier"], list): classifier_str_list = options["classifier"] else: classifier_str_list = [options["classifier"]] classifiers = [ get_classifier(classifier_str=clf, config=None, verbose=False) for clf in classifier_str_list ] if options["verbosity"]: print("Classifiers Loaded: ") for clf in classifiers: print(" - '" + clf[0] + "'") print() #-------------------------------------------------------------------------- # Load the features extractors extractors_str_list = options["features"] extractors = [ get_features_extr(features_str_list=extr, verbose=False) for extr in extractors_str_list ] if options["verbosity"]: print("Features extractors Loaded: ") for extrs in extractors: print(" - '" + extrs[0] + "'") print() #-------------------------------------------------------------------------- # Prepare results informations supports F1_micro = [[0 for x in classifiers] for y in extractors] F1_macro = [[0 for x in classifiers] for y in extractors] Time_train = [[0 for x in classifiers] for y in extractors] output_dir = options["output-dir"] individual_scores_dir = output_dir + "indiv_scores/" create_dir(individual_scores_dir) #-------------------------------------------------------------------------- # Start the model comparison t0 = time() total_iteration = len(classifiers) * len(extractors) if options["verbosity"]: print("Starting model comparisons") # Loop for each pair features-extractor/classifier for idx_extr, extr in enumerate(extractors): extr_name = get_features_extr_name(extr) for idx_clf, clf in enumerate(classifiers): clf_name = get_classifier_name(clf) if options["verbosity"]: iteration_number = (idx_extr) * len(classifiers) + idx_clf + 1 print("Iteration : " + str(iteration_number) + "/" + str(total_iteration)) print("Testing : Features: " + extr_name + " | Classifier: " + clf_name) t0_step = time() # Build pipeline pipeline = get_pipeline(features_extr=extr, classifier=clf, verbose=False) # Start training + cross validation try: model, step_scores = train_model_cross_validation( authors=Authors, label_type=options["label-type"], pipeline=pipeline, verbose=False) except: print("some error occured - the features extracted and the \ classifier are problably incompatible\n") continue if options["verbosity"]: print("Training complete in " + str(round(time() - t0_step)) + " seconds") print_scores(step_scores) print() # Save scores save_scores(scores=step_scores, output_dir=individual_scores_dir, filename=extr_name + "+" + clf_name, verbose=False) F1_micro[idx_extr][idx_clf] = step_scores["mean_score_micro"] F1_macro[idx_extr][idx_clf] = step_scores["mean_score_macro"] Time_train[idx_extr][idx_clf] = round(time() - t0_step) # Save final micro and macro measuresand execution time save_comparison_table(F1_micro, extractors, classifiers, output_dir + "micro.csv") save_comparison_table(F1_macro, extractors, classifiers, output_dir + "macro.csv") save_comparison_table(Time_train, extractors, classifiers, output_dir + "time.csv") if options["verbosity"]: print("Comparison task complete in " + str(round(time() - t0)) + " s")