def run_atl(train_ds, test_ds, lbr, model, qs, quota, n_init_labeled): start_time = time.time() E_in, E_in_f1, E_out, E_out_f1 = [], [], [], [] #E_out_P, E_out_R = [], [] labels = [] l = quota sup.printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) for i in range(quota): # QBC ask_id = qs.make_query() # Labeler for QBC on train_ds and Random on train_ds2 lb = lbr.label(train_ds.data[ask_id][0]) # QBC train_ds.update(ask_id, lb) labels.append(lb) model.train(train_ds) X_train_current, y_train_current = train_ds.format_sklearn() E_in = np.append(E_in, model.score(train_ds)) E_in_f1 = np.append( E_in_f1, f1_score(y_train_current, model.predict(X_train_current), pos_label=1, average='binary', sample_weight=None)) X_test, y_test = test_ds.format_sklearn() E_out = np.append(E_out, model.score(test_ds)) prec, recall, f1score, support = precision_recall_fscore_support( y_test, model.predict(X_test), average='binary') E_out_f1 = np.append(E_out_f1, f1score) #E_out_P = np.append(E_out_P, prec) #E_out_R = np.append(E_out_R, recall) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix='Progress:', suffix='Complete', length=50) runt = time.time() - start_time print('Runtime: {:.2f} seconds'.format(runt)) return E_in, E_in_f1, E_out, E_out_f1, model, runt
def calcDomainRelatednessCVinDict(candsets, all_features, dense_features_dict=None, cv=5, metric='phi'): d = {} combinations = [] for combo in itertools.combinations(candsets, 2): if ((combo[0].split('_')[0] in combo[1].split('_')) or (combo[0].split('_')[1] in combo[1].split('_'))): combinations.append(combo) #print(combinations) l = len(combinations) sup.printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) for i, combo in enumerate(combinations): d.update({ combo: { 'all': calcDomainRelatednessCV(candsets[combo[0]], candsets[combo[1]], all_features, cv, metric) } }) # only the dense features if (dense_features_dict is not None): dense_feature_key = '_'.join( sorted(set(combo[0].split('_') + combo[1].split('_')))) d[combo].update({ 'dense': calcDomainRelatednessCV(candsets[combo[0]], candsets[combo[1]], dense_features_dict[dense_feature_key], cv, metric) }) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix='Progress:', suffix='Complete', length=50) return d
def run_weighted_atl(train_ds, test_ds, lbr, model, qs, quota): start_time = time.time() E_in, E_in_f1, E_out, E_out_f1 = [], [], [], [] E_out_P, E_out_R = [], [] model_pred_prob, model_feature_import, model_depth_tree = [], [], [] labels, corrected_labels = [], [] X_test, y_test = test_ds.format_sklearn() l = quota sup.printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) for i in range(quota): # QBC ask_id = qs.make_query() # Oracle lb = lbr.label(train_ds.data[ask_id][0]) # QBC train_ds.update(ask_id, lb) labels.append(lb) corrected = 0 if (train_ds._y_transfer_labels[ask_id] != lb): corrected = 1 train_ds.update_transfer_labels(ask_id, lb) corrected_labels.append(corrected) if (model.model.warm_start and 1 in train_ds._y and 0 in train_ds._y and i > 0): # for RF if warm_start = True and i > 0 then there will be two trees added # to the forest which are trained only on the target instances queried. In the # case i == 1 then only the two bootstrapped target instances (one neg. and one pos. target instances) # are in the training set. For the subsequent runs the queried target instances will be in the training set model.model.n_estimators += 2 model.train(train_ds, no_weights=True) # calculating the training score X_train_current, y_train_current = train_ds.format_sklearn( no_weights=True) E_in = np.append( E_in, model.score(Dataset(X=X_train_current, y=y_train_current))) E_in_f1 = np.append( E_in_f1, f1_score(y_train_current, model.predict(X_train_current), pos_label=1, average='binary')) elif (model.model.warm_start and i == 0): # for RF with warm_start = True in the first itertation a forest with 10 trees # is trained on the source instances. If the source instances are weighted # based on importance weighting of domain adaptation then it is trained with them # as sample_weights however if no weighting was specified when initializing the # Dataset (i.e. SourceATLDataset) object, then it it trained without domain adaptation model.train(train_ds.get_source_training_data()) # calculating the training score X_train_current, y_train_current = train_ds.format_sklearn( no_weights=True) E_in = np.append( E_in, model.score(Dataset(X=X_train_current, y=y_train_current))) E_in_f1 = np.append( E_in_f1, f1_score(y_train_current, model.predict(X_train_current), pos_label=1, average='binary')) # get info about the RF model like pred prob on target test, feature importance, and depth of trees model_pred_prob.append(model.predict_proba(X_test)) model_feature_import.append(model.feature_importances_()) model_depth_tree.append(model.get_trees_max_depth()) else: # for the case that we use a model other than RF as active learning model, we cannot use the warm_start # approch and hence we always learn a model on the source instance + the current labeled set. This is also # the case if we use RF with warm_start = False. X_source, y_source, sample_weights = train_ds.get_source_training_data( ).format_sklearn() X_target_current, y_target_current = train_ds.format_sklearn( no_weights=True) X_train_current = np.vstack([X_source, X_target_current]) y_train_current = np.append(y_source, y_target_current) # assign a weight of 1 to each target instance sample_weights = np.concatenate( [sample_weights, [1] * (y_target_current.shape[0])]) model.train( le.AWTLDataset(X_train_current, y_train_current, sample_weights)) # calculating the training score E_in = np.append( E_in, model.score(Dataset(X=X_train_current, y=y_train_current))) E_in_f1 = np.append( E_in_f1, f1_score(y_train_current, model.predict(X_train_current), pos_label=1, average='binary')) # calculating the test score for this iteration. This is actually the interesting part!!! E_out = np.append(E_out, model.score(test_ds)) prec, recall, f1score, support = precision_recall_fscore_support( y_test, model.predict(X_test), average='binary') if (i == quota - 1): model_pred_prob.append(model.predict_proba(X_test)) model_feature_import.append(model.feature_importances_()) model_depth_tree.append(model.get_trees_max_depth()) print( 'Last iteration Performance on Target Test Set: F1 {:.2f}; Prec {:.2f}; Recall {:.2f}' .format(E_out_f1[-1], E_out_P[-1], E_out_R[-1])) print( 'Average depth of trees at start (iteration 0): {} at last iteration {}' .format(np.mean(model_depth_tree[0]), np.mean(model_depth_tree[1]))) E_out_f1 = np.append(E_out_f1, f1score) E_out_P = np.append(E_out_P, prec) E_out_R = np.append(E_out_R, recall) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix='Progress:', suffix='Complete', length=50) share_of_corrected_labeles = sum(corrected_labels) / quota runt = time.time() - start_time print('Runtime: {:.2f} seconds'.format(runt)) print('Corrected labels from transfer: {}'.format(sum(corrected_labels))) return E_in, E_in_f1, E_out, E_out_f1, E_out_P, E_out_R, model, runt, share_of_corrected_labeles, model_pred_prob, model_feature_import, model_depth_tree
def performTLFromDict(candsets,candsets_train,candsets_test,estimators,all_features,dense_features_dict=None,da_weighting=None,n=10): """ ***IMPORTANT*** -> Very time consuming. Hence, results should be saved to hard disk with saveTLResultsToJSON() function, so that the experiments not necessarily need to be repeated. Perform Transfer Learning Experiment for each combination of source-target pairs in candsets dictionary with naive transfer of matching rule trained on source instances and evaluated on all target instances - target_train_size for all estimators specified in estimators and for all_features as well as only dense features per combination. The results are averaged over n runs. @parameters candsets: Dictionary containing all candidate sets (pot. correspondences) candsets_train: Dictionary containing all training sets (pot. correspondences) candsets_test: Dictionary containing all test sets (pot. correspondences) estimators: Dicitionary with sklearn Estimators that shall be used for the TL Experiment. Dictionary should be of form {'logreg':LogisticRegression(),'logregcv':LogisticRegressionCV(),...} All_features: List of with all features Dense_features_dict: Dictionary with list of onle the dense feature for each combination. Exp: When source ban_half and target wor_half then the dense features across ban, half and wor need to be saved in a list which is the value of for dense_features_dict['ban_half_wor']. It is important that the key is compound of ban, half, wor in alphabetical order seperated by '_' n: specifies on how many random samples the experiments shall be performed and averaged. 100 will explode computing time!!! Default: 10 """ x_instances = [10,14,20,24,28,32,38,44,50,60,70,80,90,100,120,140,160,180,200,300,500] d = {} combinations = [] for combo in itertools.combinations(candsets, 2): if((combo[0].split('_')[0] in combo[1].split('_')) or (combo[0].split('_')[1] in combo[1].split('_'))): combinations.append(combo) #print(combinations) l = len(combinations) sup.printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50) for i, combo in enumerate(combinations): for clf in estimators: a_transfer_results = [] a_target_results = [] b_transfer_results = [] b_target_results = [] a_transfer_results_dense = [] a_target_results_dense = [] b_transfer_results_dense = [] b_target_results_dense = [] for x in x_instances: # all features (also non-dense ones) # transfer from a to b res = getF1SourceTargetFixedAvg(candsets[combo[0]],candsets[combo[1]],candsets_train[combo[1]],candsets_test[combo[1]],estimators[clf],all_features,da_weighting,x,n) a_transfer_results.append(res[0]) a_target_results.append(res[1]) # transfer from b to a res = getF1SourceTargetFixedAvg(candsets[combo[1]],candsets[combo[0]],candsets_train[combo[0]],candsets_test[combo[0]],estimators[clf],all_features,da_weighting,x,n) b_transfer_results.append(res[0]) b_target_results.append(res[1]) if(dense_features_dict is not None): # only the dense features dense_feature_key = '_'.join(sorted(set(combo[0].split('_')+combo[1].split('_')))) # transfer from a to b res = getF1SourceTargetFixedAvg(candsets[combo[0]], candsets[combo[1]], candsets_train[combo[1]], candsets_test[combo[1]], estimators[clf], dense_features_dict[dense_feature_key],da_weighting,x,n) a_transfer_results_dense.append(res[0]) a_target_results_dense.append(res[1]) # transfer from b to a res = getF1SourceTargetFixedAvg(candsets[combo[1]], candsets[combo[0]], candsets_train[combo[0]], candsets_test[combo[0]], estimators[clf], dense_features_dict[dense_feature_key],da_weighting,x,n) b_transfer_results_dense.append(res[0]) b_target_results_dense.append(res[1]) # all features a_transfer_res = sum(a_transfer_results)/len(x_instances) a_target_max = max(a_target_results) b_transfer_res = sum(b_transfer_results)/len(x_instances) b_target_max = max(b_target_results) try: idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results) - np.array(a_target_results)))).flatten()[0] a_x_target_instances = x_instances[idx] except Exception: a_x_target_instances = np.nan try: idx = np.argwhere(np.diff(np.sign(np.array(b_transfer_results) - np.array(b_target_results)))).flatten()[0] b_x_target_instances = x_instances[idx] except Exception: b_x_target_instances = np.nan if(combo not in d): if(da_weighting is None): d.update({combo:{'all':{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}}}}) else: d.update({combo:{'all':{da_weighting:{clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}}}}) else: if(da_weighting is None): d[combo]['all']['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}) else: d[combo]['all'][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}) if(combo[::-1] not in d): if(da_weighting is None): d.update({combo[::-1]:{'all':{'no_weighting':{clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}}}}) else: d.update({combo[::-1]:{'all':{da_weighting:{clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}}}}) else: if(da_weighting is None): d[combo[::-1]]['all']['no_weighting'].update({clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}) else: d[combo[::-1]]['all'][da_weighting].update({clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}) if(dense_features_dict is not None): # dense features a_transfer_res_dense = sum(a_transfer_results_dense)/len(x_instances) a_target_max_dense = max(a_target_results_dense) b_transfer_res_dense = sum(b_transfer_results_dense)/len(x_instances) b_target_max_dense = max(b_target_results_dense) try: idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results_dense) - np.array(a_target_results_dense)))).flatten()[0] a_x_target_instances_dense = x_instances[idx] except Exception: a_x_target_instances_dense = np.nan try: idx = np.argwhere(np.diff(np.sign(np.array(b_transfer_results_dense) - np.array(b_target_results_dense)))).flatten()[0] b_x_target_instances_dense = x_instances[idx] except Exception: b_x_target_instances_dense = np.nan if('dense' not in d[combo]): if(da_weighting is None): d[combo].update({'dense':{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res_dense, 'target_max_result':a_target_max_dense, 'x_target_exceed':a_x_target_instances_dense, 'y_transfer_results':a_transfer_results_dense, 'y_target_results':a_target_results_dense, 'n_runs':n}}}}) else: d[combo].update({'dense':{da_weighting:{clf:{'transfer_avg_result':a_transfer_res_dense, 'target_max_result':a_target_max_dense, 'x_target_exceed':a_x_target_instances_dense, 'y_transfer_results':a_transfer_results_dense, 'y_target_results':a_target_results_dense, 'n_runs':n}}}}) else: if(da_weighting is None): d[combo]['dense']['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res_dense, 'target_max_result':a_target_max_dense, 'x_target_exceed':a_x_target_instances_dense, 'y_transfer_results':a_transfer_results_dense, 'y_target_results':a_target_results_dense, 'n_runs':n}}) else: d[combo]['dense'][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res_dense, 'target_max_result':a_target_max_dense, 'x_target_exceed':a_x_target_instances_dense, 'y_transfer_results':a_transfer_results_dense, 'y_target_results':a_target_results_dense, 'n_runs':n}}) if('dense' not in d[combo[::-1]]): if(da_weighting is None): d[combo[::-1]].update({'dense':{'no_weighting':{clf:{'transfer_avg_result':b_transfer_res_dense, 'target_max_result':b_target_max_dense, 'x_target_exceed':b_x_target_instances_dense, 'y_transfer_results':b_transfer_results_dense, 'y_target_results':b_target_results_dense, 'n_runs':n}}}}) else: d[combo[::-1]].update({'dense':{da_weighting:{clf:{'transfer_avg_result':b_transfer_res_dense, 'target_max_result':b_target_max_dense, 'x_target_exceed':b_x_target_instances_dense, 'y_transfer_results':b_transfer_results_dense, 'y_target_results':b_target_results_dense, 'n_runs':n}}}}) else: if(da_weighting is None): d[combo[::-1]]['dense']['no_weighting'].update({clf:{'transfer_avg_result':b_transfer_res_dense, 'target_max_result':b_target_max_dense, 'x_target_exceed':b_x_target_instances_dense, 'y_transfer_results':b_transfer_results_dense, 'y_target_results':b_target_results_dense, 'n_runs':n}}) else: d[combo[::-1]]['dense'][da_weighting].update({clf:{'transfer_avg_result':b_transfer_res_dense, 'target_max_result':b_target_max_dense, 'x_target_exceed':b_x_target_instances_dense, 'y_transfer_results':b_transfer_results_dense, 'y_target_results':b_target_results_dense, 'n_runs':n}}) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50) return d
def performSingleTLExp(source,target,source_target_name,target_train,target_test,estimators,features,da_weighting=None,n=10,switch_roles=True): """ Backup function to perform single experiment. """ x_instances = [10,14,20,24,28,32,38,44,50,60,70,80,90,100,120,140,160,180,200,300,500] d = {} l = len(estimators.keys()) sup.printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50) if(switch_roles): target_source_name = '{}_{}'.format(source_target_name.split('_')[1],source_target_name.split('_')[0]) for i, clf in enumerate(estimators): a_transfer_results = [] a_target_results = [] b_transfer_results = [] b_target_results = [] for x in x_instances: # all features (also non-dense ones) # transfer from a to b res = getF1SourceTargetFixedAvg(source,target,target_train,target_test,estimators[clf],features,da_weighting,x,n) a_transfer_results.append(res[0]) a_target_results.append(res[1]) # transfer from b to a res = getF1SourceTargetFixedAvg(target,source,target_train,target_test,estimators[clf],features,da_weighting,x,n) b_transfer_results.append(res[0]) b_target_results.append(res[1]) a_transfer_res = sum(a_transfer_results)/len(x_instances) a_target_max = max(a_target_results) b_transfer_res = sum(b_transfer_results)/len(x_instances) b_target_max = max(b_target_results) try: idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results) - np.array(a_target_results)))).flatten()[0] a_x_target_instances = x_instances[idx] except Exception: a_x_target_instances = np.nan try: idx = np.argwhere(np.diff(np.sign(np.array(b_transfer_results) - np.array(b_target_results)))).flatten()[0] b_x_target_instances = x_instances[idx] except Exception: b_x_target_instances = np.nan if(source_target_name not in d): if(da_weighting is None): d.update({source_target_name:{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}}}) else: d.update({source_target_name:{da_weighting:{clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}}}) else: if(da_weighting is None): d[source_target_name]['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}) else: d[source_target_name][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}) if(target_source_name not in d): if(da_weighting is None): d.update({target_source_name:{'no_weighting':{clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}}}) else: d.update({target_source_name:{da_weighting:{clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}}}) else: if(da_weighting is None): d[target_source_name]['no_weighting'].update({clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}) else: d[target_source_name][da_weighting].update({clf:{'transfer_avg_result':b_transfer_res, 'target_max_result':b_target_max, 'x_target_exceed':b_x_target_instances, 'y_transfer_results':b_transfer_results, 'y_target_results':b_target_results, 'n_runs':n}}) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50) else: for i, clf in enumerate(estimators): a_transfer_results = [] a_target_results = [] for x in x_instances: # perform transfer learning experiments with source as source and target as target res = getF1SourceTargetFixedAvg(source,target,target_train,target_test,estimators[clf],features,da_weighting,x,n) a_transfer_results.append(res[0]) a_target_results.append(res[1]) a_transfer_res = sum(a_transfer_results)/len(x_instances) a_target_max = max(a_target_results) try: idx = np.argwhere(np.diff(np.sign(np.array(a_transfer_results) - np.array(a_target_results)))).flatten()[0] a_x_target_instances = x_instances[idx] except Exception: a_x_target_instances = np.nan if(source_target_name not in d): if(da_weighting is None): d.update({source_target_name:{'no_weighting':{clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}}}) else: d.update({source_target_name:{da_weighting:{clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}}}) else: if(da_weighting is None): d[source_target_name]['no_weighting'].update({clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}) else: d[source_target_name][da_weighting].update({clf:{'transfer_avg_result':a_transfer_res, 'target_max_result':a_target_max, 'x_target_exceed':a_x_target_instances, 'y_transfer_results':a_transfer_results, 'y_target_results':a_target_results, 'n_runs':n}}) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50) return d
def run_al(train_ds, test_ds, lbr, model, qs, quota, n_init_labeled): start_time = time.time() E_in, E_in_f1, E_out, E_out_f1, E_out_P, E_out_R = [], [], [], [], [], [] model_pred_prob, model_feature_import, model_depth_tree = [], [], [] labels = [] for x in range(n_init_labeled): E_out_f1 = np.append(E_out_f1, 0.0) E_out_P = np.append(E_out_P, 0.0) E_out_R = np.append(E_out_R, 0.0) E_in_f1 = np.append(E_in_f1, 0.0) E_in = np.append(E_in, 0.0) X_test, y_test = test_ds.format_sklearn() model.train(train_ds) model_pred_prob.append(model.predict_proba(X_test)) model_feature_import.append(model.feature_importances_()) if (model.name == 'dt'): model_depth_tree.append(model.get_tree_max_depth()) if (model.name == 'rf'): model_depth_tree.append(model.get_trees_max_depth()) l = quota - n_init_labeled sup.printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) for i in range(quota - n_init_labeled): # QBC ask_id = qs.make_query() # Labeler for QBC on train_ds lb = lbr.label(train_ds.data[ask_id][0]) # QBC train_ds.update(ask_id, lb) labels.append(lb) model.train(train_ds) X_train_current, y_train_current = train_ds.format_sklearn() E_in = np.append(E_in, model.score(train_ds)) E_in_f1 = np.append( E_in_f1, f1_score(y_train_current, model.predict(X_train_current), pos_label=1, average='binary', sample_weight=None)) E_out = np.append(E_out, model.score(test_ds)) prec, recall, f1score, support = precision_recall_fscore_support( y_test, model.predict(X_test), average='binary') # l = quota-n_init_labeled as defined for the progress_bar if (i == l - 1): model_pred_prob.append(model.predict_proba(X_test)) model_feature_import.append(model.feature_importances_()) if (model.name == 'dt'): model_depth_tree.append(model.get_tree_max_depth()) if (model.name == 'rf'): model_depth_tree.append(model.get_trees_max_depth()) E_out_f1 = np.append(E_out_f1, f1score) E_out_P = np.append(E_out_P, prec) E_out_R = np.append(E_out_R, recall) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix='Progress:', suffix='Complete', length=50) runt = time.time() - start_time print('Runtime: {:.2f} seconds'.format(runt)) return E_in, E_in_f1, E_out, E_out_f1, E_out_P, E_out_R, model, runt, model_pred_prob, model_feature_import, model_depth_tree
def returnSuperBMsInDict(candsets_train, candsets_test, estimators, features, progress_bar=True): """ For each candest in candsets dictionary calculate the performance on hold-out test set when trained on training set of each using the estimators provided in estimators dictionary on the features specified in features argument. Candsets_train: dictionary of all training sets Candsets_test: dictionary of all test sets Estimators: dictionary of sklearn estimators that shall be used to train a classifier (Exp: {'logreg':LogisticRegression(),'dectree':DecisionTree()}) features: list of features that shall be used Progress_bar: Boolean if progress bar shall be printed to track progress. Default: True Returns: dictionary with combinations as first keys and estimators as second keys. f1 and model_params are the final keys """ d = {} if (progress_bar): l = len(candsets_train.keys()) sup.printProgressBar(0, l, prefix='Progress:', suffix='Complete', length=50) for i, candset in enumerate(candsets_train.keys()): for clf in estimators: res, params = returnPassiveLearningResultsHoldoutSet( estimators[clf], candsets_train[candset], candsets_test[candset], features) if (candset not in d): d.update({ '{}'.format(candset): { clf: { 'f1': res, 'model_params': params } } }) else: d[candset].update( {clf: { 'f1': res, 'model_params': params }}) # Update Progress Bar sup.printProgressBar(i + 1, l, prefix='Progress:', suffix='Complete', length=50) else: for candset in candsets_train: for clf in estimators: res = returnPassiveLearningResultsHoldoutSet( estimators[clf], candsets_train[candset], candsets_test[candset], features) if (candset not in d): d.update({ '{}'.format(candset): { clf: { 'f1': res, 'model_params': params } } }) else: d[candset].update( {clf: { 'f1': res, 'model_params': params }}) return d