def main(on_switch=False):
    if on_switch:
        obj = DataPreparationNameEthnicityProject()
        obj.data_preparation_steps()

        if obj.control_panel['done_switch']:
            hf.done_alert()
Example #2
0
def main(on_switch=False):
    if on_switch:
        obj = FeatureCreationNameEthnicityProject()
        obj.feature_generation_steps()

        if local_control_panel['done_switch']:
            hf.done_alert()
Example #3
0
def main(on_switch=False):
    if on_switch:
        save_switch = False  # WARNING: Will overwrite existing if True
        run_on_subsampled_data = True
        run_on_full_data = False
        run_on_unfeatured_data = False
        run_on_featured_data = True

        # Eg, full list >> [1, 5, 10, 50, 100, 500]
        nk_list = [5]
        '''Eg, Full list >> ['dummy', 'sex_only', 'name_basic_only', 'name_substring_only', 'name_numeric_only', 
							'name_metaphone_only', 'name_all', 'loc_basic_only', 'loc_sep_entity_only', 
							'loc_substring_only', 'loc_all', 'name_all_loc_all']'''
        feature_set_list = ['name_all_loc_all_reduced']
        ml_algo_dict = {
            'LR': {
                'clf':
                LogisticRegression(tol=3.359818286283781e-05,
                                   solver='liblinear',
                                   penalty='l1',
                                   max_iter=4000,
                                   class_weight=None,
                                   C=4.281332398719396)
            },
            'SVC_LINEAR': {
                'clf': CalibratedClassifierCV(LinearSVC(penalty='l2', C=0.01)),
            },
            'NB': {
                'clf': BernoulliNB(),
            },
        }
        # Eg, Full list >> ['ab', 'fn', 'metis', 'inuit', 'ch', 'ja', 'en', 'fr', 'ir', 'it', 'rus', 'sc', 'others']
        target_set_list = ['ch', 'ab']

        if run_on_subsampled_data:
            # Loop through subsampling n set with unfeatured data
            if run_on_unfeatured_data:
                for nk in nk_list:
                    for algo_key, algo_val in ml_algo_dict.items():
                        for target_label in target_set_list:
                            print('>> Current time:', datetime.datetime.now())
                            obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(
                                control_panel={
                                    'save_result_switch':
                                    save_switch,  # WARNING: Will overwrite existing
                                    'use_subsampled_df_switch':
                                    False,  # WARNING: Switch to False in production
                                    'use_subsampled_df_nk': nk,
                                    'use_featured_df_switch': False,
                                    'use_feature_set': [],
                                    'feature_selection_switch': False,
                                    'cross_validation_switch': False,
                                    'ml_process_on_test_data_switch': True,
                                    'ml_process_on_training_data_switch':
                                    False,
                                    'ml_process_on_ext_data_switch': False,
                                    'ml_algo': [algo_key, algo_val],
                                    'ml_algo_param_grid': None,
                                    'binary_target_label': target_label,
                                    'eval_score': None,
                                    'random_state': 888,
                                })
                            obj.machine_learning_steps()

            # Loop through feature set and subsampling n set
            if run_on_featured_data:
                for feature_set in feature_set_list:
                    for nk in nk_list:
                        for algo_key, algo_val in ml_algo_dict.items():
                            for target_label in target_set_list:
                                print('>> Current time:',
                                      datetime.datetime.now())
                                obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(
                                    control_panel={
                                        'save_result_switch':
                                        save_switch,  # WARNING: Will overwrite existing
                                        'use_subsampled_df_switch':
                                        False,  # WARNING: Switch to False in production
                                        'use_subsampled_df_nk': nk,
                                        'use_featured_df_switch': True,
                                        'use_feature_set': feature_set,
                                        'feature_selection_switch': False,
                                        'cross_validation_switch': False,
                                        'ml_process_on_test_data_switch': True,
                                        'ml_process_on_training_data_switch':
                                        False,
                                        'ml_process_on_ext_data_switch': False,
                                        'ml_algo': [algo_key, algo_val],
                                        'ml_algo_param_grid': None,
                                        'binary_target_label': target_label,
                                        'eval_score': None,
                                        'random_state': 888,
                                    })
                                obj.machine_learning_steps()

        if run_on_full_data:
            # Run once using unfeatured, full dataset
            if run_on_unfeatured_data:
                for feature_set in feature_set_list:
                    for algo_key, algo_val in ml_algo_dict.items():
                        for target_label in target_set_list:
                            print('>> Current time:', datetime.datetime.now())
                            obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(
                                control_panel={
                                    'save_result_switch':
                                    save_switch,  # WARNING: Will overwrite existing
                                    'use_subsampled_df_switch':
                                    False,  # WARNING: Switch to False in production
                                    'use_subsampled_df_nk': 'none',
                                    'use_featured_df_switch': False,
                                    'use_feature_set': [],
                                    'feature_selection_switch': False,
                                    'cross_validation_switch': False,
                                    'ml_process_on_test_data_switch': True,
                                    'ml_process_on_training_data_switch':
                                    False,
                                    'ml_process_on_ext_data_switch': False,
                                    'ml_algo': [algo_key, algo_val],
                                    'ml_algo_param_grid': None,
                                    'binary_target_label': target_label,
                                    'eval_score': None,
                                    'random_state': 888,
                                })
                            obj.machine_learning_steps()

            # Run once using featured, full dataset
            if run_on_featured_data:
                for feature_set in feature_set_list:
                    for algo_key, algo_val in ml_algo_dict.items():
                        for target_label in target_set_list:
                            print('>> Current time:', datetime.datetime.now())
                            obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(
                                control_panel={
                                    'save_result_switch':
                                    save_switch,  # WARNING: Will overwrite existing
                                    'use_subsampled_df_switch':
                                    False,  # WARNING: Switch to False in production
                                    'use_subsampled_df_nk': [],
                                    'use_featured_df_switch': True,
                                    'use_feature_set': feature_set,
                                    'feature_selection_switch': False,
                                    'cross_validation_switch': False,
                                    'ml_process_on_test_data_switch': True,
                                    'ml_process_on_training_data_switch':
                                    False,
                                    'ml_process_on_ext_data_switch': False,
                                    'ml_algo': [algo_key, algo_val],
                                    'ml_algo_param_grid': None,
                                    'binary_target_label': target_label,
                                    'eval_score': None,
                                    'random_state': 888,
                                })
                            obj.machine_learning_steps()

    if local_control_panel['done_switch']:
        hf.done_alert()
def main(on_switch=False):
	if on_switch:
		save_switch = False
		run_on_subsampled_data = False
		run_on_full_data = True
		run_on_unfeatured_data = False
		run_on_featured_data = True

		# Eg, full list >> [1, 5, 10, 50, 100, 500]
		nk_list = [500]
		'''Eg, Full list >> ['dummy', 'sex_only', 'name_basic_only', 'name_substring_only', 'name_numeric_only', 
							'name_metaphone_only', 'name_all', 'loc_basic_only', 'loc_sep_entity_only', 
							'loc_substring_only', 'loc_all', 'name_all_loc_all']'''
		feature_set_list = ['dummy', 'name_all', 'name_all_loc_all_reduced']
		eval_score_list = ['macro f1 score']
		target_label_list = ['ab', 'fn', 'metis', 'inuit', 'ch', 'ja', 'fr']
		ml_algo_param_dict = \
						{	'LR': {	'clf': LogisticRegression(),
									'param': { # empty since we're not doing grid-search here
									}},
						}

		if run_on_subsampled_data:
			# Loop through subsampling n set with unfeatured data
			if run_on_unfeatured_data:
				for nk in nk_list:
					for algo_key, algo_val in ml_algo_param_dict.items():
						for target_label in target_label_list:
							for eval_score in eval_score_list:
								obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
										'save_result_switch': save_switch, # WARNING: Will overwrite existing
										'use_subsampled_df_switch': False, # WARNING: Switch to False in production
										'use_subsampled_df_nk': nk,
										'use_featured_df_switch': False,
										'use_feature_set': 'none',
										'feature_selection_switch': True,
										'cross_validation_switch': False,
										'ml_process_on_test_data_switch': False,
										'ml_process_on_training_data_switch': False,
										'ml_process_on_ext_data_switch': False,
										'ml_algo': None,
										'ml_algo_param_grid': [algo_key, algo_val],
										'binary_target_label': target_label, 
										'eval_score': eval_score,
										'random_state': 888,
										})
								obj.machine_learning_steps()

			# Loop through feature set and subsampling n set
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for nk in nk_list:
						for algo_key, algo_val in ml_algo_param_dict.items():
							for target_label in target_label_list:
								for eval_score in eval_score_list:
									obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
											'save_result_switch': save_switch, # WARNING: Will overwrite existing
											'use_subsampled_df_switch': False, # WARNING: Switch to False in production
											'use_subsampled_df_nk': nk,
											'use_featured_df_switch': True,
											'use_feature_set': feature_set,
											'feature_selection_switch': True,
											'cross_validation_switch': False,
											'ml_process_on_test_data_switch': False,
											'ml_process_on_training_data_switch': False,
											'ml_process_on_ext_data_switch': False,
											'ml_algo': None,
											'ml_algo_param_grid': [algo_key, algo_val],
											'binary_target_label': target_label, 
											'eval_score': eval_score,
											'random_state': 888,
											})
									obj.machine_learning_steps()		

		if run_on_full_data:
			# Run once using unfeatured, full dataset
			if run_on_unfeatured_data:
				for feature_set in feature_set_list:
					for algo_key, algo_val in ml_algo_param_dict.items():
						for target_label in target_label_list:
							for eval_score in eval_score_list:
								obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
										'save_result_switch': save_switch, # WARNING: Will overwrite existing
										'use_subsampled_df_switch': False, # WARNING: Switch to False in production
										'use_subsampled_df_nk': 'none',
										'use_featured_df_switch': False,
										'use_feature_set': 'none',
										'feature_selection_switch': True,
										'cross_validation_switch': False,
										'ml_process_on_test_data_switch': False,
										'ml_process_on_training_data_switch': False,
										'ml_process_on_ext_data_switch': False,
										'ml_algo': None,
										'ml_algo_param_grid': [algo_key, algo_val],
										'binary_target_label': target_label, 
										'eval_score': eval_score,
										'random_state': 888,
										})
								obj.machine_learning_steps()	

			# Run once using featured, full dataset
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for algo_key, algo_val in ml_algo_param_dict.items():
						for target_label in target_label_list:
							for eval_score in eval_score_list:
								obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
										'save_result_switch': save_switch, # WARNING: Will overwrite existing
										'use_subsampled_df_switch': False, # WARNING: Switch to False in production
										'use_subsampled_df_nk': 'none',
										'use_featured_df_switch': True,
										'use_feature_set': feature_set,
										'feature_selection_switch': True,
										'cross_validation_switch': False,
										'ml_process_on_test_data_switch': False,
										'ml_process_on_training_data_switch': False,
										'ml_process_on_ext_data_switch': False,
										'ml_algo': None,
										'ml_algo_param_grid': [algo_key, algo_val],
										'binary_target_label': target_label, 
										'eval_score': eval_score,
										'random_state': 888,
										})
								obj.machine_learning_steps()			

	if local_control_panel['done_switch']:
		hf.done_alert()
def main(on_switch=False):
	if on_switch:
		save_switch = False
		run_on_subsampled_data = True
		run_on_full_data = False
		run_on_unfeatured_data = False
		run_on_featured_data = True

		# Eg, full list >> [1, 5, 10, 50, 100, 500]
		nk_list = [5] 
		cv_repeat = 1
		'''Eg, Full list >> ['dummy', 'sex_only', 'name_basic_only', 'name_substring_only', 'name_numeric_only', 
							'name_metaphone_only', 'name_all', 'loc_basic_only', 'loc_sep_entity_only', 
							'loc_substring_only', 'loc_all', 'name_all_loc_all', 'name_all_loc_all_reduced']'''
		feature_set_list = ['name_all']
		eval_score_list = ['macro f1 score']
		# Eg, Full list >> ['ab', 'fn', 'metis', 'inuit', 'ch', 'ja', 'en', 'fr', 'ir', 'it', 'rus', 'sc', 'others']
		target_label_list = ['fn']
		ml_algo_param_dict = \
						{	
							'LR_V1': {	'clf': LogisticRegression(),
										'param': {
											'logisticregression__solver': ['liblinear'],
											'logisticregression__penalty': ['l1', 'l2'],
											'logisticregression__C': np.logspace(-4, 4, 20),
											'logisticregression__tol': np.logspace(-5, 5, 20),
											'logisticregression__class_weight': [None, 'balanced'],
											'logisticregression__max_iter': [50, 1000, 4000, 20000],
										}},

							'LR_V2': {	'clf': LogisticRegression(),
										'param': {
											'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
											'logisticregression__penalty': ['none', 'l2'],
											'logisticregression__C': np.logspace(-4, 4, 20),
											'logisticregression__tol': np.logspace(-5, 5, 20),
											'logisticregression__class_weight': [None, 'balanced'],
											'logisticregression__max_iter': [50, 1000, 4000, 20000],
										}},

							'SVC_LINEAR': {	'clf': LinearSVC(),
								            'param': {
								            	'linearsvc__penalty': ['l2'],
								            	'linearsvc__loss': ['hinge', 'squared_hinge'],
								                'linearsvc__C': np.logspace(-4, 4, 20),
								                'linearsvc__tol': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
								                'linearsvc__class_weight': [None, 'balanced'],
								                'linearsvc__max_iter': [50, 1000, 4000, 20000],
                                     		}},

							'SVC_NONLINEAR': {	'clf': SVC(),
									            'param': {
										            'svc__kernel': ['poly', 'rbf', 'sigmoid'],
										            'svc__C': np.logspace(-4, 4, 20),
										            'svc__tol': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
										            'svc__class_weight': [None, 'balanced'],
										            'svc__decision_function_shape': ['ovo', 'ovr'],
										            'svc__max_iter': [50, 1000, 4000, 20000],
                                        		}},

							'NB': {	'clf': BernoulliNB(),
									'param': {
										'bernoullinb__alpha': np.logspace(-4, 4, 20),
										'bernoullinb__binarize': [None, 0, .2, .4, .6, .8, 1],
										'bernoullinb__fit_prior': [True, False],
									}},
						}

		if run_on_subsampled_data:
			# Loop through subsampling n set with unfeatured data
			if run_on_unfeatured_data:
				for nk in nk_list:
					for target_label in target_label_list:
						for algo_key, algo_val in ml_algo_param_dict.items():
							for eval_score in eval_score_list:
								for i in range(1, cv_repeat+1):
									print('>> Current time:', datetime.datetime.now())
									obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
											'save_result_switch': save_switch, # WARNING: Will overwrite existing
											'use_subsampled_df_switch': False, # WARNING: Switch to False in production
											'use_subsampled_df_nk': nk,
											'use_featured_df_switch': False,
											'use_feature_set': [],
											'feature_selection_switch': False,
											'cross_validation_switch': True,
											'cross_validation_repeat': i,
											'ml_process_on_test_data_switch': False,
											'ml_process_on_training_data_switch': False,
											'ml_process_on_ext_data_switch': False,
											'ml_algo': None,
											'ml_algo_param_grid': [algo_key, algo_val],
											'binary_target_label': target_label, 
											'eval_score': eval_score,
											'random_state': 888,
											})
									obj.machine_learning_steps()

			# Loop through feature set and subsampling n set
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for nk in nk_list:
						for target_label in target_label_list:
							for algo_key, algo_val in ml_algo_param_dict.items():
								for eval_score in eval_score_list:
									for i in range(1, cv_repeat+1):
										print('>> Current time:', datetime.datetime.now())
										obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
												'save_result_switch': save_switch, # WARNING: Will overwrite existing
												'use_subsampled_df_switch': False, # WARNING: Switch to False in production
												'use_subsampled_df_nk': nk,
												'use_featured_df_switch': True,
												'use_feature_set': feature_set,
												'feature_selection_switch': False,
												'cross_validation_switch': True,
												'cross_validation_repeat': i,
												'ml_process_on_test_data_switch': False,
												'ml_process_on_training_data_switch': False,
												'ml_process_on_ext_data_switch': False,
												'ml_algo': None,
												'ml_algo_param_grid': [algo_key, algo_val],
												'binary_target_label': target_label, 
												'eval_score': eval_score,
												'random_state': 888,
												})
										obj.machine_learning_steps()		

		if run_on_full_data:
			# Run once using unfeatured, full dataset
			if run_on_unfeatured_data:
				for feature_set in feature_set_list:
					for target_label in target_label_list:
						for algo_key, algo_val in ml_algo_param_dict.items():
							for eval_score in eval_score_list:
								for i in range(1, cv_repeat+1):
									print('>> Current time:', datetime.datetime.now())
									obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
											'save_result_switch': save_switch, # WARNING: Will overwrite existing
											'use_subsampled_df_switch': False, # WARNING: Switch to False in production
											'use_subsampled_df_nk': 'none',
											'use_featured_df_switch': False,
											'use_feature_set': [],
											'feature_selection_switch': False,
											'cross_validation_switch': True,
											'cross_validation_repeat': i,
											'ml_process_on_test_data_switch': False,
											'ml_process_on_training_data_switch': False,
											'ml_process_on_ext_data_switch': False,
											'ml_algo': None,
											'ml_algo_param_grid': [algo_key, algo_val],
											'binary_target_label': target_label, 
											'eval_score': eval_score,
											'random_state': 888,
											})
									obj.machine_learning_steps()	

			# Run once using featured, full dataset
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for target_label in target_label_list:
						for algo_key, algo_val in ml_algo_param_dict.items():
							for eval_score in eval_score_list:
								for i in range(1, cv_repeat+1):
									print('>> Current time:', datetime.datetime.now())
									obj = ml_pipeline.MachineLearningNameEthnicityProjectBinaryClass(control_panel = {
											'save_result_switch': save_switch, # WARNING: Will overwrite existing
											'use_subsampled_df_switch': False, # WARNING: Switch to False in production
											'use_subsampled_df_nk': [],
											'use_featured_df_switch': True,
											'use_feature_set': feature_set,
											'feature_selection_switch': False,
											'cross_validation_switch': True,
											'cross_validation_repeat': i,
											'ml_process_on_test_data_switch': False,
											'ml_process_on_training_data_switch': False,
											'ml_process_on_ext_data_switch': False,
											'ml_algo': None,
											'ml_algo_param_grid': [algo_key, algo_val],
											'binary_target_label': target_label, 
											'eval_score': eval_score,
											'random_state': 888,
											})
									obj.machine_learning_steps()			

	if local_control_panel['done_switch']:
		hf.done_alert()
def main(on_switch=False):
	if on_switch:
		save_switch = False
		run_on_subsampled_data = False
		run_on_full_data = True
		run_on_unfeatured_data = False
		run_on_featured_data = True

		# Eg, full list >> [5, 50, 500]
		nk_list = [5] 
		# Eg, ['dummy', 'sex_only', 'name_all', 'name_all_loc_all', 'name_all_loc_all_reduced']
		feature_set_list = ['name_all_loc_all_reduced']
		ml_algo_dict = {	
							'LR':	{'clf': LogisticRegression(
										tol=0.01438449888287663, 
										solver='liblinear',
										penalty='l2',
										multi_class='ovr',
										max_iter=50,
										class_weight=None,
										C=10000.0)
									},
							'SVC':	{'clf': OneVsRestClassifier(LinearSVC(
										tol=0.0001,
										penalty='l2',
										multi_class='ovr',
										max_iter=50,
										loss='squared_hinge',
										class_weight='balanced',
										C=0.08858667904100823)),
									},
							'NB':	{'clf': BernoulliNB(
										fit_prior=True,
										binarize=0.6,
										alpha=0.00026366508987303583)
									},
						}

		if run_on_subsampled_data:
			# Loop through subsampling n set with unfeatured data
			if run_on_unfeatured_data:
				for nk in nk_list:
					for algo_key, algo_val in ml_algo_dict.items():
						obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
								'save_result_switch': save_switch, # WARNING: Will overwrite existing
								'use_subsampled_df_switch': True,
								'use_subsampled_df_nk': nk,
								'use_featured_df_switch': False,
								'use_feature_set': [],
								'feature_selection_switch': False,
								'cross_validation_switch': False,
								'cross_validation_repeat': None,
								'ml_process_on_test_data_switch': True,
								'ml_process_on_training_data_switch': False,
								'ml_process_on_ext_data_switch': False,
								'ml_algo': [algo_key, algo_val],
								'ml_algo_param_grid': None,
								'eval_score': None,
								'label_varname': 'ETHNICITY_RECAT', 
								'random_state': 888,
								})
						obj.machine_learning_steps()

			# Loop through feature set and subsampling n set
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for nk in nk_list:
						for algo_key, algo_val in ml_algo_dict.items():
							obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
									'save_result_switch': save_switch, # WARNING: Will overwrite existing
									'use_subsampled_df_switch': True,
									'use_subsampled_df_nk': nk,
									'use_featured_df_switch': True,
									'use_feature_set': feature_set,
									'feature_selection_switch': False,
									'cross_validation_switch': False,
									'cross_validation_repeat': None,
									'ml_process_on_test_data_switch': True,
									'ml_process_on_training_data_switch': False,
									'ml_process_on_ext_data_switch': False,
									'ml_algo': [algo_key, algo_val],
									'ml_algo_param_grid': None,
									'eval_score': None,
									'label_varname': 'ETHNICITY_RECAT',
									'random_state': 888,
									})
							obj.machine_learning_steps()

		if run_on_full_data:
			# Run once using unfeatured, full dataset
			if run_on_unfeatured_data:
				for algo_key, algo_val in ml_algo_dict.items():
					obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
							'save_result_switch': save_switch, # WARNING: Will overwrite existing
							'use_subsampled_df_switch': False,
							'use_subsampled_df_nk': [],
							'use_featured_df_switch': True,
							'use_feature_set': [],
							'feature_selection_switch': False,
							'cross_validation_switch': False,
							'cross_validation_repeat': None,
							'ml_process_on_test_data_switch': True,
							'ml_process_on_training_data_switch': False,
							'ml_process_on_ext_data_switch': False,
							'ml_algo': [algo_key, algo_val],
							'ml_algo_param_grid': None,
							'eval_score': None,
							'label_varname': 'ETHNICITY_RECAT',
							'random_state': 888,
							})
					obj.machine_learning_steps()	

			# Run once using featured, full dataset
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for algo_key, algo_val in ml_algo_dict.items():
						obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
								'save_result_switch': save_switch, # WARNING: Will overwrite existing
								'use_subsampled_df_switch': False,
								'use_subsampled_df_nk': [],
								'use_featured_df_switch': True,
								'use_feature_set': feature_set,
								'feature_selection_switch': False,
								'cross_validation_switch': False,
								'cross_validation_repeat': None,
								'ml_process_on_test_data_switch': True,
								'ml_process_on_training_data_switch': False,
								'ml_process_on_ext_data_switch': False,
								'ml_algo': [algo_key, algo_val],
								'ml_algo_param_grid': None,
								'eval_score': None,
								'label_varname': 'ETHNICITY_RECAT',
								'random_state': 888,
								})
						obj.machine_learning_steps()

	if local_control_panel['done_switch']:
		hf.done_alert()
Example #7
0
def main(on_switch=False):
	if on_switch:
		save_switch = False
		run_on_subsampled_data = False
		run_on_full_data = True
		run_on_unfeatured_data = False
		run_on_featured_data = True

		# Eg, full list >> [5, 50, 500]
		nk_list = [5]
		cv_repeat = 1
		# Eg = ['name_all', 'name_all_loc_all', 'name_all_loc_all_reduced']
		feature_set_list = ['name_all_loc_all_reduced']
		# Eg = ['accuracy', 'macro f1 score', 'macro precision', 'macro recall']
		eval_score_list = ['macro f1 score']
		ml_algo_param_dict = \
						{	
							'LR_V1': {	'clf': LogisticRegression(),
										'param': {
											'logisticregression__solver': ['liblinear'],
											'logisticregression__penalty': ['l1', 'l2'],
											'logisticregression__C': np.logspace(-4, 4, 20),
											'logisticregression__tol': np.logspace(-5, 5, 20),
											'logisticregression__class_weight': [None, 'balanced'],
											'logisticregression__multi_class': ['ovr', 'auto'],
											'logisticregression__max_iter': [50, 1000, 4000, 20000],
										}},

							'LR_V2': {	'clf': LogisticRegression(),
										'param': {
											'logisticregression__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
											'logisticregression__penalty': ['none', 'l2'],
											'logisticregression__C': np.logspace(-4, 4, 20),
											'logisticregression__tol': np.logspace(-5, 5, 20),
											'logisticregression__class_weight': [None, 'balanced'],
											'logisticregression__multi_class': ['ovr', 'multinomial', 'auto'],
											'logisticregression__max_iter': [50, 1000, 4000, 20000],
										}},

							'SVC_LINEAR': {	'clf': OneVsRestClassifier(LinearSVC()),
								            'param': {
								            	'onevsrestclassifier__estimator__penalty': ['l2'],
								            	'onevsrestclassifier__estimator__loss': ['hinge', 'squared_hinge'],
								                'onevsrestclassifier__estimator__C': np.logspace(-4, 4, 20),
								                'onevsrestclassifier__estimator__tol': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
								                'onevsrestclassifier__estimator__class_weight': [None, 'balanced'],
								                'onevsrestclassifier__estimator__multi_class': ['ovr', 'crammer_singer'],
								                'onevsrestclassifier__estimator__max_iter': [50, 1000, 4000, 20000],
                                     		}},

							'SVC_NONLINEAR': {	'clf': OneVsRestClassifier(SVC()),
									            'param': {
										            'onevsrestclassifier__estimator__kernel': ['poly', 'rbf', 'sigmoid'],
										            'onevsrestclassifier__estimator__C': np.logspace(-4, 4, 20),
										            'onevsrestclassifier__estimator__tol': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
										            'onevsrestclassifier__estimator__class_weight': [None, 'balanced'],
										            'onevsrestclassifier__estimator__decision_function_shape': ['ovo', 'ovr'],
										            'onevsrestclassifier__estimator__max_iter': [50, 1000, 4000, 20000],
                                        		}},

							'NB': {	'clf': BernoulliNB(),
									'param': {
										'bernoullinb__alpha': np.logspace(-4, 4, 20),
										'bernoullinb__binarize': [None, 0, .2, .4, .6, .8, 1],
										'bernoullinb__fit_prior': [True, False],
									}},
						}

		if run_on_subsampled_data:
			# Loop through subsampling n set with unfeatured data
			if run_on_unfeatured_data:
				for nk in nk_list:
					for algo_key, algo_val in ml_algo_param_dict.items():
						for eval_score in eval_score_list:
							for i in range(1, cv_repeat+1):
								obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
										'save_result_switch': save_switch, # WARNING: Will overwrite existing
										'use_subsampled_df_switch': True,
										'use_subsampled_df_nk': nk,
										'use_featured_df_switch': False,
										'use_feature_set': [],
										'feature_selection_switch': False,
										'cross_validation_switch': True,
										'cross_validation_repeat': i,
										'ml_process_on_test_data_switch': False,
										'ml_process_on_ext_data_switch': False,
										'ml_process_on_training_data_switch': False,
										'ml_algo': None,
										'ml_algo_param_grid': [algo_key, algo_val],
										'eval_score': eval_score,
										'label_varname': 'ETHNICITY_RECAT', 
										'random_state': 888,
										})
								obj.machine_learning_steps()

			# Loop through feature set and subsampling n set
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for nk in nk_list:
						for algo_key, algo_val in ml_algo_param_dict.items():
							for eval_score in eval_score_list:
								for i in range(1, cv_repeat+1):
									obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
											'save_result_switch': save_switch, # WARNING: Will overwrite existing
											'use_subsampled_df_switch': True,
											'use_subsampled_df_nk': nk,
											'use_featured_df_switch': True,
											'use_feature_set': feature_set,
											'feature_selection_switch': False,
											'cross_validation_switch': True,
											'cross_validation_repeat': i,
											'ml_process_on_test_data_switch': False,
											'ml_process_on_ext_data_switch': False,
											'ml_process_on_training_data_switch': False,
											'ml_algo': None,
											'ml_algo_param_grid': [algo_key, algo_val],
											'eval_score': eval_score,
											'label_varname': 'ETHNICITY_RECAT',
											'random_state': 888,
											})
									obj.machine_learning_steps()

		if run_on_full_data:
			# Run once using unfeatured, full dataset
			if run_on_unfeatured_data:
				for algo_key, algo_val in ml_algo_param_dict.items():
					for eval_score in eval_score_list:
						for i in range(1, cv_repeat+1):
							obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
									'save_result_switch': save_switch, # WARNING: Will overwrite existing
									'use_subsampled_df_switch': False,
									'use_subsampled_df_nk': [],
									'use_featured_df_switch': True,
									'use_feature_set': [],
									'feature_selection_switch': False,
									'cross_validation_switch': True,
									'cross_validation_repeat': i,
									'ml_process_on_test_data_switch': False,
									'ml_process_on_ext_data_switch': False,
									'ml_process_on_training_data_switch': False,
									'ml_algo': None,
									'ml_algo_param_grid': [algo_key, algo_val],
									'eval_score': eval_score,
									'label_varname': 'ETHNICITY_RECAT',
									'random_state': 888,
									})
							obj.machine_learning_steps()	

			# Run once using featured, full dataset
			if run_on_featured_data:
				for feature_set in feature_set_list:
					for algo_key, algo_val in ml_algo_param_dict.items():
						for eval_score in eval_score_list:
							for i in range(1, cv_repeat+1):
								obj = ml_pipeline.MachineLearningNameEthnicityProjectMultiClass(control_panel = {
										'save_result_switch': save_switch, # WARNING: Will overwrite existing
										'use_subsampled_df_switch': False,
										'use_subsampled_df_nk': [],
										'use_featured_df_switch': True,
										'use_feature_set': feature_set,
										'feature_selection_switch': False,
										'cross_validation_switch': True,
										'cross_validation_repeat': i,
										'ml_process_on_test_data_switch': False,
										'ml_process_on_ext_data_switch': False,
										'ml_process_on_training_data_switch': False,
										'ml_algo': None,
										'ml_algo_param_grid': [algo_key, algo_val],
										'eval_score': eval_score,
										'label_varname': 'ETHNICITY_RECAT',
										'random_state': 888,
										})
								obj.machine_learning_steps()

	if local_control_panel['done_switch']:
		hf.done_alert()