def cyto_ai_training_pipeline_log_df(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        df_output_file = sys_obj.get_df_output_file()
        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print(data_df)
        print("end cast_all_to_numeric.")

        ###
        data_df.to_csv(df_output_file, index=False)
        print("DF output file = %s" % (df_output_file))

        return data_df
 def _callback(mongo_client, **kwargs):
     tb = mongo_client[user_def.USER_DATABASE][user_def.USER_INFOS_TB]
     qcols = {"_id":0}
     for v in cols: 
         if not user_def.has_user_infos_col(v):
             raise UserMongoDAOException("Unknown user infos row column \"%s\"", v)
         qcols[v] = 1
     
     cursor = tb.find({"uid":uid}, qcols)
     if cursor.count() <= 0:
         return None 
     
     dc = cursor[0]
     if dc.has_key("logo_url") and dc["logo_url"]:
         dc["logo_url"] = SysConfig.current().gen_file_url(type_defines.USER_LOGO_FILE, dc["logo_url"])
     
     return dc
    def cyto_cnn_training_pipeline(self, data_ary, test_ary):
        # data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()
        process_obj = DataProcessing()

        TEST_VALIDATION_SPLIT = 0.33

        # model_output_dir = sys_obj.get_model_output_dir()

        #### data preprocessing
        np.random.shuffle(data_ary)
        train_set_x, train_set_y = sampling_obj.get_x_y_from_data_ary(data_ary)
        print('train set shape', train_set_x.shape)
        train_set_x = train_set_x.reshape(train_set_x.shape[0], 432, 1220, 1)

        np.random.shuffle(test_ary)
        test_X_ary, test_Y_ary = sampling_obj.get_x_y_from_data_ary(test_ary)
        test_X_ary = test_X_ary.reshape(test_X_ary.shape[0], 432, 1220, 1)
        (test_set_x, valid_set_x, test_set_y,
         valid_set_y) = train_test_split(test_X_ary,
                                         test_Y_ary,
                                         test_size=TEST_VALIDATION_SPLIT)
        # train_set_x, train_set_y = sampling_obj.binary_ary_data_over_sampling(train_set_x, train_set_y)

        ### sampling

        my_model = train_obj.cnn_apply(train_set_x, train_set_y, valid_set_x,
                                       valid_set_y, test_X_ary, test_Y_ary)

        # test_score = my_model.evaluate(test_set_x, test_set_y)

        # test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y)
        # test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y)

        # print("\ntest data set, %s: %.2f%%" % (my_model.metrics_names[1], test_score[1]*100))

        # print("Test score:", test_score[0])
        # print('Test accuracy:', test_score[1])

        return my_model
	def setUp(self):
		# print("In setUp ...")
		self.data_obj = CytoOADataMain()
		self.env_obj = SysConfig()
		self.train_obj = CytoOATrainingMain()
		self.process_obj = DataProcessing()
class UnitTestSeqTools(unittest.TestCase):

	def setUp(self):
		# print("In setUp ...")
		self.data_obj = CytoOADataMain()
		self.env_obj = SysConfig()
		self.train_obj = CytoOATrainingMain()
		self.process_obj = DataProcessing()

	def tearDown(self):
		# print("In tearDown ...")
		pass

	# def test_cyto_keras_training_pipeline(self):
	# 	print("In test_cyto_keras_training_pipeline ... ")

	# 	### probe bind directory
	# 	# # data_dir = '/home/ryan/src_dir/CytoCloudR/tmp/gene/output'
	# 	# # data_dir = '/home/ryan/src_dir/CytoCloudR/tmp/gene/test_output'
	# 	# data_dir = self.env_obj.get_cnv_output_dir()

	# 	### report excel
	# 	# outcome_file = '/home/ryan/src_dir/CytoOA_AI/data/Cyto_Report_summary2.xlsx'
	# 	# outcome_file = self.env_obj.get_outcome_file()

	# 	## read data
	# 	# cnv_df = self.data_obj.build_cnv_training_data(data_dir, outcome_file)
	# 	# print(cnv_df)

	# 	# print(cnv_df.describe())
	# 	# print(cnv_df.duplicated())
	# 	### DNN
	# 	# self.train_obj.cyto_dnn_balance_training_pipeline(cnv_df)
	
	# def test_cyto_tif_training_pipeline(self):
	# 	# # read tif path and cnv_outcome
	# 	# img_data_dir = self.env_obj.get_tif_data_dir()
	# 	# img_path_df = self.data_obj.get_tif_data_table(img_data_dir, 'local_tiff_match_file_list.txt')

	# 	# with open('data_df.pickle','rb') as file: # pickle for fast testing
	# 	# 	cnv_df = pickle.load(file)
	# 	# img_path_df = pd.merge(left=img_path_df, right=cnv_df.loc[:, ['Array_ID', 'cnv_outcome']]) ## img_path_df 包含X路徑、Y
		
	# 	# print(img_path_df.head())
	# 	# print(img_path_df.shape)
	# 	# # # read tif pixel value
	# 	# data_ary = self.data_obj.get_img_ary(img_path_df)
	# 	# np.save('tif_ary_(1220x432)_0625.npy', data_ary)
	# 	data_ary = np.load('tif_ary_(1220x432)_0625.npy') # for fast testing 'tif_ary_(610x216)_0615.npy'
	# 	(train_data_ary, test_data_ary) = train_test_split(data_ary, test_size=0.3)
	# 	# augmented_data = self.process_obj.img_balance_augmentation(train_data_ary)
	# 	# data = data[:50]

	# 	### CNN
	# 	self.train_obj.cyto_cnn_training_pipeline(train_data_ary, test_data_ary)

	# 	# ### Vgg-net
	
	def test_cyto_R_CBS_training_pipeline(self):
		#	Read CBS plot and cnv_outcome
		img_data_dir = self.env_obj.get_tif_data_dir()
		img_path_df = self.data_obj.get_tif_data_table(img_data_dir, 'cbs_all_file_list.txt')
		print("CBS all data is loaded. \n")
		with open('data_df.pickle','rb') as file: # pickle for fast testing
			cnv_df = pickle.load(file)
		img_path_df = pd.merge(left=img_path_df, right=cnv_df.loc[:, ['Array_ID', 'cnv_outcome']]) ## img_path_df 包含X路徑、Y
		
		print('Shape of the raw data records:', img_path_df.shape, '\n')

		data_ary = self.data_obj.get_img_ary(img_path_df)
		np.save('CBS_ary_(1150x400)_0801.npy', data_ary)
 def __init__(self):
     self.sys_obj = SysConfig()
    def cyto_ai_training_pipeline(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print("end cast_all_to_numeric.")
        # print("########################")
        # print(data_df)

        ### convert to category
        data_df[y_key] = data_df[y_key].astype('category')

        ### sampling
        (train_set,
         test_set) = sampling_obj.sk_sampling(data_df, test_set_ratio)

        # print("######################## train_set")
        # print(train_set)

        # print("######################## test_set")
        # print(test_set)

        #### temp data
        test_set_bak = test_set.copy()
        # test_set_bak[y_key] = test_set_bak[y_key].astype('int')

        ### get x,y
        (train_set_x,
         train_set_y) = sampling_obj.get_x_y_from_dataframe(train_set, y_key)
        (test_set_x,
         test_set_y) = sampling_obj.get_x_y_from_dataframe(test_set, y_key)

        # print("######################## train_set_x")
        # print(train_set_x)
        # print("######################## train_set_y")
        # print(train_set_y)

        # print("######################## test_set_x")
        # print(test_set_x)
        # print("######################## test_set_y")
        # print(test_set_y)

        ### seperate by disease
        test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :]
        test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :]

        (test_set_disease_x,
         test_set_disease_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_disease, y_key)
        (test_set_normal_x,
         test_set_normal_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_normal, y_key)

        # print("######################## test_set_disease")
        # print(test_set_disease)
        # print("######################## test_set_disease_x")
        # print(test_set_disease_x)
        # print("######################## test_set_disease_y")
        # print(test_set_disease_y)

        # print("######################## test_set_normal")
        # print(test_set_normal)
        # print("######################## test_set_normal_x")
        # print(test_set_normal_x)
        # print("######################## test_set_normal_y")
        # print(test_set_normal_y)

        test_set_disease_len = len(test_set_disease_y)
        test_set_normal_len = len(test_set_normal_y)

        print(
            "######################## test_set_disease_len, test_set_normal_len"
        )
        print("{}, {}".format(test_set_disease_len, test_set_normal_len))
        # ### feature list
        # # feature_dict = {}
        # # feature_dict['numeric'] = list(train_set_x.columns.values)

        ### training the model
        my_model = train_obj.SKRandomForest_Category(train_set_x, train_set_y)

        test_score = my_model.score(test_set_x, test_set_y)
        test_set_diseas_score = my_model.score(test_set_disease_x,
                                               test_set_disease_y)
        test_set_normal_score = my_model.score(test_set_normal_x,
                                               test_set_normal_y)

        print("Test score = {}".format(test_score))
        print("test_set_diseas_score score = {}".format(test_set_diseas_score))
        print("test_set_normal_score score = {}".format(test_set_normal_score))

        return data_df
    def cyto_ai_balance_training_pipeline(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        model_output_dir = sys_obj.get_model_output_dir()
        log_file = sys_obj.get_log_file()

        model_threshold = 200
        model_count = 1
        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print("end cast_all_to_numeric.")
        # print("########################")
        # print(data_df)

        ### convert to category
        data_df[y_key] = data_df[y_key].astype('category')

        ### sampling

        disease_df = data_df.loc[data_df[y_key] == 1, :]
        normal_df = data_df.loc[data_df[y_key] == 0, :]

        print("### disease_df")
        print(disease_df)

        print("### normal_df")
        print(normal_df)

        print(len(disease_df), len(normal_df))
        # # log_file = '/app/data/model/RF_3000_log.txt'
        fh_writer = open(log_file, 'w')

        while model_count < model_threshold:
            # (train_set, train_label, test_set, test_label) = sampling_obj.category2_sampling_pipeline(normal_df, disease_df,y_key, test_set_ratio)
            (train_set,
             test_set) = sampling_obj.category2_simple_sampling_pipeline(
                 normal_df, disease_df, y_key, test_set_ratio)

            print("###############################")
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print(model_count)
            # print("######################## train_set")
            # print(train_set)

            # print("######################## test_set")
            # print(test_set)

            # #### temp data
            test_set_bak = test_set.copy()
            # # test_set_bak[y_key] = test_set_bak[y_key].astype('int')

            ### get x,y
            (train_set_x, train_set_y) = sampling_obj.get_x_y_from_dataframe(
                train_set, y_key)
            (test_set_x, test_set_y) = sampling_obj.get_x_y_from_dataframe(
                test_set, y_key)

            # print("######################## train_set_x")
            # print(train_set_x)
            # print("######################## train_set_y")
            # print(train_set_y)

            # print("######################## test_set_x")
            # print(test_set_x)
            # print("######################## test_set_y")
            # print(test_set_y)

            ### seperate by disease
            test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :]
            test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :]

            (test_set_disease_x,
             test_set_disease_y) = sampling_obj.get_x_y_from_dataframe(
                 test_set_disease, y_key)
            (test_set_normal_x,
             test_set_normal_y) = sampling_obj.get_x_y_from_dataframe(
                 test_set_normal, y_key)

            # print("######################## test_set_disease")
            # print(test_set_disease)
            # print("######################## test_set_disease_x")
            # print(test_set_disease_x)
            # print("######################## test_set_disease_y")
            # print(test_set_disease_y)

            # print("######################## test_set_normal")
            # print(test_set_normal)
            # print("######################## test_set_normal_x")
            # print(test_set_normal_x)
            # print("######################## test_set_normal_y")
            # print(test_set_normal_y)

            test_set_disease_len = len(test_set_disease_y)
            test_set_normal_len = len(test_set_normal_y)

            print(
                "######################## test_set_disease_len, test_set_normal_len"
            )
            print("{}, {}".format(test_set_disease_len, test_set_normal_len))
            # ### feature list
            # # feature_dict = {}
            # # feature_dict['numeric'] = list(train_set_x.columns.values)

            ### training the model
            my_model = train_obj.SKRandomForest_Category(
                train_set_x, train_set_y)

            test_score = my_model.score(test_set_x, test_set_y)
            test_set_diseas_score = my_model.score(test_set_disease_x,
                                                   test_set_disease_y)
            test_set_normal_score = my_model.score(test_set_normal_x,
                                                   test_set_normal_y)

            print("Test score = {}".format(test_score))
            print("test_set_diseas_score score = {}".format(
                test_set_diseas_score))
            print("test_set_normal_score score = {}".format(
                test_set_normal_score))

            ### log file
            fh_writer.write("{}\t{}\t{}\n".format(test_score,
                                                  test_set_diseas_score,
                                                  test_set_normal_score))

            ### model
            model_file = model_output_dir + str(model_count) + '.pkl'
            joblib.dump(my_model, model_file)
            model_count += 1
        fh_writer.close()

        return data_df
    def cyto_xgboost_balance_training_pipeline(self, data_df):

        data_processing_obj = DataProcessing()
        sampling_obj = DataSampling()
        sys_obj = SysConfig()
        train_obj = DataTrainning()

        test_set_ratio = sys_obj.get_test_set_ratio()
        y_key = sys_obj.get_y_key()

        model_output_dir = sys_obj.get_model_output_dir()
        log_file = sys_obj.get_log_file()

        model_threshold = 200
        model_count = 1
        print(data_df)
        #### data preprocessing
        print("start cast_all_to_numeric.")
        data_df = data_processing_obj.cast_all_to_numeric(data_df)
        print("end cast_all_to_numeric.")
        # print("########################")
        # print(data_df)

        ### convert to category
        # data_df[y_key] = data_df[y_key].astype('category')

        ### sampling

        disease_df = data_df.loc[data_df[y_key] == 1, :]
        normal_df = data_df.loc[data_df[y_key] == 0, :]

        # # log_file = '/app/data/model/RF_3000_log.txt'
        # fh_writer = open(log_file, 'w')

        # while model_count < model_threshold:
        # (train_set, train_label, test_set, test_label) = sampling_obj.category2_sampling_pipeline(normal_df, disease_df,y_key, test_set_ratio)
        (train_set,
         test_set) = sampling_obj.category2_simple_sampling_pipeline(
             normal_df, disease_df, y_key, test_set_ratio)

        print("###############################")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        print(model_count)
        # print("######################## train_set")
        # print(train_set)

        # print("######################## test_set")
        # print(test_set)

        # #### temp data
        test_set_bak = test_set.copy()
        # # test_set_bak[y_key] = test_set_bak[y_key].astype('int')

        ### get x,y
        (train_set_x,
         train_set_y) = sampling_obj.get_x_y_from_dataframe(train_set, y_key)
        (test_set_x,
         test_set_y) = sampling_obj.get_x_y_from_dataframe(test_set, y_key)

        # print("######################## train_set_x")
        # print(train_set_x)
        # print("######################## train_set_y")
        # print(train_set_y)

        # print("######################## test_set_x")
        # print(test_set_x)
        # print("######################## test_set_y")
        # print(test_set_y)

        ### seperate by disease
        test_set_disease = test_set_bak.loc[test_set_bak[y_key] == 1, :]
        test_set_normal = test_set_bak.loc[test_set_bak[y_key] == 0, :]

        (test_set_disease_x,
         test_set_disease_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_disease, y_key)
        (test_set_normal_x,
         test_set_normal_y) = sampling_obj.get_x_y_from_dataframe(
             test_set_normal, y_key)

        # print("######################## test_set_disease")
        # print(test_set_disease)
        # print("######################## test_set_disease_x")
        # print(test_set_disease_x)
        # print("######################## test_set_disease_y")
        # print(test_set_disease_y)

        # print("######################## test_set_normal")
        # print(test_set_normal)
        # print("######################## test_set_normal_x")
        # print(test_set_normal_x)
        # print("######################## test_set_normal_y")
        # print(test_set_normal_y)

        test_set_disease_len = len(test_set_disease_y)
        test_set_normal_len = len(test_set_normal_y)

        print(
            "######################## test_set_disease_len, test_set_normal_len"
        )
        print("{}, {}".format(test_set_disease_len, test_set_normal_len))
        # ### feature list
        # # feature_dict = {}
        # # feature_dict['numeric'] = list(train_set_x.columns.values)

        ### training the model
        my_model = train_obj.xgboot_training(train_set_x, train_set_y,
                                             test_set_x, test_set_y)

        # # test_set_x = preprocessing.scale(test_set_x)
        # # test_set_x = scaler.scale(test_set_x)
        # test_set_x = scaler.transform(test_set_x)

        # # test_set_y = to_categorical(test_set_y)
        # test_score = my_model.evaluate(test_set_x, test_set_y)
        # # test_set_diseas_score = my_model.score(test_set_disease_x, test_set_disease_y)
        # # test_set_normal_score = my_model.score(test_set_normal_x, test_set_normal_y)

        # print("\ntest data set, %s: %.2f%%" % (my_model.metrics_names[1], test_score[1]*100))

        # print("Test score:", test_score[0])
        # print('Test accuracy:', test_score[1])

        return data_df