def process_data(self, data_folder: str, log_data_obj: dict): logging.warning("Processing Data for : " + str(data_folder)) log_name = log_data_obj["log_name"] log_type = log_data_obj["type"] location_type = log_data_obj["location_type"] folder = log_data_obj["folder"] dataset_session = DatasetSession(log_type) ''' STEP1: check for new datasets from folder directory ''' #read dataset, if any new if log_type == "csv": # invoke datasetsession to read the csv dataset_session.read_csv(data_folder, folder, location_type) # load print("isinstance(dataset_session.dataset, Dataset): " + str(isinstance(dataset_session.dataset, Dataset))) dataset_size: Tuple = dataset_session.get_size() logging.info("Dataset Session size: " + str(dataset_size)) # fetch actual dataframe print("======GET DATAFRAME ======") print(dataset_session.get_dataset().get_dataframe().data.shape)
def extract_users(dataset_session: DatasetSession, log_metadata_obj: dict) -> List: ############## TESTS # get dataset log_file_dataset: Dataset = dataset_session.get_dataset() # get core dataframe log_file_core_dataframe: CoreDataFrame = log_file_dataset.get_dataframe( ) # get data frame (.data) log_file_dataframe: pd.DataFrame = log_file_core_dataframe.data # test: get shape log_file_shape: Tuple = log_file_dataframe.shape logging.warning("execute(): dataframe shape: " + str(log_file_shape)) ############ logging.info( "ExtractAllUsersCSV: extract_users log_file_data.columns: - " + str(log_file_dataframe.columns)) logging.info("ExtractAllUsersCSV: extract_users log_metadata_obj: - " + str(log_metadata_obj)) id_column: pd.Series = log_file_dataframe[ log_metadata_obj["id_feature"]] logging.info( "ExtractAllUsersCSV, extract_users, id_column, len of column: " + str(len(id_column))) user_set: List = np.unique( log_file_dataframe[log_metadata_obj["id_feature"]].fillna("NA")) logging.info( "ExtractAllUsersCSV, extract_users, user_set len of column: " + str(len(user_set))) logging.error(user_set) return user_set
def process_data(self, data_folder: str, log_data_obj: dict) -> CoreDataFrame: logging.warning("Processing Data for : "+str(data_folder)) log_name = log_data_obj["log_name"] log_type = log_data_obj["type"] location_type = log_data_obj["location_type"] folder = log_data_obj["folder"] id_feature = log_data_obj["id_feature"] dataset_session = DatasetSession(log_type) #read dataset, if any new if log_type == DataSourceFileType.CSV.value: # invoke datasetsession to read the csv dataset_session.read_csv(data_folder, folder, location_type) # load print( "isinstance(dataset_session.dataset, Dataset): "+str(isinstance(dataset_session.dataset, Dataset)) ) dataset_size: Tuple = dataset_session.get_size() logging.info( "Dataset Session size: "+str(dataset_size) ) # fetch actual dataframe print("======GET DATAFRAME ======") return dataset_session.get_dataset().get_dataframe()