def extract_users(dataset_session: DatasetSession, log_metadata_obj: dict) -> List: ############## TESTS # get dataset log_file_dataset: Dataset = dataset_session.get_csv_dataset() # get core dataframe log_file_core_dataframe: CoreDataFrame = log_file_dataset.get_dataframe( ) # get data frame (.data) log_file_dataframe: pd.DataFrame = log_file_core_dataframe.data # test: get shape log_file_shape: Tuple = log_file_dataframe.shape logging.warning("execute(): dataframe shape: " + str(log_file_shape)) ############ logging.info( "ExtractAllUsersCSV: extract_users log_file_data.columns: - " + str(log_file_dataframe.columns)) logging.info("ExtractAllUsersCSV: extract_users log_metadata_obj: - " + str(log_metadata_obj)) id_column: pd.Series = log_file_dataframe[ log_metadata_obj["id_feature"]] logging.info( "ExtractAllUsersCSV, extract_users, id_column, len of column: " + str(len(id_column))) user_set: List = np.unique( log_file_dataframe[log_metadata_obj["id_feature"]].fillna("NA")) logging.info( "ExtractAllUsersCSV, extract_users, user_set len of column: " + str(len(user_set))) logging.error(user_set) return user_set
def process_data(self, data_folder: str, log_data_obj: dict) -> DatasetSession: logging.warning("Processing Data for : " + str(data_folder)) log_name = log_data_obj["log_name"] log_type = log_data_obj["type"] delimiter = log_data_obj["delimiter"] location_type = log_data_obj["location_type"] folder = log_data_obj["folder"] id_feature = log_data_obj["id_feature"] dataset_session: DatasetSession = DatasetSession(log_type) #read dataset, if any new if log_type == DataSourceFileType.CSV.value: # invoke datasetsession to read the csv dataset_session.read_csv(data_folder, folder, location_type, delimiter) # load print("isinstance(dataset_session.dataset, Dataset): " + str(isinstance(dataset_session.dataset, Dataset))) dataset_size: Tuple = dataset_session.get_size() logging.info("Dataset Session size: " + str(dataset_size)) elif log_type == DataSourceFileType.FLAT.value: pass elif log_type == DataSourceFileType.PARQUET.value: pass elif log_type == DataSourceFileType.JSON.value: pass return dataset_session
def process_data(self, data_folder: str, log_data_obj: dict) -> DatasetSession: logging.warning("Processing Data for : " + str(data_folder)) log_name = log_data_obj["log_name"] log_type = log_data_obj["type"] location_type = log_data_obj["location_type"] folder = log_data_obj["folder"] id_feature = log_data_obj["id_feature"] dataset_session: DatasetSession = DatasetSession(log_type) #read dataset, if any new if log_type == DataSourceFileType.CSV.value: # invoke datasetsession to read the csv dataset_session.read_csv(data_folder, folder, location_type) # load print("isinstance(dataset_session.dataset, Dataset): " + str(isinstance(dataset_session.dataset, Dataset))) dataset_size: Tuple = dataset_session.get_size() logging.info("Dataset Session size: " + str(dataset_size)) # fetch actual dataframe to return print("======GET DATAFRAME ======") #return dataset_session.get_dataset().get_dataframe() return dataset_session
def __init__(self, file_path: str, sep: str, header: int, error_bad_lines: bool, warn_bad_lines: bool): dataset_session: DatasetSession = DatasetSession("disk") dataset_session.read_csv("../test_datasets/toy_1", "proxy", "disk", sep) self.data: DataFrame = CoreDataFrame(dataset_session.dataset.dataframe)
def process_data(self, data_folder: str, log_data_obj: dict): logging.warning("Processing Data for : " + str(data_folder)) log_name = log_data_obj["log_name"] log_type = log_data_obj["type"] location_type = log_data_obj["location_type"] folder = log_data_obj["folder"] dataset_session = DatasetSession(log_type) ''' STEP1: check for new datasets from folder directory ''' #read dataset, if any new if log_type == "csv": # invoke datasetsession to read the csv dataset_session.read_csv(data_folder, folder, location_type) # load print("isinstance(dataset_session.dataset, Dataset): " + str(isinstance(dataset_session.dataset, Dataset))) dataset_size: Tuple = dataset_session.get_size() logging.info("Dataset Session size: " + str(dataset_size)) # fetch actual dataframe print("======GET DATAFRAME ======") print(dataset_session.get_dataset().get_dataframe().data.shape)
def __init__(self, file_path: str, file: str, sep: str, header: int, error_bad_lines: bool, warn_bad_lines: bool): dataset_session: DatasetSession = DatasetSession("disk") file_location: str = ''.join([file_path, file]) dataset_session.read_csv("../test_datasets/toy_1", "proxy", "disk", sep) #assign data self.data: DataFrame = CoreDataFrame( dataset_session.csv_dataset.dataframe)
def process_data(self, data_folder: str, log_data_obj: dict): logging.warning("Processing Data for : " + str(data_folder)) log_name = log_data_obj["log_name"] log_type = log_data_obj["type"] location_type = log_data_obj["location_type"] folder = log_data_obj["folder"] dataset_session = DatasetSession(log_type) ''' STEP1: check for new datasets from folder directory ''' #read dataset, if any new if log_type == "csv": dataset_session.read_csv(data_folder, folder, location_type) # load print("isinstance(dataset_session.dataset, Dataset): " + str(isinstance(dataset_session.dataset, Dataset))) dataset_size: Tuple = dataset_session.get_size() logging.warning("Dataset Session size: " + str(dataset_size))
def __init__(self, host: str, query: str): dataset_session: DatasetSession = DatasetSession("es") dataset_session.read_es_index(host, query) self.data: DataFrame = CoreDataFrame( dataset_session.es_dataset.dataframe)