Esempio n. 1
0
    def extract_users(dataset_session: DatasetSession,
                      log_metadata_obj: dict) -> List:
        ############## TESTS
        # get dataset
        log_file_dataset: Dataset = dataset_session.get_csv_dataset()
        # get core dataframe
        log_file_core_dataframe: CoreDataFrame = log_file_dataset.get_dataframe(
        )
        # get data frame (.data)
        log_file_dataframe: pd.DataFrame = log_file_core_dataframe.data
        # test: get shape
        log_file_shape: Tuple = log_file_dataframe.shape
        logging.warning("execute(): dataframe shape: " + str(log_file_shape))
        ############

        logging.info(
            "ExtractAllUsersCSV: extract_users log_file_data.columns: - " +
            str(log_file_dataframe.columns))
        logging.info("ExtractAllUsersCSV: extract_users log_metadata_obj: - " +
                     str(log_metadata_obj))

        id_column: pd.Series = log_file_dataframe[
            log_metadata_obj["id_feature"]]
        logging.info(
            "ExtractAllUsersCSV, extract_users, id_column, len of column: " +
            str(len(id_column)))
        user_set: List = np.unique(
            log_file_dataframe[log_metadata_obj["id_feature"]].fillna("NA"))
        logging.info(
            "ExtractAllUsersCSV, extract_users, user_set len of column: " +
            str(len(user_set)))
        logging.error(user_set)
        return user_set
Esempio n. 2
0
    def process_data(self, data_folder: str,
                     log_data_obj: dict) -> DatasetSession:

        logging.warning("Processing Data for : " + str(data_folder))

        log_name = log_data_obj["log_name"]
        log_type = log_data_obj["type"]
        delimiter = log_data_obj["delimiter"]
        location_type = log_data_obj["location_type"]
        folder = log_data_obj["folder"]
        id_feature = log_data_obj["id_feature"]

        dataset_session: DatasetSession = DatasetSession(log_type)

        #read dataset, if any new
        if log_type == DataSourceFileType.CSV.value:

            # invoke datasetsession to read the csv
            dataset_session.read_csv(data_folder, folder, location_type,
                                     delimiter)  # load
            print("isinstance(dataset_session.dataset, Dataset): " +
                  str(isinstance(dataset_session.dataset, Dataset)))
            dataset_size: Tuple = dataset_session.get_size()
            logging.info("Dataset Session size: " + str(dataset_size))

        elif log_type == DataSourceFileType.FLAT.value:
            pass
        elif log_type == DataSourceFileType.PARQUET.value:
            pass
        elif log_type == DataSourceFileType.JSON.value:
            pass

        return dataset_session
Esempio n. 3
0
    def process_data(self, data_folder: str,
                     log_data_obj: dict) -> DatasetSession:

        logging.warning("Processing Data for : " + str(data_folder))

        log_name = log_data_obj["log_name"]
        log_type = log_data_obj["type"]
        location_type = log_data_obj["location_type"]
        folder = log_data_obj["folder"]
        id_feature = log_data_obj["id_feature"]

        dataset_session: DatasetSession = DatasetSession(log_type)

        #read dataset, if any new
        if log_type == DataSourceFileType.CSV.value:
            # invoke datasetsession to read the csv
            dataset_session.read_csv(data_folder, folder,
                                     location_type)  # load
            print("isinstance(dataset_session.dataset, Dataset): " +
                  str(isinstance(dataset_session.dataset, Dataset)))
            dataset_size: Tuple = dataset_session.get_size()
            logging.info("Dataset Session size: " + str(dataset_size))

        # fetch actual dataframe to return
        print("======GET DATAFRAME ======")
        #return dataset_session.get_dataset().get_dataframe()
        return dataset_session
Esempio n. 4
0
    def __init__(self, file_path: str, sep: str, header: int,
                 error_bad_lines: bool, warn_bad_lines: bool):

        dataset_session: DatasetSession = DatasetSession("disk")

        dataset_session.read_csv("../test_datasets/toy_1", "proxy", "disk",
                                 sep)

        self.data: DataFrame = CoreDataFrame(dataset_session.dataset.dataframe)
Esempio n. 5
0
    def process_data(self, data_folder: str, log_data_obj: dict):

        logging.warning("Processing Data for : " + str(data_folder))

        log_name = log_data_obj["log_name"]
        log_type = log_data_obj["type"]
        location_type = log_data_obj["location_type"]
        folder = log_data_obj["folder"]

        dataset_session = DatasetSession(log_type)
        '''
         STEP1: check for new datasets
         from folder directory
        '''

        #read dataset, if any new
        if log_type == "csv":
            # invoke datasetsession to read the csv
            dataset_session.read_csv(data_folder, folder,
                                     location_type)  # load
            print("isinstance(dataset_session.dataset, Dataset): " +
                  str(isinstance(dataset_session.dataset, Dataset)))
            dataset_size: Tuple = dataset_session.get_size()
            logging.info("Dataset Session size: " + str(dataset_size))

            # fetch actual dataframe
            print("======GET DATAFRAME ======")
            print(dataset_session.get_dataset().get_dataframe().data.shape)
Esempio n. 6
0
    def __init__(self, file_path: str, file: str, sep: str, header: int,
                 error_bad_lines: bool, warn_bad_lines: bool):

        dataset_session: DatasetSession = DatasetSession("disk")

        file_location: str = ''.join([file_path, file])

        dataset_session.read_csv("../test_datasets/toy_1", "proxy", "disk",
                                 sep)

        #assign data
        self.data: DataFrame = CoreDataFrame(
            dataset_session.csv_dataset.dataframe)
Esempio n. 7
0
    def process_data(self, data_folder: str, log_data_obj: dict):

        logging.warning("Processing Data for : " + str(data_folder))

        log_name = log_data_obj["log_name"]
        log_type = log_data_obj["type"]
        location_type = log_data_obj["location_type"]
        folder = log_data_obj["folder"]

        dataset_session = DatasetSession(log_type)
        '''
         STEP1: check for new datasets
         from folder directory
        '''

        #read dataset, if any new
        if log_type == "csv":
            dataset_session.read_csv(data_folder, folder,
                                     location_type)  # load
            print("isinstance(dataset_session.dataset, Dataset): " +
                  str(isinstance(dataset_session.dataset, Dataset)))
            dataset_size: Tuple = dataset_session.get_size()
            logging.warning("Dataset Session size: " + str(dataset_size))
Esempio n. 8
0
    def __init__(self, host: str, query: str):

        dataset_session: DatasetSession = DatasetSession("es")
        dataset_session.read_es_index(host, query)
        self.data: DataFrame = CoreDataFrame(
            dataset_session.es_dataset.dataframe)