Beispiel #1
0
    def load_data(self, path, dataset_name='', batch_start_dt=None, batch_end_dt=None):
        """
        This will read data frame from es
        :param path: String path to index in es
        :param dataset_name: String name of the dataset for logging
        :return: Pandas dataframe containing raw data
        """
        # Read CSV from drive
        self.log.info('Loading {} ids'.format(dataset_name,path))

        if not batch_start_dt:
            batch_start_dt = self.batch_start_dt
        
        if not batch_end_dt:
            batch_end_dt = self.batch_end_dt
        
        # Set start of batch date filter
        start_dt = ''
        id = ''
        dt_col = ''

        if dataset_name == 'Incident':
            start_dt = batch_start_dt
            id = self.config.dataloader.incident_id
            dt_col = self.inc_dt_column

        elif dataset_name == 'Change':
            start_dt = "{}||-{}d".format(batch_start_dt, self.window_size)
            id = self.config.dataloader.change_id
            dt_col = self.chg_dt_column

        df = ed.read_es(self.es, path)            
        df = df.es_query({
            "range": {
                dt_col: {
                    "gte": start_dt,
                    "lte": batch_end_dt
                }
                }
            })

        return (df, dt_col)
Beispiel #2
0
    def test_all_formats(self):
        index_name = self.time_index_name
        ed_df = ed.read_es(ES_TEST_CLIENT, index_name)

        for format_name in self.time_formats.keys():
            times = [
                pd.to_datetime(
                    datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z").strftime(
                        self.time_formats[format_name]),
                    format=self.time_formats[format_name],
                ) for dt in self.times
            ]

            ed_series = ed_df[format_name]
            pd_series = pd.Series(
                times,
                index=[str(i) for i in range(len(self.times))],
                name=format_name)

            assert_pandas_eland_series_equal(pd_series, ed_series)
Beispiel #3
0
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

# Create pandas and eland data frames
from eland.tests import (
    ES_TEST_CLIENT,
    FLIGHTS_DF_FILE_NAME,
    FLIGHTS_INDEX_NAME,
    FLIGHTS_SMALL_INDEX_NAME,
    ECOMMERCE_DF_FILE_NAME,
    ECOMMERCE_INDEX_NAME,
)

_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
_pd_flights.index = _pd_flights.index.map(str)  # make index 'object' not int
_ed_flights = ed.read_es(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)

_pd_flights_small = _pd_flights.head(48)
_ed_flights_small = ed.read_es(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME)

_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
_pd_ecommerce["order_date"] = pd.to_datetime(_pd_ecommerce["order_date"])
_pd_ecommerce["products.created_on"] = _pd_ecommerce[
    "products.created_on"].apply(lambda x: pd.to_datetime(x))
_pd_ecommerce.insert(2, "customer_birth_date", None)
_pd_ecommerce.index = _pd_ecommerce.index.map(
    str)  # make index 'object' not int
_pd_ecommerce["customer_birth_date"].astype("datetime64")
_ed_ecommerce = ed.read_es(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)