def load_data(self, path, dataset_name='', batch_start_dt=None, batch_end_dt=None): """ This will read data frame from es :param path: String path to index in es :param dataset_name: String name of the dataset for logging :return: Pandas dataframe containing raw data """ # Read CSV from drive self.log.info('Loading {} ids'.format(dataset_name,path)) if not batch_start_dt: batch_start_dt = self.batch_start_dt if not batch_end_dt: batch_end_dt = self.batch_end_dt # Set start of batch date filter start_dt = '' id = '' dt_col = '' if dataset_name == 'Incident': start_dt = batch_start_dt id = self.config.dataloader.incident_id dt_col = self.inc_dt_column elif dataset_name == 'Change': start_dt = "{}||-{}d".format(batch_start_dt, self.window_size) id = self.config.dataloader.change_id dt_col = self.chg_dt_column df = ed.read_es(self.es, path) df = df.es_query({ "range": { dt_col: { "gte": start_dt, "lte": batch_end_dt } } }) return (df, dt_col)
def test_all_formats(self): index_name = self.time_index_name ed_df = ed.read_es(ES_TEST_CLIENT, index_name) for format_name in self.time_formats.keys(): times = [ pd.to_datetime( datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z").strftime( self.time_formats[format_name]), format=self.time_formats[format_name], ) for dt in self.times ] ed_series = ed_df[format_name] pd_series = pd.Series( times, index=[str(i) for i in range(len(self.times))], name=format_name) assert_pandas_eland_series_equal(pd_series, ed_series)
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) # Create pandas and eland data frames from eland.tests import ( ES_TEST_CLIENT, FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_SMALL_INDEX_NAME, ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME, ) _pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index() _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"]) _pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int _ed_flights = ed.read_es(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME) _pd_flights_small = _pd_flights.head(48) _ed_flights_small = ed.read_es(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME) _pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index() _pd_ecommerce["order_date"] = pd.to_datetime(_pd_ecommerce["order_date"]) _pd_ecommerce["products.created_on"] = _pd_ecommerce[ "products.created_on"].apply(lambda x: pd.to_datetime(x)) _pd_ecommerce.insert(2, "customer_birth_date", None) _pd_ecommerce.index = _pd_ecommerce.index.map( str) # make index 'object' not int _pd_ecommerce["customer_birth_date"].astype("datetime64") _ed_ecommerce = ed.read_es(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)