def test_empty_info(self): mapping = {"mappings": {"properties": {}}} for i in range(0, 10): field_name = "field_name_" + str(i) mapping["mappings"]["properties"][field_name] = {"type": "float"} ES_TEST_CLIENT.indices.delete(index="empty_index", ignore=[400, 404]) ES_TEST_CLIENT.indices.create(index="empty_index", body=mapping) ed_df = ed.DataFrame(ES_TEST_CLIENT, "empty_index") ed_df.info() ES_TEST_CLIENT.indices.delete(index="empty_index")
def test_all_formats(self): index_name = self.time_index_name ed_df = ed.DataFrame(ES_TEST_CLIENT, index_name) for format_name in self.time_formats.keys(): times = [ pd.to_datetime( datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z").strftime( self.time_formats[format_name]), format=self.time_formats[format_name], ) for dt in self.times ] ed_series = ed_df[format_name] pd_series = pd.Series( times, index=[str(i) for i in range(len(self.times))], name=format_name) assert_pandas_eland_series_equal(pd_series, ed_series)
def test_end_to_end(self): """Tests an end-to-end workflow with all components.""" num_documents = 100 num_users = 10 max_queries = 5 resources.prepare(ES_TEST_CLIENT) simulate.generate_events( num_documents, num_users, max_queries, lambda x: ES_TEST_CLIENT.index( index=resources.INDEX, pipeline=resources.INDEX, body=x), with_progress=True, ) ES_TEST_CLIENT.indices.refresh(resources.INDEX) resources.start_transforms(ES_TEST_CLIENT, resources.TRANSFORM_NAMES) # if any of the mechanics above fail, we won't reach this point, which # is a good integration test in-and-of-itself index_size = ES_TEST_CLIENT.count(index=METRICS_INDEX)['count'] self.assertGreaterEqual(index_size, num_users) self.assertLessEqual(index_size, num_users * max_queries) # make some invariant assertions based on aggregate statistics of the data # when things break, these statistics go to 0 metrics_df = ed.DataFrame(es_client=ES_TEST_CLIENT, es_index_pattern=METRICS_INDEX) metrics_cols = [ x for x in metrics_df.columns if x.startswith('metrics.clicks.') ] non_zero_properties = metrics_df[metrics_cols].describe().loc[[ 'count', 'mean', 'std', 'max' ]] self.assertFalse(non_zero_properties.eq(0.0).any().any())
def test_init(self): # Construct empty DataFrame (throws) with pytest.raises(ValueError): ed.DataFrame() # Construct invalid DataFrame (throws) with pytest.raises(ValueError): ed.DataFrame(es_client=ES_TEST_CLIENT) # Construct invalid DataFrame (throws) with pytest.raises(ValueError): ed.DataFrame(es_index_pattern=FLIGHTS_INDEX_NAME) # Good constructors ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME) ed.DataFrame(es_client=ES_TEST_CLIENT, es_index_pattern=FLIGHTS_INDEX_NAME) qc = QueryCompiler(client=ES_TEST_CLIENT, index_pattern=FLIGHTS_INDEX_NAME) ed.DataFrame(_query_compiler=qc)
#We need to connect to our ElasticSearch first, fo that refer to the reference link provided in the README.md for explaination import elasticsearch import eland as ed from elasticsearch import Elasticsearch es = Elasticsearch( ['host_server_name'], http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'), scheme="https", port=443, ) es.ping() #Get the names of all available indices on ElasticSearch as output rec = es.indices.get_alias("*") for Name in rec: print(Name) #Choose the index whose data you are interested in df = ed.DataFrame(es, es_index_pattern="NAME_OF_INDEX_PATTERN") #Print the data in chosen index in the form of eland dataframe df
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) # Create pandas and eland data frames from tests import ( ECOMMERCE_DF_FILE_NAME, ECOMMERCE_INDEX_NAME, ES_TEST_CLIENT, FLIGHTS_DF_FILE_NAME, FLIGHTS_INDEX_NAME, FLIGHTS_SMALL_INDEX_NAME, ) _pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index() _pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"]) _pd_flights.index = _pd_flights.index.map(str) # make index 'object' not int _ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME) _pd_flights_small = _pd_flights.head(48) _ed_flights_small = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME) _pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index() _pd_ecommerce["order_date"] = pd.to_datetime(_pd_ecommerce["order_date"]) _pd_ecommerce["products.created_on"] = _pd_ecommerce[ "products.created_on"].apply(lambda x: pd.to_datetime(x)) _pd_ecommerce.insert(2, "customer_birth_date", None) _pd_ecommerce.index = _pd_ecommerce.index.map( str) # make index 'object' not int _pd_ecommerce["customer_birth_date"].astype("datetime64") _ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)
import click from datetime import time from pathlib import Path from elasticsearch import Elasticsearch import typing import eland import pandas as pd import geopandas as gpd es = Elasticsearch() geodf = eland.DataFrame(es, "sd_geo") def get_closest_lat_long(point, threshold=100): q = { "size": 1, "query": { "bool": { "must": [ { "wildcard": { "street.keyword": f"{point.address_road_primary}*", }, }, ], "filter": { "range": { "number": { "gte": point.address_number_primary - threshold, "lte": point.address_number_primary + threshold,
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Authors: Madeline McCombe <*****@*****.**> ############################################################################## # import packages import eland as ed ############################################################################## # change these to match your ELK IP/index name ip_address = '127.0.0.1' index_name = 'gtp-*' # accessing index in lab ELK stack ed_df = ed.DataFrame(ip_address, index_name) # pull info about the index (like memory usage) print(ed_df.info(), '\n') # find the number of entries in the index print('There are', ed_df.shape[0], 'entries in this index.\n') # See the most common event codes print('The top 10 event_code/count pairs are: \n', ed_df['event_code'].value_counts(), '\n')
import elasticsearch import eland as ed from elasticsearch import Elasticsearch # Connecting to an Elastic Cloud instance(remote) es = Elasticsearch( ['host_server_name'], http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'), scheme="https", port=443, ) #you can also use this one-liner #es = Elasticsearch(['https://*****:*****@HOST_SERVER_NAME:PORT']) #Invoke and import index data into eland dataframe df = ed.DataFrame(es, es_index_pattern="NAME_OF_INDEX_PATTERN") #Check whether the Elasticsearch respond es.ping() df # For connecting Elastic cloud on your localhost df = ed.DataFrame("localhost:9200", es_index_pattern="NAME_OF_INDEX_PATTERN") df
#Refer to README.md file for a detailed code and execution steps import elasticsearch import eland as ed from elasticsearch import Elasticsearch es = Elasticsearch(['host_server_name'], http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'), scheme="https", port=443) #Following syntax is used to create an index into your Elasticsearch es.indices.create(index="My_First_Index", ignore=400) #Check or Fetch the created index df = ed.DataFrame(es, es_index_pattern="mydata") df #Deleting the index es.indices.delete(index="My_First_Index", ignore=400)