Ejemplo n.º 1
0
    def test_empty_info(self):
        mapping = {"mappings": {"properties": {}}}

        for i in range(0, 10):
            field_name = "field_name_" + str(i)
            mapping["mappings"]["properties"][field_name] = {"type": "float"}

        ES_TEST_CLIENT.indices.delete(index="empty_index", ignore=[400, 404])
        ES_TEST_CLIENT.indices.create(index="empty_index", body=mapping)

        ed_df = ed.DataFrame(ES_TEST_CLIENT, "empty_index")
        ed_df.info()

        ES_TEST_CLIENT.indices.delete(index="empty_index")
Ejemplo n.º 2
0
    def test_all_formats(self):
        index_name = self.time_index_name
        ed_df = ed.DataFrame(ES_TEST_CLIENT, index_name)

        for format_name in self.time_formats.keys():
            times = [
                pd.to_datetime(
                    datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z").strftime(
                        self.time_formats[format_name]),
                    format=self.time_formats[format_name],
                ) for dt in self.times
            ]

            ed_series = ed_df[format_name]
            pd_series = pd.Series(
                times,
                index=[str(i) for i in range(len(self.times))],
                name=format_name)

            assert_pandas_eland_series_equal(pd_series, ed_series)
Ejemplo n.º 3
0
    def test_end_to_end(self):
        """Tests an end-to-end workflow with all components."""

        num_documents = 100
        num_users = 10
        max_queries = 5

        resources.prepare(ES_TEST_CLIENT)
        simulate.generate_events(
            num_documents,
            num_users,
            max_queries,
            lambda x: ES_TEST_CLIENT.index(
                index=resources.INDEX, pipeline=resources.INDEX, body=x),
            with_progress=True,
        )
        ES_TEST_CLIENT.indices.refresh(resources.INDEX)
        resources.start_transforms(ES_TEST_CLIENT, resources.TRANSFORM_NAMES)

        # if any of the mechanics above fail, we won't reach this point, which
        # is a good integration test in-and-of-itself
        index_size = ES_TEST_CLIENT.count(index=METRICS_INDEX)['count']
        self.assertGreaterEqual(index_size, num_users)
        self.assertLessEqual(index_size, num_users * max_queries)

        # make some invariant assertions based on aggregate statistics of the data
        # when things break, these statistics go to 0
        metrics_df = ed.DataFrame(es_client=ES_TEST_CLIENT,
                                  es_index_pattern=METRICS_INDEX)
        metrics_cols = [
            x for x in metrics_df.columns if x.startswith('metrics.clicks.')
        ]
        non_zero_properties = metrics_df[metrics_cols].describe().loc[[
            'count', 'mean', 'std', 'max'
        ]]
        self.assertFalse(non_zero_properties.eq(0.0).any().any())
Ejemplo n.º 4
0
    def test_init(self):
        # Construct empty DataFrame (throws)
        with pytest.raises(ValueError):
            ed.DataFrame()

        # Construct invalid DataFrame (throws)
        with pytest.raises(ValueError):
            ed.DataFrame(es_client=ES_TEST_CLIENT)

        # Construct invalid DataFrame (throws)
        with pytest.raises(ValueError):
            ed.DataFrame(es_index_pattern=FLIGHTS_INDEX_NAME)

        # Good constructors
        ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)
        ed.DataFrame(es_client=ES_TEST_CLIENT, es_index_pattern=FLIGHTS_INDEX_NAME)

        qc = QueryCompiler(client=ES_TEST_CLIENT, index_pattern=FLIGHTS_INDEX_NAME)
        ed.DataFrame(_query_compiler=qc)
Ejemplo n.º 5
0
#We need to connect to our ElasticSearch first, fo that refer to the reference link provided in the README.md for explaination

import elasticsearch
import eland as ed
from elasticsearch import Elasticsearch

es = Elasticsearch(
    ['host_server_name'],
    http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'),
    scheme="https",
    port=443,
)

es.ping()

#Get the names of all available indices on ElasticSearch as output
rec = es.indices.get_alias("*")
for Name in rec:
    print(Name)

#Choose the index whose data you are interested in
df = ed.DataFrame(es, es_index_pattern="NAME_OF_INDEX_PATTERN")

#Print the data in chosen index in the form of eland dataframe
df
Ejemplo n.º 6
0
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

# Create pandas and eland data frames
from tests import (
    ECOMMERCE_DF_FILE_NAME,
    ECOMMERCE_INDEX_NAME,
    ES_TEST_CLIENT,
    FLIGHTS_DF_FILE_NAME,
    FLIGHTS_INDEX_NAME,
    FLIGHTS_SMALL_INDEX_NAME,
)

_pd_flights = pd.read_json(FLIGHTS_DF_FILE_NAME).sort_index()
_pd_flights["timestamp"] = pd.to_datetime(_pd_flights["timestamp"])
_pd_flights.index = _pd_flights.index.map(str)  # make index 'object' not int
_ed_flights = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_INDEX_NAME)

_pd_flights_small = _pd_flights.head(48)
_ed_flights_small = ed.DataFrame(ES_TEST_CLIENT, FLIGHTS_SMALL_INDEX_NAME)

_pd_ecommerce = pd.read_json(ECOMMERCE_DF_FILE_NAME).sort_index()
_pd_ecommerce["order_date"] = pd.to_datetime(_pd_ecommerce["order_date"])
_pd_ecommerce["products.created_on"] = _pd_ecommerce[
    "products.created_on"].apply(lambda x: pd.to_datetime(x))
_pd_ecommerce.insert(2, "customer_birth_date", None)
_pd_ecommerce.index = _pd_ecommerce.index.map(
    str)  # make index 'object' not int
_pd_ecommerce["customer_birth_date"].astype("datetime64")
_ed_ecommerce = ed.DataFrame(ES_TEST_CLIENT, ECOMMERCE_INDEX_NAME)

import click
from datetime import time
from pathlib import Path
from elasticsearch import Elasticsearch
import typing
import eland
import pandas as pd
import geopandas as gpd


es = Elasticsearch()
geodf = eland.DataFrame(es, "sd_geo")


def get_closest_lat_long(point, threshold=100):
    q = {
        "size": 1,
        "query": {
            "bool": {
                "must": [
                    {
                        "wildcard": {
                            "street.keyword": f"{point.address_road_primary}*",
                            },
                        },
                    ],
                "filter": {
                    "range": {
                        "number": {
                            "gte": point.address_number_primary - threshold,
                            "lte": point.address_number_primary + threshold,
Ejemplo n.º 8
0
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Authors: Madeline McCombe <*****@*****.**>

##############################################################################
# import packages
import eland as ed

##############################################################################

# change these to match your ELK IP/index name
ip_address = '127.0.0.1'
index_name = 'gtp-*'


# accessing index in lab ELK stack
ed_df = ed.DataFrame(ip_address, index_name)

# pull info about the index (like memory usage)
print(ed_df.info(), '\n')

# find the number of entries in the index
print('There are', ed_df.shape[0], 'entries in this index.\n')

# See the most common event codes
print('The top 10 event_code/count pairs are: \n', ed_df['event_code'].value_counts(), '\n')

Ejemplo n.º 9
0
import elasticsearch
import eland as ed
from elasticsearch import Elasticsearch

# Connecting to an Elastic Cloud instance(remote)
es = Elasticsearch(
    ['host_server_name'],
    http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'),
    scheme="https",
    port=443,
)

#you can also use this one-liner
#es = Elasticsearch(['https://*****:*****@HOST_SERVER_NAME:PORT'])

#Invoke and import index data into eland dataframe
df = ed.DataFrame(es, es_index_pattern="NAME_OF_INDEX_PATTERN")

#Check whether the Elasticsearch respond
es.ping()

df

# For connecting Elastic cloud on your localhost
df = ed.DataFrame("localhost:9200", es_index_pattern="NAME_OF_INDEX_PATTERN")
df
Ejemplo n.º 10
0
#Refer to README.md file for a detailed code and execution steps
import elasticsearch
import eland as ed
from elasticsearch import Elasticsearch

es = Elasticsearch(['host_server_name'],
                   http_auth=('YOUR_USERNAME', 'YOUR_PASSWORD'),
                   scheme="https",
                   port=443)

#Following syntax is used to create an index into your Elasticsearch
es.indices.create(index="My_First_Index", ignore=400)

#Check or Fetch the created index
df = ed.DataFrame(es, es_index_pattern="mydata")
df

#Deleting the index
es.indices.delete(index="My_First_Index", ignore=400)