Esempio n. 1
0
def saveElasticS(df,
                 esindex_name='transform_index',
                 estype_name='transform_type'):
    logging.info("Saving main Transform model to Elasticsearch...")
    start_total_time = time.time()

    elastic_host = "https://9db53c7bb4f5be2d856033a9aeb6e5a5.us-central1.gcp.cloud.es.io"
    elastic_username = "******"
    elastic_port = 9243
    elastic_password = "******"

    INDEX = esindex_name
    TYPE = estype_name
    df['indexId'] = (df.index + 100).astype(str)
    esp = Espandas(hosts=[elastic_host],
                   port=elastic_port,
                   http_auth=(elastic_username, elastic_password))
    logger.info("Bulk insert into ElasticSearch, chunksize=%d, time_out: %d" %
                (20000, 60))
    logger.info("ElasticSearch host: %s", elastic_host)
    logger.info("ElasticSearch port: %s", elastic_port)
    logger.info(esp.es_write(df, INDEX, TYPE, chunksize=20000, rto=60))

    end_total_time = time.time() - start_total_time
    logger.info(
        'Finish bulk insert to Eslastic Search, time taken: %.5f with total entity: %d'
        % (end_total_time, len(df)))
    return
Esempio n. 2
0
 def import_enriched_data(data, type, index=State.tenant):
     elastic_hosts = [
         "http://{ip}:{port}/".format(ip=State.ELASTIC_IP,
                                      port=State.ELASTIC_PORT)
     ]
     esp = Espandas(hosts=elastic_hosts, verify_certs=True)
     esp.es_write(data, index, type)
Esempio n. 3
0
                              click_topic_is_general as is_general,
                              click_topic_count as num,
                              click_topic_name as topic_name
                            FROM `kumparan-data.topic_recommender.click_distribution_hourly` CDH
                          """

    if G <= 0:
        logger.DEBUG("G cannot smaller or equal to 0...")
        sys.exit()

    # ~~~ create elastic index for fallback data ~~~
    logger.info("ElasticSearch host: %s", elastic_host)
    logger.info("ElasticSearch port: %s", elastic_port)

    es = Elasticsearch([elastic_host], port=elastic_port)
    esp = Espandas(hosts=[elastic_host], port=elastic_port)

    logger.info("Checking topicrecommendation_transform_fallback_index availability...")
    index_name = "topicrecommendation_transform_fallback_index"
    is_index_exist = es.indices.exists(index=index_name)

    request_body_fb = {
        "settings" : {
            "number_of_shards": 2,
            "number_of_replicas": 1
        },
        "mappings" : {
            "topicrecommendation_transform_fallback_type" : {
                "properties" : {
                    "topid": {"type": "keyword" },
                    "topic_name": {"type": "text" },
Esempio n. 4
0
def pd_elastic(df):
    df['indexId'] = (df.index).astype(str)
    Index = input('index name: ')
    Type = input('type name: ')
    esp = Espandas()
    esp.es_write(df, Index, Type)
Esempio n. 5
0
def test_es_client():
    """
    Insert a DataFrame and test that is is correctly extracted
    """
    # Only run this test if the index does not already exist
    # and can be created and deleted
    if test_es():
        try:
            print('Connection to ElasticSearch established: testing write and read.')
            es = Elasticsearch()
            es.indices.create(INDEX)

            esp = Espandas()
            esp.es_write(df, INDEX, TYPE)
            k = list(df['indexId'].astype('str'))
            res = esp.es_read(k, INDEX, TYPE)

            # The returned DataFrame should match the original
            assert res.shape == df.shape
            assert np.all(res.index == df.index)
            assert np.all(res.columns == df.columns)
            assert np.all(res == df)

            # Bogus keys should not match anything
            res = esp.es_read(['bar'], INDEX, TYPE)
            assert res is None
            num_sample = 3
            present = list(df.sample(num_sample)['indexId'].astype('str'))
            present.append('bar')
            res = esp.es_read(present, INDEX, TYPE)
            assert res.shape[0] == num_sample

            # Test for invalid inputs
            # Input must be a DataFrame
            with pytest.raises(ValueError):
                esp.es_write('foobar', INDEX, TYPE)
            # uid_name must exist in the DataFrame
            with pytest.raises(ValueError):
                esp.es_write(df, INDEX, TYPE, uid_name='foo_index')

            # Values in uid_name must be unique
            df2 = df.copy()
            df2.ix[0, 'indexId'] = df.ix[1, 'indexId']
            with pytest.raises(ValueError):
                esp.es_write(df2, INDEX, TYPE)
        finally:
            # Cleanup
            es.indices.delete(INDEX)
def reset_es_index(host_url,
                   port,
                   index_type='risky_ids',
                   index_name='most_recent_risky_ids'):
    # this is use to create request body
    settings = {"settings": {"number_of_shards": 1, "number_of_replicas": 0}}
    # mapping of index
    maps = {}
    maps['risky_ids'] = {
        "properties": {
            'mobileId': {
                "type": "keyword"
            },
            'earliestContactTime': {
                "type": "date"
            },
            "status": {
                "type": "integer"
            },
        }
    }
    maps['new_risky_ids'] = {
        "properties": {
            'mobileId': {
                "type": "keyword"
            },
            'updateAt': {
                "type": "date"
            },
            "oldStatus": {
                "type": "integer"
            },
            "newStatus": {
                "type": "integer"
            }
        }
    }
    maps['red_zones'] = {
        "properties": {
            "mobileId": {
                "type": "keyword"
            },
            "maxTime": {
                "type": "date"
            },
            "minTime": {
                "type": "date"
            },
            "location": {
                "type": "geo_point"
            },
            "delta": {
                "type": "integer"
            }
        }
    }

    # connect to elastic search
    es = Elasticsearch([host_url + ':' + port], timeout=600)
    # Reconstruct index if it is overwrite
    if es.indices.exists(index_name):
        es.indices.delete(index=index_name)
    es.indices.create(index=index_name, body=settings)
    es.indices.put_mapping(index=index_name, body=maps[index_type])
    return Espandas(es)
Esempio n. 7
0
from memory_profiler import LogFile
import sys
sys.stdout = LogFile('memory_profile6_log', reportIncrementFlag=False)
# ~~~~~~~~~~~~~


elastic_host = "https://9db53c7bb4f5be2d856033a9aeb6e5a5.us-central1.gcp.cloud.es.io"
elastic_username = "******"
elastic_port = 9243
elastic_password = "******"

logger.info("ElasticSearch host: %s", elastic_host)
logger.info("ElasticSearch port: %s", elastic_port)
es = Elasticsearch([elastic_host], port=elastic_port,
                    http_auth=(elastic_username, elastic_password))
esp = Espandas(hosts=[elastic_host], port=elastic_port, http_auth=(elastic_username, elastic_password))

pd.set_option('display.width', 1000)

def humanbytes(B):
    'Return the given bytes as a human friendly KB, MB, GB, or TB string'
    B = float(int(B))
    KB = float(1024)
    MB = float(KB ** 2) # 1,048,576
    GB = float(KB ** 3) # 1,073,741,824
    TB = float(KB ** 4) # 1,099,511,627,776
    if B < KB:
        return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
    elif KB <= B < MB:
        return '{0:.2f} KB'.format(B/KB)
    elif MB <= B < GB: