def saveElasticS(df, esindex_name='transform_index', estype_name='transform_type'): logging.info("Saving main Transform model to Elasticsearch...") start_total_time = time.time() elastic_host = "https://9db53c7bb4f5be2d856033a9aeb6e5a5.us-central1.gcp.cloud.es.io" elastic_username = "******" elastic_port = 9243 elastic_password = "******" INDEX = esindex_name TYPE = estype_name df['indexId'] = (df.index + 100).astype(str) esp = Espandas(hosts=[elastic_host], port=elastic_port, http_auth=(elastic_username, elastic_password)) logger.info("Bulk insert into ElasticSearch, chunksize=%d, time_out: %d" % (20000, 60)) logger.info("ElasticSearch host: %s", elastic_host) logger.info("ElasticSearch port: %s", elastic_port) logger.info(esp.es_write(df, INDEX, TYPE, chunksize=20000, rto=60)) end_total_time = time.time() - start_total_time logger.info( 'Finish bulk insert to Eslastic Search, time taken: %.5f with total entity: %d' % (end_total_time, len(df))) return
def import_enriched_data(data, type, index=State.tenant): elastic_hosts = [ "http://{ip}:{port}/".format(ip=State.ELASTIC_IP, port=State.ELASTIC_PORT) ] esp = Espandas(hosts=elastic_hosts, verify_certs=True) esp.es_write(data, index, type)
click_topic_is_general as is_general, click_topic_count as num, click_topic_name as topic_name FROM `kumparan-data.topic_recommender.click_distribution_hourly` CDH """ if G <= 0: logger.DEBUG("G cannot smaller or equal to 0...") sys.exit() # ~~~ create elastic index for fallback data ~~~ logger.info("ElasticSearch host: %s", elastic_host) logger.info("ElasticSearch port: %s", elastic_port) es = Elasticsearch([elastic_host], port=elastic_port) esp = Espandas(hosts=[elastic_host], port=elastic_port) logger.info("Checking topicrecommendation_transform_fallback_index availability...") index_name = "topicrecommendation_transform_fallback_index" is_index_exist = es.indices.exists(index=index_name) request_body_fb = { "settings" : { "number_of_shards": 2, "number_of_replicas": 1 }, "mappings" : { "topicrecommendation_transform_fallback_type" : { "properties" : { "topid": {"type": "keyword" }, "topic_name": {"type": "text" },
def pd_elastic(df): df['indexId'] = (df.index).astype(str) Index = input('index name: ') Type = input('type name: ') esp = Espandas() esp.es_write(df, Index, Type)
def test_es_client(): """ Insert a DataFrame and test that is is correctly extracted """ # Only run this test if the index does not already exist # and can be created and deleted if test_es(): try: print('Connection to ElasticSearch established: testing write and read.') es = Elasticsearch() es.indices.create(INDEX) esp = Espandas() esp.es_write(df, INDEX, TYPE) k = list(df['indexId'].astype('str')) res = esp.es_read(k, INDEX, TYPE) # The returned DataFrame should match the original assert res.shape == df.shape assert np.all(res.index == df.index) assert np.all(res.columns == df.columns) assert np.all(res == df) # Bogus keys should not match anything res = esp.es_read(['bar'], INDEX, TYPE) assert res is None num_sample = 3 present = list(df.sample(num_sample)['indexId'].astype('str')) present.append('bar') res = esp.es_read(present, INDEX, TYPE) assert res.shape[0] == num_sample # Test for invalid inputs # Input must be a DataFrame with pytest.raises(ValueError): esp.es_write('foobar', INDEX, TYPE) # uid_name must exist in the DataFrame with pytest.raises(ValueError): esp.es_write(df, INDEX, TYPE, uid_name='foo_index') # Values in uid_name must be unique df2 = df.copy() df2.ix[0, 'indexId'] = df.ix[1, 'indexId'] with pytest.raises(ValueError): esp.es_write(df2, INDEX, TYPE) finally: # Cleanup es.indices.delete(INDEX)
def reset_es_index(host_url, port, index_type='risky_ids', index_name='most_recent_risky_ids'): # this is use to create request body settings = {"settings": {"number_of_shards": 1, "number_of_replicas": 0}} # mapping of index maps = {} maps['risky_ids'] = { "properties": { 'mobileId': { "type": "keyword" }, 'earliestContactTime': { "type": "date" }, "status": { "type": "integer" }, } } maps['new_risky_ids'] = { "properties": { 'mobileId': { "type": "keyword" }, 'updateAt': { "type": "date" }, "oldStatus": { "type": "integer" }, "newStatus": { "type": "integer" } } } maps['red_zones'] = { "properties": { "mobileId": { "type": "keyword" }, "maxTime": { "type": "date" }, "minTime": { "type": "date" }, "location": { "type": "geo_point" }, "delta": { "type": "integer" } } } # connect to elastic search es = Elasticsearch([host_url + ':' + port], timeout=600) # Reconstruct index if it is overwrite if es.indices.exists(index_name): es.indices.delete(index=index_name) es.indices.create(index=index_name, body=settings) es.indices.put_mapping(index=index_name, body=maps[index_type]) return Espandas(es)
from memory_profiler import LogFile import sys sys.stdout = LogFile('memory_profile6_log', reportIncrementFlag=False) # ~~~~~~~~~~~~~ elastic_host = "https://9db53c7bb4f5be2d856033a9aeb6e5a5.us-central1.gcp.cloud.es.io" elastic_username = "******" elastic_port = 9243 elastic_password = "******" logger.info("ElasticSearch host: %s", elastic_host) logger.info("ElasticSearch port: %s", elastic_port) es = Elasticsearch([elastic_host], port=elastic_port, http_auth=(elastic_username, elastic_password)) esp = Espandas(hosts=[elastic_host], port=elastic_port, http_auth=(elastic_username, elastic_password)) pd.set_option('display.width', 1000) def humanbytes(B): 'Return the given bytes as a human friendly KB, MB, GB, or TB string' B = float(int(B)) KB = float(1024) MB = float(KB ** 2) # 1,048,576 GB = float(KB ** 3) # 1,073,741,824 TB = float(KB ** 4) # 1,099,511,627,776 if B < KB: return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte') elif KB <= B < MB: return '{0:.2f} KB'.format(B/KB) elif MB <= B < GB: