def get_all_ml_files(cls, es_client: Elasticsearch) -> dict: """Get all scripts, pipelines, and models which start with ml_*.""" pipelines = IngestClient(es_client).get_pipeline() scripts = es_client.cluster.state()['metadata']['stored_scripts'] models = MlClient( es_client).get_trained_models()['trained_model_configs'] manifests = get_ml_model_manifests_by_model_id() files = { 'pipeline': { n: s for n, s in pipelines.items() if n.lower().startswith('ml_') }, 'script': {n: s for n, s in scripts.items() if n.lower().startswith('ml_')}, 'model': { m['model_id']: { 'model': m, 'release': manifests[m['model_id']] } for m in models if m['model_id'] in manifests }, } return files
def __init__( self, pipeline_id: str = "pdf_content", field: str = "data", pipeline_description: str = "Extracting info from pdf content"): self.pipeline_id: str = pipeline_id self.index_name: str = pipeline_id + "_index" self.field: str = field self.pipeline_description: str = pipeline_description self.ingest_client = IngestClient(current_app.elasticsearch)
def add_attachment_pipeline(self): pipeline = { "description": "Extract attachment information encoded in Base64 with UTF-8 charset", "processors": [{ "attachment": { "field": "attachment" } }] } ingest_client = IngestClient(self.client) if self.is_new: ingest_client.put_pipeline('attachment', pipeline)
def createPreprocessor(self): dropPipeline = { "description": "drop old room", "processors": [{ "drop": { "if": """ //Получаем текущую дату из параметра в формате ISO-8601 ZonedDateTime zdt = ZonedDateTime.parse(ctx.bazis_update_date); long millisDateTime = zdt.toInstant().toEpochMilli(); ZonedDateTime nowDate = ZonedDateTime.ofInstant(Instant.ofEpochMilli(millisDateTime), ZoneId.of("Z")); //Получаем end_date ZonedDateTime endDateZDT = ZonedDateTime.parse(ctx.end_date + "T00:00:00Z"); long millisDateTimeEndDate = endDateZDT.toInstant().toEpochMilli(); ZonedDateTime endDate = ZonedDateTime.ofInstant(Instant.ofEpochMilli(millisDateTimeEndDate), ZoneId.of("Z")); // Сравниваем даты return endDate.isBefore(nowDate) """ } }] } IngestClient(ES).put_pipeline(id=self.PIPELINE, body=dropPipeline)
def _createPipeline(self): ic = IngestClient(self.conn) self.pipeline_id = 'monthlyprocessor' pipeline_body = { "description": "monthly date-time index naming", "processors" : [ { "date_index_name" : { "field" : "@timestamp", "index_name_prefix" : "{{ _index}}-", "date_rounding" : "M", } } ] } ic.put_pipeline(id=self.pipeline_id, body=pipeline_body)
def setUpClass(cls): if cls._overridden_settings: cls._cls_overridden_context = override_settings( **cls._overridden_settings) cls._cls_overridden_context.enable() connections.configure(**settings.ELASTICSEARCH_CONNECTIONS) cls.es_client = cls._get_client() IngestClient(cls.es_client).put_pipeline( id='ingest_attachment', body={ 'description': "Extract attachment information", 'processors': [{ "attachment": { "field": "data", "indexed_chars": "-1" }, "remove": { "field": "data" } }] }) super().setUpClass()
def putPipelines(): conn = get_connection() client = IngestClient(conn) client.put_pipeline(id='rename_structure_unit_description', body={ 'description': "Rename field _source.description to _source.desc", 'processors': [ { "rename": { "field": "_source.description", "target_field": "_source.desc", }, }, ] })
def __init__(self, db, sm_config=None): self.sm_config = sm_config or SMConfig.get_conf() self._es: Elasticsearch = init_es_conn(self.sm_config['elasticsearch']) self._ingest: IngestClient = IngestClient(self._es) self._db = db self._ds_locker = DBMutex(self.sm_config['db']) self.index = self.sm_config['elasticsearch']['index'] self._get_mol_by_formula_dict_cache = dict()
def remove_ml_scripts_pipelines(cls, es_client: Elasticsearch, ml_type: List[str]) -> dict: """Remove all ML script and pipeline files.""" results = dict(script={}, pipeline={}) ingest_client = IngestClient(es_client) files = cls.get_all_ml_files(es_client=es_client) for file_type, data in files.items(): for name in list(data): this_type = name.split('_')[1].lower() if this_type not in ml_type: continue if file_type == 'script': results[file_type][name] = es_client.delete_script(name) elif file_type == 'pipeline': results[file_type][name] = ingest_client.delete_pipeline( name) return results
def _create_ingest_pipeline(self) -> None: """ Create ingest pipeline to allow extract file content and use them for search. """ p = IngestClient(self.es) # TODO - G.M - 2019-05-31 - check if possible to set specific analyzer for # attachment content parameters. Goal : # allow ngram or lang specific indexing for "in file search" p.put_pipeline( id="attachment", body={ "description": "Extract attachment information", "processors": [{ "attachment": { "field": "file" } }], }, )
def check_model_files(ctx): """Check ML model files on an elasticsearch instance.""" from elasticsearch.client import IngestClient, MlClient from .misc import get_ml_model_manifests_by_model_id es_client: Elasticsearch = ctx.obj['es'] ml_client = MlClient(es_client) ingest_client = IngestClient(es_client) def safe_get(func, arg): try: return func(arg) except elasticsearch.NotFoundError: return None models = [ m for m in ml_client.get_trained_models().get( 'trained_model_configs', []) if m['created_by'] != '_xpack' ] if models: if len([m for m in models if m['model_id'].startswith('dga_')]) > 1: click.secho( 'Multiple DGA models detected! It is not recommended to run more than one DGA model at a time', fg='yellow') manifests = get_ml_model_manifests_by_model_id() click.echo(f'DGA Model{"s" if len(models) > 1 else ""} found:') for model in models: manifest = manifests.get(model['model_id']) click.echo( f' - {model["model_id"]}, associated release: {manifest.html_url if manifest else None}' ) else: click.echo('No DGA Models found') support_files = { 'create_script': safe_get(es_client.get_script, 'dga_ngrams_create'), 'delete_script': safe_get(es_client.get_script, 'dga_ngrams_transform_delete'), 'enrich_pipeline': safe_get(ingest_client.get_pipeline, 'dns_enrich_pipeline'), 'inference_pipeline': safe_get(ingest_client.get_pipeline, 'dns_dga_inference_enrich_pipeline') } click.echo('Support Files:') for support_file, results in support_files.items(): click.echo( f' - {support_file}: {"found" if results else "not found"}')
def installPipelines(): conn = get_connection() client = IngestClient(conn) client.put_pipeline(id='ingest_attachment', body={ 'description': "Extract attachment information", 'processors': [{ "attachment": { "field": "data", "indexed_chars": "-1" }, "remove": { "field": "data" } }] }) client.put_pipeline(id='add_timestamp', body={ 'description': "Adds an index_date timestamp", 'processors': [ { "set": { "field": "index_date", "value": "{{_ingest.timestamp}}", }, }, ] })
def remove_dga_model(ctx, model_id, force, es_client: Elasticsearch = None, ml_client: MlClient = None, ingest_client: IngestClient = None): """Remove ML DGA files.""" from elasticsearch.client import IngestClient, MlClient es_client = es_client or ctx.obj['es'] ml_client = ml_client or MlClient(es_client) ingest_client = ingest_client or IngestClient(es_client) def safe_delete(func, fid, verbose=True): try: func(fid) except elasticsearch.NotFoundError: return False if verbose: click.echo(f' - {fid} deleted') return True model_exists = False if not force: existing_models = ml_client.get_trained_models() model_exists = model_id in [ m['model_id'] for m in existing_models.get('trained_model_configs', []) ] if model_exists or force: if model_exists: click.secho('[-] Existing model detected - deleting files', fg='yellow') deleted = [ safe_delete(ingest_client.delete_pipeline, 'dns_dga_inference_enrich_pipeline'), safe_delete(ingest_client.delete_pipeline, 'dns_enrich_pipeline'), safe_delete(es_client.delete_script, 'dga_ngrams_transform_delete'), # f'{model_id}_dga_ngrams_transform_delete' safe_delete(es_client.delete_script, 'dga_ngrams_create'), # f'{model_id}_dga_ngrams_create' safe_delete(ml_client.delete_trained_model, model_id) ] if not any(deleted): click.echo('No files deleted') else: click.echo(f'Model: {model_id} not found')
class Pipeline: """ A pipeline is a definition of a series of processors that are to be executed in the same order as they are declared. (https://www.elastic.co/guide/en/elasticsearch/reference/current/pipeline.html) Parameters ---------- client: elasticsearch.Elasticsearch a elasticsearch client. name: str name for pipeline. pipeline_handler: :obj:`PipelineHandler` object that contains json object of pipeline. Attributes ---------- client: str name: str pipeline_handler: :obj:`PipelineHandler` """ def __init__(self, client, name, pipeline_handler): self._pipeline_handler = pipeline_handler self._name = name self.ingest_client = IngestClient(client) def create_pipeline(self, params=None): """ create_pipeline method create the elasticsearch pipeline with processors specified in pipeline_handler json. """ try: self.ingest_client.put_pipeline( self._name, load_json(self._pipeline_handler._json)) except Exception as e: raise (e)
def createPreprocessor(self): dropPipeline = { "description": "drop not actulal addresses", "processors": [{ "drop": { "if": "ctx.curr_status != '0' " } }, { "drop": { "if": "ctx.act_status != '1'" } }, { "drop": { "if": "ctx.live_status != '1'" } }] } IngestClient(ES).put_pipeline(id=PIPELINE, body=dropPipeline)
class IngestConnector: def __init__( self, pipeline_id: str = "pdf_content", field: str = "data", pipeline_description: str = "Extracting info from pdf content"): self.pipeline_id: str = pipeline_id self.index_name: str = pipeline_id + "_index" self.field: str = field self.pipeline_description: str = pipeline_description self.ingest_client = IngestClient(current_app.elasticsearch) def create_pipeline(self): self.ingest_client.put_pipeline(id=self.pipeline_id, body={ 'description': self.pipeline_description, 'processors': [{ "attachment": { "field": self.field } }] }) def delete_pipeline(self): self.ingest_client.delete_pipeline(id=self.pipeline_id) def get_pipeline(self): return self.ingest_client.get_pipeline(id=self.pipeline_id) def add_to_index(self, id_: int, content: str, content_page: int, content_paragraph: int): current_app.elasticsearch.index( index=self.index_name, id=id_, pipeline=self.pipeline_id, body={ self.field: base64.b64encode(content.encode("utf-8")).decode("utf-8"), "content_page": content_page, "content_paragraph": content_paragraph, }) def remove_from_index(self, id_: int): current_app.elasticsearch.delete(index=self.index_name, id=id_) def api_search(self, query: str): return current_app.elasticsearch.search( index=self.index_name, body={"query": { "match": { "attachment.content": query } }}) def search(self, query: str): search = self.api_search(query) ids = [int(hit['_id']) for hit in search['hits']['hits']] if len(ids) == 0: return None when = [] for i in range(len(ids)): when.append((ids[i], i)) res = KnowledgePdfContent.query.filter( KnowledgePdfContent.id.in_(ids)).order_by( db.case(when, value=KnowledgePdfContent.id)).all() return res[0] if len(res) > 0 else None
from blog.models import Article from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean from django.conf import settings ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL') if ELASTICSEARCH_ENABLED: connections.create_connection( hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']]) from elasticsearch import Elasticsearch es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) from elasticsearch.client import IngestClient c = IngestClient(es) try: c.get_pipeline('geoip') except elasticsearch.exceptions.NotFoundError: c.put_pipeline('geoip', body='''{ "description" : "Add geoip info", "processors" : [ { "geoip" : { "field" : "ip" } } ] }''')
def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite): """Upload ML DGA model and dependencies and enrich DNS data.""" import io import requests import shutil import zipfile es_client: Elasticsearch = ctx.obj['es'] client_info = es_client.info() license_client = LicenseClient(es_client) if license_client.get()['license']['type'].lower() not in ('platinum', 'enterprise'): client_error( 'You must have a platinum or enterprise subscription in order to use these ML features' ) # download files if necessary if not model_dir: if not model_tag: client_error( 'model-tag or model-dir required to download model files') click.echo(f'Downloading artifact: {model_tag}') release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}' release = requests.get(release_url) release.raise_for_status() assets = [ a for a in release.json()['assets'] if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip') ] if len(assets) != 1: client_error( f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!' ) zipped_url = assets[0]['browser_download_url'] zipped = requests.get(zipped_url) z = zipfile.ZipFile(io.BytesIO(zipped.content)) dga_dir = get_path('ML-models', 'DGA') model_dir = os.path.join(dga_dir, model_tag) os.makedirs(dga_dir, exist_ok=True) shutil.rmtree(model_dir, ignore_errors=True) z.extractall(dga_dir) click.echo(f'files saved to {model_dir}') # read files as needed z.close() def get_model_filename(pattern): paths = list(Path(model_dir).glob(pattern)) if not paths: client_error( f'{model_dir} missing files matching the pattern: {pattern}') if len(paths) > 1: client_error( f'{model_dir} contains multiple files matching the pattern: {pattern}' ) return paths[0] @contextmanager def open_model_file(name): pattern = expected_ml_dga_patterns[name] with open(get_model_filename(pattern), 'r') as f: yield json.load(f) model_id, _ = os.path.basename( get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1) click.echo( f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})' ) # upload model ml_client = MlClient(es_client) ingest_client = IngestClient(es_client) existing_models = ml_client.get_trained_models() if model_id in [ m['model_id'] for m in existing_models.get('trained_model_configs', []) ]: if overwrite: ctx.invoke(remove_dga_model, model_id=model_id, es_client=es_client, ml_client=ml_client, ingest_client=ingest_client, force=True) else: client_error( f'Model: {model_id} already exists on stack! Try --overwrite to force the upload' ) click.secho('[+] Uploading model (may take a while)') with open_model_file('model') as model_file: try: ml_client.put_trained_model(model_id=model_id, body=model_file) except elasticsearch.ConnectionTimeout: msg = 'Connection timeout, try increasing timeout using `es --timeout <secs> experimental setup_dga_model`.' client_error(msg) # install scripts click.secho('[+] Uploading painless scripts') with open_model_file('dga_ngrams_create') as painless_install: es_client.put_script(id='dga_ngrams_create', body=painless_install) # f'{model_id}_dga_ngrams_create' with open_model_file('dga_ngrams_transform_delete') as painless_delete: es_client.put_script(id='dga_ngrams_transform_delete', body=painless_delete) # f'{model_id}_dga_ngrams_transform_delete' # Install ingest pipelines click.secho('[+] Uploading pipelines') def _build_es_script_error(err, pipeline_file): error = err.info['error'] cause = error['caused_by'] error_msg = [ f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}', ' '.join(f'{k}: {v}' for k, v in error['position'].items()), '\n'.join(error['script_stack']) ] return click.style('\n'.join(error_msg), fg='red') with open_model_file('dns_enrich_pipeline') as ingest_pipeline1: try: ingest_client.put_pipeline(id='dns_enrich_pipeline', body=ingest_pipeline1) except elasticsearch.RequestError as e: if e.error == 'script_exception': client_error(_build_es_script_error(e, 'ingest_pipeline1'), e, ctx=ctx) else: raise with open_model_file( 'dns_dga_inference_enrich_pipeline') as ingest_pipeline2: try: ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline', body=ingest_pipeline2) except elasticsearch.RequestError as e: if e.error == 'script_exception': client_error(_build_es_script_error(e, 'ingest_pipeline2'), e, ctx=ctx) else: raise click.echo('Ensure that you have updated your packetbeat.yml config file.') click.echo(' - reference: ML_DGA.md #2-update-packetbeat-configuration') click.echo( 'Associated rules and jobs can be found under ML-experimental-detections releases in the repo' ) click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>') click.echo( 'To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
def createElasticSearchIngestPipeline(self): esIngestClient = IngestClient(self.client) self.constructLanguagePipeline( esIngestClient, 'title_language_detector', 'Work title language detection', field='title.' ) self.constructLanguagePipeline( esIngestClient, 'alt_title_language_detector', 'Work alt_title language detection', prefix='_ingest._value.' ) self.constructLanguagePipeline( esIngestClient, 'edition_title_language_detector', 'Edition title language detection', prefix='_ingest._value.', field='title.' ) self.constructLanguagePipeline( esIngestClient, 'edition_sub_title_language_detector', 'Edition subtitle language detection', prefix='_ingest._value.', field='sub_title.' ) self.constructLanguagePipeline( esIngestClient, 'subject_heading_language_detector', 'Subject heading language detection', prefix='_ingest._value.', field='heading.' ) esIngestClient.put_pipeline( id='foreach_alt_title_language_detector', body={ 'description': 'loop for parsing alt_titles', 'processors': [ { 'foreach': { 'field': 'alt_titles', 'processor': { 'pipeline': { 'name': 'alt_title_language_detector', } } } } ] } ) esIngestClient.put_pipeline( id='edition_language_detector', body={ 'description': 'loop for parsing edition fields', 'processors': [ { 'pipeline': { 'name': 'edition_title_language_detector', 'ignore_failure': True } }, { 'pipeline': { 'name': 'edition_sub_title_language_detector', 'ignore_failure': True } } ] } ) esIngestClient.put_pipeline( id='language_detector', body={ 'description': 'Full language processing', 'processors': [ { 'pipeline': { 'name': 'title_language_detector', 'ignore_failure': True } }, { 'pipeline': { 'name': 'foreach_alt_title_language_detector', 'ignore_failure': True } }, { 'foreach': { 'field': 'editions', 'processor': { 'pipeline': { 'name': 'edition_language_detector', 'ignore_failure': True } } } }, { 'foreach': { 'field': 'subjects', 'ignore_missing': True, 'processor': { 'pipeline': { 'name': 'subject_heading_language_detector', 'ignore_failure': True } } } } ] } )
"field_map": {} } }] }, "docs": [{ "_source": { "sl": 4.2, "sw": 3.9, "pl": 1.9, "pw": 0.4 } }] } # simulate ingest pipeline IngestClient.simulate(es, body) # In[ ]: # Lets include an english name to convert the predicted value back to english flower name # set up an enrich index # create docs mapping serialized values -> english name mapping_index_name = model_id + '_mapping' mapping_docs = [] now = datetime.now() for pos, name in enumerate(labels.categories): mapping_docs.append({ "_index": mapping_index_name,
def deletePipelines(): conn = get_connection() client = IngestClient(conn) client.delete_pipeline(id='rename_structure_unit_description')
def ingest_client(self) -> IngestClient: return IngestClient(self.es_client)
def __init__(self, client, name, pipeline_handler): self._pipeline_handler = pipeline_handler self._name = name self.ingest_client = IngestClient(client)
import requests from datetime import datetime res = requests.get('http://localhost:9200') print(res) #connect to our cluster from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch.client import IngestClient es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) es_index = IndicesClient(es) ex_indices = IngestClient(es) import json r = requests.get('http://localhost:9200') i = 1 if r.status_code == 200: doc1 = { "description": "Extract attachment information from arrays", "processors": [{ "foreach": { "field": "attachments", "processor": { "attachment": { "target_field": "_ingest._value.attachment", "field": "_ingest._value.data" } }
] ds_doc['config']['database_ids'] = moldb_ids update_es_dataset(ds_doc, moldb_name_id_map) update_es_annotations(ds_doc, moldb_name_id_map_rev) update_db_dataset(ds_doc) except Exception as e: logger.warning(f'Failed to migrate dataset {ds_doc["id"]}: {e}') failed_datasets.append((ds_doc['id'], e)) if failed_datasets: print('FAILED DATASETS:') for ds, err in failed_datasets: print(ds, err) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Migrate moldb names -> moldb ids') parser.add_argument('--config', default='conf/config.json') parser.add_argument('--where', help='SQL WHERE statement') parser.add_argument('--ds-ids', help='Dataset ids, comma separated list') args = parser.parse_args() with GlobalInit(args.config) as sm_config: es: Elasticsearch = init_es_conn(sm_config['elasticsearch']) ingest: IngestClient = IngestClient(es) ds_ids = args.ds_ids.split(',') if args.ds_ids else None migrate_moldbs(args.where, ds_ids)
import json from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch.client import IngestClient conn = pymssql.connect(server="40.71.86.193", port=1433, user="******", password="******", database="testCaseFinal") cursor = conn.cursor() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) es_index = IndicesClient(es) ex_indices = IngestClient(es) table_name = 'Attachment_sfdc' query = "SELECT Body FROM Attachment_sfdc " var = cursor.execute(query) data_tup_list = cursor.fetchall() #print(data_tup_list) r = requests.get('http://localhost:9200') i = 1 if r.status_code == 200: ''' doc1 = { "description" : "Extract attachment information from arrays", "processors" : [
def make_pipelines(): with open(os.path.join(pipelines_dir, "pipelines.json")) as file: pipelines = json.load(file) ing_client = IngestClient(client) for key in pipelines.keys(): print("Creating {0} created {1}".format(key,ing_client.put_pipeline(key, pipelines[key])))