Ejemplo n.º 1
0
    def get_all_ml_files(cls, es_client: Elasticsearch) -> dict:
        """Get all scripts, pipelines, and models which start with ml_*."""
        pipelines = IngestClient(es_client).get_pipeline()
        scripts = es_client.cluster.state()['metadata']['stored_scripts']
        models = MlClient(
            es_client).get_trained_models()['trained_model_configs']
        manifests = get_ml_model_manifests_by_model_id()

        files = {
            'pipeline': {
                n: s
                for n, s in pipelines.items() if n.lower().startswith('ml_')
            },
            'script':
            {n: s
             for n, s in scripts.items() if n.lower().startswith('ml_')},
            'model': {
                m['model_id']: {
                    'model': m,
                    'release': manifests[m['model_id']]
                }
                for m in models if m['model_id'] in manifests
            },
        }
        return files
Ejemplo n.º 2
0
    def __init__(
            self,
            pipeline_id: str = "pdf_content",
            field: str = "data",
            pipeline_description: str = "Extracting info from pdf content"):
        self.pipeline_id: str = pipeline_id
        self.index_name: str = pipeline_id + "_index"
        self.field: str = field
        self.pipeline_description: str = pipeline_description

        self.ingest_client = IngestClient(current_app.elasticsearch)
Ejemplo n.º 3
0
    def add_attachment_pipeline(self):
        pipeline = {
            "description":
            "Extract attachment information encoded in Base64 with UTF-8 charset",
            "processors": [{
                "attachment": {
                    "field": "attachment"
                }
            }]
        }

        ingest_client = IngestClient(self.client)
        if self.is_new:
            ingest_client.put_pipeline('attachment', pipeline)
Ejemplo n.º 4
0
    def createPreprocessor(self):
        dropPipeline = {
            "description":
            "drop old room",
            "processors": [{
                "drop": {
                    "if":
                    """
        //Получаем текущую дату из параметра в формате ISO-8601
        ZonedDateTime zdt = ZonedDateTime.parse(ctx.bazis_update_date);
        long millisDateTime = zdt.toInstant().toEpochMilli();
        ZonedDateTime nowDate =
        ZonedDateTime.ofInstant(Instant.ofEpochMilli(millisDateTime), ZoneId.of("Z")); 

        //Получаем end_date 
        ZonedDateTime endDateZDT = ZonedDateTime.parse(ctx.end_date + "T00:00:00Z");
        long millisDateTimeEndDate = endDateZDT.toInstant().toEpochMilli();
        ZonedDateTime endDate =
        ZonedDateTime.ofInstant(Instant.ofEpochMilli(millisDateTimeEndDate), ZoneId.of("Z")); 

        // Сравниваем даты
          return endDate.isBefore(nowDate)
        """
                }
            }]
        }
        IngestClient(ES).put_pipeline(id=self.PIPELINE, body=dropPipeline)
Ejemplo n.º 5
0
 def _createPipeline(self):
     ic = IngestClient(self.conn)
     self.pipeline_id = 'monthlyprocessor'
     pipeline_body = {
                   "description": "monthly date-time index naming",
                   "processors" : [
                     {
                       "date_index_name" : {
                         "field" : "@timestamp",
                         "index_name_prefix" : "{{ _index}}-",
                         "date_rounding" : "M",
                       }
                     }
                   ]
     }
     ic.put_pipeline(id=self.pipeline_id, body=pipeline_body)
Ejemplo n.º 6
0
    def setUpClass(cls):
        if cls._overridden_settings:
            cls._cls_overridden_context = override_settings(
                **cls._overridden_settings)
            cls._cls_overridden_context.enable()

        connections.configure(**settings.ELASTICSEARCH_CONNECTIONS)
        cls.es_client = cls._get_client()

        IngestClient(cls.es_client).put_pipeline(
            id='ingest_attachment',
            body={
                'description':
                "Extract attachment information",
                'processors': [{
                    "attachment": {
                        "field": "data",
                        "indexed_chars": "-1"
                    },
                    "remove": {
                        "field": "data"
                    }
                }]
            })

        super().setUpClass()
def putPipelines():
    conn = get_connection()
    client = IngestClient(conn)
    client.put_pipeline(id='rename_structure_unit_description',
                        body={
                            'description':
                            "Rename field _source.description to _source.desc",
                            'processors': [
                                {
                                    "rename": {
                                        "field": "_source.description",
                                        "target_field": "_source.desc",
                                    },
                                },
                            ]
                        })
Ejemplo n.º 8
0
 def __init__(self, db, sm_config=None):
     self.sm_config = sm_config or SMConfig.get_conf()
     self._es: Elasticsearch = init_es_conn(self.sm_config['elasticsearch'])
     self._ingest: IngestClient = IngestClient(self._es)
     self._db = db
     self._ds_locker = DBMutex(self.sm_config['db'])
     self.index = self.sm_config['elasticsearch']['index']
     self._get_mol_by_formula_dict_cache = dict()
Ejemplo n.º 9
0
    def remove_ml_scripts_pipelines(cls, es_client: Elasticsearch,
                                    ml_type: List[str]) -> dict:
        """Remove all ML script and pipeline files."""
        results = dict(script={}, pipeline={})
        ingest_client = IngestClient(es_client)

        files = cls.get_all_ml_files(es_client=es_client)
        for file_type, data in files.items():
            for name in list(data):
                this_type = name.split('_')[1].lower()
                if this_type not in ml_type:
                    continue
                if file_type == 'script':
                    results[file_type][name] = es_client.delete_script(name)
                elif file_type == 'pipeline':
                    results[file_type][name] = ingest_client.delete_pipeline(
                        name)

        return results
Ejemplo n.º 10
0
 def _create_ingest_pipeline(self) -> None:
     """
     Create ingest pipeline to allow extract file content and use them for search.
     """
     p = IngestClient(self.es)
     # TODO - G.M - 2019-05-31 - check if possible to set specific analyzer for
     # attachment content parameters. Goal :
     # allow ngram or lang specific indexing for "in file search"
     p.put_pipeline(
         id="attachment",
         body={
             "description": "Extract attachment information",
             "processors": [{
                 "attachment": {
                     "field": "file"
                 }
             }],
         },
     )
Ejemplo n.º 11
0
def check_model_files(ctx):
    """Check ML model files on an elasticsearch instance."""
    from elasticsearch.client import IngestClient, MlClient
    from .misc import get_ml_model_manifests_by_model_id

    es_client: Elasticsearch = ctx.obj['es']
    ml_client = MlClient(es_client)
    ingest_client = IngestClient(es_client)

    def safe_get(func, arg):
        try:
            return func(arg)
        except elasticsearch.NotFoundError:
            return None

    models = [
        m for m in ml_client.get_trained_models().get(
            'trained_model_configs', []) if m['created_by'] != '_xpack'
    ]

    if models:
        if len([m for m in models if m['model_id'].startswith('dga_')]) > 1:
            click.secho(
                'Multiple DGA models detected! It is not recommended to run more than one DGA model at a time',
                fg='yellow')

        manifests = get_ml_model_manifests_by_model_id()

        click.echo(f'DGA Model{"s" if len(models) > 1 else ""} found:')
        for model in models:
            manifest = manifests.get(model['model_id'])
            click.echo(
                f'    - {model["model_id"]}, associated release: {manifest.html_url if manifest else None}'
            )
    else:
        click.echo('No DGA Models found')

    support_files = {
        'create_script':
        safe_get(es_client.get_script, 'dga_ngrams_create'),
        'delete_script':
        safe_get(es_client.get_script, 'dga_ngrams_transform_delete'),
        'enrich_pipeline':
        safe_get(ingest_client.get_pipeline, 'dns_enrich_pipeline'),
        'inference_pipeline':
        safe_get(ingest_client.get_pipeline,
                 'dns_dga_inference_enrich_pipeline')
    }

    click.echo('Support Files:')
    for support_file, results in support_files.items():
        click.echo(
            f'    - {support_file}: {"found" if results else "not found"}')
Ejemplo n.º 12
0
def installPipelines():
    conn = get_connection()
    client = IngestClient(conn)
    client.put_pipeline(id='ingest_attachment',
                        body={
                            'description':
                            "Extract attachment information",
                            'processors': [{
                                "attachment": {
                                    "field": "data",
                                    "indexed_chars": "-1"
                                },
                                "remove": {
                                    "field": "data"
                                }
                            }]
                        })
    client.put_pipeline(id='add_timestamp',
                        body={
                            'description':
                            "Adds an index_date timestamp",
                            'processors': [
                                {
                                    "set": {
                                        "field": "index_date",
                                        "value": "{{_ingest.timestamp}}",
                                    },
                                },
                            ]
                        })
Ejemplo n.º 13
0
def remove_dga_model(ctx,
                     model_id,
                     force,
                     es_client: Elasticsearch = None,
                     ml_client: MlClient = None,
                     ingest_client: IngestClient = None):
    """Remove ML DGA files."""
    from elasticsearch.client import IngestClient, MlClient

    es_client = es_client or ctx.obj['es']
    ml_client = ml_client or MlClient(es_client)
    ingest_client = ingest_client or IngestClient(es_client)

    def safe_delete(func, fid, verbose=True):
        try:
            func(fid)
        except elasticsearch.NotFoundError:
            return False
        if verbose:
            click.echo(f' - {fid} deleted')
        return True

    model_exists = False
    if not force:
        existing_models = ml_client.get_trained_models()
        model_exists = model_id in [
            m['model_id']
            for m in existing_models.get('trained_model_configs', [])
        ]

    if model_exists or force:
        if model_exists:
            click.secho('[-] Existing model detected - deleting files',
                        fg='yellow')

        deleted = [
            safe_delete(ingest_client.delete_pipeline,
                        'dns_dga_inference_enrich_pipeline'),
            safe_delete(ingest_client.delete_pipeline, 'dns_enrich_pipeline'),
            safe_delete(es_client.delete_script,
                        'dga_ngrams_transform_delete'),
            # f'{model_id}_dga_ngrams_transform_delete'
            safe_delete(es_client.delete_script, 'dga_ngrams_create'),
            # f'{model_id}_dga_ngrams_create'
            safe_delete(ml_client.delete_trained_model, model_id)
        ]

        if not any(deleted):
            click.echo('No files deleted')
    else:
        click.echo(f'Model: {model_id} not found')
Ejemplo n.º 14
0
class Pipeline:
    """
    A pipeline is a definition of a series of processors that
    are to be executed in the same order as they are declared.
    (https://www.elastic.co/guide/en/elasticsearch/reference/current/pipeline.html)

    Parameters
    ----------

    client: elasticsearch.Elasticsearch
        a elasticsearch client.
    name: str
        name for pipeline.
    pipeline_handler: :obj:`PipelineHandler`
        object that contains json object of pipeline.

    Attributes
    ----------

    client: str
    name: str
    pipeline_handler: :obj:`PipelineHandler`
    """
    def __init__(self, client, name, pipeline_handler):
        self._pipeline_handler = pipeline_handler
        self._name = name
        self.ingest_client = IngestClient(client)

    def create_pipeline(self, params=None):
        """
        create_pipeline method create the elasticsearch pipeline
        with processors specified in pipeline_handler json.
        """
        try:
            self.ingest_client.put_pipeline(
                self._name, load_json(self._pipeline_handler._json))
        except Exception as e:
            raise (e)
Ejemplo n.º 15
0
 def createPreprocessor(self):
     dropPipeline = {
         "description":
         "drop not actulal addresses",
         "processors": [{
             "drop": {
                 "if": "ctx.curr_status  != '0' "
             }
         }, {
             "drop": {
                 "if": "ctx.act_status  != '1'"
             }
         }, {
             "drop": {
                 "if": "ctx.live_status  != '1'"
             }
         }]
     }
     IngestClient(ES).put_pipeline(id=PIPELINE, body=dropPipeline)
Ejemplo n.º 16
0
class IngestConnector:
    def __init__(
            self,
            pipeline_id: str = "pdf_content",
            field: str = "data",
            pipeline_description: str = "Extracting info from pdf content"):
        self.pipeline_id: str = pipeline_id
        self.index_name: str = pipeline_id + "_index"
        self.field: str = field
        self.pipeline_description: str = pipeline_description

        self.ingest_client = IngestClient(current_app.elasticsearch)

    def create_pipeline(self):
        self.ingest_client.put_pipeline(id=self.pipeline_id,
                                        body={
                                            'description':
                                            self.pipeline_description,
                                            'processors': [{
                                                "attachment": {
                                                    "field": self.field
                                                }
                                            }]
                                        })

    def delete_pipeline(self):
        self.ingest_client.delete_pipeline(id=self.pipeline_id)

    def get_pipeline(self):
        return self.ingest_client.get_pipeline(id=self.pipeline_id)

    def add_to_index(self, id_: int, content: str, content_page: int,
                     content_paragraph: int):
        current_app.elasticsearch.index(
            index=self.index_name,
            id=id_,
            pipeline=self.pipeline_id,
            body={
                self.field:
                base64.b64encode(content.encode("utf-8")).decode("utf-8"),
                "content_page":
                content_page,
                "content_paragraph":
                content_paragraph,
            })

    def remove_from_index(self, id_: int):
        current_app.elasticsearch.delete(index=self.index_name, id=id_)

    def api_search(self, query: str):
        return current_app.elasticsearch.search(
            index=self.index_name,
            body={"query": {
                "match": {
                    "attachment.content": query
                }
            }})

    def search(self, query: str):
        search = self.api_search(query)

        ids = [int(hit['_id']) for hit in search['hits']['hits']]

        if len(ids) == 0:
            return None

        when = []
        for i in range(len(ids)):
            when.append((ids[i], i))

        res = KnowledgePdfContent.query.filter(
            KnowledgePdfContent.id.in_(ids)).order_by(
                db.case(when, value=KnowledgePdfContent.id)).all()
        return res[0] if len(res) > 0 else None
Ejemplo n.º 17
0
from blog.models import Article
from elasticsearch_dsl import Document, InnerDoc, Date, Integer, Long, Text, Object, GeoPoint, Keyword, Boolean

from django.conf import settings

ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL')

if ELASTICSEARCH_ENABLED:
    connections.create_connection(
        hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']])
    from elasticsearch import Elasticsearch

    es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts'])
    from elasticsearch.client import IngestClient

    c = IngestClient(es)
    try:
        c.get_pipeline('geoip')
    except elasticsearch.exceptions.NotFoundError:
        c.put_pipeline('geoip',
                       body='''{
              "description" : "Add geoip info",
              "processors" : [
                {
                  "geoip" : {
                    "field" : "ip"
                  }
                }
              ]
            }''')
Ejemplo n.º 18
0
def setup_dga_model(ctx, model_tag, repo, model_dir, overwrite):
    """Upload ML DGA model and dependencies and enrich DNS data."""
    import io
    import requests
    import shutil
    import zipfile

    es_client: Elasticsearch = ctx.obj['es']
    client_info = es_client.info()
    license_client = LicenseClient(es_client)

    if license_client.get()['license']['type'].lower() not in ('platinum',
                                                               'enterprise'):
        client_error(
            'You must have a platinum or enterprise subscription in order to use these ML features'
        )

    # download files if necessary
    if not model_dir:
        if not model_tag:
            client_error(
                'model-tag or model-dir required to download model files')

        click.echo(f'Downloading artifact: {model_tag}')

        release_url = f'https://api.github.com/repos/{repo}/releases/tags/{model_tag}'
        release = requests.get(release_url)
        release.raise_for_status()
        assets = [
            a for a in release.json()['assets']
            if a['name'].startswith('ML-DGA') and a['name'].endswith('.zip')
        ]

        if len(assets) != 1:
            client_error(
                f'Malformed release: expected 1 match ML-DGA zip, found: {len(assets)}!'
            )

        zipped_url = assets[0]['browser_download_url']
        zipped = requests.get(zipped_url)
        z = zipfile.ZipFile(io.BytesIO(zipped.content))

        dga_dir = get_path('ML-models', 'DGA')
        model_dir = os.path.join(dga_dir, model_tag)
        os.makedirs(dga_dir, exist_ok=True)
        shutil.rmtree(model_dir, ignore_errors=True)
        z.extractall(dga_dir)
        click.echo(f'files saved to {model_dir}')

        # read files as needed
        z.close()

    def get_model_filename(pattern):
        paths = list(Path(model_dir).glob(pattern))
        if not paths:
            client_error(
                f'{model_dir} missing files matching the pattern: {pattern}')
        if len(paths) > 1:
            client_error(
                f'{model_dir} contains multiple files matching the pattern: {pattern}'
            )

        return paths[0]

    @contextmanager
    def open_model_file(name):
        pattern = expected_ml_dga_patterns[name]
        with open(get_model_filename(pattern), 'r') as f:
            yield json.load(f)

    model_id, _ = os.path.basename(
        get_model_filename('dga_*_model.json')).rsplit('_', maxsplit=1)

    click.echo(
        f'Setting up DGA model: "{model_id}" on {client_info["name"]} ({client_info["version"]["number"]})'
    )

    # upload model
    ml_client = MlClient(es_client)
    ingest_client = IngestClient(es_client)

    existing_models = ml_client.get_trained_models()
    if model_id in [
            m['model_id']
            for m in existing_models.get('trained_model_configs', [])
    ]:
        if overwrite:
            ctx.invoke(remove_dga_model,
                       model_id=model_id,
                       es_client=es_client,
                       ml_client=ml_client,
                       ingest_client=ingest_client,
                       force=True)
        else:
            client_error(
                f'Model: {model_id} already exists on stack! Try --overwrite to force the upload'
            )

    click.secho('[+] Uploading model (may take a while)')

    with open_model_file('model') as model_file:
        try:
            ml_client.put_trained_model(model_id=model_id, body=model_file)
        except elasticsearch.ConnectionTimeout:
            msg = 'Connection timeout, try increasing timeout using `es --timeout <secs> experimental setup_dga_model`.'
            client_error(msg)

    # install scripts
    click.secho('[+] Uploading painless scripts')

    with open_model_file('dga_ngrams_create') as painless_install:
        es_client.put_script(id='dga_ngrams_create', body=painless_install)
        # f'{model_id}_dga_ngrams_create'

    with open_model_file('dga_ngrams_transform_delete') as painless_delete:
        es_client.put_script(id='dga_ngrams_transform_delete',
                             body=painless_delete)
        # f'{model_id}_dga_ngrams_transform_delete'

    # Install ingest pipelines
    click.secho('[+] Uploading pipelines')

    def _build_es_script_error(err, pipeline_file):
        error = err.info['error']
        cause = error['caused_by']

        error_msg = [
            f'Script error while uploading {pipeline_file}: {cause["type"]} - {cause["reason"]}',
            ' '.join(f'{k}: {v}' for k, v in error['position'].items()),
            '\n'.join(error['script_stack'])
        ]

        return click.style('\n'.join(error_msg), fg='red')

    with open_model_file('dns_enrich_pipeline') as ingest_pipeline1:
        try:
            ingest_client.put_pipeline(id='dns_enrich_pipeline',
                                       body=ingest_pipeline1)
        except elasticsearch.RequestError as e:
            if e.error == 'script_exception':
                client_error(_build_es_script_error(e, 'ingest_pipeline1'),
                             e,
                             ctx=ctx)
            else:
                raise

    with open_model_file(
            'dns_dga_inference_enrich_pipeline') as ingest_pipeline2:
        try:
            ingest_client.put_pipeline(id='dns_dga_inference_enrich_pipeline',
                                       body=ingest_pipeline2)
        except elasticsearch.RequestError as e:
            if e.error == 'script_exception':
                client_error(_build_es_script_error(e, 'ingest_pipeline2'),
                             e,
                             ctx=ctx)
            else:
                raise

    click.echo('Ensure that you have updated your packetbeat.yml config file.')
    click.echo('    - reference: ML_DGA.md #2-update-packetbeat-configuration')
    click.echo(
        'Associated rules and jobs can be found under ML-experimental-detections releases in the repo'
    )
    click.echo('To upload rules, run: kibana upload-rule <ml-rule.toml>')
    click.echo(
        'To upload ML jobs, run: es experimental upload-ml-job <ml-job.json>')
Ejemplo n.º 19
0
    def createElasticSearchIngestPipeline(self):
        esIngestClient = IngestClient(self.client)

        self.constructLanguagePipeline(
            esIngestClient, 'title_language_detector', 'Work title language detection',
            field='title.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'alt_title_language_detector', 'Work alt_title language detection',
            prefix='_ingest._value.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'edition_title_language_detector', 'Edition title language detection',
            prefix='_ingest._value.',
            field='title.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'edition_sub_title_language_detector', 'Edition subtitle language detection',
            prefix='_ingest._value.',
            field='sub_title.'
        )

        self.constructLanguagePipeline(
            esIngestClient, 'subject_heading_language_detector', 'Subject heading language detection',
            prefix='_ingest._value.',
            field='heading.'
        )

        esIngestClient.put_pipeline(
            id='foreach_alt_title_language_detector',
            body={
                'description': 'loop for parsing alt_titles',
                'processors': [
                    {
                        'foreach': {
                            'field': 'alt_titles',
                            'processor': {
                                'pipeline': {
                                    'name': 'alt_title_language_detector',
                                }
                            }
                        }
                    }
                ]
            }
        )

        esIngestClient.put_pipeline(
            id='edition_language_detector',
            body={
                'description': 'loop for parsing edition fields',
                'processors': [
                    {
                        'pipeline': {
                            'name': 'edition_title_language_detector',
                            'ignore_failure': True
                        }
                    },
                    {
                        'pipeline': {
                            'name': 'edition_sub_title_language_detector',
                            'ignore_failure': True
                        }
                    }
                ]
            }
        )

        esIngestClient.put_pipeline(
            id='language_detector',
            body={
                'description': 'Full language processing',
                'processors': [
                    {
                        'pipeline': {
                            'name': 'title_language_detector',
                            'ignore_failure': True
                        }
                    },
                    {
                        'pipeline': {
                            'name': 'foreach_alt_title_language_detector',
                            'ignore_failure': True
                        }
                    },
                    {
                        'foreach': {
                            'field': 'editions',
                            'processor': {
                                'pipeline': {
                                    'name': 'edition_language_detector',
                                    'ignore_failure': True
                                }
                            }
                        }
                    },
                    {
                        'foreach': {
                            'field': 'subjects',
                            'ignore_missing': True,
                            'processor': {
                                'pipeline': {
                                    'name': 'subject_heading_language_detector',
                                    'ignore_failure': True
                                }
                            }
                        }
                    }
                ]
            }
        )
Ejemplo n.º 20
0
                "field_map": {}
            }
        }]
    },
    "docs": [{
        "_source": {
            "sl": 4.2,
            "sw": 3.9,
            "pl": 1.9,
            "pw": 0.4
        }
    }]
}

# simulate ingest pipeline
IngestClient.simulate(es, body)

# In[ ]:

# Lets include an english name to convert the predicted value back to english flower name

# set up an enrich index

# create docs mapping serialized values -> english name
mapping_index_name = model_id + '_mapping'
mapping_docs = []
now = datetime.now()

for pos, name in enumerate(labels.categories):
    mapping_docs.append({
        "_index": mapping_index_name,
def deletePipelines():
    conn = get_connection()
    client = IngestClient(conn)
    client.delete_pipeline(id='rename_structure_unit_description')
Ejemplo n.º 22
0
 def ingest_client(self) -> IngestClient:
     return IngestClient(self.es_client)
Ejemplo n.º 23
0
 def __init__(self, client, name, pipeline_handler):
     self._pipeline_handler = pipeline_handler
     self._name = name
     self.ingest_client = IngestClient(client)
Ejemplo n.º 24
0
import requests
from datetime import datetime
res = requests.get('http://localhost:9200')
print(res)

#connect to our cluster
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch.client import IngestClient

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
es_index = IndicesClient(es)
ex_indices = IngestClient(es)

import json
r = requests.get('http://localhost:9200')
i = 1
if r.status_code == 200:

    doc1 = {
        "description":
        "Extract attachment information from arrays",
        "processors": [{
            "foreach": {
                "field": "attachments",
                "processor": {
                    "attachment": {
                        "target_field": "_ingest._value.attachment",
                        "field": "_ingest._value.data"
                    }
                }
            ]
            ds_doc['config']['database_ids'] = moldb_ids

            update_es_dataset(ds_doc, moldb_name_id_map)
            update_es_annotations(ds_doc, moldb_name_id_map_rev)
            update_db_dataset(ds_doc)
        except Exception as e:
            logger.warning(f'Failed to migrate dataset {ds_doc["id"]}: {e}')
            failed_datasets.append((ds_doc['id'], e))

    if failed_datasets:
        print('FAILED DATASETS:')
        for ds, err in failed_datasets:
            print(ds, err)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Migrate moldb names -> moldb ids')
    parser.add_argument('--config', default='conf/config.json')
    parser.add_argument('--where', help='SQL WHERE statement')
    parser.add_argument('--ds-ids', help='Dataset ids, comma separated list')
    args = parser.parse_args()

    with GlobalInit(args.config) as sm_config:
        es: Elasticsearch = init_es_conn(sm_config['elasticsearch'])
        ingest: IngestClient = IngestClient(es)

        ds_ids = args.ds_ids.split(',') if args.ds_ids else None
        migrate_moldbs(args.where, ds_ids)
Ejemplo n.º 26
0
import json
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch.client import IngestClient

conn = pymssql.connect(server="40.71.86.193",
                       port=1433,
                       user="******",
                       password="******",
                       database="testCaseFinal")

cursor = conn.cursor()

es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
es_index = IndicesClient(es)
ex_indices = IngestClient(es)

table_name = 'Attachment_sfdc'
query = "SELECT  Body FROM Attachment_sfdc "
var = cursor.execute(query)
data_tup_list = cursor.fetchall()
#print(data_tup_list)

r = requests.get('http://localhost:9200')
i = 1
if r.status_code == 200:
    '''
	doc1 = {

			"description" : "Extract attachment information from arrays",
			  "processors" : [
Ejemplo n.º 27
0
def make_pipelines():
    with open(os.path.join(pipelines_dir, "pipelines.json")) as file:
        pipelines = json.load(file)
        ing_client = IngestClient(client)
        for key in pipelines.keys():
            print("Creating {0} created {1}".format(key,ing_client.put_pipeline(key, pipelines[key])))