except Exception as e: raise Exception("Unable to connect to Mongo: %s" % e) if cfg.elasticsearch.get("enabled"): try: import elasticsearch except ImportError: raise Exception( "ElasticSearch is enabled but not installed, aborting!") hosts = [] for host in cfg.elasticsearch.get("hosts", "127.0.0.1:9200").split(","): if host.strip(): hosts.append(host.strip()) ELASTIC = elasticsearch.Elasticsearch(hosts) ELASTIC_INDEX = cfg.elasticsearch.get("index", "cuckoo") else: ELASTIC = None MOLOCH_ENABLED = cfg.moloch.get("enabled") MOLOCH_HOST = cfg.moloch.get("host") MOLOCH_INSECURE = cfg.moloch.get("insecure") # In case we have VPNs enabled we need to initialize through the following # two methods as they verify the interaction with VPNs as well as gather # which VPNs are available (for representation upon File/URL submission). from lib.cuckoo.core.startup import init_rooter, init_routing init_rooter() init_routing()
#!/usr/bin/python import elasticsearch from elasticsearch_dsl import Search, A, Q import dateutil.parser as parser #logging.basicConfig(level=logging.WARN) es = elasticsearch.Elasticsearch(['https://gracc.opensciencegrid.org/q'], timeout=300, use_ssl=True, verify_certs=False) osg_summary_index = 'gracc.osg.summary' s = Search(using=es, index=osg_summary_index) s = s.query("match", OIM_FQDN="osg.alice.ornl.gov") s = s.query(Q("range", EndTime={"gte": "now-1M", "lt": "now"})) s.aggs.bucket('jobs_per_day', 'date_histogram', field='EndTime', interval='day')\ .metric('njobs', 'sum', field='Njobs') response = s.execute() print(response) for tag in response.aggregations.jobs_per_day: print(tag.key, tag.njobs.value)
def main() -> None: """ Run this command like: python -m fatcat_scholar.issue_db """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) subparsers = parser.add_subparsers() parser.add_argument( "--db-file", help="sqlite3 database file to open", default=settings.SCHOLAR_ISSUEDB_PATH, type=str, ) sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables") sub.set_defaults(func="init_db") sub = subparsers.add_parser( "load_pubs", help="update container-level stats from JSON file" ) sub.set_defaults(func="load_pubs") sub.add_argument( "json_file", help="collection-level metadata, as JSON-lines", nargs="?", default=sys.stdin, type=argparse.FileType("r"), ) sub = subparsers.add_parser( "load_issues", help="update item-level stats from JSON file" ) sub.set_defaults(func="load_issues") sub.add_argument( "json_file", help="item-level metadata, as JSON-lines", nargs="?", default=sys.stdin, type=argparse.FileType("r"), ) sub = subparsers.add_parser( "load_counts", help="update volume-level stats from elasticsearch endpoint" ) sub.set_defaults(func="load_counts") args = parser.parse_args() if not args.__dict__.get("func"): parser.print_help(file=sys.stderr) sys.exit(-1) idb = IssueDB(args.db_file) api_conf = fatcat_openapi_client.Configuration() api_conf.host = settings.FATCAT_API_HOST api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(api_conf)) es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_FATCAT_BASE) if args.func == "load_pubs": idb.load_pubs(args.json_file, api) elif args.func == "load_issues": idb.load_issues(args.json_file, es_client) elif args.func == "load_counts": idb.load_counts(es_client) else: func = getattr(idb, args.func) func()
import elasticsearch from elasticsearch_dsl import Search import pathlib INDEX_NAME = 'index-2' ELASTIC_HOST = 'http://localhost:9231/' client = elasticsearch.Elasticsearch(hosts=[ELASTIC_HOST]) data_1 = { 'id': 1, 'name': 'Python Arabia', 'tag': 'Python', } data_2 = { 'id': 2, 'name': 'Java for learn', 'tag': 'Java', } data_3 = { 'id': 2, 'name': 'Elasticsearch', 'tag': pathlib.Path('es.txt').read_text() } # add_data_1 = client.index(index= INDEX_NAME, body= data_1) # print(add_data_1) # add_data_2 = client.index(index= INDEX_NAME, body= data_2)
from pymongo import MongoClient from elasticsearch import helpers from bson import json_util import json import elasticsearch import collections # elasticsearch client es = elasticsearch.Elasticsearch([{'host': '192.168.0.106', 'port': 9200}]) # connection to mongoDB connection = MongoClient('192.168.0.58',10080) # getting collection db = connection.pubmed.pubmed_data_total_simplified objectId = [] j= 1 a = db.find() for j in range(db.count()): A = a.next() x = str(A['_id']) if(j<552301): j=j+1 continue if(j>1000000): print j, 'first break' break objectId.append(x) j=j+1
parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--host', dest='host', default=None, help='Input URL or IPv4 for elasticsearch') args = parser.parse_args() print("Host provided:", args.host) print("Running on elasticsearch-py version:", elasticsearch.__versionstr__) if args.host: host = str(args.host) try: inputES = elasticsearch.Elasticsearch(host) print("Elasticsearch server details:", inputES.info()) input_mappings = inputES.indices.get_mapping() count = 0 for k, v in input_mappings.items(): if len(v['mappings'].keys()) > 1: count += 1 print("Index: ", k) print("Mappings", list(v['mappings'].keys())) print("\n") print("\nMutlitple Mapping Indices Count:", count) except elasticsearch.ConnectionError as e: print(e)
def test_es_connect(): es = elasticsearch.Elasticsearch(ES_URL) assert es.ping() assert es.indices.exists(index=Stage.ES_INDEX)
elasticsearch search module """ import os from typing import Dict, List import elasticsearch as es from config import config from embedding import get_embedding es_url = os.getenv("ES_URL") es_user = os.getenv("ES_USER") es_password = os.getenv("ES_PASSWORD") index_name = os.getenv("INDEX_NAME") es_auth = (es_user, es_password) es_handler = es.Elasticsearch(es_url, port=443, http_auth=es_auth, use_ssl=True, verify_certs=False) def get_similar_documents(query: str, count: int) -> List[Dict[str, str]]: """ get similar documents """ index = config.get_es_index() embedded_query = get_embedding(query) knn_query = {"size": count, "query": {"knn": {"embedding": {"vector": embedded_query, "k": count}}}} results = es_handler.search(index=index, body=knn_query)["hits"]["hits"] documents = [] for res in results: doc = {"id": res["_id"], "score": res["_score"]}
## Jesus M. Gonzalez-Barahona <*****@*****.**> ## import email.utils from perceval.backends.core.mbox import MBox import elasticsearch from pprint import pprint # uri (label) for the mailing list to analyze mbox_uri = 'http://mail-archives.apache.org/mod_mbox/httpd-announce/' # directory for letting Perceval where mbox archives are # you need to have the archives to analyzed there before running the script mbox_dir = 'archives' # ElasticSearch instance (url) es = elasticsearch.Elasticsearch(['http://localhost:9200/']) # Create the 'messages' index in ElasticSearch try: es.indices.create('messages') except elasticsearch.exceptions.RequestError: es.indices.delete('messages') es.indices.create('messages') # create a mbox object, using mbox_uri as label, mbox_dir as directory to scan repo = MBox(uri=mbox_uri, dirpath=mbox_dir) # Fetch all commits as an iteratoir, and iterate it uploading to ElasticSearch print('Analyzing mbox archives...') # fetch all messages as an iteratoir for message in repo.fetch():
def __init__(self, **kwargs): self._logger = logging.getLogger(__file__) self.client = elasticsearch.Elasticsearch(**kwargs)
#!/usr/bin/env python # -*- coding: utf-8 -*- from flask import Flask, request, jsonify from flask_cors import CORS, cross_origin import elasticsearch import json app = Flask(__name__) CORS(app) es = elasticsearch.Elasticsearch( ['http://*****:*****@elasticsearch:8080']) @app.route("/", methods=["GET"]) def alldata(): res = es.search(index="booze", body={"query": {"match_all": {}}}) return jsonify(res) @app.route("/brandsComplete", methods=['GET']) def brandsComplete(): """Return unique brand names""" res = es.search(index="booze", body={ "size": 0, "aggs": { "unique_brands": { "terms": { "field": "brand_name.keyword",
def __init__(self, elastic_urls=['127.0.0.1:9200'], sqlite_db=None): self.elastic_urls = elastic_urls self.es_client = elasticsearch.Elasticsearch(self.elastic_urls) self.engine = create_engine(sqlite_db)
import time from pprint import pprint import elasticsearch endpoint = sys.argv[1] work_node = sys.argv[2] pattern = sys.argv[3] tgt_num_of_shards = int(sys.argv[4]) tgt_num_of_replicas = int(sys.argv[5]) (host, port) = endpoint.split(':') es = elasticsearch.Elasticsearch( [ {'host': host, 'port': port, 'timeout': 310} ] ) indices = es.cat.indices(index=pattern, format="json") open_indices = [] closed_indices = [] for index in indices: if index['index'].startswith('shrunk-'): continue elif index['status'] == 'close': closed_indices.append(index['index']) else: if int(index['pri']) == tgt_num_of_shards:
import json import string, random, tempfile import time from click import testing as clicktest from mock import patch, Mock import unittest from . import CuratorTestCase from . import testvars as testvars import logging logger = logging.getLogger(__name__) host, port = os.environ.get('TEST_ES_SERVER', 'localhost:9200').split(':') port = int(port) if port else 9200 global_client = elasticsearch.Elasticsearch(host=host, port=port) EMPTY710ROUTING = { 'allocation': { 'include': { '_tier_preference': 'data_content' } } } delete_count_pattern = ('---\n' 'actions:\n' ' 1:\n' ' description: "Delete indices as filtered"\n' ' action: delete_indices\n' ' options:\n'
import elasticsearch es = elasticsearch.Elasticsearch() # use default of localhost, port 9200 es.index(index='posts', doc_type='blog', id=2, body={ 'author': 'Benjamin Pollack', 'blog': 'bitquabit', 'title': 'Having Fun: Python and Elasticsearch', 'topics': ['elasticsearch', 'python', 'parseltongue'], 'awesomeness': 0.7 }) es.index(index='posts', doc_type='blog', id=3, body={ 'author': 'Benjamin Pollack', 'blog': 'bitquabit', 'title': 'How to Write Clickbait Titles About Git Being Awful Compared to Mercurial', 'topics': ['mercurial', 'git', 'flamewars', 'hidden messages'], 'awesomeness': 0.95 }) print(es.search(index='posts', q='author:"Benjamin Pollack"'))
from fake_useragent import UserAgent from moviepy.editor import VideoFileClip from env import config from qiniu import Qiniu from utils.util import CheloExtendedLogger, count_str, query_cut, del_item_from_list from utils.tool import obj2int, makedirs from utils.video_filter import video_filter requests.packages.urllib3.disable_warnings() ua = UserAgent() _qiniu = Qiniu() base_url = os.path.split(os.path.realpath(__file__))[0] es_client = elasticsearch.Elasticsearch(config.es_addr, timeout=30) logging.setLoggerClass(CheloExtendedLogger) douyin_logger = logging.getLogger("douyin_sync") CHROME_DRIVER = os.path.join(base_url, 'utils/chromedriver') class DouYin(object): ua = ua.random def __init__(self): option = webdriver.ChromeOptions() option.add_argument('--headless') option.add_argument('disable-infobars') option.add_argument('start-maximized') option.add_argument('--no-sandbox') option.add_argument('--no-zygote')
def __init__(self, index=None, doc_type=None): self.es = elasticsearch.Elasticsearch() self.esClient = elasticsearch.client.IndicesClient(self.es)
''' Indexes dataset.json in the elastic search server ''' import elasticsearch import json es = elasticsearch.Elasticsearch() print 'Indexing the data to the Elastic Search Server.. Would take some time' with open("dataset.json") as f: data = json.load(f) num_rec = len(data['images']) new_d = [ {} for _ in xrange(num_rec)] for _ in xrange(num_rec): new_d[_]['imgurl'] = data['images'][_]['filename'] new_d[_]['description'] = data['images'][_]['sentences'][0]['raw'] for i in xrange(num_rec): es.index(index="desearch", doc_type="json", id=i, body = { 'imgurl': new_d[i]['imgurl'], 'description': new_d[i]['description'], 'idnum': i }) print 'Done!'
#!/usr/bin/env python #-*- coding:utf8 -*- import pdb #connect to our cluster import elasticsearch es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}]) # make sure ES is up and running import requests res = requests.get('http://127.0.0.1:9200') print(res.content) #let's iterate over swapi people documents and index them import json print "index!!!...." i = 1 while i < 3: res = requests.get('http://swapi.co/api/people/'+ str(i)) if res.status_code == 200: source = json.loads(res.content) es.index(index='sw', doc_type='people', id=i, body=source) print 'index %d success! \n' % i i += 1 i = i-1
own_key = 'Get key from https://openweathermap.org/api' news_key = 'Get key from https://newsapi.org/' ##### MAPPINGS ###### mapping = { "settings": { "max_docvalue_fields_search": 200, "number_of_shards": 1, "number_of_replicas": 0 } } ##### CREATE INDEX ###### try: es = elasticsearch.Elasticsearch(host, port=port, http_auth=(username, password)) except Exception as e: print(e) pprint.pprint("--------------") try: es.indices.create(index=index_name, body=mapping, ignore=400) except Exception as e: print(e) pprint.pprint("--------------") ####### HELPER FUNCTIONS ####### def reset(): forecast_day["temp_min"] = 999
import elasticsearch as es import elasticsearch_dsl as es_dsl import numpy as np import pandas as pd import sys import os if '__file__' in vars(): project_path = os.path.abspath( os.path.join(__file__, os.path.pardir, os.path.pardir, os.path.pardir)) print('\n Adding path: ', project_path) sys.path.append(project_path) con = es.Elasticsearch('192.168.1.66') # Get the source labels # Get the counts per source labels # Filter per source label.. # Get "databases" .. or indices res_indices = list(con.indices.get_alias('*').keys()) for i_ind in res_indices: print('Database: ', i_ind) temp = es_dsl.Search(using=con, index=i_ind) temp.query('match') max_count = temp.count() print('Size: ', max_count)
@author: fangyucheng """ import json import time import elasticsearch import elasticsearch.helpers hosts = '192.168.17.11' port = 80 user_id = 'fangyucheng' password = '******' http_auth = (user_id, password) es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth) task_list = [] result_list = [] es_scan = elasticsearch.helpers.scan(es, index='target_releasers') for line in es_scan: task_list.append(line) print('the length of releaser is %s' % len(task_list)) bulk_all_body = '' count = 0 for line in task_list: releaser_info = line['_source'] platform = releaser_info['platform']
def get_elasticsearch_connection(): es = elasticsearch.Elasticsearch([{ 'host': os.getenv('ELASTICSEARCH_HOST'), 'port': os.getenv('ELASTICSEARCH_PORT') }]) return es
def __init__(self, host, port): self.connection = elasticsearch.Elasticsearch( ['http://{esHost}:{esPort}'.format(esHost=host, esPort=port)], timeout=3000)
class FulltextHits(BaseModel): query_type: str count_returned: int count_found: int offset: int limit: int deep_page_limit: int query_time_ms: int query_wall_time_ms: int results: List[Any] # global sync client connection es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_QUERY_BASE, timeout=25.0) def transform_es_results(resp: Response) -> List[dict]: # convert from ES objects to python dicts results = [] for h in resp: r = h._d_ # print(h.meta._d_) r["_highlights"] = [] if "highlight" in dir(h.meta): highlights = h.meta.highlight._d_ for k in highlights: r["_highlights"] += highlights[k] r["_collapsed"] = [] r["_collapsed_count"] = 0
def index(log_obj=logger): user = request.args.get('user') #log_obj = logging.getLogger('service_logger') credentials = os.path.join(direc, 'Q_DSaaS_v1-40635af5d3e8.json') log_obj.info('Starting logger') #Data Sets [REPLACE BY HDFS File Access] image_path = os.path.join(direc, 'images/' + user) #Modules vision = google_vision(path_to_discovery_file=credentials, log_obj=log_obj) #Requests and Indexing response = vision.get_response(image_path) #new_db = es_instance.get_es() #indexed elastic_search output = {} if 'logoAnnotations' in response['responses'][0]: output['Google_Vision'] = response['responses'][0]['logoAnnotations'][0]['description'] else: output['Google_Vision'] = 'null' if 'textAnnotations' in response['responses'][0]: output['OCR'] = response['responses'][0]['textAnnotations'][0]['description'] ocr = re.sub('[^A-Za-z0-9.//]', " ", output['OCR']) output['OCR'] = ocr else: output['OCR'] = 'null' #Test queries for elastic search res = elasticsearch.Elasticsearch(hosts='localhost:9200') log_obj.info('ES connection made') brand_search = res.search(index='brands', body={'query': {'match':{'brand': output['OCR']}}})['hits']['hits'] tag_search = res.search(index='tags', body={"query": { "multi_match": {"query" : output['OCR'], "fields" : "tagline", "fuzziness": "AUTO"}}})['hits']['hits'] brand_1, brand_2, brand_3 = brand_search[0]['_source']['brand'], brand_search[1]['_source']['brand'], brand_search[2]['_source']['brand'] log_obj.info('Retreived top 3 brands from ES') tag_1, tag_2, tag_3 = tag_search[0]['_source']['brand'], tag_search[1]['_source']['brand'], tag_search[2]['_source']['brand'] log_obj.info('Retreived top 3 tags from ES') output['Facebook URL'] = extract.get_facebook(output['OCR']) output['Website URL'] = extract.get_website_url(output['OCR']) if output['Google_Vision'] == 'null': prediction = rules.rule_ocr_tag(brand_1, brand_2, brand_3, tag_1, tag_2, tag_3) output['Brand Prediction'] = prediction log_obj.info('Brand-Tag Rule') #output['rule'] = 'third' return jsonify(output) prediction = rules.rule_tag_logo(tag_1, tag_2, tag_3, output['Google_Vision']) if prediction != 'null': output['Brand Prediction'] = rules.rule_tag_logo(tag_1, tag_2, tag_3, output['Google_Vision']) log_obj.info('Tag-Logo Rule') #output['rule'] = 'second' return jsonify(output) else: output['Brand Prediction'] = rules.rule_brand_logo(brand_1, brand_2, brand_3, output['Google_Vision']) log_obj.info('Brand-Logo Rule') #output['rule'] = 'first' return jsonify(output)
from flask import Flask from flask import request from flask import render_template from flask import jsonify from stormpath.client import Client as StormpathClient import elasticsearch from alliterativeanimals import get_name import settings # Flask app = Flask(__name__) # Elastic Search es = elasticsearch.Elasticsearch([settings.ELASTICSEARCH_URL]) INDEX = "main" DOC_TYPE = "idea" """ es.search(index="main", body={"query":{"fuzzy":{"_all":"run"}}}); """ # Stormpath stormpath_client = StormpathClient( api_key_id=settings.STORMPATH_API_KEY_ID, api_key_secret=settings.STORMPATH_API_KEY_SECRET) stormpath_application = stormpath_client.applications.search( settings.STORMPATH_APPLICATION_NAME)[0] # STATIC FILES
def main(argv): global chatty start = "" end = "" duration = "" bisection = 0 bisection_max = 10 min_step = datetime.timedelta(minutes=5) try: opts, args = getopt.getopt( argv, "hqs:d:t:", ["help", "quiet", "start=", "duration=", "steps="]) except getopt.GetoptError: eprint('Error: Unrecognized option!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('Use -h for help.') sys.exit(2) for opt, arg in opts: if opt in ("-h", "--help"): print('categorization.py [-h] [-s <start> -d <duration>]') print(' -h: print this text') print( ' -s <start>: start date and time in elasticsearch time format' ) print(' -d <duration>: duration format <number>[mhdw]') print(' -t <steps>: steps format <number>[mhdw]') print( ' If Elasticsearch query isn\'t responed in time, step size will be automatically adjusted by bisection.' ) print(' -q: no output except errors') print(' --help: same as -h') print(' --start <start>: same as -s') print(' --duration <duration>: same as -d') print(' --steps <steps>: same as -t') print(' --quiet: same as -q') print('') print( ' If start and duration are omitted, the last 24 hours will be used.' ) sys.exit(0) elif opt in ("-q", "--quiet"): chatty = False elif opt in ("-s", "--start"): try: start_dt = datetime.datetime.strptime(arg, "%Y-%m-%dT%H:%M") start = arg except ValueError: eprint('Error: Invalid option start!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<start> format: yyyy-mm-ddThh:mm') eprint('Use -h for help.') sys.exit(2) elif opt in ("-d", "--duration"): match = re.match("(\d+)([mhdw])$", arg) if match: (x, c) = match.groups() try: y = int(x) except: eprint('Error: Invalid option duration!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) if c == 'm': delta = datetime.timedelta(minutes=y) elif c == 'h': delta = datetime.timedelta(hours=y) elif c == 'd': delta = datetime.timedelta(days=y) elif c == 'w': delta = datetime.timedelta(days=(7 * y)) else: eprint('Error: Invalid option duration!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) duration = arg else: eprint('Error: Invalid option duration!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) elif opt in ("-t", "--steps"): match = re.match("(\d+)([mhdw])$", arg) if match: (x, c) = match.groups() try: y = int(x) except: eprint('Error: Invalid option steps!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) if c == 'm': step = datetime.timedelta(minutes=y) elif c == 'h': step = datetime.timedelta(hours=y) elif c == 'd': step = datetime.timedelta(days=y) elif c == 'w': step = datetime.timedelta(days=(7 * y)) else: eprint('Error: Invalid option steps!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<duration> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) else: eprint('Error: Invalid option steps!') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('<steps> format: <number>[mhdw]') eprint('Use -h for help.') sys.exit(2) if ((start == "") and (duration == "")): end_dt = datetime.datetime.now() end = end_dt.strftime("%Y-%m-%dT%H:%M") delta = datetime.timedelta(days=1) start_dt = end_dt - delta start = start_dt.strftime("%Y-%m-%dT%H:%M") elif ((start == "") or (duration == "")): eprint('Error: Invalid option combination!') eprint('Start and duration must both be specified or omitted.') eprint( 'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]' ) eprint('Use -h for help.') sys.exit(2) else: end_dt = start_dt + delta end = end_dt.strftime("%Y-%m-%dT%H:%M") if ('step' not in vars()): step = delta if (end_dt - start_dt) < step: step = end_dt - start_dt es = elasticsearch.Elasticsearch(nodes) elasticsearch_version = get_elasticsearch_major_version(es) axes = [ 'entropy_peer_sessions', 'entropy_peer_packets', 'entropy_peer_bytes', 'entropy_sport_sessions', 'entropy_sport_packets', 'entropy_sport_bytes', 'entropy_dport_sessions', 'entropy_dport_packets', 'entropy_dport_bytes' ] host = {} peer = {} sport = {} dport = {} ''' During data generation slight differences in timing may cause some hosts not to be listed in all dictionaries. These hosts will be ignored. ''' ignore_hosts = [] sets = [[ 'peer (1)', peer, "src", "SourceAddress", "dst", "DestinationAddress" ], ['peer (2)', peer, "dst", "DestinationAddress", "src", "SourceAddress" ], ['sport (1)', sport, "src", "SourceAddress", "sport", "SourcePort"], [ 'sport (2)', sport, "dst", "DestinationAddress", "dport", "DestinationPort" ], [ 'dport (1)', dport, "src", "SourceAddress", "dport", "DestinationPort" ], [ 'dport (2)', dport, "dst", "DestinationAddress", "sport", "SourcePort" ]] for s in sets: moving_start_dt = start_dt moving_end_dt = min(start_dt + step, end_dt) while True: try: start = moving_start_dt.strftime("%Y-%m-%dT%H:%M:%S") end = moving_end_dt.strftime("%Y-%m-%dT%H:%M:%S") print_or_quiet('%s Fetching %s data ... %s - %s' % (strftime('%H:%M:%S'), s[0], start, end)) query_into_dictionary(es, elasticsearch_version, start, end, host, s[1], s[2], s[3], s[4], s[5]) moving_start_dt = moving_end_dt moving_end_dt += step if moving_start_dt >= end_dt: break if moving_end_dt > end_dt: moving_end_dt = end_dt except elasticsearch.exceptions.ConnectionTimeout as esect: bisection += 1 if bisection <= bisection_max: step = step // 2 if step < min_step: eprint( 'Elasticsearch Connection Timeout. Minimum timeframe reached. Exiting ...' ) sys.exit(3) eprint( 'Elasticsearch Connection Timeout. Halving step size.') moving_end_dt = moving_start_dt + step else: eprint( '%d. time Elasticsearch Connection Timeout. Exiting ...' % bisection) sys.exit(3) print_or_quiet('%s Calculating sums ...' % (strftime('%H:%M:%S'))) calculate_sums(host, 'peer', peer, ignore_hosts) calculate_sums(host, 'sport', sport, ignore_hosts) calculate_sums(host, 'dport', dport, ignore_hosts) print_or_quiet('%s Removing incomplete hosts ...' % (strftime('%H:%M:%S'))) for h in ignore_hosts: try: del host[h] except KeyError: # host may appear more than once pass print_or_quiet('%s Calculating entropy ...' % (strftime('%H:%M:%S'))) calculate_entropy(host, [[peer, 'peer'], [dport, 'dport'], [sport, 'sport']]) print_or_quiet('%s Removing dictionaries ...' % (strftime('%H:%M:%S'))) peer.clear() sport.clear() dport.clear() if (len(host) == 0): eprint("No data found. Exiting ...") sys.exit(1) print_or_quiet('%s Creating sample set ...' % (strftime('%H:%M:%S'))) labels = host.keys() npa = create_np(host, axes) n_samples, n_features = npa.shape print_or_quiet(' samples: %d features:%d' % (n_samples, n_features)) ######################### # MeanShift ######################### print_or_quiet('%s Calculating bandwidth ...' % (strftime('%H:%M:%S'))) bandwidth = estimate_bandwidth(npa, quantile=bandwidth_quantile, n_samples=bandwidth_n_samples, random_state=bandwidth_random_state, n_jobs=bandwidth_n_jobs) if (bandwidth == 0.00000000): eprint('Useless bandwith. Exiting ....') sys.exit(3) print_or_quiet('%s Calculating MeanShift ...' % (strftime('%H:%M:%S'))) ms = MeanShift(bandwidth=bandwidth, bin_seeding=meanshift_bin_seeding, cluster_all=meanshift_cluster_all, n_jobs=meanshift_n_jobs) ms.fit(npa, npa.shape) n_clusters = len(numpy.unique(ms.labels_)) print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S'))) prediction = ms.predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) meanshift_output = {} for (ip, prediction) in ip_prediction: meanshift_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing MeanShift output file ...' % (strftime('%H:%M:%S'))) with open(meanshift_outputfile, 'w') as fp: json.dump( {str(key): value for key, value in meanshift_output.iteritems()}, fp) ######################### # KMeans ######################### print_or_quiet('%s Calculating KMeans ...' % (strftime('%H:%M:%S'))) km = KMeans(n_clusters=kmeans_n_clusters) km.fit(npa, npa.shape) print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S'))) prediction = km.predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) kmeans_output = {} for (ip, prediction) in ip_prediction: kmeans_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing KMeans output file ...' % (strftime('%H:%M:%S'))) with open(kmeans_outputfile, 'w') as fp: json.dump( {str(key): value for key, value in kmeans_output.iteritems()}, fp) ######################### # AgglomerativeClustering ######################### print_or_quiet('%s Calculating Agglomerative Clustering ...' % (strftime('%H:%M:%S'))) ac = AgglomerativeClustering(n_clusters=agglomerative_n_clusters, affinity=agglomerative_affinity, linkage=agglomerative_linkage) prediction = ac.fit_predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) agglomerative_output = {} for (ip, prediction) in ip_prediction: agglomerative_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing Agglomerative Clustering output file ...' % (strftime('%H:%M:%S'))) with open(agglomerative_outputfile, 'w') as fp: json.dump( { str(key): value for key, value in agglomerative_output.iteritems() }, fp) ######################### # DBSCAN ######################### print_or_quiet('%s Calculating DBSCAN ...' % (strftime('%H:%M:%S'))) db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples) prediction = db.fit_predict(npa) ip = [y['ip_address'] for y in host.values()] ip_prediction = zip(ip, prediction) dbscan_output = {} for (ip, prediction) in ip_prediction: dbscan_output.setdefault(prediction, list()).append(ip) print_or_quiet('%s Writing DBSCAN output file ...' % (strftime('%H:%M:%S'))) with open(dbscan_outputfile, 'w') as fp: json.dump( {str(key): value for key, value in dbscan_output.iteritems()}, fp)
def __init__(self, host_port, index_name=None, index_type=None): self.index_name = index_name self.index_type = index_type self.es = elasticsearch.Elasticsearch(host_port, timeout=3600)
def _connect_elastic(self, address): try: _es = elasticsearch.Elasticsearch([address]) return _es except Exception as e: raise e