Example #1
0
except Exception as e:
    raise Exception("Unable to connect to Mongo: %s" % e)

if cfg.elasticsearch.get("enabled"):
    try:
        import elasticsearch
    except ImportError:
        raise Exception(
            "ElasticSearch is enabled but not installed, aborting!")

    hosts = []
    for host in cfg.elasticsearch.get("hosts", "127.0.0.1:9200").split(","):
        if host.strip():
            hosts.append(host.strip())

    ELASTIC = elasticsearch.Elasticsearch(hosts)
    ELASTIC_INDEX = cfg.elasticsearch.get("index", "cuckoo")
else:
    ELASTIC = None

MOLOCH_ENABLED = cfg.moloch.get("enabled")
MOLOCH_HOST = cfg.moloch.get("host")
MOLOCH_INSECURE = cfg.moloch.get("insecure")

# In case we have VPNs enabled we need to initialize through the following
# two methods as they verify the interaction with VPNs as well as gather
# which VPNs are available (for representation upon File/URL submission).
from lib.cuckoo.core.startup import init_rooter, init_routing

init_rooter()
init_routing()
#!/usr/bin/python

import elasticsearch
from elasticsearch_dsl import Search, A, Q
import dateutil.parser as parser

#logging.basicConfig(level=logging.WARN)
es = elasticsearch.Elasticsearch(['https://gracc.opensciencegrid.org/q'],
                                 timeout=300,
                                 use_ssl=True,
                                 verify_certs=False)
osg_summary_index = 'gracc.osg.summary'

s = Search(using=es, index=osg_summary_index)

s = s.query("match", OIM_FQDN="osg.alice.ornl.gov")
s = s.query(Q("range", EndTime={"gte": "now-1M", "lt": "now"}))
s.aggs.bucket('jobs_per_day', 'date_histogram', field='EndTime', interval='day')\
    .metric('njobs', 'sum', field='Njobs')
response = s.execute()
print(response)

for tag in response.aggregations.jobs_per_day:
    print(tag.key, tag.njobs.value)
Example #3
0
def main() -> None:
    """
    Run this command like:

        python -m fatcat_scholar.issue_db
    """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    subparsers = parser.add_subparsers()

    parser.add_argument(
        "--db-file",
        help="sqlite3 database file to open",
        default=settings.SCHOLAR_ISSUEDB_PATH,
        type=str,
    )

    sub = subparsers.add_parser("init_db", help="create sqlite3 output file and tables")
    sub.set_defaults(func="init_db")

    sub = subparsers.add_parser(
        "load_pubs", help="update container-level stats from JSON file"
    )
    sub.set_defaults(func="load_pubs")
    sub.add_argument(
        "json_file",
        help="collection-level metadata, as JSON-lines",
        nargs="?",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    sub = subparsers.add_parser(
        "load_issues", help="update item-level stats from JSON file"
    )
    sub.set_defaults(func="load_issues")
    sub.add_argument(
        "json_file",
        help="item-level metadata, as JSON-lines",
        nargs="?",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    sub = subparsers.add_parser(
        "load_counts", help="update volume-level stats from elasticsearch endpoint"
    )
    sub.set_defaults(func="load_counts")

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        parser.print_help(file=sys.stderr)
        sys.exit(-1)

    idb = IssueDB(args.db_file)
    api_conf = fatcat_openapi_client.Configuration()
    api_conf.host = settings.FATCAT_API_HOST
    api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(api_conf))
    es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_FATCAT_BASE)

    if args.func == "load_pubs":
        idb.load_pubs(args.json_file, api)
    elif args.func == "load_issues":
        idb.load_issues(args.json_file, es_client)
    elif args.func == "load_counts":
        idb.load_counts(es_client)
    else:
        func = getattr(idb, args.func)
        func()
Example #4
0
import elasticsearch
from elasticsearch_dsl import Search
import pathlib

INDEX_NAME = 'index-2'

ELASTIC_HOST = 'http://localhost:9231/'

client = elasticsearch.Elasticsearch(hosts=[ELASTIC_HOST])

data_1 = {
    'id': 1,
    'name': 'Python Arabia',
    'tag': 'Python',
}

data_2 = {
    'id': 2,
    'name': 'Java for learn',
    'tag': 'Java',
}

data_3 = {
    'id': 2,
    'name': 'Elasticsearch',
    'tag': pathlib.Path('es.txt').read_text()
}

# add_data_1 = client.index(index= INDEX_NAME, body= data_1)
# print(add_data_1)
# add_data_2 = client.index(index= INDEX_NAME, body= data_2)
from pymongo import MongoClient
from elasticsearch import helpers
from bson import json_util
import json
import elasticsearch
import collections

# elasticsearch client
es = elasticsearch.Elasticsearch([{'host': '192.168.0.106', 'port': 9200}])

# connection to mongoDB
connection = MongoClient('192.168.0.58',10080)

# getting collection
db = connection.pubmed.pubmed_data_total_simplified

objectId = []
j= 1
a = db.find()
for j in range(db.count()):
    A = a.next()
    x = str(A['_id'])
    if(j<552301):
        j=j+1
        continue
    if(j>1000000):
        print j, 'first break'
        break
    objectId.append(x)
    j=j+1
parser = argparse.ArgumentParser(description='Process some integers.')

parser.add_argument('--host',
                    dest='host',
                    default=None,
                    help='Input URL or IPv4 for elasticsearch')

args = parser.parse_args()
print("Host provided:", args.host)

print("Running on elasticsearch-py version:", elasticsearch.__versionstr__)

if args.host:
    host = str(args.host)
    try:
        inputES = elasticsearch.Elasticsearch(host)
        print("Elasticsearch server details:", inputES.info())
        input_mappings = inputES.indices.get_mapping()
        count = 0
        for k, v in input_mappings.items():
            if len(v['mappings'].keys()) > 1:
                count += 1
                print("Index: ", k)
                print("Mappings", list(v['mappings'].keys()))
                print("\n")

        print("\nMutlitple Mapping Indices Count:", count)
    except elasticsearch.ConnectionError as e:
        print(e)
def test_es_connect():
    es = elasticsearch.Elasticsearch(ES_URL)
    assert es.ping()
    assert es.indices.exists(index=Stage.ES_INDEX)
Example #8
0
elasticsearch search module
"""
import os
from typing import Dict, List

import elasticsearch as es
from config import config
from embedding import get_embedding

es_url = os.getenv("ES_URL")
es_user = os.getenv("ES_USER")
es_password = os.getenv("ES_PASSWORD")
index_name = os.getenv("INDEX_NAME")

es_auth = (es_user, es_password)
es_handler = es.Elasticsearch(es_url, port=443, http_auth=es_auth, use_ssl=True, verify_certs=False)


def get_similar_documents(query: str, count: int) -> List[Dict[str, str]]:
    """
    get similar documents
    """
    index = config.get_es_index()
    embedded_query = get_embedding(query)
    knn_query = {"size": count, "query": {"knn": {"embedding": {"vector": embedded_query, "k": count}}}}

    results = es_handler.search(index=index, body=knn_query)["hits"]["hits"]

    documents = []
    for res in results:
        doc = {"id": res["_id"], "score": res["_score"]}
##   Jesus M. Gonzalez-Barahona <*****@*****.**>
##

import email.utils

from perceval.backends.core.mbox import MBox
import elasticsearch
from pprint import pprint

# uri (label) for the mailing list to analyze
mbox_uri = 'http://mail-archives.apache.org/mod_mbox/httpd-announce/'
# directory for letting Perceval where mbox archives are
# you need to have the archives to analyzed there before running the script
mbox_dir = 'archives'
# ElasticSearch instance (url)
es = elasticsearch.Elasticsearch(['http://localhost:9200/'])

# Create the 'messages' index in ElasticSearch
try:
    es.indices.create('messages')
except elasticsearch.exceptions.RequestError:
    es.indices.delete('messages')
    es.indices.create('messages')

# create a mbox object, using mbox_uri as label, mbox_dir as directory to scan
repo = MBox(uri=mbox_uri, dirpath=mbox_dir)

# Fetch all commits as an iteratoir, and iterate it uploading to ElasticSearch
print('Analyzing mbox archives...')
# fetch all messages as an iteratoir
for message in repo.fetch():
 def __init__(self, **kwargs):
     self._logger = logging.getLogger(__file__)
     self.client = elasticsearch.Elasticsearch(**kwargs)
Example #11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
from flask_cors import CORS, cross_origin
import elasticsearch
import json

app = Flask(__name__)
CORS(app)

es = elasticsearch.Elasticsearch(
    ['http://*****:*****@elasticsearch:8080'])


@app.route("/", methods=["GET"])
def alldata():
    res = es.search(index="booze", body={"query": {"match_all": {}}})
    return jsonify(res)


@app.route("/brandsComplete", methods=['GET'])
def brandsComplete():
    """Return unique brand names"""
    res = es.search(index="booze",
                    body={
                        "size": 0,
                        "aggs": {
                            "unique_brands": {
                                "terms": {
                                    "field": "brand_name.keyword",
Example #12
0
 def __init__(self, elastic_urls=['127.0.0.1:9200'], sqlite_db=None):
     self.elastic_urls = elastic_urls
     self.es_client = elasticsearch.Elasticsearch(self.elastic_urls)
     self.engine = create_engine(sqlite_db)
Example #13
0
import time
from pprint import pprint

import elasticsearch

endpoint = sys.argv[1]
work_node = sys.argv[2]
pattern = sys.argv[3]
tgt_num_of_shards = int(sys.argv[4])
tgt_num_of_replicas = int(sys.argv[5])

(host, port) = endpoint.split(':')

es = elasticsearch.Elasticsearch(
  [
    {'host': host, 'port': port, 'timeout': 310}
  ]
)

indices = es.cat.indices(index=pattern, format="json")

open_indices = []
closed_indices = []

for index in indices:
  if index['index'].startswith('shrunk-'):
    continue
  elif index['status'] == 'close':
    closed_indices.append(index['index'])
  else:
    if int(index['pri']) == tgt_num_of_shards:
Example #14
0
import json
import string, random, tempfile
import time
from click import testing as clicktest
from mock import patch, Mock
import unittest
from . import CuratorTestCase
from . import testvars as testvars

import logging
logger = logging.getLogger(__name__)

host, port = os.environ.get('TEST_ES_SERVER', 'localhost:9200').split(':')
port = int(port) if port else 9200

global_client = elasticsearch.Elasticsearch(host=host, port=port)

EMPTY710ROUTING = {
    'allocation': {
        'include': {
            '_tier_preference': 'data_content'
        }
    }
}

delete_count_pattern = ('---\n'
                        'actions:\n'
                        '  1:\n'
                        '    description: "Delete indices as filtered"\n'
                        '    action: delete_indices\n'
                        '    options:\n'
import elasticsearch
es = elasticsearch.Elasticsearch()  # use default of localhost, port 9200
es.index(index='posts', doc_type='blog', id=2, body={
    'author': 'Benjamin Pollack',
    'blog': 'bitquabit',
    'title': 'Having Fun: Python and Elasticsearch',
    'topics': ['elasticsearch', 'python', 'parseltongue'],
    'awesomeness': 0.7
})
es.index(index='posts', doc_type='blog', id=3, body={
    'author': 'Benjamin Pollack',
    'blog': 'bitquabit',
    'title': 'How to Write Clickbait Titles About Git Being Awful Compared to Mercurial',
    'topics': ['mercurial', 'git', 'flamewars', 'hidden messages'],
    'awesomeness': 0.95
})
print(es.search(index='posts', q='author:"Benjamin Pollack"'))
from fake_useragent import UserAgent
from moviepy.editor import VideoFileClip

from env import config
from qiniu import Qiniu
from utils.util import CheloExtendedLogger, count_str, query_cut, del_item_from_list
from utils.tool import obj2int, makedirs
from utils.video_filter import video_filter

requests.packages.urllib3.disable_warnings()

ua = UserAgent()
_qiniu = Qiniu()
base_url = os.path.split(os.path.realpath(__file__))[0]
es_client = elasticsearch.Elasticsearch(config.es_addr, timeout=30)
logging.setLoggerClass(CheloExtendedLogger)
douyin_logger = logging.getLogger("douyin_sync")
CHROME_DRIVER = os.path.join(base_url, 'utils/chromedriver')


class DouYin(object):
    ua = ua.random

    def __init__(self):
        option = webdriver.ChromeOptions()
        option.add_argument('--headless')
        option.add_argument('disable-infobars')
        option.add_argument('start-maximized')
        option.add_argument('--no-sandbox')
        option.add_argument('--no-zygote')
 def __init__(self, index=None, doc_type=None):
     self.es = elasticsearch.Elasticsearch()
     self.esClient = elasticsearch.client.IndicesClient(self.es)
''' 
Indexes dataset.json in the elastic search server
'''
import elasticsearch
import json
es = elasticsearch.Elasticsearch()  

print 'Indexing the data to the Elastic Search Server.. Would take some time'
with open("dataset.json") as f:
    data = json.load(f)
num_rec = len(data['images'])
new_d = [ {} for _ in xrange(num_rec)]

for _ in xrange(num_rec):
    new_d[_]['imgurl'] = data['images'][_]['filename']
    new_d[_]['description'] = data['images'][_]['sentences'][0]['raw']
for i in xrange(num_rec):
    es.index(index="desearch", doc_type="json", id=i, body = {
                    'imgurl': new_d[i]['imgurl'],
                    'description': new_d[i]['description'],
                    'idnum': i
                })
print 'Done!'
Example #19
0
#!/usr/bin/env python
#-*- coding:utf8 -*-

import pdb


#connect to our cluster
import elasticsearch
es = elasticsearch.Elasticsearch([{'host': '127.0.0.1', 'port': 9200}])


# make sure ES is up and running
import requests
res = requests.get('http://127.0.0.1:9200')
print(res.content)


#let's iterate over swapi people documents and index them
import json

print "index!!!...."
i = 1
while i < 3:
  res = requests.get('http://swapi.co/api/people/'+ str(i))
  if res.status_code == 200:
    source = json.loads(res.content)
    es.index(index='sw', doc_type='people', id=i, body=source)
    print 'index %d success! \n' % i
  i += 1

i = i-1
own_key = 'Get key from https://openweathermap.org/api'
news_key = 'Get key from https://newsapi.org/'

##### MAPPINGS ######
mapping = {
    "settings": {
        "max_docvalue_fields_search": 200,
        "number_of_shards": 1,
        "number_of_replicas": 0
    }
}

##### CREATE INDEX ######
try:
    es = elasticsearch.Elasticsearch(host,
                                     port=port,
                                     http_auth=(username, password))
except Exception as e:
    print(e)
    pprint.pprint("--------------")
try:
    es.indices.create(index=index_name, body=mapping, ignore=400)
except Exception as e:
    print(e)
    pprint.pprint("--------------")

####### HELPER FUNCTIONS #######


def reset():
    forecast_day["temp_min"] = 999
Example #21
0
import elasticsearch as es
import elasticsearch_dsl as es_dsl

import numpy as np
import pandas as pd

import sys
import os

if '__file__' in vars():
    project_path = os.path.abspath(
        os.path.join(__file__, os.path.pardir, os.path.pardir, os.path.pardir))
    print('\n Adding path: ', project_path)
    sys.path.append(project_path)

con = es.Elasticsearch('192.168.1.66')

# Get the source labels
# Get the counts per source labels
# Filter per source label..

# Get "databases" .. or indices
res_indices = list(con.indices.get_alias('*').keys())

for i_ind in res_indices:
    print('Database: ', i_ind)
    temp = es_dsl.Search(using=con, index=i_ind)

    temp.query('match')
    max_count = temp.count()
    print('Size: ', max_count)
@author: fangyucheng
"""

import json
import time
import elasticsearch
import elasticsearch.helpers

hosts = '192.168.17.11'
port = 80
user_id = 'fangyucheng'
password = '******'
http_auth = (user_id, password)

es = elasticsearch.Elasticsearch(hosts=hosts, port=port, http_auth=http_auth)

task_list = []
result_list = []

es_scan = elasticsearch.helpers.scan(es, index='target_releasers')

for line in es_scan:
    task_list.append(line)
print('the length of releaser is %s' % len(task_list))

bulk_all_body = ''
count = 0
for line in task_list:
    releaser_info = line['_source']
    platform = releaser_info['platform']
def get_elasticsearch_connection():
    es = elasticsearch.Elasticsearch([{
        'host': os.getenv('ELASTICSEARCH_HOST'),
        'port': os.getenv('ELASTICSEARCH_PORT')
    }])
    return es
Example #24
0
 def __init__(self, host, port):
     self.connection = elasticsearch.Elasticsearch(
         ['http://{esHost}:{esPort}'.format(esHost=host, esPort=port)],
         timeout=3000)
Example #25
0

class FulltextHits(BaseModel):
    query_type: str
    count_returned: int
    count_found: int
    offset: int
    limit: int
    deep_page_limit: int
    query_time_ms: int
    query_wall_time_ms: int
    results: List[Any]


# global sync client connection
es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_QUERY_BASE,
                                        timeout=25.0)


def transform_es_results(resp: Response) -> List[dict]:
    # convert from ES objects to python dicts
    results = []
    for h in resp:
        r = h._d_
        # print(h.meta._d_)
        r["_highlights"] = []
        if "highlight" in dir(h.meta):
            highlights = h.meta.highlight._d_
            for k in highlights:
                r["_highlights"] += highlights[k]
        r["_collapsed"] = []
        r["_collapsed_count"] = 0
Example #26
0
def index(log_obj=logger):
    user = request.args.get('user')
    #log_obj = logging.getLogger('service_logger')
    credentials = os.path.join(direc, 'Q_DSaaS_v1-40635af5d3e8.json')
    log_obj.info('Starting logger')
    #Data Sets [REPLACE BY HDFS File Access]
    image_path = os.path.join(direc, 'images/' + user)
    
    #Modules
    vision = google_vision(path_to_discovery_file=credentials, log_obj=log_obj)

    #Requests and Indexing
    response = vision.get_response(image_path)

    #new_db = es_instance.get_es() #indexed elastic_search

    output = {}

    if 'logoAnnotations' in response['responses'][0]:
        output['Google_Vision'] = response['responses'][0]['logoAnnotations'][0]['description']
    else:
        output['Google_Vision'] = 'null'

    if 'textAnnotations' in response['responses'][0]:
        output['OCR'] = response['responses'][0]['textAnnotations'][0]['description']
        ocr = re.sub('[^A-Za-z0-9.//]', " ", output['OCR'])
        output['OCR'] = ocr
    else:
        output['OCR'] = 'null'

    #Test queries for elastic search
    res = elasticsearch.Elasticsearch(hosts='localhost:9200')
    log_obj.info('ES connection made')
    brand_search = res.search(index='brands', body={'query': {'match':{'brand': output['OCR']}}})['hits']['hits']
    tag_search = res.search(index='tags', body={"query": { "multi_match": {"query" : output['OCR'], "fields" : "tagline", "fuzziness": "AUTO"}}})['hits']['hits']

    brand_1, brand_2, brand_3 = brand_search[0]['_source']['brand'], brand_search[1]['_source']['brand'], brand_search[2]['_source']['brand']
    log_obj.info('Retreived top 3 brands from ES')
    tag_1, tag_2, tag_3 = tag_search[0]['_source']['brand'], tag_search[1]['_source']['brand'], tag_search[2]['_source']['brand']
    log_obj.info('Retreived top 3 tags from ES')

    output['Facebook URL'] = extract.get_facebook(output['OCR'])
    output['Website URL'] = extract.get_website_url(output['OCR'])
   

    if output['Google_Vision'] == 'null':
        prediction = rules.rule_ocr_tag(brand_1, brand_2, brand_3, tag_1, tag_2, tag_3)
        output['Brand Prediction'] = prediction
        log_obj.info('Brand-Tag Rule')
        #output['rule'] = 'third'
        return jsonify(output)

    prediction = rules.rule_tag_logo(tag_1, tag_2, tag_3, output['Google_Vision'])
    
    if prediction != 'null':
        output['Brand Prediction'] = rules.rule_tag_logo(tag_1, tag_2, tag_3, output['Google_Vision'])
        log_obj.info('Tag-Logo Rule')
        #output['rule'] = 'second'
        return jsonify(output)
    else:
        output['Brand Prediction'] = rules.rule_brand_logo(brand_1, brand_2, brand_3, output['Google_Vision'])
        log_obj.info('Brand-Logo Rule')
        #output['rule'] = 'first'
        return jsonify(output)
Example #27
0
from flask import Flask
from flask import request
from flask import render_template
from flask import jsonify
from stormpath.client import Client as StormpathClient
import elasticsearch

from alliterativeanimals import get_name
import settings

# Flask
app = Flask(__name__)

# Elastic Search
es = elasticsearch.Elasticsearch([settings.ELASTICSEARCH_URL])
INDEX = "main"
DOC_TYPE = "idea"
"""
es.search(index="main", body={"query":{"fuzzy":{"_all":"run"}}});
"""

# Stormpath
stormpath_client = StormpathClient(
    api_key_id=settings.STORMPATH_API_KEY_ID,
    api_key_secret=settings.STORMPATH_API_KEY_SECRET)
stormpath_application = stormpath_client.applications.search(
    settings.STORMPATH_APPLICATION_NAME)[0]

# STATIC FILES
def main(argv):
    global chatty

    start = ""
    end = ""
    duration = ""

    bisection = 0
    bisection_max = 10
    min_step = datetime.timedelta(minutes=5)

    try:
        opts, args = getopt.getopt(
            argv, "hqs:d:t:",
            ["help", "quiet", "start=", "duration=", "steps="])
    except getopt.GetoptError:
        eprint('Error: Unrecognized option!')
        eprint(
            'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
        )
        eprint('Use -h for help.')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print('categorization.py [-h] [-s <start> -d <duration>]')
            print('  -h:                    print this text')
            print(
                '  -s <start>:            start date and time in elasticsearch time format'
            )
            print('  -d <duration>:         duration format <number>[mhdw]')
            print('  -t <steps>:            steps format <number>[mhdw]')
            print(
                '                         If Elasticsearch query isn\'t responed in time, step size will be automatically adjusted by bisection.'
            )
            print('  -q:                    no output except errors')
            print('  --help:                same as -h')
            print('  --start <start>:       same as -s')
            print('  --duration <duration>: same as -d')
            print('  --steps <steps>:       same as -t')
            print('  --quiet:               same as -q')
            print('')
            print(
                ' If start and duration are omitted, the last 24 hours will be used.'
            )
            sys.exit(0)
        elif opt in ("-q", "--quiet"):
            chatty = False
        elif opt in ("-s", "--start"):
            try:
                start_dt = datetime.datetime.strptime(arg, "%Y-%m-%dT%H:%M")
                start = arg
            except ValueError:
                eprint('Error: Invalid option start!')
                eprint(
                    'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                )
                eprint('<start> format: yyyy-mm-ddThh:mm')
                eprint('Use -h for help.')
                sys.exit(2)
        elif opt in ("-d", "--duration"):
            match = re.match("(\d+)([mhdw])$", arg)
            if match:
                (x, c) = match.groups()
                try:
                    y = int(x)
                except:
                    eprint('Error: Invalid option duration!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
                if c == 'm':
                    delta = datetime.timedelta(minutes=y)
                elif c == 'h':
                    delta = datetime.timedelta(hours=y)
                elif c == 'd':
                    delta = datetime.timedelta(days=y)
                elif c == 'w':
                    delta = datetime.timedelta(days=(7 * y))
                else:
                    eprint('Error: Invalid option duration!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
                duration = arg
            else:
                eprint('Error: Invalid option duration!')
                eprint(
                    'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                )
                eprint('<duration> format: <number>[mhdw]')
                eprint('Use -h for help.')
                sys.exit(2)
        elif opt in ("-t", "--steps"):
            match = re.match("(\d+)([mhdw])$", arg)
            if match:
                (x, c) = match.groups()
                try:
                    y = int(x)
                except:
                    eprint('Error: Invalid option steps!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
                if c == 'm':
                    step = datetime.timedelta(minutes=y)
                elif c == 'h':
                    step = datetime.timedelta(hours=y)
                elif c == 'd':
                    step = datetime.timedelta(days=y)
                elif c == 'w':
                    step = datetime.timedelta(days=(7 * y))
                else:
                    eprint('Error: Invalid option steps!')
                    eprint(
                        'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                    )
                    eprint('<duration> format: <number>[mhdw]')
                    eprint('Use -h for help.')
                    sys.exit(2)
            else:
                eprint('Error: Invalid option steps!')
                eprint(
                    'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
                )
                eprint('<steps> format: <number>[mhdw]')
                eprint('Use -h for help.')
                sys.exit(2)

    if ((start == "") and (duration == "")):
        end_dt = datetime.datetime.now()
        end = end_dt.strftime("%Y-%m-%dT%H:%M")
        delta = datetime.timedelta(days=1)
        start_dt = end_dt - delta
        start = start_dt.strftime("%Y-%m-%dT%H:%M")
    elif ((start == "") or (duration == "")):
        eprint('Error: Invalid option combination!')
        eprint('Start and duration must both be specified or omitted.')
        eprint(
            'categorization.py [-h] [-q] [-s <start> -d <duration>] [-t <steps]'
        )
        eprint('Use -h for help.')
        sys.exit(2)
    else:
        end_dt = start_dt + delta
        end = end_dt.strftime("%Y-%m-%dT%H:%M")

    if ('step' not in vars()):
        step = delta

    if (end_dt - start_dt) < step:
        step = end_dt - start_dt

    es = elasticsearch.Elasticsearch(nodes)

    elasticsearch_version = get_elasticsearch_major_version(es)

    axes = [
        'entropy_peer_sessions', 'entropy_peer_packets', 'entropy_peer_bytes',
        'entropy_sport_sessions', 'entropy_sport_packets',
        'entropy_sport_bytes', 'entropy_dport_sessions',
        'entropy_dport_packets', 'entropy_dport_bytes'
    ]

    host = {}
    peer = {}
    sport = {}
    dport = {}
    '''
    During data generation slight differences in timing
    may cause some hosts not to be listed in all dictionaries.
    These hosts will be ignored.
    '''
    ignore_hosts = []

    sets = [[
        'peer (1)', peer, "src", "SourceAddress", "dst", "DestinationAddress"
    ], ['peer (2)', peer, "dst", "DestinationAddress", "src", "SourceAddress"
        ], ['sport (1)', sport, "src", "SourceAddress", "sport", "SourcePort"],
            [
                'sport (2)', sport, "dst", "DestinationAddress", "dport",
                "DestinationPort"
            ],
            [
                'dport (1)', dport, "src", "SourceAddress", "dport",
                "DestinationPort"
            ],
            [
                'dport (2)', dport, "dst", "DestinationAddress", "sport",
                "SourcePort"
            ]]

    for s in sets:
        moving_start_dt = start_dt
        moving_end_dt = min(start_dt + step, end_dt)
        while True:
            try:
                start = moving_start_dt.strftime("%Y-%m-%dT%H:%M:%S")
                end = moving_end_dt.strftime("%Y-%m-%dT%H:%M:%S")
                print_or_quiet('%s Fetching %s data ... %s - %s' %
                               (strftime('%H:%M:%S'), s[0], start, end))
                query_into_dictionary(es, elasticsearch_version, start, end,
                                      host, s[1], s[2], s[3], s[4], s[5])
                moving_start_dt = moving_end_dt
                moving_end_dt += step
                if moving_start_dt >= end_dt:
                    break
                if moving_end_dt > end_dt:
                    moving_end_dt = end_dt
            except elasticsearch.exceptions.ConnectionTimeout as esect:
                bisection += 1
                if bisection <= bisection_max:
                    step = step // 2
                    if step < min_step:
                        eprint(
                            'Elasticsearch Connection Timeout. Minimum timeframe reached. Exiting ...'
                        )
                        sys.exit(3)
                    eprint(
                        'Elasticsearch Connection Timeout. Halving step size.')
                    moving_end_dt = moving_start_dt + step
                else:
                    eprint(
                        '%d. time Elasticsearch Connection Timeout. Exiting ...'
                        % bisection)
                    sys.exit(3)

    print_or_quiet('%s Calculating sums ...' % (strftime('%H:%M:%S')))
    calculate_sums(host, 'peer', peer, ignore_hosts)
    calculate_sums(host, 'sport', sport, ignore_hosts)
    calculate_sums(host, 'dport', dport, ignore_hosts)

    print_or_quiet('%s Removing incomplete hosts ...' % (strftime('%H:%M:%S')))
    for h in ignore_hosts:
        try:
            del host[h]
        except KeyError:
            # host may appear more than once
            pass

    print_or_quiet('%s Calculating entropy ...' % (strftime('%H:%M:%S')))
    calculate_entropy(host,
                      [[peer, 'peer'], [dport, 'dport'], [sport, 'sport']])

    print_or_quiet('%s Removing dictionaries ...' % (strftime('%H:%M:%S')))
    peer.clear()
    sport.clear()
    dport.clear()

    if (len(host) == 0):
        eprint("No data found. Exiting ...")
        sys.exit(1)

    print_or_quiet('%s Creating sample set ...' % (strftime('%H:%M:%S')))
    labels = host.keys()
    npa = create_np(host, axes)
    n_samples, n_features = npa.shape
    print_or_quiet('          samples: %d features:%d' %
                   (n_samples, n_features))

    #########################
    # MeanShift
    #########################
    print_or_quiet('%s Calculating bandwidth ...' % (strftime('%H:%M:%S')))
    bandwidth = estimate_bandwidth(npa,
                                   quantile=bandwidth_quantile,
                                   n_samples=bandwidth_n_samples,
                                   random_state=bandwidth_random_state,
                                   n_jobs=bandwidth_n_jobs)

    if (bandwidth == 0.00000000):
        eprint('Useless bandwith. Exiting ....')
        sys.exit(3)

    print_or_quiet('%s Calculating MeanShift ...' % (strftime('%H:%M:%S')))
    ms = MeanShift(bandwidth=bandwidth,
                   bin_seeding=meanshift_bin_seeding,
                   cluster_all=meanshift_cluster_all,
                   n_jobs=meanshift_n_jobs)
    ms.fit(npa, npa.shape)
    n_clusters = len(numpy.unique(ms.labels_))

    print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S')))
    prediction = ms.predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    meanshift_output = {}
    for (ip, prediction) in ip_prediction:
        meanshift_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing MeanShift output file ...' %
                   (strftime('%H:%M:%S')))
    with open(meanshift_outputfile, 'w') as fp:
        json.dump(
            {str(key): value
             for key, value in meanshift_output.iteritems()}, fp)

    #########################
    # KMeans
    #########################
    print_or_quiet('%s Calculating KMeans ...' % (strftime('%H:%M:%S')))
    km = KMeans(n_clusters=kmeans_n_clusters)
    km.fit(npa, npa.shape)

    print_or_quiet('%s Getting predictions ...' % (strftime('%H:%M:%S')))
    prediction = km.predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    kmeans_output = {}
    for (ip, prediction) in ip_prediction:
        kmeans_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing KMeans output file ...' %
                   (strftime('%H:%M:%S')))
    with open(kmeans_outputfile, 'w') as fp:
        json.dump(
            {str(key): value
             for key, value in kmeans_output.iteritems()}, fp)

    #########################
    # AgglomerativeClustering
    #########################
    print_or_quiet('%s Calculating Agglomerative Clustering ...' %
                   (strftime('%H:%M:%S')))
    ac = AgglomerativeClustering(n_clusters=agglomerative_n_clusters,
                                 affinity=agglomerative_affinity,
                                 linkage=agglomerative_linkage)

    prediction = ac.fit_predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    agglomerative_output = {}
    for (ip, prediction) in ip_prediction:
        agglomerative_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing Agglomerative Clustering output file ...' %
                   (strftime('%H:%M:%S')))
    with open(agglomerative_outputfile, 'w') as fp:
        json.dump(
            {
                str(key): value
                for key, value in agglomerative_output.iteritems()
            }, fp)

    #########################
    # DBSCAN
    #########################
    print_or_quiet('%s Calculating DBSCAN ...' % (strftime('%H:%M:%S')))
    db = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)

    prediction = db.fit_predict(npa)
    ip = [y['ip_address'] for y in host.values()]
    ip_prediction = zip(ip, prediction)

    dbscan_output = {}
    for (ip, prediction) in ip_prediction:
        dbscan_output.setdefault(prediction, list()).append(ip)

    print_or_quiet('%s Writing DBSCAN output file ...' %
                   (strftime('%H:%M:%S')))
    with open(dbscan_outputfile, 'w') as fp:
        json.dump(
            {str(key): value
             for key, value in dbscan_output.iteritems()}, fp)
Example #29
0
 def __init__(self, host_port, index_name=None, index_type=None):
     self.index_name = index_name
     self.index_type = index_type
     self.es = elasticsearch.Elasticsearch(host_port, timeout=3600)
Example #30
0
 def _connect_elastic(self, address):
     try:
         _es = elasticsearch.Elasticsearch([address])
         return _es
     except Exception as e:
         raise e