Esempi in Python per Elasticsearch.count, esempi in Python per elasticsearch.Elasticsearch.count

Esempio n. 1

0

Mostra file

File: ESQueryExecution.py Progetto: mayankkejriwal/pycharm-projects-ubuntu

def count_records_in_ebola_data():
    """

    :return:
    """
    query = dict()
    index = 'ebola_data'
    url_EShost = "http://52.7.75.159:9020/"
    es = Elasticsearch(url_EShost, connection_class = RequestsHttpConnection, http_auth = ('lorelei', 'thorthor'))
    query['query'] = ESQueryBuilders.build_match_all_query()
    print es.count(index=index, body = query)['count']

Esempio n. 2

0

Mostra file

File: upgrade-tests.py Progetto: 10061100/elasticsearch

def new_es_instance(num_nodes, http_port, timeout=30):
    logging.info("Waiting for %s nodes to join the cluster" % num_nodes)
    for _ in range(0, timeout):
        # TODO(simonw): ask Honza if there is a better way to do this?
        try:
            es = Elasticsearch([{"host": "127.0.0.1", "port": http_port + x} for x in range(0, num_nodes)])
            es.cluster.health(wait_for_nodes=num_nodes)
            es.count()  # can we actually search or do we get a 503? -- anyway retry
            return es
        except (ConnectionError, TransportError):
            pass
        time.sleep(1)
    assert False, "Timed out waiting for %s nodes for %s seconds" % (num_nodes, timeout)

Esempio n. 3

0

Mostra file

File: create-bwc-index.py Progetto: pombredanne/elasticsearch

def create_client(http_port, timeout=30):
    logging.info("Waiting for node to startup")
    for _ in range(0, timeout):
        # TODO: ask Honza if there is a better way to do this?
        try:
            client = Elasticsearch([{"host": "127.0.0.1", "port": http_port}])
            client.cluster.health(wait_for_nodes=1)
            client.count()  # can we actually search or do we get a 503? -- anyway retry
            return client
        except (ConnectionError, TransportError):
            pass
        time.sleep(1)
    assert False, "Timed out waiting for node for %s seconds" % timeout

Esempio n. 4

0

Mostra file

File: create_bwc_index.py Progetto: xushjie1987/elasticsearch-v2.1.1

def create_client(http_port=DEFAULT_HTTP_TCP_PORT, timeout=30):
  logging.info('Waiting for node to startup')
  for _ in range(0, timeout):
    # TODO: ask Honza if there is a better way to do this?
    try:
      client = Elasticsearch([{'host': 'localhost', 'port': http_port}])
      client.cluster.health(wait_for_nodes=1)
      client.count() # can we actually search or do we get a 503? -- anyway retry
      return client
    except (ConnectionError, TransportError):
      pass
    time.sleep(1)
  assert False, 'Timed out waiting for node for %s seconds' % timeout

Esempio n. 5

0

Mostra file

File: reindex.py Progetto: cyrillk/elastic-reindex

def main():
    args = parse_args()
    should_apply = args.apply
    print(should_apply)

    es = Elasticsearch([{'host': host}])

    print_count("Source [before]", es.count(index=source))
    print_count("Target [before]", es.count(index=target))

    if (args.apply):
        reindex(es, source, target, chunk_size=5000, scroll='30m')

    print_count("Source [after]", es.count(index=source))
    print_count("Target [after]", es.count(index=target))

Esempio n. 6

0

Mostra file

File: ElasticSearchLib.py Progetto: GLMeece/robotframework-elasticsearch

    def es_count(self,p_host,p_port,p_index,p_query=None):
        """
        === Returns the Number of Documents That Match a Query ===
        
        The result is the response from elastic search. The value is in the "count" field of the response.

        - ``p_host`` - Elasticsearch server
        - ``p_port`` - Port of the es server
        - ``p_index`` - Name of the index to query
        - ``p_query`` - Query to run

        | ${res} = | Es Count | localhost | 9200 | myIndex |  {"query": {"query_string": {"query": "searched value"}}} |

        ``${res}`` contains the number of docs
        """

        # Es client
        try:
            param = [{'host':p_host,'port':int(p_port)}]
            es = Elasticsearch(param)
        except Exception:
            raise AssertionError("Connection error on %s:%i",p_host,int(p_port))

        try:
            result = es.count(index=p_index, body=p_query)
        except Exception:
            raise AssertionError("Count error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query)

        return result['count']

Esempio n. 7

0

Mostra file

File: test_logstash.py Progetto: qubell/eggshell-white

 def _client_test_case(self, instance):
     try:
         loggers = [
             inst
             for inst in instance.environment.services
             if instance.organization.application(inst.applicationId).name == self.name
         ]
         host = loggers[0].returnValues["logger.logger-server"]
         es = Elasticsearch([{"host": host}])
         index_name = "logstash-" + datetime.utcnow().strftime("%Y.%m.%d")
         records_count = es.count(index_name, body={"query": {"term": {"instId": instance.id}}})
         self.assertTrue(records_count >= 2, "Expected at least two messages in index, got %s" % records_count)
         records = es.search(index=index_name, body={"query": {"match_all": {}}})["hits"]["hits"]
         for record in records:
             self.assertEqual(record["_source"]["@message"], "Hello from execrun!")
             expected_keys = [
                 "@severity",
                 "@timestamp",
                 "filename",
                 "instId",
                 "jobId",
                 "stepId",
                 "stepname",
                 "host",
                 "@message",
             ]
             for key in expected_keys:
                 self.assertIn(
                     key, record["_source"], "Message saved to elasticsearch should contain field %s" % key
                 )
     except TransportError as e:
         self.fail("Can not retrieve count of log messages: %s %s" % (e.status_code, e.error))

Esempio n. 8

0

Mostra file

File: ElasticSearchLib.py Progetto: ravineon/robotframework-elasticsearch

    def es_count(self,p_host,p_port,p_index,p_query=None):
        """
        Returns the number of documents that match a query
        The result is the response from elastic search. The value is in the "count" field of the response.

        {p_host}   Elasticsearch server\n
        {p_port}   Port of the es server\n
        {p_index}  Name of the index to query\n
        {p_query}  Query to run\n

        | ${res} = | es count | localhost | 9200 | myIndex |  {"query":{"query_string":{"query": "searched value"}}} |

        ${res} contains the number of docs
        """

        # Es client
        try:
            param = [{'host':p_host,'port':int(p_port)}]
            es = Elasticsearch(param)
        except Exception:
            raise AssertionError("Connexion error on %s:%i",p_host,int(p_port))

        try:
            result = es.count(index=p_index, body=p_query)
        except Exception:
            raise AssertionError("Count error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query)

        return result['count']

Esempio n. 9

0

Mostra file

File: eshits2csv.py Progetto: andreapalaia/es-utils

def query_and_dump_reults(args):
    es = Elasticsearch([args.hostname + ':' + str(args.port)])

    query = '{"query":{"match_all":{}}}'
    if args.query is not None:
        query = args.query

    doc_type = None
    if args.doc_type is not None:
        doc_type = args.doc_type

    target = "output.csv"
    if args.target is not None:
        target = args.target

    res = es.count(index=args.index, body=query)
    nhits = res['count']

    counter = 0
    bar = progressbar.ProgressBar(max_value=nhits)

    res = helpers.scan(es, index=args.index, query=query, doc_type=doc_type)
    fields = args.fields.split(',')
    with open(target, 'w') as csvfile:
        datawriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
        datawriter.writerow(fields)
        for item in res:
            item = item['_source']
            datawriter.writerow([get_var(item, field) for field in fields])

            counter += 1
            bar.update(counter)
        bar.finish()

Esempio n. 10

0

Mostra file

File: ESio.py Progetto: GalakFayyar/TabordNG

    def count(self, p_index, p_query={}):
        """Gets the number of docs for a query

            p_index:    elasticsearch index where to query
            p_query:    the query to process

            return the number of docs from the index p_index and the query p_query
        """
        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s', json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            result = es.count(index=p_index, body=p_query)
            logger.info('Count the number of items from %s for the query %s', p_index, p_query)
        except Exception as e:
            logger.error('Error querying the index %s with query %s', p_index, p_query)
            logger.error(e)

        return result['count']

Esempio n. 11

0

Mostra file

File: es_aws.py Progetto: jacksli/pythonscripts

 def get(self,indexs,logdir="/root",datadir="/opt"):
    es=Elasticsearch(self.host)
    try:
        count=es.count(index=indexs)
        count=int(count["count"])
        logfile=logdir+"/"+indexs
        if os.path.isfile(logfile):
           file=open(logfile,"r")
           value=int(file.readline())
           file.close()
           if count<=value:
              return False
        file=open(logfile,"w")
        file.write(str(count))
        file.close()
        num=count/10
        j=0
        datafile=datadir+"/"+indexs
        while j<=num:
           rs=es.search(index=indexs,from_=j*10,size=10)
           file=open(datafile,"a")
           for doc in rs["hits"]["hits"]:
               file.write(str(doc["_source"])+"\n")
           file.close()
           j=j+1
        if os.path.isfile(datafile):
           glacier=awsglacier(self.region,self.access,self.secret)
           glacier.uploadfile(datafile)
           os.remove(datafile)
    except ElasticsearchException:
        print "elasticsearch exceptiont"

Esempio n. 12

0

Mostra file

File: id_getter.py Progetto: cemkoc/GorReplay_and_Log

def get_ids_with_response_status(status):
    es = Elasticsearch(["http://localhost:9200"])
    
    total_num_docs = es.count(index="gor", body={"query": {"match_all": {}}})['count']
    print "The total number of docs is: " + str(total_num_docs)

    # this number is the total number of documents inside the gor index that correspond to various queries.
    #filtered_num = es.count(index="gor", body={"query": {"bool": {"must": { "match": { "Resp_Status": str(status) }}, "must_not": { "match": { "Resp_Content-Type": "octet-stream" }}}}})['count']
    
    filtered_num = es.count(index="gor", body={"query": {"bool": {"must": { "match": { "Resp_Status": str(status) }}}}})['count']
    #total_num_charset = es.count(index="gor", body={"query": {"bool": {"must_not": { "match": { "Resp_Content-Type": "octet-stream" }}}}})['count']
    
    res = es.search(index="gor", doc_type="RequestResponse", body={"query": {"bool": {"must": { "match": { "Resp_Status": str(status) }}}}, "size": int(filtered_num), "fields": ["_id"]}, request_timeout=300)
    print str(filtered_num) + " documents in the index have response status of "+ str(status) + "..."
    
    ids_list = [d['_id'] for d in res['hits']['hits']]
    return ids_list

Esempio n. 13

0

Mostra file

File: elastic.py Progetto: haleystorm/nets-pipeline

class ElasticClient:
    def __init__(self, host: str, port: int):
        try:
            self.es = Elasticsearch(hosts=[
                {'host': host,
                 'port': port}])
            info = self.es.info()
            logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name']))

        except ElasticsearchException as e:
            logger.error("Elasticsearch is not available.", e)
            exit(0)

    def get_articles(self, index, doctype, batch_size):
        query = '{"query": { "bool": { "must_not": { "exists": { "field": "status" }}}}}'
        result = self.es.search(index=index, doc_type=doctype, size=batch_size, body=query)
        articles = result.get('hits').get('hits')
        return articles if articles is not None else []

    def count(self, index):
        return self.es.count(index=index)['count']

    def info(self):
        return self.es.info()

    def check_url(self, url: str, auth_index: str):
        """
        Private function to check if a URL appears in the database.

        Parameters
        ----------

        url: URL for the news stories to be scraped.

        auth_index: es index

        Returns
        -------

        found: Boolean.
                Indicates whether or not a URL was found in the database.
        """
        response = self.es.search(index=auth_index, doc_type=auth_index, body={
            "query":
                {
                    "match_phrase": {
                        "url": url
                    }
                }
        }, size=0, terminate_after=1, ignore_unavailable=True)

        return response["hits"]["total"] > 0

    def persist(self, index, doctype, payload):
        self.es.index(index=index, doc_type=doctype, body=payload)

    def update(self, index, doctype, doc_id, payload):
        self.es.update(index=index, doc_type=doctype, id=doc_id, body=payload)

Esempio n. 14

0

Mostra file

File: es-reindex.py Progetto: charmon79/PythonTest

def CheckStatus():
	global start_time
	global finished
	es = Elasticsearch([url])
	source_count = es.count(index=source)
	target_count = es.count(index=target)
	print "copied "+str(target_count['count'])+" of "+str(source_count['count'])+" ( elapsed: "+str(int(time.time() - start_time))+" sec. )"
	if source_count == target_count:
		finished = True

Esempio n. 15

0

Mostra file

File: cruncher_e2e.py Progetto: blagarde/riotscrape

 def test_from_redis_to_elasticsearch(self):
     '''
     Crunch mock data and then retrieve crunched users
     '''
     gc = GameCruncher()
     gc.crunch()
     sleep(2)
     es = Elasticsearch(ES_NODES)
     nb_user_crunched = es.count(index=RIOT_USERS_INDEX)
     self.assertEqual(10, nb_user_crunched['count'])

Esempio n. 16

0

Mostra file

File: es_mon.py Progetto: rackerlabs/heat-cloud-monitoring-plugins

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-H', '--host', default='localhost',
                        help='Elasticsearch host')
    parser.add_argument('-P', '--port', default=9200, help='Elasticsearch '
                        'HTTP port')
    parser.add_argument('-s', '--ssl', default=False, action='store_true',
                        help='Use SSL for connection')
    parser.add_argument('-u', '--username', help='HTTP auth username')
    parser.add_argument('-p', '--password', help='HTTP auth password')
    parser.add_argument('-U', '--url_prefix', default='', help='URL prefix '
                        'for HTTP requests')
    parser.add_argument('-i', '--index', default='_all', help='Index that '
                        'should be searched. Default: _all')
    parser.add_argument('-f', '--field', default='@timestamp', help='Field the '
                        'range should be bound to. Default: @timestamp')
    parser.add_argument('-r', '--range', default='now-1h', help='Start time to '
                        'search back for entries. Default: now-1h')
    args = parser.parse_args()

    host = args.host
    port = args.port
    ssl = args.ssl
    username = args.username
    password = args.password
    url_prefix = args.url_prefix
    index = args.index
    field = args.field
    time = args.range

    hosts = [{
        'host': host,
        'port': port,
        'url_prefix': url_prefix,
        'http_auth': '{}:{}'.format(username, password),
        'use_ssl': ssl
    },]
    es = Elasticsearch(hosts)
    search_filter = {
        'query': {
            'filtered': {
                'filter': {
                    'range': {
                        field: {
                            'gte': time
                        }
                    }
                }
            }
        }
    }
    count = es.count(index=index, body=search_filter)
    print 'metric item_count int {}'.format(count['count'])

Esempio n. 17

0

Mostra file

File: MappingAnalyses.py Progetto: mayankkejriwal/pycharm-projects-ubuntu

    def docs_density_frequency_statistics(elasticsearch_host, index_name, doc_type, output_file = None):
        """
        We will vary the 'density' (the percentage of exists queries that should match) and print out the CUMULATIVE
        frequency at each density point. Interpret the results with care.

        Because of timeout issues, this function has to be called in lags at times. Change the 'range' line
        in the function to start from where you left off the last time something crashes.
        :param elasticsearch_host: the elasticsearch host
        :param index_name: the name of the elasticsearch index
        :param doc_type: the type in the index
        :return: None
        """


        webpage_properties = MappingAnalyses._get_list_of_all_webpage_properties()
        should = list()
        for property in webpage_properties:
            should.append(TableFunctions.build_constant_score_exists_clause(property))
        bool_query = BuildCompoundESQueries.BuildCompoundESQueries.build_bool_arbitrary(should = should)
        cumul_freq_dict = dict() # key is the 'density' in percent, value is total number of docs retrieved.
        es = Elasticsearch(elasticsearch_host)
        for i in range(0, 101, 5):
            msm_str = str(i) + '%'
            bool_query['bool']['minimum_should_match'] = msm_str
            query = dict()
            query['query'] = bool_query
            count = es.count(index= index_name, doc_type=doc_type, body = query)['count']
            print str(i)+'\t'+str(count)
            cumul_freq_dict[i] = count
        query = dict()
        query['query'] = TableFunctions.build_match_all_query()
        cumul_freq_dict[0] = es.count(index= index_name, doc_type=doc_type, body = query)['count']
        if output_file:
            file = codecs.open(output_file, 'w', 'utf-8')
            json.dump(cumul_freq_dict, file)
            file.write('\n')
            file.close()
        else:
            pp = pprint.PrettyPrinter(indent=4)
            pp.pprint(cumul_freq_dict)

Esempio n. 18

0

Mostra file

File: ads_app_count.py Progetto: hellowendy/Insight_Data_Science

def main():
    es = Elasticsearch([{'host': eslogin.host, 'port': eslogin.port}],
                        http_auth=(eslogin.user,eslogin.password))
    
    columns = range(150)
    index = range(360)
    df = pd.DataFrame(index=index, columns=columns)
    df = df.fillna(0)
    
    for col in columns:
        for ind in index:
            df.loc[ind,col] = es.count(index="events-2015.05.*", body={'query': {'bool': {'must':[{'match': { 'ai' : col }}, {'match': { 'cr' : ind }}, {'match': { 'et' : 'AD_SHOW' }}],'must_not':[{'match': { 'fr' : 'true' }}]}}})['count']

    df.to_csv("../data/ad_show_5_2015.tab",sep='\t')

Esempio n. 19

0

Mostra file

File: tasks.py Progetto: CenterForOpenScience/SHARE

def count_es(es_url, es_index, min_date, max_date):
    es_client = Elasticsearch(es_url or settings.ELASTICSEARCH['URL'], retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'])

    return es_client.count(
        index=(es_index or settings.ELASTICSEARCH['INDEX']),
        doc_type='creativeworks',
        body={
            'query': {
                'range': {
                    'date_created': {'gte': min_date.isoformat(), 'lte': max_date.isoformat()}
                }
            }
        }
    )['count']

Esempio n. 20

0

Mostra file

File: esthroughput.py Progetto: anuragkh/sysbench

def main(argv):
  es_server = 'localhost'
  query_file = ''
  index = 'bench'
  doc_type = 'data'
  bench_type = 'search'
  num_threads = 1
  help_msg = 'esbench.py -e <es-server> -q <queries> -i <index> -t <doc-type> -b <bench-type> -n <num-threads>'
  try:
    opts, args = getopt.getopt(argv, 'he:q:i:t:b:',
                               ['es-server', 'queries=', 'index=', 'type=', 'benchtype=', 'numthreads='])
  except getopt.GetoptError:
    print help_msg
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print help_msg
      sys.exit()
    elif opt in ('-e', '--es-server'):
      es_server = arg
    elif opt in ('-q', '--queries'):
      query_file = arg
    elif opt in ('-i', '--index'):
      index = arg
    elif opt in ('-t', '--type'):
      doc_type = arg
    elif opt in ('-b', '--benchtype'):
      bench_type = arg
    elif opt in ('-n', '--numthreads'):
      num_threads = int(arg)

  es = Elasticsearch(hosts=['http://%s:9200' % es_server], timeout=600)
  count = es.count(index=index)['count']
  del es

  threads = []
  print '[Main Thread] Initializing %d threads...' % num_threads
  for i in range(0, num_threads):
    queries = load_queries(bench_type=bench_type, query_file=query_file, record_count=count)
    thread = BenchmarkThread(thread_id=i, bench_type=bench_type, es_server=es_server, index=index, doc_type=doc_type,
                             queries=queries)
    threads.append(thread)

  print '[Main Thread] Starting threads...'
  for thread in threads:
    thread.start()

  print '[Main Thread] Waiting for threads to join...'
  for thread in threads:
    thread.join()

Esempio n. 21

0

Mostra file

File: index.py Progetto: mgax/hambar109

class Index(object):

    def __init__(self, name=None):
        self.name = name
        self.doc_type = 'mof'
        self.es = Elasticsearch()

    def init_app(self, app):
        self.name = app.config['ES_INDEX']

    def initialize(self):
        if self.name in self.es.indices.get_aliases():
            print("deleting old index")
            self.drop()
        self.es.indices.create(index=self.name, body=ES_INDEX_BODY)

    def drop(self):
        self.es.indices.delete(self.name)

    def add(self, doc_id, data):
        self.es.index(
            index=self.name,
            doc_type=self.doc_type,
            id=doc_id,
            body=data,
        )
        self.es.indices.refresh(self.name)

    def bulk_add(self, documents):
        rv = helpers.bulk_index(
            client=self.es,
            docs=(
                {
                    '_id': doc_id,
                    '_index': self.name,
                    '_type': self.doc_type,
                    '_source': data,
                }
                for doc_id, data in documents
            ),
            raise_on_error=True,
        )
        self.es.indices.refresh(self.name)

    def count(self):
        return self.es.count(index=self.name)['count']

    def search(self, query):
        return self.es.search(index=self.name, body={'query': query})

Esempio n. 22

0

Mostra file

File: resultdb.py Progetto: yangjy15033/pyspider

class ResultDB( BaseResultDB):
    collection_prefix = ''

    def __init__(self, url, database='resultdb'):
        self.conn = Elasticsearch()
        self.database = database
        #self.conn.IndicesClient(self.conn).delete(index=self.database);
        #self.save( "afxc2", "sd","http://www.5566.com",{"shopname":"sdfsdfs"} )
        #print self.count( "afxc2" )
        #print self.get( "afxc2" ,  "sd" )
        #self.select( "afxc2" )


    def _parse(self, data):
        return data["_source"]
        #if 'result' in data:
        #    data['result'] = json.loads(data['result'])
        #return data

    def _stringify(self, data):
        if 'result' in data:
            data['result'] = json.dumps(data['result'])
        return data

    def save(self, project, taskid, url, result):
        obj = {
            'taskid': taskid,
            'url': url,
            'result': result,
            'updatetime': time.time(),
        }
        return self.conn.index( index=self.database, doc_type=project, id=taskid, body= obj )

    def select(self, project, fields=None, offset=0, limit=0):
        ret = [];
        if limit==0 :
            limit = 10
        items = self.conn.search( index=self.database, doc_type=project, fields=fields,_source=True , from_=offset,size=limit );
        for item in  items["hits"]["hits"]:
             ret.append( self._parse(item))
        return ret;

    def count(self, project):
        r = self.conn.count(index=self.database, doc_type=project );
        return r['count'];

    def get(self, project, taskid, fields=None):
        return  self.conn.get_source( index=self.database, doc_type=project, id=taskid );

Esempio n. 23

0

Mostra file

File: es.py Progetto: laco/python-nimoy

class ElasticsearchBackend(BaseBackend):
    def __init__(self, conn):
        if Elasticsearch is None:
            raise ImportError("Plz. install elasticsearch library for ElasticsearchBackend.")
        self._es = Elasticsearch(**conn.options.get('elasticsearch', {}))
        super().__init__(conn)

    def _gen_es_id_for_data(self, schema_name, _data):
        key_names = self._conn.schema.get_primary_key(schema_name)
        return '_n_'.join([str(_data[key]) for key in key_names])

    def _gen_es_id_for_id(self, _id):
        if isinstance(_id, str):
            return _id
        elif isinstance(_id, (tuple, list)):
            return '_n_'.join(_id)
        else:
            return _id

    def put_item(self, schema_name, _data, overwrite=False):
        op_type = 'create' if not overwrite else 'index'
        result = self._es.index(index=schema_name, doc_type=schema_name, id=self._gen_es_id_for_data(schema_name, _data), body=_data, op_type=op_type)
        return result.get('_version', 0) > 0

    def get_item(self, schema_name, _id):
        try:
            result = self._es.get(index=schema_name, doc_type=schema_name, id=self._gen_es_id_for_id(_id))
        except NotFoundError:
            raise ItemNotFound("Item not found for id {} in {}.".format(_id, schema_name))
        return result['_source']

    def delete_item(self, schema_name, _id):
        result = self._es.delete(index=schema_name, doc_type=schema_name, id=self._gen_es_id_for_id(_id))
        return result['found'] is True

    def query(self, schema_name, _w, limit=10):
        return self.scan(schema_name, _w, limit)

    def scan(self, schema_name, _w, limit=10):
        query = elastic_parse_wt(_w, {})
        query["size"] = limit
        result = self._es.search(index=schema_name, doc_type=schema_name, body=query)
        return [hit['_source'] for hit in result["hits"]["hits"]]

    def query_count(self, schema_name, _w):
        query = elastic_parse_wt(_w, {})
        result = self._es.count(index=schema_name, doc_type=schema_name, body=query)
        return result.get('count', 0)

Esempio n. 24

0

Mostra file

File: views.py Progetto: moinfar/ResearchGate-Analyser

def indexing_status_page(request, id):
    es = Elasticsearch()
    crawl_info = CrawlInfo.objects.get(id=id)
    try:
        es.indices.refresh(index="index-%d" % crawl_info.id)
        percentage = int(es.count("index-%d" % crawl_info.id, crawl_info.type).get('count') * 100 /
                         crawl_info.successful_crawls)
        percentage = max(1, percentage)
    except Exception as e:
        percentage = 0

    if request.GET.get('type', 'HTML') == 'JSON':
        result = json.dumps({'status': 'OK', 'percent': percentage},
                            ensure_ascii=False, encoding='utf8')
        return HttpResponse(result, content_type='application/json; charset=utf-8')

    return render(request, 'indexing_status.html', {'percent': percentage})

Esempio n. 25

0

Mostra file

File: generate_edges.py Progetto: Sotera/pst-extraction

def export_edges(index, file, qs='*'):
    es = Elasticsearch()
    body = {
        "query" : {
            "bool":{
                "must":[
                    {
                        "query_string" : { "query" : qs }
                    },
                    {
                        "filtered": {
                            "query": {"bool":{"must":[{"match_all":{}}]}},
                            "filter": {
                                "bool": {
                                    "must": [ { "exists": { "field": "senders"}}],
                                    "should" :[
                                        { "exists": { "field": "tos"}},
                                        { "exists": { "field": "ccs"}},
                                        { "exists": { "field": "bccs"}}
                                    ]
                                }
                            }
                        }
                    }
                ]
            }
        },
        "sort":  {}
    }


    def rcvrs(fields={}):
        return fields.get("tos",[]) +fields.get("ccs",[])+fields.get("bccs",[])

    count = es.count(index=index, doc_type="emails", body=body)["count"]
    # TODO add batch processing
    addrs = es.search(index=index, doc_type="emails", size=count, from_=0, fields=["senders", "tos", "ccs", "bccs"], body=body)

    edges = reduce(operator.add, [[{"from":hit["fields"]["senders"][0], "to":rcvr}for rcvr in rcvrs(hit["fields"]) ]for hit in addrs["hits"]["hits"]])

    text_file = open(file, "w")
    [text_file.write(json.dumps(edge)+"\n") for edge in edges]
    text_file.close()

Esempio n. 26

0

Mostra file

File: datastore.py Progetto: snehitp/reference-server

class DatastoreConnection:
    def __init__(self):
        self._es = Elasticsearch()
        self._patients = PatientManager(self)
        self._vocabularies = VocabularyManager(self)

    def index_patients(self, filename):
        return self._patients.index(filename)

    def index_hpo(self, filename):
        return self._vocabularies.index(index='hpo', filename=filename, Parser=OBOParser)

    def index_genes(self, filename):
        return self._vocabularies.index(index='genes', filename=filename, Parser=GeneParser)

    def get_vocabulary_term(self, id, index='_all'):
        return self._vocabularies.get_term(id, index=index)

    def find_similar_patients(self, patient, n=5):
        """Return the n most similar patients to the given query api.Patient"""
        return self._patients.find_similar_patients(patient=patient, n=n)

    def search(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.search(*args, **kwargs)

    def bulk(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.bulk(*args, **kwargs)

    def index(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.index(*args, **kwargs)

    def count(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.count(*args, **kwargs)

    @property
    def indices(self):
        """Expose ElasticSearch property"""
        return self._es.indices

Esempio n. 27

0

Mostra file

File: es.py Progetto: CaptainAL/Spyder

def from_elasticsearch(host, index, query, port=9200, pagination=100):
    """ Create Bag from Elasticsearch Query

    >>> b = from_elasticsearch(host='hostname', index='reddit',
    ...                        query={"match": {'body':'Python'}})
    """
    es = Elasticsearch([{'host': host, 'port': port}])
    count = es.count(index=index, body={'query': query})['count']

    npartitions = int(ceil(count / pagination))
    name = 'elasticsearch' + next(tokens)

    dsk = dict()
    for i in range(npartitions):
        kwargs = {'index': index, 'body': {'query': query,
                                           'from': pagination*i,
                                           'size': pagination}}
        dsk[(name, i)] = (get_results, es, kwargs)

    return Bag(dsk, name, npartitions)

Esempio n. 28

0

Mostra file

File: ads_app_count.py Progetto: hellowendy/Insight_Data_Science

def main():
    es = Elasticsearch([{
        'host': eslogin.host,
        'port': eslogin.port
    }],
                       http_auth=(eslogin.user, eslogin.password))

    columns = range(150)
    index = range(360)
    df = pd.DataFrame(index=index, columns=columns)
    df = df.fillna(0)

    for col in columns:
        for ind in index:
            df.loc[ind, col] = es.count(index="events-2015.05.*",
                                        body={
                                            'query': {
                                                'bool': {
                                                    'must': [{
                                                        'match': {
                                                            'ai': col
                                                        }
                                                    }, {
                                                        'match': {
                                                            'cr': ind
                                                        }
                                                    }, {
                                                        'match': {
                                                            'et': 'AD_SHOW'
                                                        }
                                                    }],
                                                    'must_not': [{
                                                        'match': {
                                                            'fr': 'true'
                                                        }
                                                    }]
                                                }
                                            }
                                        })['count']

    df.to_csv("../data/ad_show_5_2015.tab", sep='\t')

Esempio n. 29

0

Mostra file

def main():
    (opts, args) = parse_opts()

    es = Elasticsearch(
        [{ 'host': opts.es_host,
           'port': opts.es_port }],
        timeout=1200,
        retry_on_timeout=True
    )
    
    print('Cluster: {}'.format(es.info().get('cluster_name')))
    
    indices = es.indices.get(index=opts.index_pattern).keys()
    
    queries = []
    if opts.program:
        queries.append({'term': {'program': opts.program}})
    if opts.fleet:
        queries.append({'term': {'fleet': opts.fleet}})
    if opts.message:
        queries.append({'match_phrase':{'message': opts.message}})

    body = None
    if len(queries) > 0:
        body = {'query': {'bool': {'must': queries}}}

    for index in indices:
      resp = es.count(index=index, body=body)
      count = resp.get('count')
      print('{:22} count: {:6}'.format(index, count))

      if opts.query > 0:
        resp = es.search(index=index, body=body)
        print_logs(resp['hits']['hits'])
      elif opts.delete:
        rval = es.delete_by_query(index=index, body=body)
        rval2 = es.indices.forcemerge(
            index=index,
            params={'only_expunge_deletes':'true'}
        )
        print('{:22} Deleted: {:10} Failed: {}'.format(index, rval['deleted'], rval2['_shards']['failed']))

Esempio n. 30

0

Mostra file

File: es_client.py Progetto: syyscn/kbqa

class ElasticSearchClient(object):
    def __init__(self, host, port):
        self.host = host
        self.port = port
        self.connect()

    def connect(self):
        self.es = Elasticsearch(hosts=[{'host': self.host, 'port': self.port}])

    def count(self, index):
        """
        :param index:
        :return: 统计index总数
        """
        return self.es.count(index=index)

    def delete(self, index, doc_type, id):
        """
        :param index:
        :param doc_type:
        :param id:
        :return: 删除index中具体的一条
        """
        self.es.delete(index=index, doc_type=doc_type, id=id)

    def get(self, index, id):
        return self.es.get(index=index, id=id)

    def search(self, index, doc_type, constraint, size=20):
        try:
            doc = {"query": {"match": constraint}}
            res = self.es.search(index=index,
                                 doc_type=doc_type,
                                 body=doc,
                                 size=size)
            resources = []
            for hit in res['hits']['hits']:
                resources.append(hit["_source"])
            return resources
        except Exception as err:
            print(err)

Esempio n. 31

0

Mostra file

class Queries(object):
    def __init__(self, log_data, filter_type):
        time.sleep(70)
        self.es_client = Elasticsearch()
        self.__all_documents_count(log_data, filter_type)

    def __all_documents_count(self, log_data, filter_type):
        filter_wildcard = filter_type + '-*'
        result = self.es_client.count(index=filter_wildcard)

        if result['count'] == len(log_data):
            Status.show(
                'The created Elasticserach index {} has the same amount of items {} as the {}.log'
                .format(filter_type, result['count'], filter_type), True)
        else:
            Status.show(
                'The {} index data count {} differs from the ingested log, please check the filters'
                .format(filter_type, result['count']), False)

    def __document_time(self):
        pass

Esempio n. 32

0

Mostra file

File: views.py Progetto: wsyx123/logPlatform

    def testdata(self, eshosts, indexname, query_body, count_query_body,
                 highligth):
        es = Elasticsearch(eshosts)
        logtotal = es.count(indexname, body=count_query_body)
        logkey = es.indices.get_mapping(index=indexname)
        try:
            logdata = es.search(indexname, body=query_body)
        except RequestError:
            return {'status': False, 'msg': '有不存在的field被当作条件查询'}
        logstatis = logdata['aggregations']['groupDate']['buckets']
        logstatis = self.generate_echart_data(logstatis)
        logdata = logdata['hits']['hits']
        logdata = self.format_logdata(logdata, highligth)

        logkey = self.get_key(logkey)
        return {
            'logkey': logkey,
            'logtotal': logtotal,
            'status': True,
            'logdata': logdata,
            'echart': logstatis
        }

Esempio n. 33

0

Mostra file

    def run(self):

        output = self.output().open('w')
        elastic = Elasticsearch(hosts=self.hosts)
        query = '{"query": {"prefix": {"path": "%s"}}}' % self.monitoring_key
        count = elastic.count(index="disthene", body=query)['count']
        limit = 5000

        sum = 0
        for chunk in range(int(count / self.chunk_size) + 1):
            result = elastic.search(index="disthene",
                                    body=query,
                                    size=self.chunk_size,
                                    from_=sum,
                                    stored_fields="path")
            for path in result["hits"]["hits"]:
                output.write("{}\n".format(path["_source"]["path"]))

            sum += self.chunk_size
            if sum > limit:
                break
        output.close()

Esempio n. 34

0

Mostra file

File: export_es_data.py Progetto: zpeng1989/neo4j-server

def export():
    es = Elasticsearch(["ubuntu3:9200"])
    index_name = "script_data"
    type_name = "script"
    target_index_name = "script_data"
    # file_name = "D:\search_text.json"
    file_name = "/home/hadoop/search_text.json"

    count = es.count(index=index_name, doc_type=type_name)['count']
    body = {"size": count}
    data = es.search(index=index_name, doc_type=type_name,
                     body=body)['hits']['hits']

    tmp = ""
    for i in range(len(data) - 1):
        index = "{\"index\":{\"_index\":\"" + target_index_name + "\",\"_id\":" + str(
            i) + "}}\n"
        tmp += index
        tmp += str(data[i]['_source'])
        tmp += "\n"
        file = codecs.open(file_name, 'w', encoding="utf-8")
        file.write(tmp)

Esempio n. 35

0

Mostra file

File: search.py Progetto: xmuthad/elasticsearch-output

class Search(object):
    def __init__(self):
        self.es = Elasticsearch(hosts, http_compress=True)

    def multi_get(self):
        #health_status = es.cluster.health()
        #print health_status
        #res = es.mget(params)
        #body = {"query":{"term":{}}}

        #number = es.count(body=body)
        index = ["log-2018.03.21"]
        from_ = 0
        body = """
            {"index":%(index)s}
            {"query":{"match_all":{}},"from":%(from_)d, "size":%(limit)d}
            """ % dict(index=index, from_=from_, limit=LIMIT)
        res = self.es.msearch(body, doc_type='message')
        total = res['responses'][0]['hits']['total']
        hits = res['responses'][0]['hits']['hits']
        for i in xrange(total / LIMIT):
            body = """
                {"index":["log-2018.03.21"]}
                {"query":{"match_all":{}},"from":LIMIT*(1+i), "size":LIMIT}"""
            res = es.msearch(body, doc_type='message')
            hits.append(hits)
        return hits

    def _count(self, index=None, item=None, value=None):
        body = {
            "query": {
                "term": {
                    item: value,
                }
            }
        }
        res = self.es.count(index=index, body=body)
        return res['count']

Esempio n. 36

0

Mostra file

 def get(self, pid, record, **kwargs):
     """Handle GET request."""
     page_views = 0
     es = Elasticsearch(CFG_ELASTICSEARCH_SEARCH_HOST)
     query = {
         "query": {
             "bool": {
                 "must": [{
                     "match": {
                         "id_bibrec": pid.pid_value
                     }
                 }, {
                     "match": {
                         "_type": "events.pageviews"
                     }
                 }]
             }
         }
     }
     results = es.count(index=ES_INDEX, body=query)
     if results:
         page_views = results.get('count', 0)
     return make_response(jsonify(page_views), 200)

Esempio n. 37

0

Mostra file

def es_get_all_ips(str_existing_index):
    """Returns list of list_of_ips stored in given Elasticsearch index"""
    list_ips = []
    es = Elasticsearch(([{'host': get_es_cluster_ip()}]))
    count = es.count(index=str_existing_index)['count']
    res = es.search(index=str_existing_index,
                    body={
                        "size": 0,
                        "aggs": {
                            "all_ip": {
                                "terms": {
                                    "field": "ip",
                                    "size": count
                                }
                            }
                        }
                    })
    for key in res['aggregations']['all_ip']['buckets']:
        list_ips.append(key['key'])
    print('Found ' + str(len(list_ips)) + ' IPs in Elasticsearch index ' +
          str_existing_index)
    ask_continue()
    return list_ips

Esempio n. 38

0

Mostra file

File: glances_elasticsearch.py Progetto: 4sp1r3/glances

    def init(self):
        """Init the connection to the ES server."""
        if not self.export_enable:
            return None

        try:
            es = Elasticsearch(hosts=['{0}:{1}'.format(self.host, self.port)])
        except Exception as e:
            logger.critical("Cannot connect to ElasticSearch server %s:%s (%s)" % (self.host, self.port, e))
            sys.exit(2)
        else:
            logger.info("Connected to the ElasticSearch server %s:%s" % (self.host, self.port))

        try:
            index_count = es.count(index=self.index)['count']
        except Exception as e:
            # Index did not exist, it will be created at the first write
            # Create it...
            es.indices.create(self.index)
        else:
            logger.info("There is already %s entries in the ElasticSearch %s index" % (index_count, self.index))

        return es

Esempio n. 39

0

Mostra file

File: glances_elasticsearch.py Progetto: yashodhank/glances

    def init(self):
        """Init the connection to the ES server."""
        if not self.export_enable:
            return None

        try:
            es = Elasticsearch(hosts=['{}:{}'.format(self.host, self.port)])
        except Exception as e:
            logger.critical("Cannot connect to ElasticSearch server %s:%s (%s)" % (self.host, self.port, e))
            sys.exit(2)
        else:
            logger.info("Connected to the ElasticSearch server %s:%s" % (self.host, self.port))

        try:
            index_count = es.count(index=self.index)['count']
        except Exception as e:
            # Index did not exist, it will be created at the first write
            # Create it...
            es.indices.create(self.index)
        else:
            logger.info("There is already %s entries in the ElasticSearch %s index" % (index_count, self.index))

        return es

Esempio n. 40

0

Mostra file

def get_count(ids, ip):
    client = Elasticsearch([
        {
            'host': ip
        },
    ])
    response = client.count(index="blogposts",
                            body={
                                "query": {
                                    "bool": {
                                        "must": [{
                                            "terms": {
                                                "blogsite_id":
                                                ids.replace(' ',
                                                            '').split(','),
                                                "boost":
                                                1
                                            }
                                        }]
                                    }
                                }
                            })
    return int(response['count'])

Esempio n. 41

0

Mostra file

File: elastic_search.py Progetto: 54skyray/skynet

class ElkTask():
    def __init__(self):
        elk_addr = "http://123.56.9.150:9200/"
        self.es = Elasticsearch(elk_addr)
        self.es.cluster.health(request_timeout=10)
    def do_user_agent_reg(self,threshold,reg_string):
        try:
            res = self.es.count(body = {
                "query": {
                    "filtered": {
                        "query":  { "regexp": {
                            "logs.http_user_agent.raw": "%s"%(reg_string)
                        }
                        },
                        "filter": { "term":  { "logs.source_type.raw": "RTR" }}
                    }

                }
            })
            return res["count"] >= threshold
        except Exception,e:
            print(e)
            return False

Esempio n. 42

0

Mostra file

File: check_es_doc_duplication.py Progetto: lenjonemcse/usaspending-api

 def handle_with_partitioning(self, **options):
     self._es_client_config = {
         "hosts": options["es_hostname"],
         "timeout": options["es_timeout"]
     }
     self._partition_size = options["partition_size"]
     es = Elasticsearch(**self._es_client_config)
     self._index = options["index"]
     parallelism = options["parallelism"]
     doc_count = es.count(index=options["index"])["count"]
     _log.info(f"Found {doc_count:,} docs in index {self._index}")
     self._num_partitions = ceil(doc_count / self._partition_size)
     _log.info(
         f"Running a total of {self._num_partitions:,} agg queries, "
         f"each returning up to {self._partition_size:,} buckets "
         f"that capture the degree of duplication of a duplicated _id. "
         f"Queries will be distributed among {parallelism} parallel threads."
     )
     with ThreadPool(parallelism) as pool:
         num_partitions = self._num_partitions
         if options.get("stop_after"):
             num_partitions = options["stop_after"]
         pool.map(self.count_duplication_by_partitions,
                  range(0, num_partitions))
     if self._duplicated_doc_ids:
         duped_id_count = len(self._duplicated_doc_ids)
         max_dupe = max(self._duplicated_doc_ids.values())
         p75 = sorted(self._duplicated_doc_ids.values())[
             int(ceil((duped_id_count * 75) / 100)) - 1]
         p95 = sorted(self._duplicated_doc_ids.values())[
             int(ceil((duped_id_count * 95) / 100)) - 1]
         _log.warning(
             f"Found {len(self._duplicated_doc_ids):,} _ids with more than one doc in the index. "
             f"Max duplication (p100) = {max_dupe}; p95 = {p95}; p75 = {p75}"
         )
     else:
         _log.info("No duplicate documents with the same _id field found.")

Esempio n. 43

0

Mostra file

def es_log_count_search(idx, key, value, interval="1h"):
    '''
    :param idx: 索引名 
    :param key:  查询字段
    :param value:  匹配的值 
    :param interval:  查询的时间周期
    :return: 
    '''
    es = Elasticsearch(['ops-es.00joy.com'], scheme='https', port=443)
    body = {
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            key: value
                        }
                    },
                    # {"match": {"content": "Elasticsearch"}}
                ],
                "filter": [{
                    "range": {
                        "@timestamp": {
                            # "gte": "2019-11-21T00:00:00.000+0800",
                            # "lt": "2018-06-15T13:00:00.000+0800"
                            "gt": "now-" + interval
                            # "gt": "2014-01-01 00:00:00",
                            # "lt": "2014-01-01 00:00:00||+1M"     #加一个月
                        }
                    }
                }]
            }
        }
    }
    # ret = es.search(index='loginprocess', )
    ret = es.count(index=idx, body=body)
    print(ret['count'])

Esempio n. 44

0

Mostra file

def connectDB(esIndex, nodes, rootLogger):
    '''
    Function to connect to Elasticsearch DB. Uses default parameters.

    Args:
        esIndex (str): Elasticsearch index of concern
        nodes (list): list of string values of node information; e.g. ['127.0.0.1:9200', '127.0.0.2:9200']
        rootLogger (obj): reference of rootLogger object

    Returns:
        obj: elasticsearch object reference
    '''
    es = Elasticsearch([node for node in nodes])
    try:
        # Get no. of documents
        numDocs = es.count(index=esIndex, body={"query": {"match_all": {}}})['count']
        rootLogger.info(f'Connection successful. Number of documents found: {numDocs}')
        return es
    except ConnectionError:
        rootLogger.error('Error talking to ES DB. Check if DB is started up.')
        sys.exit(500)
    except ElasticsearchException:
        rootLogger.error("Unexpected error:", sys.exc_info()[0])
        sys.exit(500)

Esempio n. 45

0

Mostra file

def get_length():
    es = Elasticsearch(port=9211)
    index = "image_cells"
    query = {
        "query": {
            "bool": {
                "should": [{
                    "match": {
                        "isDebris": "true"
                    }
                }, {
                    "bool": {
                        "must_not": [{
                            "term": {
                                "annotation": "null"
                            }
                        }]
                    }
                }]
            }
        }
    }
    res = es.count(index=index, body=query)
    print(res)

Esempio n. 46

0

Mostra file

def elastic_processor(user, mapping, is_save):
    username = user['username']
    mapping = pre_processing(username=username, mapping=mapping)
    data_to_save = []
    record_temp = {}
    path = mapping['source']['path'].split("/")
    try:
        es = Elasticsearch([{'host': path[0], 'port': path[1]}])
        count = es.count(index=mapping['source']['iterator'])
        count = count['count']
    except:
        raise Exception("Invalid source path")

    number_record_in_file = math.ceil(count / (multiprocessing.cpu_count()))
    array = []
    for i in range(0, count, number_record_in_file):
        array.append(
            (mapping, i, number_record_in_file, user, is_save, path[0], path[1], mapping['source']['iterator']))
    p = Pool(len(array))
    logging.warning(array)
    data_to_save = p.map(elastic_processing, array)
    p.close()
    p.join()
    return data_to_save

Esempio n. 47

0

Mostra file

File: es.py Progetto: jackyops/examples

class ElasticSearchClass(object):
    def __init__(self, host, port, user, passwrod):
        self.host = host
        self.port = port
        self.user = user
        self.password = passwrod
        self.connect()

    def connect(self):
        self.es = Elasticsearch(hosts=[{'host': self.host, 'port': self.port}],
                                http_auth=(self.user, self.password))

    def count(self, indexname):
        """
        :param indexname:
        :return: 统计index总数
        """
        return self.es.count(index=indexname)

    def delete(self, indexname, doc_type, id):
        """
        :param indexname:
        :param doc_type:
        :param id:
        :return: 删除index中具体的一条
        """
        self.es.delete(index=indexname, doc_type=doc_type, id=id)

    def get(self, indexname, id):
        return self.es.get(index=indexname, id=id)

    def search(self, indexname, size=10):
        try:
            return self.es.search(index=indexname, size=size, sort="@timestamp:desc")
        except Exception as err:
            print(err)

Esempio n. 48

0

Mostra file

File: elastic.py Progetto: reddevillz/timesketch

class ElasticsearchDataStore(object):
    """Implements the datastore."""

    # Number of events to queue up when bulk inserting events.
    DEFAULT_FLUSH_INTERVAL = 1000
    DEFAULT_SIZE = 100
    DEFAULT_LIMIT = DEFAULT_SIZE  # Max events to return
    DEFAULT_FROM = 0
    DEFAULT_STREAM_LIMIT = 5000  # Max events to return when streaming results

    def __init__(self, host='127.0.0.1', port=9200):
        """Create a Elasticsearch client."""
        super(ElasticsearchDataStore, self).__init__()
        self.client = Elasticsearch([{'host': host, 'port': port}])
        self.import_counter = Counter()
        self.import_events = []

    @staticmethod
    def _build_labels_query(sketch_id, labels):
        """Build Elasticsearch query for Timesketch labels.

        Args:
            sketch_id: Integer of sketch primary key.
            labels: List of label names.

        Returns:
            Elasticsearch query as a dictionary.
        """
        label_query = {'bool': {'should': [], 'minimum_should_match': 1}}

        for label in labels:
            nested_query = {
                'nested': {
                    'query': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'timesketch_label.name': label
                                }
                            }, {
                                'term': {
                                    'timesketch_label.sketch_id': sketch_id
                                }
                            }]
                        }
                    },
                    'path': 'timesketch_label'
                }
            }
            label_query['bool']['should'].append(nested_query)
        return label_query

    @staticmethod
    def _build_events_query(events):
        """Build Elasticsearch query for one or more document ids.

        Args:
            events: List of Elasticsearch document IDs.

        Returns:
            Elasticsearch query as a dictionary.
        """
        events_list = [event['event_id'] for event in events]
        query_dict = {'query': {'ids': {'values': events_list}}}
        return query_dict

    def build_query(self,
                    sketch_id,
                    query_string,
                    query_filter,
                    query_dsl=None,
                    aggregations=None):
        """Build Elasticsearch DSL query.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            aggregations: Dict of Elasticsearch aggregations

        Returns:
            Elasticsearch DSL query as a dictionary
        """

        if query_dsl:
            query_dsl = json.loads(query_dsl)
            # Remove any aggregation coming from user supplied Query DSL.
            # We have no way to display this data in a good way today.
            if query_dsl.get('aggregations', None):
                del query_dsl['aggregations']
            return query_dsl

        if query_filter.get('events', None):
            events = query_filter['events']
            return self._build_events_query(events)

        query_dsl = {
            'query': {
                'bool': {
                    'must': [],
                    'must_not': [],
                    'filter': []
                }
            }
        }

        # TODO: Remove when old UI has been deprecated.
        if query_filter.get('star', None):
            label_query = self._build_labels_query(sketch_id, ['__ts_star'])
            query_string = '*'
            query_dsl['query']['bool']['must'].append(label_query)

        # TODO: Remove when old UI has been deprecated.
        if query_filter.get('time_start', None):
            query_dsl['query']['bool']['filter'] = [{
                'bool': {
                    'should': [{
                        'range': {
                            'datetime': {
                                'gte': query_filter['time_start'],
                                'lte': query_filter['time_end']
                            }
                        }
                    }]
                }
            }]

        if query_string:
            query_dsl['query']['bool']['must'].append(
                {'query_string': {
                    'query': query_string
                }})

        # New UI filters
        if query_filter.get('chips', None):
            labels = []
            must_filters = query_dsl['query']['bool']['must']
            must_not_filters = query_dsl['query']['bool']['must_not']
            datetime_ranges = {
                'bool': {
                    'should': [],
                    'minimum_should_match': 1
                }
            }

            for chip in query_filter['chips']:
                if chip['type'] == 'label':
                    labels.append(chip['value'])

                elif chip['type'] == 'term':
                    term_filter = {
                        'match_phrase': {
                            '{}'.format(chip['field']): {
                                'query': "{}".format(chip['value'])
                            }
                        }
                    }

                    if chip['operator'] == 'must':
                        must_filters.append(term_filter)

                    elif chip['operator'] == 'must_not':
                        must_not_filters.append(term_filter)

                elif chip['type'] == 'datetime_range':
                    start = chip['value'].split(',')[0]
                    end = chip['value'].split(',')[1]
                    range_filter = {
                        'range': {
                            'datetime': {
                                'gte': start,
                                'lte': end
                            }
                        }
                    }
                    datetime_ranges['bool']['should'].append(range_filter)

            label_filter = self._build_labels_query(sketch_id, labels)
            must_filters.append(label_filter)
            must_filters.append(datetime_ranges)

        # Pagination
        if query_filter.get('from', None):
            query_dsl['from'] = query_filter['from']

        # Number of events to return
        if query_filter.get('size', None):
            query_dsl['size'] = query_filter['size']

        # Make sure we are sorting.
        if not query_dsl.get('sort', None):
            query_dsl['sort'] = {'datetime': query_filter.get('order', 'asc')}

        # Add any pre defined aggregations
        if aggregations:
            # post_filter happens after aggregation so we need to move the
            # filter to the query instead.
            if query_dsl.get('post_filter', None):
                query_dsl['query']['bool']['filter'] = query_dsl['post_filter']
                query_dsl.pop('post_filter', None)
            query_dsl['aggregations'] = aggregations

        return query_dsl

    def search(self,
               sketch_id,
               query_string,
               query_filter,
               query_dsl,
               indices,
               count=False,
               aggregations=None,
               return_fields=None,
               enable_scroll=False):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            count: Boolean indicating if we should only return result count
            aggregations: Dict of Elasticsearch aggregations
            return_fields: List of fields to return
            enable_scroll: If Elasticsearch scroll API should be used

        Returns:
            Set of event documents in JSON format
        """

        scroll_timeout = None
        if enable_scroll:
            scroll_timeout = '1m'  # Default to 1 minute scroll timeout

        # Exit early if we have no indices to query
        if not indices:
            return {'hits': {'hits': [], 'total': 0}, 'took': 0}

        # Check if we have specific events to fetch and get indices.
        if query_filter.get('events', None):
            indices = {
                event['index']
                for event in query_filter['events']
                if event['index'] in indices
            }

        query_dsl = self.build_query(sketch_id, query_string, query_filter,
                                     query_dsl, aggregations)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = 'query_then_fetch'

        # Only return how many documents matches the query.
        if count:
            del query_dsl['sort']
            count_result = self.client.count(body=query_dsl,
                                             index=list(indices))
            return count_result.get('count', 0)

        if not return_fields:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.search(body=query_dsl,
                                      index=list(indices),
                                      search_type=search_type,
                                      scroll=scroll_timeout)

        # The argument " _source_include" changed to "_source_includes" in
        # ES version 7. This check add support for both version 6 and 7 clients.
        # pylint: disable=unexpected-keyword-arg
        if self.version.startswith('6'):
            _search_result = self.client.search(body=query_dsl,
                                                index=list(indices),
                                                search_type=search_type,
                                                _source_include=return_fields,
                                                scroll=scroll_timeout)
        else:
            _search_result = self.client.search(body=query_dsl,
                                                index=list(indices),
                                                search_type=search_type,
                                                _source_includes=return_fields,
                                                scroll=scroll_timeout)

        return _search_result

    def search_stream(self,
                      sketch_id=None,
                      query_string=None,
                      query_filter=None,
                      query_dsl=None,
                      indices=None,
                      return_fields=None,
                      enable_scroll=True):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args :
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            return_fields: List of fields to return
            enable_scroll: Boolean determing whether scrolling is enabled.

        Returns:
            Generator of event documents in JSON format
        """

        if not query_filter.get('size'):
            query_filter['size'] = self.DEFAULT_STREAM_LIMIT

        if not query_filter.get('terminate_after'):
            query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT

        result = self.search(sketch_id=sketch_id,
                             query_string=query_string,
                             query_dsl=query_dsl,
                             query_filter=query_filter,
                             indices=indices,
                             return_fields=return_fields,
                             enable_scroll=enable_scroll)

        if enable_scroll:
            scroll_id = result['_scroll_id']
            scroll_size = result['hits']['total']
        else:
            scroll_id = None
            scroll_size = 0

        # Elasticsearch version 7.x returns total hits as a dictionary.
        # TODO: Refactor when version 6.x has been deprecated.
        if isinstance(scroll_size, dict):
            scroll_size = scroll_size.get('value', 0)

        for event in result['hits']['hits']:
            yield event

        while scroll_size > 0:
            # pylint: disable=unexpected-keyword-arg
            result = self.client.scroll(scroll_id=scroll_id, scroll='5m')
            scroll_id = result['_scroll_id']
            scroll_size = len(result['hits']['hits'])
            for event in result['hits']['hits']:
                yield event

    def get_event(self, searchindex_id, event_id):
        """Get one event from the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id

        Returns:
            Event document in JSON format
        """
        try:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            if self.version.startswith('6'):
                event = self.client.get(index=searchindex_id,
                                        id=event_id,
                                        doc_type='_all',
                                        _source_exclude=['timesketch_label'])
            else:
                event = self.client.get(index=searchindex_id,
                                        id=event_id,
                                        doc_type='_all',
                                        _source_excludes=['timesketch_label'])

            return event

        except NotFoundError:
            abort(HTTP_STATUS_CODE_NOT_FOUND)

    def count(self, indices):
        """Count number of documents.

        Args:
            indices: List of indices.

        Returns:
            Number of documents.
        """
        if not indices:
            return 0
        try:
            result = self.client.count(index=indices)
        except (NotFoundError, RequestError):
            es_logger.error('Unable to count indexes (index not found)',
                            exc_info=True)
            return 0
        return result.get('count', 0)

    def set_label(self,
                  searchindex_id,
                  event_id,
                  event_type,
                  sketch_id,
                  user_id,
                  label,
                  toggle=False,
                  single_update=True):
        """Set label on event in the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id
            event_type: String of ElasticSearch document type
            sketch_id: Integer of sketch primary key
            user_id: Integer of user primary key
            label: String with the name of the label
            toggle: Optional boolean value if the label should be toggled
            single_update: Boolean if the label should be indexed immediately.
            (add/remove). The default is False.

        Returns:
            Dict with updated document body, or None if this is a single update.
        """
        # Elasticsearch painless script.
        update_body = {
            'script': {
                'lang': 'painless',
                'source': ADD_LABEL_SCRIPT,
                'params': {
                    'timesketch_label': {
                        'name': str(label),
                        'user_id': user_id,
                        'sketch_id': sketch_id
                    }
                }
            }
        }

        if toggle:
            update_body['script']['source'] = TOGGLE_LABEL_SCRIPT

        if not single_update:
            script = update_body['script']
            return dict(source=script['source'],
                        lang=script['lang'],
                        params=script['params'])

        doc = self.client.get(index=searchindex_id,
                              id=event_id,
                              doc_type='_all')
        try:
            doc['_source']['timesketch_label']
        except KeyError:
            doc = {'doc': {'timesketch_label': []}}
            self.client.update(index=searchindex_id,
                               doc_type=event_type,
                               id=event_id,
                               body=doc)

        self.client.update(index=searchindex_id,
                           id=event_id,
                           doc_type=event_type,
                           body=update_body)

        return None

    def create_index(self, index_name=uuid4().hex, doc_type='generic_event'):
        """Create index with Timesketch settings.

        Args:
            index_name: Name of the index. Default is a generated UUID.
            doc_type: Name of the document type. Default id generic_event.

        Returns:
            Index name in string format.
            Document type in string format.
        """
        _document_mapping = {
            'properties': {
                'timesketch_label': {
                    'type': 'nested'
                },
                'datetime': {
                    'type': 'date'
                }
            }
        }

        # TODO: Remove when we deprecate Elasticsearch version 6.x
        if self.version.startswith('6'):
            _document_mapping = {doc_type: _document_mapping}

        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(
                    index=index_name, body={'mappings': _document_mapping})
            except ConnectionError:
                raise RuntimeError('Unable to connect to Timesketch backend.')
            except RequestError:
                index_exists = self.client.indices.exists(index_name)
                es_logger.warning(
                    'Attempting to create an index that already exists '
                    '({0:s} - {1:s})'.format(index_name, str(index_exists)))

        # We want to return unicode here to keep SQLalchemy happy.
        if six.PY2:
            if not isinstance(index_name, six.text_type):
                index_name = codecs.decode(index_name, 'utf-8')

            if not isinstance(doc_type, six.text_type):
                doc_type = codecs.decode(doc_type, 'utf-8')

        return index_name, doc_type

    def delete_index(self, index_name):
        """Delete Elasticsearch index.

        Args:
            index_name: Name of the index to delete.
        """
        if self.client.indices.exists(index_name):
            try:
                self.client.indices.delete(index=index_name)
            except ConnectionError as e:
                raise RuntimeError(
                    'Unable to connect to Timesketch backend: {}'.format(e))

    def import_event(self,
                     index_name,
                     event_type,
                     event=None,
                     event_id=None,
                     flush_interval=DEFAULT_FLUSH_INTERVAL):
        """Add event to Elasticsearch.

        Args:
            flush_interval: Number of events to queue up before indexing
            index_name: Name of the index in Elasticsearch
            event_type: Type of event (e.g. plaso_event)
            event: Event dictionary
            event_id: Event Elasticsearch ID
        """
        if event:
            for k, v in event.items():
                if not isinstance(k, six.text_type):
                    k = codecs.decode(k, 'utf8')

                # Make sure we have decoded strings in the event dict.
                if isinstance(v, six.binary_type):
                    v = codecs.decode(v, 'utf8')

                event[k] = v

            # Header needed by Elasticsearch when bulk inserting.
            header = {
                'index': {
                    '_index': index_name,
                }
            }
            update_header = {'update': {'_index': index_name, '_id': event_id}}

            # TODO: Remove when we deprecate Elasticsearch version 6.x
            if self.version.startswith('6'):
                header['index']['_type'] = event_type
                update_header['update']['_type'] = event_type

            if event_id:
                # Event has "lang" defined if there is a script used for import.
                if event.get('lang'):
                    event = {'script': event}
                else:
                    event = {'doc': event}
                header = update_header

            self.import_events.append(header)
            self.import_events.append(event)
            self.import_counter['events'] += 1

            if self.import_counter['events'] % int(flush_interval) == 0:
                try:
                    self.client.bulk(body=self.import_events)
                except (ConnectionTimeout, socket.timeout):
                    # TODO: Add a retry here.
                    es_logger.error('Unable to add events', exc_info=True)
                self.import_events = []
        else:
            # Import the remaining events in the queue.
            if self.import_events:
                try:
                    self.client.bulk(body=self.import_events)
                except (ConnectionTimeout, socket.timeout):
                    # TODO: Add a retry here.
                    es_logger.error('Unable to add events', exc_info=True)

        return self.import_counter['events']

    def flush_queued_events(self):
        if self.import_events:
            self.client.bulk(body=self.import_events)

    @property
    def version(self):
        """Get Elasticsearch version.

        Returns:
          Version number as a string.
        """
        version_info = self.client.info().get('version')
        return version_info.get('number')

Esempio n. 49

0

Mostra file

File: es.py Progetto: zhengkunwang223/jms-storage-sdk

class ESStorage(LogStorage):
    def __init__(self, config):
        hosts = config.get("HOSTS")
        kwargs = config.get("OTHER", {})
        self.index = config.get("INDEX") or 'jumpserver'
        self.doc_type = config.get("DOC_TYPE") or 'command_store'
        self.es = Elasticsearch(hosts=hosts, **kwargs)

    @staticmethod
    def make_data(command):
        data = dict(user=command["user"],
                    asset=command["asset"],
                    system_user=command["system_user"],
                    input=command["input"],
                    output=command["output"],
                    session=command["session"],
                    timestamp=command["timestamp"])
        data["date"] = datetime.fromtimestamp(command['timestamp'],
                                              tz=pytz.UTC)
        return data

    def bulk_save(self, command_set, raise_on_error=True):
        actions = []
        for command in command_set:
            data = dict(
                _index=self.index,
                _type=self.doc_type,
                _source=self.make_data(command),
            )
            actions.append(data)
        return bulk(self.es,
                    actions,
                    index=self.index,
                    raise_on_error=raise_on_error)

    def save(self, command):
        """
        保存命令到数据库
        """
        data = self.make_data(command)
        return self.es.index(index=self.index,
                             doc_type=self.doc_type,
                             body=data)

    @staticmethod
    def get_query_body(match=None, exact=None, date_from=None, date_to=None):
        if date_to is None:
            date_to = datetime.now()
        if date_from is None:
            date_from = date_to - timedelta(days=7)

        time_from = date_from.timestamp()
        time_to = date_to.timestamp()

        body = {
            "query": {
                "bool": {
                    "must": [],
                    "filter": [{
                        "range": {
                            "timestamp": {
                                "gte": time_from,
                                "lte": time_to,
                            }
                        }
                    }]
                }
            },
            "sort": {
                "timestamp": {
                    "order": "desc"
                }
            }
        }
        if match:
            for k, v in match.items():
                body["query"]["bool"]["must"].append({"match": {k: v}})
        if exact:
            for k, v in exact.items():
                body["query"]["bool"]["filter"].append({"term": {k: v}})
        return body

    def filter(self,
               date_from=None,
               date_to=None,
               user=None,
               asset=None,
               system_user=None,
               input=None,
               session=None):

        match = {}
        exact = {}

        if user:
            exact["user"] = user
        if asset:
            exact["asset"] = asset
        if system_user:
            exact["system_user"] = system_user

        if session:
            match["session"] = session
        if input:
            match["input"] = input

        body = self.get_query_body(match, exact, date_from, date_to)
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              body=body)
        return data["hits"]

    def count(self,
              date_from=None,
              date_to=None,
              user=None,
              asset=None,
              system_user=None,
              input=None,
              session=None):
        match = {}
        exact = {}

        if user:
            exact["user"] = user
        if asset:
            exact["asset"] = asset
        if system_user:
            exact["system_user"] = system_user

        if session:
            match["session"] = session
        if input:
            match["input"] = input
        body = self.get_query_body(match, exact, date_from, date_to)
        del body["sort"]
        data = self.es.count(body=body)
        return data["count"]

    def __getattr__(self, item):
        return getattr(self.es, item)

    def all(self):
        """返回所有数据"""
        raise NotImplementedError("Not support")

    def ping(self):
        try:
            return self.es.ping()
        except Exception:
            return False

Esempio n. 50

0

Mostra file

File: ElasticSearchUtility.py Progetto: Tirthraj93/ML-Spam-Classifier

class ElasticSearchUtility:
    """
    class to communicate with ElasticSearch
    """

    def __init__(self):
        self.es = Elasticsearch(hosts=[ES_HOST], timeout=750)

    def index_exists(self, index_name):
        return self.es.indices.exists(index_name)

    def create_index(self, index_name, body):
        """
        Created a new index. If it already exists, deletes that first.

        :param index_name: index to create
        :param body: index creation body
        """
        if self.es.indices.exists(index_name):
            print("deleting '%s' index..." % index_name)
            res = self.es.indices.delete(index=index_name)
            print(" response: '%s'" % res)

        print("creating '%s' index..." % index_name)
        res = self.es.indices.create(index=index_name, body=body)
        print(" response: '%s'" % res)

    def get_doc_count(self, index_name, doc_type):
        """
        Get total number of documents in a given index

        :param index_name: name of the index
        :param doc_type: type of the document
        :return: total number of documents
        """
        return self.es.count(index_name, doc_type)["count"]

    def store_index(self, index, doc_type, source_list, init_id):
        """
        Store all data in source list as a unique document in given ElasticSearch index-type

        :param index: name of the index
        :param doc_type: type of the document
        :param source_list: list of document source to insert into given index-type
        :param init_id: initial id for the document
        """

        bulk_actions = []
        doc_id = init_id

        for source in source_list:
            data_body = ElasticSearchUtility.__index_data_body(index, doc_type, doc_id, source["_source"])
            bulk_actions.append(data_body)
            doc_id += 1

        print 'inserting - ', len(bulk_actions)
        helpers.bulk(self.es, bulk_actions)

    def get_all_terms(self, index, doc_type, doc_id, field):
        """
        Get all terms for given field of given index-doc_type-doc_id

        :param index: name of the index
        :param doc_type: type of the document
        :param doc_id: id of the document
        :param field: field to get term vectors of
        :return: all terms for given document
        """

        term_vector = self.es.termvectors(index, doc_type, id=doc_id, field_statistics=False,
                                          fields=[field], offsets=False, positions=False)

        all_terms = term_vector[field]["terms"].keys()

        return all_terms

    def get_all_ids(self, index_name, doc_type, query_body):
        """
        Returns all ids of given index for given query

        :param index_name: Name of the index
        :param doc_type: Type of the document
        :param query_body: search query
        :return: List of ids of entire index
        """

        print 'getting all ids...'

        # query scroll
        id_list = []

        scroll = self.es.search(
            index=index_name,
            doc_type=doc_type,
            scroll='10m',
            size=10000,
            fields=['_id'],
            body=query_body)

        scroll_size = scroll['hits']['total']
        size = 0
        # retrieve results
        while scroll_size > 0:
            # scrolled data is in scroll['hits']['hits']
            hits_list = scroll['hits']['hits']
            for hit in hits_list:
                doc_id = hit['_id']
                id_list.append(doc_id)
            # update scroll size
            scroll_size = len(scroll['hits']['hits'])
            size += scroll_size
            print "scrolled - ", str(size)
            # prepare next scroll
            scroll_id = scroll['_scroll_id']
            # perform next scroll
            scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m')
        return id_list

    def single_feature_matrix(self, index, doc_type, field, feature):
        """
        Fetch all documents containing given feature along with its tf as a score from
        ElasticSearch in format {id: tf}

        :param index: name of the index
        :param doc_type: type of the document
        :param field: the field to extract features from
        :param feature: the feature to extract
        :return: the dictionary of the format {id: tf}
        """

        out_dict = dict()

        query_body = {
            "query": {
                "function_score": {
                    "query": {
                        "term": {
                            "body_shingles": {
                                "value": feature
                            }
                        }
                    },
                    "functions": [
                        {
                            "script_score": {
                                "script": {
                                    "file": "getFeatureValue",
                                    "params": {
                                        "term": feature,
                                        "field": field
                                    }
                                }
                            }
                        }
                    ],
                    "boost_mode": "replace"
                }
            }
        }

        # query scroll
        scroll = self.es.search(
            index=index,
            doc_type=doc_type,
            scroll='10m',
            size=10000,
            body=query_body,
            fields=["stream_id"])

        # set initial scroll size
        scroll_size = scroll['hits']['total']

        # retrieve results
        while scroll_size > 0:
            # scrolled data is in scroll['hits']['hits']
            hits_list = scroll['hits']['hits']

            for hit in hits_list:
                out_dict[hit["_id"]] = hit["_score"]

            # update scroll size
            scroll_size = len(scroll['hits']['hits'])
            # prepare next scroll
            scroll_id = scroll['_scroll_id']
            # perform next scroll
            scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m')

        return out_dict

    @staticmethod
    def __index_data_body(index, doc_type, doc_id, source):
        """
        Create index data body for insertion based on given parameters

        :param index: name of the index
        :param doc_type: type of the document
        :param doc_id: unique id for index source
        :param source: data source
        :return: index data to insert
        """

        index_data = {
            "_index": index,
            "_type": doc_type,
            "_id": doc_id,
            "_source": source
        }

        return index_data

    @staticmethod
    def get_match_query(field, value):
        """
        creates match query body for given field and value

        :param field: document field
        :param value: value for the field
        :return: the query body
        """
        query_body = {
            "query": {
                "match": {
                    field: value
                }
            }
        }

        return query_body

    def get_field_values(self, index, doc_type, field):
        """
        Get dictionary of id:field_value for given index-type and field

        :param index: name of the index
        :param doc_type: type of the document
        :param field: field to get value of
        :return: id:field_value dictionary
        """

        out_dict = dict()

        query_body = {
            "query": {
                "match_all": {}
            }
        }

        # query scroll
        scroll = self.es.search(
            index=index,
            doc_type=doc_type,
            scroll='10m',
            size=10000,
            body=query_body,
            fields=[field])

        # set initial scroll size
        scroll_size = scroll['hits']['total']

        # retrieve results
        while scroll_size > 0:
            # scrolled data is in scroll['hits']['hits']
            hits_list = scroll['hits']['hits']

            for hit in hits_list:
                doc_id = hit["_id"]
                field_value = hit["fields"][field][0]
                out_dict[doc_id] = field_value

            # update scroll size
            scroll_size = len(scroll['hits']['hits'])
            # prepare next scroll
            scroll_id = scroll['_scroll_id']
            # perform next scroll
            scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m')

        return out_dict

    def get_all_grams(self, index, doc_type, field, unigrams=False):
        """
        Get all unique grams from entire index for given field if unigrams is False,
        otherwise get only unigrams

        :param index: name of the index
        :param doc_type: type of the document
        :param field: name of the index field
        :return: the set of all grams
        """

        print 'Getting all grams...'

        grams = set()

        if unigrams:
            file = "getUnigrams"
        else:
            file = "getGrams"

        query_body = {
            "script_fields": {
                "grams": {
                    "script": {
                        "file": file,
                        "params": {
                            "field": field
                        }
                    }
                }
            }
        }

        # query scroll
        scroll = self.es.search(
            index=index,
            doc_type=doc_type,
            scroll='10m',
            size=1000,
            body=query_body)

        # set initial scroll size
        scroll_size = scroll['hits']['total']

        # retrieve results
        size = 0
        while scroll_size > 0:
            # scrolled data is in scroll['hits']['hits']
            hits_list = scroll['hits']['hits']

            for hit in hits_list:
                try:
                    field_value = hit["fields"]["grams"]
                    grams.update(set([value.encode('UTF8') for value in field_value]))
                except KeyError:
                    pass

            # update scroll size
            scroll_size = len(scroll['hits']['hits'])
            # prepare next scroll
            scroll_id = scroll['_scroll_id']
            # perform next scroll
            scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m')

            size += scroll_size
            print size

        return grams

    def get_sparse_tf_features(self, index, doc_type, field, doc_id, terms_to_include=None):
        """
        Get an dictionary of format {term1:tf1, term2:tf2, ...} for all terms in given field of given index's doc_id
        where tf id greater than 0

        :param index: name of the index
        :param doc_type: type of the document
        :param field: field to get terms of
        :param doc_id: index document id
        :return: dictionary of the format {term1:tf1, term2:tf2, ...} for all terms having tf > 0
        """

        out_dict = dict()

        # POST trec_spam/documents/1/_termvector?field_statistics=false
        # &positions=false&offsets=false
        # &fields=body_shingles
        response = self.es.termvectors(index, doc_type, doc_id, fields=[field],
                                       field_statistics=False, positions=False, offsets=False)

        try:
            terms = response["term_vectors"][field]["terms"]
        except KeyError:
            return dict()

        for term in terms:
            words_in_term = len(term.split(' '))
            tf = terms[term]["term_freq"]

            if tf > 0 and words_in_term == 1:
                try:
                    decoded_term = str(term)

                    if terms_to_include is None:
                        out_dict[decoded_term] = tf
                    else:
                        if terms_to_include.__contains__(decoded_term):
                            out_dict[decoded_term] = tf
                except:
                    pass

        return out_dict

    def get_field_values_for_docs(self, index, doc_type, field, docs_list):
        """
        Get value of given field for all given docs in given order

        :param index: name of the index
        :param doc_type: type of the document
        :param field: field to retirieve value of
        :param docs_list: list of documents for whic values are to be retrieved
        :return: list of values of given field for their corresponding ddcs
        """

        values_list = []

        for doc in docs_list:
            response = self.es.get(index, doc, doc_type=doc_type, fields=[field])
            value = str(response["fields"][field][0])
            values_list.append(value)

        return values_list

Esempio n. 51

0

Mostra file

class ElasticsearchDataStore(datastore.DataStore):
    """Implements the datastore."""
    def __init__(self, host=u'127.0.0.1', port=9200):
        """Create a Elasticsearch client."""
        super(ElasticsearchDataStore, self).__init__()
        self.client = Elasticsearch([{u'host': host, u'port': port}])
        self.import_counter = Counter()
        self.import_events = []

    @staticmethod
    def _build_label_query(sketch_id, label_name):
        """Build Elasticsearch query for Timesketch labels.

        Args:
            sketch_id: Integer of sketch primary key.
            label_name: Name of the label to search for.

        Returns:
            Elasticsearch query as a dictionary.
        """
        query_dict = {
            u'query': {
                u'filtered': {
                    u'filter': {
                        u'nested': {
                            u'filter': {
                                u'bool': {
                                    u'must': [{
                                        u'term': {
                                            u'timesketch_label.name':
                                            label_name
                                        }
                                    }, {
                                        u'term': {
                                            u'timesketch_label.sketch_id':
                                            sketch_id
                                        }
                                    }]
                                }
                            },
                            u'path': u'timesketch_label'
                        }
                    }
                }
            }
        }
        return query_dict

    @staticmethod
    def _build_events_query(events):
        """Build Elasticsearch query for one or more document ids.

        Args:
            events: List of Elasticsearch document IDs.

        Returns:
            Elasticsearch query as a dictionary.
        """
        events_list = [event[u'event_id'] for event in events]
        query_dict = {u'query': {u'ids': {u'values': events_list}}}
        return query_dict

    @staticmethod
    def _build_field_aggregator(field_name):
        """Build Elasticsearch query for aggregation based on field.

        Args:
            field_name: Field to aggregate.

        Returns:
            Elasticsearch aggregation as a dictionary.
        """
        field_aggregation = {
            u'field_aggregation': {
                u'terms': {
                    u'field': field_name,
                    u'size': 0
                }
            }
        }
        return field_aggregation

    def build_query(self,
                    sketch_id,
                    query_string,
                    query_filter,
                    query_dsl,
                    aggregations=None):
        """Build Elasticsearch DSL query.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            aggregations: Dict of Elasticsearch aggregations

        Returns:
            Elasticsearch DSL query as a dictionary
        """
        if not query_dsl:
            if query_filter.get(u'star', None):
                query_dsl = self._build_label_query(sketch_id, u'__ts_star')

            if query_filter.get(u'events', None):
                events = query_filter[u'events']
                query_dsl = self._build_events_query(events)

            if not query_dsl:
                query_dsl = {
                    u'query': {
                        u'filtered': {
                            u'query': {
                                u'query_string': {
                                    u'query': query_string
                                }
                            }
                        }
                    }
                }
            if query_filter.get(u'time_start', None):
                query_dsl[u'query'][u'filtered'][u'filter'] = {
                    u'range': {
                        u'datetime': {
                            u'gte': query_filter[u'time_start'],
                            u'lte': query_filter[u'time_end']
                        }
                    }
                }
            if query_filter.get(u'exclude', None):
                query_dsl[u'filter'] = {
                    u'not': {
                        u'terms': {
                            u'data_type': query_filter[u'exclude']
                        }
                    }
                }
        else:
            query_dsl = json.loads(query_dsl)

        # Make sure we are sorting.
        if not query_dsl.get(u'sort', None):
            query_dsl[u'sort'] = {
                u'datetime': query_filter.get(u'order', u'asc')
            }

        # Remove any aggregation coming from user supplied Query DSL. We have
        # no way to display this data in a good way today.
        # TODO: Revisit this and figure out if we can display the data.
        if query_dsl.get(u'aggregations', None):
            del query_dsl[u'aggregations']

        # Add any pre defined aggregations
        data_type_aggregation = self._build_field_aggregator(u'data_type')
        if aggregations:
            if isinstance(aggregations, dict):
                if query_filter.get(u'exclude', None):
                    aggregations = {
                        u'exclude': {
                            u'filter': {
                                u'not': {
                                    u'terms': {
                                        u'field_aggregation':
                                        query_filter[u'exclude']
                                    }
                                }
                            },
                            u'aggregations': aggregations
                        },
                        u'data_type':
                        data_type_aggregation[u'field_aggregation']
                    }
                query_dsl[u'aggregations'] = aggregations
        else:
            query_dsl[u'aggregations'] = data_type_aggregation

        return query_dsl

    def search(self,
               sketch_id,
               query_string,
               query_filter,
               query_dsl,
               indices,
               aggregations=None,
               return_results=True):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            aggregations: Dict of Elasticsearch aggregations
            return_results: Boolean indicating if results should be returned

        Returns:
            Set of event documents in JSON format
        """
        # Limit the number of returned documents.
        DEFAULT_LIMIT = 500  # Maximum events to return
        LIMIT_RESULTS = query_filter.get(u'limit', DEFAULT_LIMIT)

        # Exit early if we have no indices to query
        if not indices:
            return {u'hits': {u'hits': [], u'total': 0}, u'took': 0}

        # Check if we have specific events to fetch and get indices.
        if query_filter.get(u'events', None):
            indices = {event[u'index'] for event in query_filter[u'events']}

        query_dsl = self.build_query(sketch_id, query_string, query_filter,
                                     query_dsl, aggregations)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = u'query_then_fetch'
        if not return_results:
            search_type = u'count'

        # Suppress the lint error because elasticsearch-py adds parameters
        # to the function with a decorator and this makes pylint sad.
        # pylint: disable=unexpected-keyword-arg
        return self.client.search(body=query_dsl,
                                  index=list(indices),
                                  size=LIMIT_RESULTS,
                                  search_type=search_type,
                                  _source_include=[
                                      u'datetime', u'timestamp', u'message',
                                      u'timestamp_desc', u'timesketch_label',
                                      u'tag'
                                  ])

    def get_event(self, searchindex_id, event_id):
        """Get one event from the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id

        Returns:
            Event document in JSON format
        """
        try:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.get(index=searchindex_id,
                                   id=event_id,
                                   _source_exclude=[u'timesketch_label'])
        except NotFoundError:
            abort(HTTP_STATUS_CODE_NOT_FOUND)

    def count(self, indices):
        """Count number of documents.

        Args:
            indices: List of indices.

        Returns:
            Number of documents.
        """
        if not indices:
            return 0
        result = self.client.count(index=indices)
        return result.get(u'count', 0)

    def set_label(self,
                  searchindex_id,
                  event_id,
                  event_type,
                  sketch_id,
                  user_id,
                  label,
                  toggle=False):
        """Set label on event in the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id
            event_type: String of ElasticSearch document type
            sketch_id: Integer of sketch primary key
            user_id: Integer of user primary key
            label: String with the name of the label
            toggle: Optional boolean value if the label should be toggled
            (add/remove). The default is False.
        """
        doc = self.client.get(index=searchindex_id, id=event_id)
        try:
            doc[u'_source'][u'timesketch_label']
        except KeyError:
            # pylint: disable=redefined-variable-type
            doc = {u'doc': {u'timesketch_label': []}}
            self.client.update(index=searchindex_id,
                               doc_type=event_type,
                               id=event_id,
                               body=doc)

        # Choose the correct script.
        script_name = u'add_label'
        if toggle:
            script_name = u'toggle_label'
        script = {
            u'script': {
                u'file': script_name,
                u'params': {
                    u'timesketch_label': {
                        u'name': str(label),
                        u'user_id': user_id,
                        u'sketch_id': sketch_id
                    }
                }
            }
        }
        self.client.update(index=searchindex_id,
                           id=event_id,
                           doc_type=event_type,
                           body=script)

    def create_index(self, index_name=uuid4().hex, doc_type=u'generic_event'):
        """Create index with Timesketch settings.

        Args:
            index_name: Name of the index. Default is a generated UUID.
            doc_type: Name of the document type. Default id generic_event.

        Returns:
            Index name in string format.
            Document type in string format.
        """
        _document_mapping = {
            doc_type: {
                u'properties': {
                    u'timesketch_label': {
                        u'type': u'nested'
                    }
                }
            }
        }

        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(
                    index=index_name, body={u'mappings': _document_mapping})
            except ConnectionError:
                raise RuntimeError(u'Unable to connect to Timesketch backend.')
        # We want to return unicode here to keep SQLalchemy happy.
        index_name = unicode(index_name.decode(encoding=u'utf-8'))
        doc_type = unicode(doc_type.decode(encoding=u'utf-8'))
        return index_name, doc_type

    def import_event(self, flush_interval, index_name, event_type, event=None):
        """Add event to Elasticsearch.

        Args:
            flush_interval: Number of events to queue up before indexing
            index_name: Name of the index in Elasticsearch
            event_type: Type of event (e.g. plaso_event)
            event: Event dictionary
        """
        if event:
            # Make sure we have decoded strings in the event dict.
            event = {
                k.decode(u'utf8'): v.decode(u'utf8')
                for k, v in event.items()
            }

            # Header needed by Elasticsearch when bulk inserting.
            self.import_events.append(
                {u'index': {
                    u'_index': index_name,
                    u'_type': event_type
                }})
            self.import_events.append(event)
            self.import_counter[u'events'] += 1
            if self.import_counter[u'events'] % int(flush_interval) == 0:
                self.client.bulk(index=index_name,
                                 doc_type=event_type,
                                 body=self.import_events)
                self.import_events = []
        else:
            if self.import_events:
                self.client.bulk(index=index_name,
                                 doc_type=event_type,
                                 body=self.import_events)

        return self.import_counter[u'events']

Esempio n. 52

0

Mostra file

File: site_assignement.py Progetto: ATLAS-Analytics/NetworkWeatherService

                index=rec["_index"], doc_type=rec["_type"], id=rec["_id"], body={"doc": {"srcSite": sS, "destSite": dS}}
            )
        print "records:", recs, "\t remaining:", q.qsize(), "\ttotal rec:", totr
        q.task_done()


print "make sure we are connected right."
import requests

res = requests.get("http://cl-analytics.mwt2.org:9200")
print (res.content)

es = Elasticsearch([{"host": "cl-analytics.mwt2.org", "port": 9200}])

print "documents to look into:"
print es.count(index=ind)

usrc = {"size": 0, "aggregations": {"unique_vals": {"terms": {"field": "@message.src", "size": 1000}}}}
udest = {"size": 0, "aggregations": {"unique_vals": {"terms": {"field": "@message.dest", "size": 1000}}}}
usrcs = []
udests = []

res = es.search(index=ind, body=usrc, size=10000)
for tag in res["aggregations"]["unique_vals"]["buckets"]:
    usrcs.append(tag["key"])

res = es.search(index=ind, body=udest, size=10000)
for tag in res["aggregations"]["unique_vals"]["buckets"]:
    udests.append(tag["key"])

print "unique sources: ", len(usrcs)

Esempio n. 53

0

Mostra file

File: es_load.py Progetto: rockgarden/python_demo

        "--path",
        action="store",
        default=None,
        help=
        "Path to git repo. Commits used as data to load into Elasticsearch. (Default: None",
    )

    args = parser.parse_args()

    # instantiate es client, connects to localhost:9200 by default
    es = Elasticsearch(args.host)

    # we load the repo and all commits
    load_repo(es, path=args.path)

    # run the bulk operations
    success, _ = bulk(es, UPDATES, index="git")
    print("Performed %d actions" % success)

    # we can now make docs visible for searching
    es.indices.refresh(index="git")

    # now we can retrieve the documents
    initial_commit = es.get(index="git",
                            id="20fbba1230cabbc0f4644f917c6c2be52b8a63e8")
    print("%s: %s" %
          (initial_commit["_id"], initial_commit["_source"]["committed_date"]))

    # and now we can count the documents
    print(es.count(index="git")["count"], "documents in index")

Esempio n. 54

0

Mostra file

class Elastic(object):
    FIELD_CATCHALL = "catchall"
    DOC_TYPE = "doc"  # we don't make use of types
    SIMILARITY = "sim"  # we always use this similarity
    ANALYZER_STOP_STEM = "english"
    ANALYZER_STOP = "stop_en"

    def __init__(self, index_name):
        self.__es = Elasticsearch(hosts=ELASTIC_HOSTS)
        self.__index_name = index_name

    @staticmethod
    def analyzed_field(analyzer=ANALYZER_STOP):
        """Returns the mapping for analyzed fields.

        :param analyzer: name of the analyzer; valid options: [ANALYZER_STOP, ANALYZER_STOP_STEM]
        """
        if analyzer not in {Elastic.ANALYZER_STOP, Elastic.ANALYZER_STOP_STEM}:
            print("Error: Analyzer", analyzer, "is not valid.")
            exit(0)
        return {
            "type": "string",
            "term_vector": "with_positions_offsets",
            "analyzer": analyzer,
            "similarity": Elastic.SIMILARITY
        }

    @staticmethod
    def notanalyzed_field():
        """Returns the mapping for not-analyzed fields."""
        return {
            "type": "string",
            "index": "not_analyzed",
            "similarity": Elastic.SIMILARITY
        }

    def __gen_similarity(self, model="BM25", params={}):
        """Gets the custom similarity function."""
        similarity = params
        similarity["type"] = model
        return {Elastic.SIMILARITY: similarity}

    def __gen_analyzers(self):
        """Gets custom analyzers.
        We include customized analyzers in the index setting, a field may or may not use it.
        """
        analyzer = {"type": "standard", "stopwords": "_english_"}
        analyzers = {"analyzer": {Elastic.ANALYZER_STOP: analyzer}}
        return analyzers

    def analyze_query(self, query, analyzer=ANALYZER_STOP):
        """Analyzes the query.

        :param query: raw query
        :param analyzer: name of analyzer
        """
        tokens = self.__es.indices.analyze(index=self.__index_name,
                                           body=query,
                                           analyzer=analyzer)["tokens"]
        query_terms = []
        for t in sorted(tokens, key=lambda x: x["position"]):
            query_terms.append(t["token"])
        return " ".join(query_terms)

    def get_mapping(self):
        """Returns mapping definition for the index."""
        mapping = self.__es.indices.get_mapping(index=self.__index_name,
                                                doc_type=self.DOC_TYPE)
        return mapping[self.__index_name]["mappings"][
            self.DOC_TYPE]["properties"]

    def get_settings(self):
        """Returns index settings."""
        return self.__es.indices.get_settings(
            index=self.__index_name)[self.__index_name]["settings"]["index"]

    def __update_settings(self, settings):
        """Updates the index settings."""
        self.__es.indices.close(index=self.__index_name)
        self.__es.indices.put_settings(index=self.__index_name, body=settings)
        self.__es.indices.open(index=self.__index_name)
        self.__es.indices.refresh(index=self.__index_name)

    def update_similarity(self, model="BM25", params={}):
        """Updates the similarity function "sim", which is fixed for all index fields.

         The method and param should match elastic settings:
         https://www.elastic.co/guide/en/elasticsearch/reference/2.3/index-modules-similarity.html

        :param model: name of the elastic model
        :param params: dictionary of params based on elastic
        """
        old_similarity = self.get_settings()["similarity"]
        new_similarity = self.__gen_similarity(model, params)
        # We only update the similarity if it is different from the old one.
        # this avoids unnecessary closing of the index
        if old_similarity != new_similarity:
            self.__update_settings({"similarity": new_similarity})

    def delete_index(self):
        """Deletes an index."""
        self.__es.indices.delete(index=self.__index_name)
        print("Index <" + self.__index_name + "> has been deleted.")

    def create_index(self, mappings, force=False):
        """Creates index (if it doesn't exist).

        :param mappings: field mappings
        :param force: forces index creation (overwrites if already exists)
        """
        if self.__es.indices.exists(self.__index_name):
            if force:
                self.delete_index()
            else:
                print("Index already exists. No changes were made.")
                return

        # sets general elastic settings
        body = ELASTIC_SETTINGS

        # sets the global index settings
        # number of shards should be always set to 1; otherwise the stats would not be correct
        body["settings"] = {
            "analysis": self.__gen_analyzers(),
            "similarity": self.__gen_similarity(),
            "index": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            },
        }

        # sets the field mappings
        body["mappings"] = {self.DOC_TYPE: {"properties": mappings}}

        # creates the index
        self.__es.indices.create(index=self.__index_name, body=body)
        print("New index <" + self.__index_name + "> is created.")

    def add_docs_bulk(self, docs):
        """Adds a set of documents to the index in a bulk.

        :param docs: dictionary {doc_id: doc}
        """
        actions = []
        for doc_id, doc in docs.items():
            action = {
                "_index": self.__index_name,
                "_type": self.DOC_TYPE,
                "_id": doc_id,
                "_source": doc
            }
            actions.append(action)

        if len(actions) > 0:
            helpers.bulk(self.__es, actions)

    def add_doc(self, doc_id, contents):
        """Adds a document with the specified contents to the index.

        :param doc_id: document ID
        :param contents: content of document
        """
        self.__es.index(index=self.__index_name,
                        doc_type=self.DOC_TYPE,
                        id=doc_id,
                        body=contents)

    def get_doc(self, doc_id, fields=None, source=True):
        """Gets a document from the index based on its ID.

        :param doc_id: document ID
        :param fields: list of fields to return (default: all)
        :param source: return document source as well (default: yes)
        """
        return self.__es.get(index=self.__index_name,
                             doc_type=self.DOC_TYPE,
                             id=doc_id,
                             fields=fields,
                             _source=source)

    def search(self, query, field, num=100, fields_return="", start=0):
        """Searches in a given field using the similarity method configured in the index for that field.

        :param query: query string
        :param field: field to search in
        :param num: number of hits to return (default: 100)
        :param fields_return: additional document fields to be returned
        :param start: starting offset (default: 0)
        :return: dictionary of document IDs with scores
        """
        hits = self.__es.search(index=self.__index_name,
                                q=query,
                                df=field,
                                _source=False,
                                size=num,
                                fields=fields_return,
                                from_=start)["hits"]["hits"]
        results = {}
        for hit in hits:
            results[hit["_id"]] = hit["_score"]
        return results

    def get_field_stats(self, field):
        """Returns stats of the given field."""
        return self.__es.field_stats(
            index=self.__index_name,
            fields=[field])["indices"]["_all"]["fields"][field]

    def get_fields(self):
        """Returns name of fields in the index."""
        return list(self.get_mapping().keys())

    # =========================================
    # ================= Stats =================
    # =========================================
    def __get_termvector(self, doc_id, field, term_stats=False):
        """Returns a term vector for a given document field, including global field and term statistics.
        Term stats can have a serious performance impact; should be set to true only if it is needed!

        :param doc_id: document ID
        :param field: field name
        """
        tv = self.__es.termvectors(index=self.__index_name,
                                   doc_type=self.DOC_TYPE,
                                   id=doc_id,
                                   fields=field,
                                   term_statistics=term_stats)
        return tv.get("term_vectors", {}).get(field, {}).get("terms", {})

    def __get_coll_termvector(self, term, field):
        """Returns a term vector containing collection stats of a term."""
        hits = self.search(term, field, num=1)
        doc_id = next(iter(hits.keys())) if len(hits) > 0 else None
        return self.__get_termvector(doc_id, field,
                                     term_stats=True) if doc_id else {}

    def num_docs(self):
        """Returns the number of documents in the index."""
        return self.__es.count(index=self.__index_name,
                               doc_type=self.DOC_TYPE)["count"]

    def num_fields(self):
        """Returns number of fields in the index."""
        return len(self.get_mapping())

    def doc_count(self, field):
        """Returns number of documents with at least one term for the given field."""
        return self.get_field_stats(field)["doc_count"]

    def coll_length(self, field):
        """Returns length of field in the collection."""
        return self.get_field_stats(field)["sum_total_term_freq"]

    def avg_len(self, field):
        """Returns average length of a field in the collection."""
        return self.coll_length(field) / self.doc_count(field)

    def doc_length(self, doc_id, field):
        """Returns length of a field in a document."""
        return sum(self.term_freqs(doc_id, field).values())

    def doc_freq(self, term, field):
        """Returns document frequency for the given term and field."""
        tv = self.__get_coll_termvector(term, field)
        return tv.get(term, {}).get("doc_freq", 0)

    def coll_term_freq(self, term, field):
        """ Returns collection term frequency for the given field."""
        tv = self.__get_coll_termvector(term, field)
        return tv.get(term, {}).get("ttf", 0)

    def term_freqs(self, doc_id, field):
        """Returns term frequencies for a given document and field.

        :return dictionary of terms with their frequencies; {doc_id: freq, ...}
        """
        tv = self.__get_termvector(doc_id, field)
        term_freqs = {}
        for term, val in tv.items():
            term_freqs[term] = val["term_freq"]
        return term_freqs

    def term_freq(self, doc_id, field, term):
        """Returns frequency of a term in a given document and field."""
        return self.term_freqs(doc_id, field).get(term, 0)

Esempio n. 55

0

Mostra file

File: load.py Progetto: tylerjharden/elasticsearch-py

    load_repo(es, path=args.path)

    # run the bulk operations
    success, _ = bulk(es, REPO_ACTIONS, index='git', raise_on_error=True)
    print('Performed %d actions' % success)

    # now we can retrieve the documents
    es_repo = es.get(index='git', doc_type='repos', id='elasticsearch')
    print('%s: %s' % (es_repo['_id'], es_repo['_source']['description']))

    # update - add java to es tags
    es.update(
        index='git',
        doc_type='repos',
        id='elasticsearch',
        body={
          "script": {
            "inline" : "ctx._source.tags.add(params.tag)",
            "params" : {
              "tag" : "java"
            }
          }
        }
    )

    # refresh to make the documents available for search
    es.indices.refresh(index='git')

    # and now we can count the documents
    print(es.count(index='git')['count'], 'documents in index')

Esempio n. 56

0

Mostra file

File: revalidate.py Progetto: TimKettenacker/CdhHodgepodge

# read in all zip files from depots
os.chdir('C:\\Users\\Administrator\\Documents')
file = pandas.read_csv('Depot_PLZ_Zuordnung_2016_12.csv', sep = ';', converters={'depot': str, 'postcode': str})
frag1 = '{"query": { "match": { "zip":"'
frag2 = '"}}}'

total_count = []
zipCollector = []

# query all zips in file and count records respectively
for zip in file['postcode']:
    zipCollector.append(zip)
    match = frag1 + zip + frag2
    # choose valid index name
    res = es.count(index="customer-d01", body=match)
    total_count.append(res['count'])
    if total_count > 400000:
       break

# write collected zips to delta.csv to be picked up by cdh
print("Requested amount of zips has been fetched. Writing to file path now. Starting revalidation process.")
zipCollector = pandas.DataFrame(zipCollector)
pandas.DataFrame.to_csv(zipCollector, path_or_buf="C:\\uniserv\\cdh\\temp\\plzdelta\\delta.csv", index = False, encoding="UTF-8", header=False)
# slice data to contain everything from last cutoff and write to wd
crtrow = file[file['postcode'] == zip].index.tolist()
out_df = file[(crtrow[0]+1):len(file)]
path = os.getcwd() + "\\Depot_PLZ_Zuordnung_2016_12.csv"
pandas.DataFrame.to_csv(out_df, path_or_buf=path, index = False, encoding="UTF-8", header=False, sep = ";")
# invoke shell commands to call revalidation and update-histnames
os.chdir("C:\\Uniserv\\cdh\\tools")

Esempio n. 57

0

Mostra file

File: view.py Progetto: MiscCoding/gsp_web

def getTopBoard():
    query = dashboard.topboardQuery
    results = db_session.execute(query)

    total = 0
    before_total = 0

    totalMaliciousCodeCount = 0
    totalTodayUriAnalysisCount = 0
    totalTodayUriAnalysisCountNPC = 0
    totalTodayUriAnalysisCountIMAS = 0

    totalTodayMaliciousFileCount = 0
    totalTodayMaliciousFileCountIMAS = 0
    totalTodayMaliciousFileCountNPC = 0
    totalTodayMaliciousFileCountZombieZero = 0

    totalMaliciousUrlCount = 0
    totalMaliciousUrlCountRDBMS = 0
    totalMaliciousFileCountRDBMS = 0

    totalYesterdayMaliciousUrlCount = 0
    totalYesterdayMaliciousFileCount = 0

    #blackList count query to MySQL
    blackListQueryResult = Rules_BlackList.query
    blackListQueryResult = blackListQueryResult.filter_by(source=750)
    blackListQueryResult = blackListQueryResult.count()
    totalMaliciousFileCountRDBMS = blackListQueryResult

    #CNC url count by RDBMS
    cncRuleQueryResult = Rules_CNC.query
    cncRuleQueryResult = cncRuleQueryResult.count()
    totalMaliciousUrlCountRDBMS = cncRuleQueryResult

    es = Elasticsearch([{
        'host': app.config['ELASTICSEARCH_URI'],
        'port': app.config['ELASTICSEARCH_PORT']
    }])

    ##total Malicious code count
    # query_type = ""
    # doc = totalMaliciousQuery(request, query_type)
    # res = es.search(index="gsp*" + "", doc_type="analysis_results", body=doc)
    # totalMaliciousCodeCount = int(res['hits']['total']) #Total malicious code count

    ##total malicious url count

    # MFdoc = totalMaliciousUrlQuery(request, "uri")
    # res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    # totalMaliciousUrlCount = int(res['hits']['total'])

    ##total tody uri analysis count NPC

    MUdoc = todayURLFileCount("uri", "NPC")
    res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MUdoc)
    totalTodayUriAnalySisCountNPC = res['count']

    ##total tody uri analysis count NPC

    MUdoc = todayURLFileCount("uri", "IMAS")
    res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MUdoc)
    totalTodayUriAnalySisCountIMAS = res['count']

    ##total today file analysis count NPC
    MFdoc = todayURLFileCount("file", "NPC")
    res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    totalTodayMaliciousFileCountNPC = res['count']

    ##total today file analysis count IMAS
    MFdoc = todayURLFileCount("file", "IMAS")
    res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    totalTodayMaliciousFileCountIMAS = res['count']

    ##total today file analysis count ZombieZero
    MFdoc = todayURLFileCount("file", "zombie zero")
    res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    totalTodayMaliciousFileCountZombieZero = res['count']

    # MFdoc = todayFileAnalysis(request, "file")
    # res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    # totalTodayMaliciousFileCount = int(res['hits']['total'])

    ##total yesterday malicious url count

    MFdoc = dashboard.yesterdayUrlFileAnalysis(request, "uri")
    res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    totalYesterdayMaliciousUrlCount = int(res['hits']['total'])

    ##total yesterday malicious file count

    MFdoc = dashboard.yesterdayUrlFileAnalysis(request, "file")
    res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc)
    totalYesterdayMaliciousFileCount = int(res['hits']['total'])

    result = dict()
    result['spread'] = 0
    result['cnc'] = 0
    result['bcode'] = 0
    result['before_spread'] = 0
    result['before_cnc'] = 0
    result['before_bcode'] = 0
    result['link'] = 0
    result['before_link'] = 0
    result['uri'] = 0
    result['before_uri'] = 0
    result['file'] = 0
    result['before_file'] = 0
    result['totalTodayUriAnalysisCount'] = 0
    result['totalTodayUriAnalysisCountNPC'] = 0
    result['totalTodayUriAnalysisCountIMAS'] = 0
    result['totalTodayMaliciousFileCount'] = 0
    result['totalTodayMaliciousFileCountNPC'] = 0
    result['totalTodayMaliciousFileCountIMAS'] = 0
    result['totalTodayMaliciousFileCountZombieZero'] = 0
    result['totalMaliciousUrlQuery'] = 0
    result['totalYesterdayMaliciousUrlCount'] = 0
    result['totalYesterdayMaliciousFileCount'] = 0

    #region db 쿼리
    for _row in results:
        if _row['date'] == datetime.datetime.now().strftime("%Y-%m-%d"):
            if _row['Code'] == "003":
                result['spread'] = _row['count']
            elif _row['Code'] == "001":
                result['cnc'] = _row['count']
            elif _row['Code'] == "-":
                result['bcode'] = _row['count']
            total += _row['count']
        else:
            if _row['Code'] == "003":
                result['before_spread'] = _row['count']
            elif _row['Code'] == "001":
                result['before_cnc'] = _row['count']
            elif _row['Code'] == "-":
                result['before_bcode'] = _row['count']
                before_total += _row['count']

    #endregion eb 쿼리

    index = app.config['ELASTICSEARCH_INDEX_HEAD'] + datetime.datetime.now(
    ).strftime('%Y.%m.%d')

    #region es 쿼리
    query = dashboard.topboardEsQuery("now-1d/d", "now/d")
    es = Elasticsearch([{
        'host': app.config['ELASTICSEARCH_URI'],
        'port': int(app.config['ELASTICSEARCH_PORT'])
    }])
    res = es.search(index="gsp*", body=query,
                    request_timeout=30)  #url_crawlds 인덱스 문제로 임시 해결책 18-03-06
    for _row in res['aggregations']['types']['buckets']:
        if _row['key'] == "link_dna_tuple5":
            result['link'] = _row['doc_count']
            total += _row['doc_count']
        elif _row['key'] == "url_jobs":
            result['uri'] = _row['doc_count']
            total += _row['doc_count']
        elif _row['key'] == "url_crawleds":
            result['file'] = _row['doc_count']
            total += _row['doc_count']

    index = app.config['ELASTICSEARCH_INDEX_HEAD'] + datetime.datetime.now(
    ).strftime('%Y.%m.%d')
    query = dashboard.topboardEsQuery("now-2d/d", "now-1d/d")
    es = Elasticsearch([{
        'host': app.config['ELASTICSEARCH_URI'],
        'port': int(app.config['ELASTICSEARCH_PORT'])
    }])
    res = es.search(index="gsp*", body=query,
                    request_timeout=30)  #url_crawlds 인덱스 문제로 임시 해결책 18-03-06
    for _row in res['aggregations']['types']['buckets']:
        if _row['key'] == "link_dna_tuple5":
            result['before_link'] = _row['doc_count']
            before_total += _row['doc_count']
        elif _row['key'] == "url_jobs":
            result['before_uri'] = _row['doc_count']
            before_total += _row['doc_count']
        elif _row['key'] == "url_crawleds":
            result['before_file'] = _row['doc_count']
            before_total += _row['doc_count']
    #endregion es 쿼리

    # result['bcode'] = 34
    # result['before_bcode'] = 11
    # result['spread'] = 35
    # result['before_spread'] = 21
    # result['before_cnc'] = 7
    # result['file'] = 1752
    # result['before_file'] = 1127
    result['totalTodayUriAnalysisCount'] = totalTodayUriAnalysisCount
    result['totalTodayMaliciousFileCount'] = totalTodayMaliciousFileCount
    result['totalMaliciousUrlCount'] = totalMaliciousUrlCountRDBMS

    result['totalYesterdayMaliciousUrlCount'] = totalYesterdayMaliciousUrlCount
    result[
        'totalYesterdayMaliciousFileCount'] = totalYesterdayMaliciousFileCount

    result['totalTodayUriAnalysisCountNPC'] = totalTodayUriAnalySisCountNPC
    result['totalTodayUriAnalysisCountIMAS'] = totalTodayUriAnalySisCountIMAS

    result['totalTodayMaliciousFileCountNPC'] = totalTodayMaliciousFileCountNPC
    result[
        'totalTodayMaliciousFileCountIMAS'] = totalTodayMaliciousFileCountIMAS
    result[
        'totalTodayMaliciousFileCountZombieZero'] = totalTodayMaliciousFileCountZombieZero

    result['cnc'] = totalMaliciousFileCountRDBMS
    result['cnc_before'] = 13

    result['total'] = total
    result['before_total'] = before_total

    return json.dumps(result)

Esempio n. 58

0

Mostra file

File: ESearch.py Progetto: Srihari231092/LyricsAnalysis

class ESearch():
    def __init__(self):
        """
        Initialize class parameters
        """
        # Connection object
        self._es = None
        self._index_name = "article_data"
        self._hash_field = "URL"
        self._dict_of_duplicate_docs = {}

    def connect_to_es(self, host_name=ELASTIC_SEARCH_ENDPOINT):
        """
        Establishes a connection to the Elastic search server.
        If server if pingable, returns connection object.
        Else return None
        :return: connection-object
        """
        self._es = Elasticsearch(hosts=[host_name], timeout=60)
        # Ping the connection to check if it's alive
        if self._es.ping():
            return self._es
        return None

    def index_exists(self, index_name=None):
        if not index_name:
            index_name = self._index_name
        return self._es.indices.exists(index_name)

    def _make_mapping(self):
        """
        Creates the index with the correct mapping
        :return:
        """
        m = Mapping()
        # add fields
        m.field('Title', 'text')
        m.field('Text', 'text')
        m.field('Publish_Date',
                'date')  # date type complicates matters across websites
        m.field('URL', 'text')
        m.field('Scrape_Date',
                'date')  # date type complicates matters across websites
        m.field('Source', 'text')
        m.field('Search_Keyword', 'text')  # save list as text?
        m.field('SE_Is_Risk', 'boolean')
        m.field('GP_Is_Risk', 'boolean')
        m.field('RG_Is_Risk', 'boolean')
        m.field('SE_Risk_Rating', 'float')
        m.field('GP_Risk_Rating', 'float')
        m.field('RG_Risk_Rating', 'float')
        m.field('SE_SnP_Open', 'float')
        m.field('SE_SnP_Close', 'float')
        m.field('SE_AbbV_Open', 'float')
        m.field('SE_AbbV_Close', 'float')
        m.field('SE_XBI_Open', 'float')
        m.field('SE_XBI_Close', 'float')
        m.field('SE_SnP_Open_Plus1', 'float')
        m.field('SE_SnP_Close_Plus1', 'float')
        m.field('SE_AbbV_Open_Plus1', 'float')
        m.field('SE_AbbV_Close_Plus1', 'float')
        m.field('SE_XBI_Open_Plus1', 'float')
        m.field('SE_XBI_Close_Plus1', 'float')
        m.field('SE_SentimentScore', 'float')
        m.field('SE_SentimentPolarity', 'float')
        m.field('CompositeScore', 'float')
        m.field('RG_FDA_Warning', 'boolean')
        m.field('GP_SentimentScore', 'float')
        m.field('GP_SentimentPolarity', 'float')
        m.field('GP_Location', 'text')
        m.field('GP_Country', 'text')
        m.field('Article_references', 'float')
        m.field('Is_source_type_RG', 'boolean')
        m.field('Is_source_type_SE', 'boolean')
        m.field('Is_source_type_GP', 'boolean')

        # save the mapping into index 'my-index'
        try:
            m.save(self._index_name)
        except Exception as e:
            print("Could not save schema!", e)

    def create_index(self):
        """
        Creates the index if it doesn't exist
        :return:
        """
        # create the index if it doesn't exist
        if not self.index_exists():
            try:
                index.create()
                self._make_mapping()
                print("Index was created :", index.exists())
            except Exception as e:
                print("~~~Index exists error")
                print(e)
                return -1
        else:
            print("Index already exists", self._index_name)
        return 0

    def get_index_mapping(self):
        """
        Retrieves the index mapping
        :return: Index mapping JSON object if success, -1 if error
        """
        try:
            return self._es.indices.get_mapping(index=self._index_name)
        except Exception as e:
            print("~~~Get index mapping error")
            print(e)
            return -1

    def get_count(self, search_obj=None):
        return self._es.count(index=self._index_name, body=search_obj)

    def upload_dataframe(self, df):
        """
        Uploads a dataframe into the index
        :param df: Dataframe (pandas)
        :return: 0 if success, -1 if failure
        """
        def rec_to_actions(df):
            for record in df.to_dict(orient="records"):
                yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}' %
                       (self._index_name, "_doc"))
                yield (json.dumps(record, default=int))

        if not self.index_exists():
            print("!!!INDEX DOES NOT EXIST -- RETURNING!!!")
            return -1

        try:
            # make the bulk call, and get a response
            response = self._es.bulk(rec_to_actions(df))  # return a dict
            if not response["errors"]:
                print("Records uploaded")
            else:
                print("Could not upload data ")
                print(response)
                return -1
        except Exception as e:
            print("\nERROR:", e)
            return -1

        return 0

    # Process documents returned by the current search/scroll
    def _populate_dict_of_duplicate_docs(self, hits):
        for item in hits:
            combined_key = str(item['_source'][self._hash_field])

            _id = item["_id"]
            # _Title = item["_source"]["Title"]

            hashval = hashlib.md5(combined_key.encode('utf-8')).digest()

            # If the hashval is new, then we will create a new key
            # in the dict_of_duplicate_docs, which will be
            # assigned a value of an empty array.
            # We then immediately push the _id onto the array.
            # If hashval already exists, then
            # we will just push the new _id onto the existing array
            self._dict_of_duplicate_docs.setdefault(hashval, []).append(_id)

    # Loop over all documents in the index, and populate the
    # dict_of_duplicate_docs data structure.
    def _scroll_over_all_docs(self):
        data = self._es.search(index=self._index_name,
                               scroll='1m',
                               body={"query": {
                                   "match_all": {}
                               }})

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        self._populate_dict_of_duplicate_docs(data['hits']['hits'])

        while scroll_size > 0:
            data = self._es.scroll(scroll_id=sid, scroll='2m')

            # Process current batch of hits
            self._populate_dict_of_duplicate_docs(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

    def _loop_over_hashes_and_remove_duplicates(self):
        urls_to_delete = []
        ids_to_delete = []
        # Search through the hash of doc values to see if any
        # duplicate hashes have been found
        for hashval, array_of_ids in self._dict_of_duplicate_docs.items():
            if len(array_of_ids) > 1:
                # print("********** Duplicate docs hash=%s **********" % hashval)
                # Get the documents that have mapped to the current hasval
                matching_docs = self._es.mget(index=self._index_name,
                                              body={"ids": array_of_ids})
                # Check if the URLs are truly the same URLs
                dict_url_ids = {}
                for doc in matching_docs['docs']:
                    dict_url_ids.setdefault(doc["_source"].get("URL"),
                                            []).append(doc["_id"])
                # remove only the first ID from the list
                dict_url_ids = {
                    key: value[1:]
                    for (key, value) in dict_url_ids.items()
                }
                for i in list(dict_url_ids.keys()):
                    urls_to_delete.append(i)
                # Delete all the IDs now
                for i in list(dict_url_ids.values()):
                    ids_to_delete.extend(i)

        for u in urls_to_delete:
            print(u)

        for idd in ids_to_delete:
            try:
                del_return = self._es.delete(index=self._index_name, id=idd)
            except Exception as e:
                print(e)
                break

    def remove_duplicates(self):
        self._scroll_over_all_docs()
        self._loop_over_hashes_and_remove_duplicates()

Esempio n. 59

0

Mostra file

File: es_connector.py Progetto: FirasOdeh/MABED

class Es_connector:
    def __init__(self,
                 host='localhost',
                 port=9200,
                 user='******',
                 password='******',
                 timeout=1000,
                 index="test2",
                 doc_type="tweet"):
        # def __init__(self, host='localhost', port=9200, user='', password='', timeout=1000, index="test2", doc_type="tweet"):

        # Define config
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.timeout = timeout
        self.index = index
        self.doc_type = doc_type
        self.size = 500
        self.body = {"query": {"match_all": {}}}
        self.result = []

        # Init Elasticsearch instance
        self.es = Elasticsearch([self.host],
                                http_auth=(self.user, self.password),
                                port=self.port,
                                timeout=self.timeout,
                                use_ssl=False)

    # def search(self, query):
    #     res = self.es.search(
    #         index=self.index,
    #         doc_type=self.doc_type,
    #         body={"query": query},
    #         size=self.size,
    #     )
    #     if res['hits']['total']>0:
    #         print("Got %d Hits:" % res['hits']['total'])
    #     return res

    def search(self, query):
        res = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            body=query,
            size=self.size,
        )
        return res

    def search_size(self, query, size=500):
        res = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            body=query,
            size=size,
        )
        return res

    def count(self, query):
        res = self.es.count(index=self.index,
                            doc_type=self.doc_type,
                            body=query)
        return res

    def post(self, query):
        res = self.es.index(index=self.index,
                            doc_type=self.doc_type,
                            body=query)
        return res

    def update_field(self, id, field, value):
        res = self.es.update(index=self.index,
                             doc_type=self.doc_type,
                             id=id,
                             body={"doc": {
                                 field: value
                             }})
        if res['result'] == "updated":
            return res
        else:
            return False

    def update(self, id, query):
        res = self.es.update(index=self.index,
                             doc_type=self.doc_type,
                             id=id,
                             body=query)
        if res['result'] == "updated":
            return res
        else:
            return False

    def delete(self, id):
        res = self.es.delete(index=self.index, doc_type=self.doc_type, id=id)
        if res['result'] == "deleted":
            return res
        else:
            return False

    def get(self, id):
        res = self.es.get(index=self.index, doc_type=self.doc_type, id=id)
        if res['found'] == True:
            # print(res)
            return res
        else:
            return False

    def bigSearch(self, query):
        res = []

        # Process hits here
        def process_hits(hits, results):
            for item in hits:
                results.append(item)
            return results

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            scroll='15m',
            size=self.size,
            body=query,
        )

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        res = process_hits(data['hits']['hits'], res)

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            res = process_hits(data['hits']['hits'], res)

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        return res

    def init_paginatedSearch(self, query):
        res = []

        # Process hits here
        def process_hits(hits, results):
            for item in hits:
                results.append(item)
            return results

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            scroll='15m',
            size=self.size,
            body=query,
        )

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        res = process_hits(data['hits']['hits'], res)
        total = data['hits']['total']
        scroll_size = total - scroll_size

        return {
            "results": res,
            "sid": sid,
            "scroll_size": scroll_size,
            "total": total
        }

    def loop_paginatedSearch(self, sid, scroll_size):
        res = []

        # Process hits here
        def process_hits(hits, results):
            for item in hits:
                results.append(item)
            return results

        if scroll_size > 0:
            data = self.es.scroll(scroll_id=sid, scroll='15m')
            # Process current batch of hits
            res = process_hits(data['hits']['hits'], res)
            # Update the scroll ID
            sid = data['_scroll_id']
            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        return {"results": res, "sid": sid, "scroll_size": scroll_size}

    def getTweets(self):
        # Process hits here
        def process_hits(hits):
            for item in hits:
                self.result.append(item)

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        body = self.body
        body = {
            "_source": ["text", "timestamp_ms", "imagesCluster"],
            "query": {
                "match_all": {}
            }
        }

        # Init scroll by search
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              scroll='15m',
                              size=self.size,
                              body=body)

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        process_hits(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            process_hits(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        text = self.result[0]['_source']['text']
        date = self.result[0]['_source']['timestamp_ms']
        return self.result

    def getFilteredTweets(self, session, status):
        # Process hits here
        def process_hits(hits):
            for item in hits:
                self.result.append(item)

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        session = 'session_' + session
        body = self.body
        body = {
            "_source": ["text", "timestamp_ms", "imagesCluster"],
            "query": {
                "terms": {
                    session: status
                }
            }
        }

        # Init scroll by search
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              scroll='15m',
                              size=self.size,
                              body=body)

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        process_hits(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            process_hits(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        # text = self.result[0]['_source']['text']
        # date = self.result[0]['_source']['timestamp_ms']
        return self.result

    def update_all(self, field, value):
        # Process hits here
        def process_hits(hits):
            for item in hits:
                self.update_field(item['_id'], field, value)

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              scroll='15m',
                              size=self.size,
                              body=self.body)

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        # print(data['hits']['total'])
        process_hits(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            process_hits(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])
        return True

    def update_query(self, query, field, value):
        # Process hits here
        def process_hits(hits):
            for item in hits:
                self.update_field(item['_id'], field, value)

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              scroll='15m',
                              size=self.size,
                              body=query)

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        # print(data['hits']['total'])
        process_hits(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            process_hits(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])
        return True

    def remove_field_all(self, field):
        # Process hits here
        def process_hits(hits):
            for item in hits:
                item['_source'].pop(field, None)
                up = self.update(
                    item['_id'],
                    {"script": "ctx._source.remove(\"" + field + "\")"})
                # print(item['_id'])
                # print(up)

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            return False

        # Init scroll by search
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              scroll='15m',
                              size=self.size,
                              body=self.body)

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        # print(data['hits']['total'])
        process_hits(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            process_hits(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])
        return True

    def initMABED(self):
        # Process hits here
        def process_hits(hits):
            for item in hits:
                self.result.append(item)

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        body = self.body
        body = {
            "_source": ["text", "timestamp_ms", "imagesCluster"],
            "query": {
                "match_all": {}
            }
        }

        # Init scroll by search
        data = self.es.search(index=self.index,
                              doc_type=self.doc_type,
                              scroll='15m',
                              size=self.size,
                              body=body)

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        process_hits(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            process_hits(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        text = self.result[0]['_source']['text']
        date = self.result[0]['_source']['timestamp_ms']
        return self.result

    def tokenize(self, text, stopwords):
        # split the documents into tokens based on whitespaces

        raw_tokens = text.lower().replace("...",
                                          "").replace("…",
                                                      "").replace("..",
                                                                  "").split()
        # trim punctuation and convert to lower case
        return [
            token.strip(string.punctuation) for token in raw_tokens
            if len(token) > 3 and token not in stopwords and 'http' not in
            token and 'cluster' not in token and re.search('[a-zA-Z]', token)
        ]

    def range_tweets(self, start, end, stopwords_file_path, words, count):
        # Process hits here
        tweets = []
        # load stop-words
        stopwords = utils.load_stopwords(stopwords_file_path)

        # print(stopwords)

        def process_hits(hits, stopwords):
            t = []
            for item in hits:
                # tweet = item['_source']['text'].encode('utf-8', 'ignore').decode('utf-8')
                tweet = item['_source']['text']
                tokenized_tweet = self.tokenize(tweet, stopwords)
                # print(tokenized_tweet)
                t.append(tokenized_tweet)
            return t

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            print("Index " + self.index + " not exists")
            exit()

        body = {
            "query": {
                "bool": {
                    "should": {
                        "match": {
                            "text": {
                                "query": words
                            }
                        }
                    },
                    "filter": {
                        "range": {
                            "@timestamp": {
                                "gt": str(start),
                                "lt": str(end)
                            }
                        }
                    }
                }
            }
        }
        print(body)

        # Init scroll by search

        # filepath = "models/" + str(hash(words)).replace("-", "") + ".model"
        filepath = "models/" + words.replace(" ", "").replace(",",
                                                              "") + ".model"
        modelfile = Path(filepath)
        if modelfile.is_file():
            model = gensim.models.Word2Vec.load(filepath)
        else:
            data = self.es.search(index=self.index,
                                  doc_type=self.doc_type,
                                  scroll='2m',
                                  size=self.size,
                                  body=body)

            # Get the scroll ID
            sid = data['_scroll_id']
            scroll_size = len(data['hits']['hits'])

            # Before scroll, process current batch of hits
            tweets = process_hits(data['hits']['hits'], stopwords)

            while scroll_size > 0:
                "Scrolling..."
                data = self.es.scroll(scroll_id=sid, scroll='2m')

                # Process current batch of hits
                tweets = tweets + process_hits(data['hits']['hits'], stopwords)

                # Update the scroll ID
                sid = data['_scroll_id']

                # Get the number of results that returned in the last scroll
                scroll_size = len(data['hits']['hits'])

            # print(texts[0])
            # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in tweets]
            # tweets = tweets + ['lyon']

            model = gensim.models.Word2Vec(tweets,
                                           min_count=1,
                                           workers=1,
                                           negative=20)
            model.save(filepath)

        words = self.tokenize(words, stopwords)
        pwords = words
        print("pwords")
        print(pwords)
        # context = model.most_similar(positive=['fête','lumières'], topn=10)
        context = model.most_similar(positive=pwords, topn=count)
        # context = model.most_similar(positive=['fête','lumières'], topn=count)
        # context = model.most_similar_cosmul(positive=pwords, topn=5)
        # context = model.similar_by_word(word='lyon', topn=5)

        # context = model.similar_by_vector(vector=['lyon','fdl','fdl2017'], topn=5)

        return context

    # =======================================================
    # =======================================================

    def bigTweetTextSearch(self, query):
        res = []

        # Process hits here
        def process_hits(hits, results):
            for item in hits:
                results.append(item['_source']['text'])
            return results

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            scroll='15m',
            size=self.size,
            body=query,
        )

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        res = process_hits(data['hits']['hits'], res)

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            res = process_hits(data['hits']['hits'], res)

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        return res

    def bigSearchMean(self, query):
        res = []
        count = 0
        scoreSum = 0

        # Process hits here
        def process_hits(hits, scoreSum):
            for item in hits:
                scoreSum = scoreSum + item['_score']
            return scoreSum

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            scroll='15m',
            size=self.size,
            body=query,
        )

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        scoreSum = process_hits(data['hits']['hits'], scoreSum)
        count = count + len(data['hits']['hits'])

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            scoreSum = process_hits(data['hits']['hits'], scoreSum)
            count = count + len(data['hits']['hits'])

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        mean = scoreSum / count
        return mean

    def bigSearchSSE(self, query, mean):
        sse = 0

        # Process hits here
        def process_hits(hits, sse):
            for item in hits:
                sse = (item['_score'] - mean)**2
            return sse

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            # print("Index " + self.index + " not exists")
            exit()

        # Init scroll by search
        data = self.es.search(
            index=self.index,
            doc_type=self.doc_type,
            scroll='15m',
            size=self.size,
            body=query,
        )

        # Get the scroll ID
        sid = data['_scroll_id']
        scroll_size = len(data['hits']['hits'])

        # Before scroll, process current batch of hits
        sse = process_hits(data['hits']['hits'], sse)

        while scroll_size > 0:
            "Scrolling..."
            data = self.es.scroll(scroll_id=sid, scroll='15m')

            # Process current batch of hits
            sse = process_hits(data['hits']['hits'], sse)

            # Update the scroll ID
            sid = data['_scroll_id']

            # Get the number of results that returned in the last scroll
            scroll_size = len(data['hits']['hits'])

        return sse

    def w2v_tweets(self, stopwords_file_path, words, count):
        # Process hits here
        tweets = []
        # load stop-words
        stopwords = utils.load_stopwords(stopwords_file_path)

        # print(stopwords)

        def process_hits(hits, stopwords):
            t = []
            for item in hits:
                # tweet = item['_source']['text'].encode('utf-8', 'ignore').decode('utf-8')
                tweet = item['_source']['text']
                tokenized_tweet = self.tokenize(tweet, stopwords)
                # print(tokenized_tweet)
                t.append(tokenized_tweet)
            return t

        # Check index exists
        if not self.es.indices.exists(index=self.index):
            print("Index " + self.index + " not exists")
            exit()

        body = {
            "query": {
                "bool": {
                    "should": {
                        "match": {
                            "text": {
                                "query": words
                            }
                        }
                    }
                }
            }
        }
        print(body)

        # Init scroll by search

        # filepath = "models/" + str(hash(words)).replace("-", "") + ".model"
        filepath = "models/" + words.replace(" ", "").replace(",",
                                                              "") + ".model"
        modelfile = Path(filepath)
        if modelfile.is_file():
            model = gensim.models.Word2Vec.load(filepath)
        else:
            data = self.es.search(index=self.index,
                                  doc_type=self.doc_type,
                                  scroll='2m',
                                  size=self.size,
                                  body=body)

            # Get the scroll ID
            sid = data['_scroll_id']
            scroll_size = len(data['hits']['hits'])

            # Before scroll, process current batch of hits
            tweets = process_hits(data['hits']['hits'], stopwords)

            while scroll_size > 0:
                "Scrolling..."
                data = self.es.scroll(scroll_id=sid, scroll='2m')

                # Process current batch of hits
                tweets = tweets + process_hits(data['hits']['hits'], stopwords)

                # Update the scroll ID
                sid = data['_scroll_id']

                # Get the number of results that returned in the last scroll
                scroll_size = len(data['hits']['hits'])

            # print(texts[0])
            # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in tweets]
            # tweets = tweets + ['lyon']

            model = gensim.models.Word2Vec(tweets,
                                           min_count=1,
                                           workers=10,
                                           negative=20)
            model.save(filepath)

        words = self.tokenize(words, stopwords)
        pwords = words
        print("pwords")
        print(pwords)
        # context = model.most_similar(positive=['fête','lumières'], topn=10)
        context = model.most_similar(positive=pwords, topn=count)
        # context = model.most_similar(positive=['fête','lumières'], topn=count)
        # context = model.most_similar_cosmul(positive=pwords, topn=5)
        # context = model.similar_by_word(word='lyon', topn=5)

        # context = model.similar_by_vector(vector=['lyon','fdl','fdl2017'], topn=5)

        return context

Esempio n. 60

0

Mostra file

File: ElasticSearchClass.py Progetto: heirish/machine-learning-portfolio

class ElasticSearchClass(object):
    def __init__(self, host, port, user=None, pwd=None):
        self.host = host
        self.port = port
        if user is not None and pwd is not None:
            self.es = Elasticsearch(hosts=[{
                'host': self.host,
                'port': self.port
            }],
                                    http_auth=(user, pwd))
        else:
            self.es = Elasticsearch(hosts=[{
                'host': self.host,
                'port': self.port
            }])

    def isValid(self):
        try:
            self.es.ping()
            return True
        except:
            return False

    def count(self, indexName):
        """
        :param indexname:
        :return: 统计index总数
        """
        return self.es.count(index=indexName)

    def delete(self, indexName, docType, id):
        """
        :param indexname:
        :param doc_type:
        :param id:
        :return: 删除index中具体的一条
        """
        self.es.delete(index=indexName, doc_type=docType, id=id)

    def get(self, indexName, docType, id):
        return self.es.get(index=indexName, doc_type=docType, id=id)

    def search(self, indexName, size=10):
        try:
            return self.es.search(index=indexName,
                                  size=size,
                                  sort="@timestamp:desc")
        except Exception as err:
            print(err)

    def createIndex(self, indexName, body):
        try:
            self.es.indices.delete(index=indexName)
        except elasticsearch.NotFoundError:
            pass
        self.es.indices.create(index=indexName, body=body)

    def indexDocument(self, indexName, docType, body, docId=None):
        if docId is not None:
            self.es.index(index=indexName,
                          doc_type=docType,
                          id=docId,
                          body=body)
        else:
            self.es.index(index=indexName, doc_type=docType, body=body)

    #https://github.com/elastic/elasticsearch-py/issues/508
    def bulkIndexDocument(self, actions):
        success, _ = bulk(self.es, actions)
        return success

    def moreLikeThis(self,
                     indexName,
                     docType,
                     id,
                     mltFields,
                     search_size=2,
                     min_term_freq=1,
                     min_doc_freq=1):
        return self.es.search(
            body={
                "size": search_size,
                "query": {
                    "more_like_this": {
                        "fields":
                        mltFields,
                        "like": [{
                            "_index": indexName,
                            "_type": docType,
                            "_id": id
                        }],
                        "min_term_freq":
                        min_term_freq,
                        "min_doc_freq":
                        min_doc_freq
                    }
                }
            })

    def termVector(self, indexName, docType, id):
        return self.es.termvectors(indexName, docType, id)
        '''