def search(self, args, es_client=client):
        search = Search(using=es_client, index=SearchableEvent.meta.index)

        if args.get('name'):
            search = search.query('fuzzy', name=args['name'])
            search = search.highlight('name')

        if args.get('description'):
            search = search.query('match', description=args['description'])
            search = search.highlight('description')

        if args.get('location-name'):
            search = search.query('fuzzy', location_name=args['location_name'])
            search = search.highlight('location_name')

        if args.get('organizer-name'):
            search = search.query(
                'fuzzy', organizer_name=args['organizer_name'])
            search = search.highlight('organizer_name')

        if args.get('organizer-description'):
            search = search.query(
                'fuzzy', organizer_description=args['organizer_description'])
            search = search.highlight('organizer_description')

        return [to_dict(r) for r in search.execute()]
Example #2
0
    def process(self, start_time:datetime, end_time:datetime, input:DataFrame):
        logger.debug('Start: %s  End: %s  Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields)))

        search = Search(using=self.client, index=self.indices[0])
        search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}}))

        for k,v in self.fields.items():
            if isinstance(v, list):
                for sv in v:
                    search = search.query("match", **{k:sv})

            else:
                search = search.query("match", **{k:v})

        logger.debug('ES Query: %s' % str(search.to_dict()))
        response = search.execute()

        logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits)))

        for hit in response:
            # filter out the meta key and flatten the values
            row = {k: str(hit[k]) for k in hit if k != 'meta'}

            logger.debug(row)
            input = input.append(row, ignore_index=True)

        return input
  def searchTweets(keyword, latlondist):
    #Variables that contains the user credentials to access Twitter API 
    if TwitterHelper.AWS_ACCESS_KEY == None:
      raise KeyError("Please set the AWS_ACCESS_KEY env. variable")
    
    if TwitterHelper.AWS_SECRET_KEY == None:
      raise KeyError("Please set the AWS_SECRET_KEY env. variable")

    s = Search()
    if latlondist != None:
      locJson = json.loads(latlondist)
      s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}})

    if keyword != None:
      q = Q("match_phrase", text = keyword)
      s = s.query(q)
    
    scanResp = None
    scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m")

    arr = []
    for resp in scanResp:
      hit = resp['_source']
      d = {}
      d['name'] = hit['name']
      d['text'] = hit['text']
      d['sentiment'] = hit['sentiment']
      d['lat'] = hit['location']['lat']
      d['lon'] = hit['location']['lon']
      arr.append(d)
    allD = {}
    allD['tweets'] = arr
    mapInput = json.dumps(allD)
    return mapInput
Example #4
0
	def get(self,request):
		form = SearchForm(request.GET)

		ctx={
			"form":form
		}
		if form.is_valid():
		 
			name_query=form.cleaned_data.get("name")
			if name_query:

				s=Search(index="daintree").query("match",name=name_query)
			else:
				s=Search(index="daintree")


			min_price=form.cleaned_data.get("min_price")
			max_price=form.cleaned_data.get("max_price")

			if min_price is not None or max_price is not None:
				price_query=dict()
				if min_price is not None:
					price_query["gte"]=min_price
				if max_price is not None:
					price_query["lte"]=max_price
				s=s.query("range",price=price_query)		
			

			s.aggs.bucket("categories","terms",field="category")

			if request.GET.get("category"):
				s=s.query("match",category=request.GET["category"])

			result=s.execute()

			ctx["products"]=result.hits
			category_aggregations=list()
			for bucket in result.aggregations.categories.buckets:
				category_name=bucket.key
				doc_count=bucket.doc_count

				category_url_params=request.GET.copy()
				category_url_params["category"]=category_name

				category_url="{}?{}".format(reverse("home"),category_url_params.urlencode())
				category_aggregations.append({
					"name":category_name,
					"doc_count":doc_count,
					"url":category_url
					})
			ctx["category_aggs"]=category_aggregations
		if "category" in request.GET:
			remove_category_search_params=request.GET.copy()
			del remove_category_search_params["category"]
			remove_category_url="{}?{}".format(reverse("home"),remove_category_search_params.urlencode())
			ctx["remove_category_url"]=remove_category_url
			
		return render(request,"home.html",ctx)
Example #5
0
    def get_unique_terms(self, field_name, min_docs=5):
        assert isinstance(self.search_obj, Search)

        # define a bucket aggregation and metrics inside:

        self.search_obj.aggs.bucket('tokens', 'terms', field=field_name, size=20)
        s = Search(self.es).index(self.index_name)
        s.query('match_all')
        s.aggs.bucket('myaggs', 'terms', field=field_name, size=0, min_doc_count=min_docs)

        res = {}
        for i in s.execute().aggregations.myaggs.buckets:
            res[i['key']] = i['doc_count']
        return res
Example #6
0
 def update_sentiments(self):
     from watson_developer_cloud import ToneAnalyzerV3Beta
     tone_analyzer = ToneAnalyzerV3Beta(username='******',
                                password='******',
                                version='2016-02-11')
     client = connections.get_connection()
     search = Search(using=client, index='articles', doc_type='article')
     q = Q('bool', must=[Q('missing', field='watson_analyzed')])
     search = search.query(q)
     counter = 0
     for result in search.scan():
         doc = Article.get(result.meta.id)
         try:
             analysis = tone_analyzer.tone(text=doc.body)
             tone_categories = analysis['document_tone']['tone_categories']
             emotion_tones = list(filter(lambda x: x['category_id'] == 'emotion_tone', tone_categories))[0]
             doc.tone = {}
             for tone in emotion_tones['tones']:
                 doc.tone[tone['tone_id']] = tone['score']
             doc.watson_success = True
         except WatsonException:
             continue
         finally:
             doc.watson_analyzed = True
             doc.save()
             counter += 1
         print(counter)
     if counter == 0:
         raise RealError()
Example #7
0
 def query_articles(self, query, prefs):
     client = connections.get_connection()
     search = Search(using=client, index='articles')
     q = Q('bool', must=[Q('exists', field='watson_analyzed'),
                         Q('match', watson_success=True),
                         Q('match', body=query)])
     search = search.query(q)
     search.execute()
     documents = []
     for hit in search[:100]:
         if '#' not in hit.url and '?' not in hit.url:
             documents.append({
                 'id': hit.meta.id,
                 'title': hit.title,
                 'body': hit.body,
                 'url': hit.url,
                 'score': hit.meta.score,
                 'tone': dict(
                     joy=hit.tone.joy,
                     fear=hit.tone.fear,
                     sadness=hit.tone.sadness,
                     disgust=hit.tone.disgust,
                     anger=hit.tone.anger
                 ),
                 'top_image': hit.top_image
             })
     if len(documents) < 10:
         return documents
     else:
         return select_k_and_sort(documents, prefs)
Example #8
0
    def search(self, **params):
        index = params.get('index', self.index)
        search = Search(using=self.client, index=index)

        page = params.get('page', None)
        per_page = params.get('per_page', None)
        if page and per_page:
            page = page - 1
            search._extra = {'from': page, 'size': per_page}

        sort = params.get('sort', None)
        if sort and sort.replace('-', '') in ['created_at', 'level']:
            search = search.sort(sort)

        date_filter = self._filter_by_date_interval(params)
        if date_filter:
            search = search.filter(date_filter)

        level = params.get('group_by', None)
        if level:
            search = search.query('match', level=level)

        hits = search.execute()

        format = params.get('format', 'object')
        if format == 'dict':
            return self._to_dict(hits)
        else:
            return self._to_logs(hits)
Example #9
0
def gracc_query_apel(year, month):
    index = osg_summary_index
    starttime = datetime.datetime(year, month, 1)
    onemonth = dateutil.relativedelta.relativedelta(months=1)
    endtime = starttime + onemonth
    s = Search(using=es, index=index)
    s = s.query('bool',
        filter=[
            Q('range', EndTime={'gte': starttime, 'lt': endtime })
          & Q('terms', VOName=vo_list)
          & ( Q('term', ResourceType='Batch')
            | ( Q('term', ResourceType='Payload')
              & Q('term', Grid='Local') )
            )
        ]
    )

    bkt = s.aggs
    bkt = bkt.bucket('Cores', 'terms', size=MAXSZ, field='Processors')
    bkt = bkt.bucket('VO',    'terms', size=MAXSZ, field='VOName')
    bkt = bkt.bucket('DN',    'terms', size=MAXSZ, field='DN')
    bkt = bkt.bucket('Site',  'terms', size=MAXSZ, missing=MISSING, field='OIM_ResourceGroup')
    #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='SiteName')
    #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='WLCGAccountingName')
    add_bkt_metrics(bkt)

    bkt = bkt.bucket('SiteName',  'terms', size=MAXSZ, field='SiteName')

    add_bkt_metrics(bkt)

    response = s.execute()
    return response
Example #10
0
def reverse():
    try:
        lon = float(request.args.get('lon'))
        lat = float(request.args.get('lat'))
    except (TypeError, ValueError):
        lon = lat = None

    if not lat or not lon:
        abort(400, "missing 'lon' or 'lat': /?lon=2.0984&lat=48.0938")

    s = Search(es).index(INDEX).query(MatchAll()).extra(size=1).sort({
        "_geo_distance": {
            "coordinate": {
                "lat": lat,
                "lon": lon
            },
            "order": "asc"
        }})
    _type = request.args.get('type', None)
    if _type:
        s = s.query({'match': {'type': _type}})
    results = s.execute()
    if len(results.hits) < 1:
        notfound.debug('reverse: lat: {}, lon: {}, type: {}'.format(
            lat, lon, _type))

    debug = 'debug' in request.args
    data = to_geo_json(results, debug=debug)
    data = json.dumps(data, indent=4 if debug else None)
    response = Response(data, mimetype='application/json')
    cors(response)
    return response
	def GetAuditData(self, case, child_id, data_type, start=None, length=None, str_query=None, sort=None, order=None):
		q = ['w32registryraw', 'filedownloadhistory', 'urlhistory', 'timeline', 'w32apifiles', 'w32rawfiles', 'w32eventlogs']

		if data_type in q:
			query = search_queries.GetGeneratorQuery(data_type, str_query, case, child_id, start, length, sort, order)
		else:
			s = Search()
			s = s[0:1000]
			t = Q('query_string', default_field="ComputerName.raw", query=child_id) & Q('query_string', default_field="CaseInfo.case_name", query=case)
			query = s.query(t).filter('term', AuditType__Generator=data_type)

		try:
			r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False)
		except ConnectionError as e:
			ret = {"connection_error": e.args[0]}
			return ret

		data = []

		try:
			for x in r.json()['hits']['hits']:
				data.append(x)
		except KeyError:
			return data

		return data
def gracc_query_jobs(es, index, starttime, endtime, interval, offset=None):
    s = Search(using=es, index=index)

    s = s.query('bool',
            filter=[
             Q('range', EndTime={'gte': starttime, 'lt': endtime })
          &  Q('term',  ResourceType='Batch')
          & ~Q('terms', SiteName=['NONE', 'Generic', 'Obsolete'])
          & ~Q('terms', VOName=['Unknown', 'unknown', 'other'])
        ]
    )

    if offset is None:
        extra = {}
    else:
        extra = {'offset': "-%ds" % offset}

    curBucket = s.aggs.bucket('EndTime', 'date_histogram',
                              field='EndTime', interval=interval, **extra)

    curBucket = curBucket.metric('CoreHours', 'sum', field='CoreHours')
    curBucket = curBucket.metric('Records', 'sum', field='Count')

    response = s.execute()
    return response
Example #13
0
def search():
    q = request.args.get('q')
    #resp = es.search(index='hoe', doc_type='record', q=q, body=aggs)
    #logging.info(q)

    s = Search(using=es, index='hoe', doc_type='record')
    s.aggs.bucket('library_place', 'terms', field='library-place')
    s.aggs.bucket('type', 'terms', field='type')
    s.aggs.bucket('genre', 'terms', field='genre')
    s.aggs.bucket('keywords', 'terms', field='keywords.label')
    s.aggs.bucket('author', 'terms', field='author.literal')
    s.query = Q('multi_match', query=q, fields=['_all'])
    filters = []
    if 'filter' in request.args:
        filters = request.args.getlist('filter')
        logging.info(filters)
        for filter in filters:
            cat, val = filter.split(':')
            cat = cat.replace('_', '-')
            filter_dict = {}
            filter_dict.setdefault(cat, val)
            logging.info(cat)
            s.filter = F('term', **filter_dict)
    #if request.args
    resp = s.execute()
    #logging.info(resp)
    #logging.info(resp.aggregations.per_category.buckets)
    return render_template('resultlist.html', records=resp.to_dict().get('hits'), facets=resp.aggregations.to_dict(), header=q, query=q, filters=filters)
	def BuildAuditAggs(self, child_id, parent_id):
		s = Search()
		s = s[0]
		t = Q('query_string', default_field="CaseInfo.case_name", query=parent_id) & Q('match', ComputerName=child_id)
		aggs_generator = A('terms', field='AuditType.Generator', size=0)

		s.aggs.bucket('datatypes', aggs_generator)
		query = s.query(t)

		try:
			r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False)
		except ConnectionError as e:
			ret = {"connection_error": e.args[0]}
			return ret

		data = []
		exclude = ['w32processes-memory', 'stateagentinspector', 'w32disks']

		for y in r.json()['aggregations']['datatypes']['buckets']:
			if not y['key'] in exclude:
				data.append({
						"id": y['key'], "parent": child_id, "text": y['key'], "type": "audit", "a_attr": {"href": "#" + y['key'] + '/' + parent_id + "/" + child_id }
					})

		return data
Example #15
0
 def categories(self):
     s = Search(
         using=docstore._get_connection(settings.DOCSTORE_HOSTS),
         index=settings.DOCSTORE_INDEX,
         doc_type='articles'
     ).fields([
         'title', 'title_sort', 'categories',
     ])[0:docstore.MAX_SIZE]
     if not settings.MEDIAWIKI_SHOW_UNPUBLISHED:
         s = s.query('match', published=True)
     response = s.execute()
     pages = []
     for hit in response:
         page = Page()
         page.url_title = hit.title[0]
         page.title = hit.title[0]
         page.title_sort = hit.title_sort[0]
         page.categories = hit.get('categories', [])
         pages.append(page)
     articles = sorted(pages, key=lambda page: page.title_sort)
     categories = {}
     for page in articles:
         for category in page.categories:
             # exclude internal editorial categories
             if category not in settings.MEDIAWIKI_HIDDEN_CATEGORIES:
                 if category not in categories.keys():
                     categories[category] = []
                 # pages already sorted so category lists will be sorted
                 if page not in categories[category]:
                     categories[category].append(page)
     return categories
Example #16
0
    def get(self, request, *args, **kwargs):
        q = request.GET.get('q')

        # Make search.
        queries = [
            query.Q('match', slug=self._phrase(q)),  # Slug.
            query.Q('match', type=self._phrase(q)),  # Type.
            query.Q('match', search_names=self._phrase(q)),  # Name.
            query.Q('prefix', carrier=q),  # Shelf carrier.
            query.Q('term', region=q)  # Shelf region.
        ]
        sq = query.Bool(should=queries)

        # Search.
        res = {'apps': [], 'brands': [], 'collections': [], 'shelves': []}
        es = Search(using=FeedItemIndexer.get_es(),
                    index=self.get_feed_element_index())
        feed_elements = es.query(sq).execute().hits
        if not feed_elements:
            return response.Response(res, status=status.HTTP_404_NOT_FOUND)

        # Deserialize.
        ctx = {'app_map': self.get_apps(request,
                                        self.get_app_ids_all(feed_elements)),
               'request': request}
        for feed_element in feed_elements:
            item_type = feed_element.item_type
            serializer = self.SERIALIZERS[item_type]
            data = serializer(feed_element, context=ctx).data
            res[self.PLURAL_TYPES[item_type]].append(data)

        # Return.
        return response.Response(res, status=status.HTTP_200_OK)
    def search(self, doc_type, query=""):
        """
        Execute search query and retrive results

        :param doc_type: Type in ElasticSearch
        :param query: search query
        :return: list with results
        """
        results = []
        if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta:
            q = Q("multi_match",
                  query=query.lower(),
                  fields=["title"])

            s = Search()
            s = s.using(self.client)
            s = s.index(self.index_name)
            s = s.doc_type(doc_type)
            s = s.query(q)
            print "search query: " + str(s.to_dict())

            response = s.execute()

            for resp in response:
                results.append(resp)
        return results
	def GetAuditDataMain(self, data):
		s = Search()
		s = s[0:1000]
		s = s.highlight('*')
		s = s.highlight_options(require_field_match=False)
		t = Q('query_string', query=data) & ~Q('query_string', default_field="AuditType.Generator", query="stateagentinspector") & ~Q('query_string', default_field="AuditType.Generator", query="w32processes-tree")

		query = s.query(t)

		try:
			r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False)
		except ConnectionError as e:
			ret = {"connection_error": e.args[0]}
			return ret

		data = []

		try:
			for x in r.json()['hits']['hits']:
				for y, v in x['highlight'].iteritems():
					data.append({
							"doc_id": x['_id'],
							"endpoint": x['_parent'],
							"audittype": x['_source']['AuditType']['Generator'],
							"field": y,
							"response": v
						})
		except KeyError:
			pass

		return data
Example #19
0
	def BuildRootTree(self):
		s = Search()
		t = Q('has_parent', type='hostname', query=Q('query_string', query="*"))
		aggs = A('terms', field='AuditType.Generator', size=16)

		s.aggs.bucket('datatypes', aggs)
		query = s.query(t)

		try:
			r = requests.post(self.es_host + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()))
		except ConnectionError as e:
			ret = {"connection_error": e.args[0]}
			return ret

		data = [{
			"id": "stackable", "parent": "#", "text": "Stackable Data"
		}]

		i = ['w32services', 'w32tasks', 'w32scripting-persistence', 'w32prefetch', 'w32network-dns', 'urlhistory']

		for x in r.json()['aggregations']['datatypes']['buckets']:
			if x['key'] not in i:
				pass
			else:
				data.append({
					"id" : x['key'], "parent": "stackable", "text": x['key'], "children": True
				})

		return data
	def BuildRootTree(self):
		s = Search()
		t = Q('query_string', query="*")
		aggs_casenum = A('terms', field="CaseInfo.case_name", size=0)

		s.aggs.bucket('casenum', aggs_casenum)
		query = s.query(t)

		try:
			r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False)
		except ConnectionError as e:
			ret = {"connection_error": e.args[0]}
			return ret

		data = [{
			"id": "current_inv", "parent": "#", "text": "Current Investigations", "type": "root"
		}, {
			"id": "comp_inv", "parent": "#", "text": "Completed Investigations", "type": "root"
		}]

		for x in r.json()['aggregations']['casenum']['buckets']:
			data.append({
				"id": x['key'], "parent": "current_inv", "text": x['key'], "children": True, "type": "case"
			})

		return data
def GetAuditGenerator(endpoints):
	audit_type = ''
	
	q = []
	a = []

	for k, v in endpoints.iteritems():
		q.append(k)
		for x in v:
			if not v in a:
				a.append(v)

	joined = ' OR '.join([x for x in a[0]])
	generator = q[0]
	
	s = Search()
	s = s[0]

	if generator == 'w32scripting-persistence':
		aggs_gen = A('terms', field='Record.Path.raw', size=0)
		aggs_endpoint = A('terms', field="ComputerName.raw", size=0)
		s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint)
		t = Q('query_string', default_field="ComputerName.raw", query=joined)
		query = s.query(t).filter('term', AuditType__Generator=generator)

	elif generator == 'w32prefetch':
		aggs_gen = A('terms', field='Record.ApplicationFileName.raw', size=0)
		aggs_endpoint = A('terms', field="ComputerName.raw", size=0)
		s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint)
		t = Q('query_string', default_field="ComputerName.raw", query=joined)
		query = s.query(t).filter('term', AuditType__Generator=generator)

	elif generator == 'w32network-dns':
		aggs_gen = A('terms', field='Record.RecordName.raw', size=0)
		aggs_endpoint = A('terms', field="ComputerName.raw", size=0)
		s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint)
		t = Q('query_string', default_field="ComputerName.raw", query=joined)
		query = s.query(t).filter('term', AuditType__Generator=generator)

	else:
		aggs_gen = A('terms', field='Record.Name.raw', size=0)
		aggs_endpoint = A('terms', field="ComputerName.raw", size=0)
		s.aggs.bucket('generator', aggs_gen).bucket('endpoint', aggs_endpoint)
		t = Q('query_string', default_field="ComputerName.raw", query=joined)
		query = s.query(t).filter('term', AuditType__Generator=generator)

	return query.to_dict()
    def search(self, query: str, filters: dict=None, only_this_type: bool=True, **kwargs: dict) -> list:
        """performs a search against elasticsearch and then pulls the corresponding data from the db

        :param query: query terms to search by
        :param filters: named (attribute, value) filters to limit the query results
        :param kwargs: additional search keyword arguments
        :return: a list of models with an additional `__score` value added
        """
        # build base search object
        s = Search(using=self.indexer.es).index(self.indexer.index_name)
        if only_this_type:
            s = s.doc_type(self.indexer.doc_type_name)

        # build query
        s = s.query('match', _all=query)

        # add filter
        if filters is not None:
            for attr, value in filters.items():
                s = s.filter(F({'term': {attr: value}}))

        # execute query
        res = s.execute()

        # build up django query
        results = {}
        for hit in res:
            # get the model
            dj_type = hit._meta.doc_type
            model = get_model(dj_type)

            # get the pk
            pk_name = model._meta.pk.name
            pk = getattr(hit, pk_name)

            # get the score
            score = hit._meta.score

            # add to mapping
            results.setdefault(model, {})
            results[model][pk] = score

        # get queryset
        querysets = []
        for model, pk_score in results.items():
            qs = model.objects.filter(pk__in=pk_score.keys())
            querysets += list(qs)

        # attach scores to instances
        for instance in querysets:
            score = results[type(instance)][instance.pk]
            instance._meta.es_score = score

        # order by score
        querysets = sorted(querysets, key=lambda i: i._meta.es_score, reverse=True)

        # return
        return querysets
Example #23
0
 def get_item(self, identifier):
     s = Search(using=self.client)
     s = s.query("match", **{"_id": identifier})
     response = s.execute()
     if response.hits.total != 1:
         return None
     return ElasticSearchRDFRecord.get_rdf_records_from_query(
         query=s,
         response=response)[0]
Example #24
0
    def query(self):
        search_obj = Search()
        for f in self.filters:
            search_obj = search_obj.filter(f)

        for q in self.queries:
            search_obj = search_obj.query(q)

        return search_obj.to_dict()
Example #25
0
def get_articles_by_iid(iid, page_from=0, page_size=1000):

    search = Search(index=INDEX).query("match", issue_iid=iid)
    search = search.query("match", _type="article")

    search = search[page_from:page_size]
    search_response = search.execute()

    return search_response
Example #26
0
def get_issue_by_iid(iid):
    search = Search(index=INDEX).query("match", iid=iid)
    search = search.query("match", _type="issue")
    search_response = search.execute()

    if search_response.success() and search_response.hits.total > 0:
        issue = search_response[0]
        return issue
    else:
        return None
Example #27
0
def get_article_by_aid(aid):
    search = Search(index=INDEX).query("match", aid=aid)
    search = search.query("match", _type="article")
    search_response = search.execute()

    if search_response.success() and search_response.hits.total > 0:
        article = search_response[0]
        return article
    else:
        return None
def GetGeneratorQuery(case, endpoint_id, start, length, str_query, sort, order):
	s = Search()
	s = s[int(start):int(length)+int(start)]
	s = s.fields([	"Record.Path",
				    "Record.Url",
				    "Record.SourceUrl",
				    "Record.TlnTime",
				    "Record.File.Accessed",
				    "Record.File.Modified",
				    "Record.File.Changed",
				    "AuditType.Generator"
				])

	order_dict = {
		"0": "TlnTime"
	}

	if str_query == "":
		_sort = {
			"Record.{0}".format(order_dict[str(sort)]): {
				"order": order
			}
		}

		t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts")
		query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort)

	else:
		_sort = {
			"Record.{0}".format(order_dict[str(sort)]): {
				"order": order
			}
		}
		
		t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts") & Q('query_string', fields=[
					"Record.Path",
				    "Record.Url",
				    "Record.SourceUrl",
				    "AuditType.Generator"], query="{0}*".format(str_query))
		query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort)

	return query.to_dict()
Example #29
0
 def convert_filters_to_query(self, filters):
     s = Search(using=self.client)
     spec = filters.get("dataset__spec", None)
     modified_from = filters.get('modified__gt', None)
     modified_until = filters.get('modified__lt', None)
     if spec and not self.spec:
         self.spec = spec
     if self.spec:
         s = s.query("match", **{'system.spec.raw': self.spec})
     if self.query:
         if 'query' in self.query:
             s = s.query(self.query.get('query'))
         if 'filter' in self.query:
             s = s.query(self.query.get('filter'))
     if modified_from:
         s = s.filter("range", **{"system.modified_at": {"gte": modified_from}})
     if modified_until:
         s = s.filter("range", **{"system.modified_at": {"lte": modified_until}})
     s = s.sort({"system.modified_at": {"order": "asc"}})
     return s[self.cursor: self.get_next_cursor()]
Example #30
0
 def get_dataset_list(self):
     s = Search(using=self.client)
     datasets = A("terms", field="delving_spec.raw")
     if self.query:
         s = s.filter(self.query.get('filter'))
     elif self.spec:
         s = s.query("match", **{'system.spec.raw': self.spec})
     s.aggs.bucket("dataset-list", datasets)
     response = s.execute()
     specs = response.aggregations['dataset-list'].buckets
     return [self.ESDataSet(spec.key, None, None, spec.doc_count, None) for spec in specs]
Example #31
0
def save_forensic_report_to_elasticsearch(forensic_report,
                                          index_suffix=None,
                                          monthly_indexes=False,
                                          number_of_shards=1,
                                          number_of_replicas=1):
    """
        Saves a parsed DMARC forensic report to ElasticSearch

        Args:
            forensic_report (OrderedDict): A parsed forensic report
            index_suffix (str): The suffix of the name of the index to save to
            monthly_indexes (bool): Use monthly indexes instead of daily
                                    indexes
            number_of_shards (int): The number of shards to use in the index
            number_of_replicas (int): The number of replicas to use in the
                                      index

        Raises:
            AlreadySaved

        """
    logger.debug("Saving forensic report to Elasticsearch")
    forensic_report = forensic_report.copy()
    sample_date = None
    if forensic_report["parsed_sample"]["date"] is not None:
        sample_date = forensic_report["parsed_sample"]["date"]
        sample_date = human_timestamp_to_datetime(sample_date)
    original_headers = forensic_report["parsed_sample"]["headers"]
    headers = OrderedDict()
    for original_header in original_headers:
        headers[original_header.lower()] = original_headers[original_header]

    arrival_date_human = forensic_report["arrival_date_utc"]
    arrival_date = human_timestamp_to_datetime(arrival_date_human)

    search = Search(index="dmarc_forensic*")
    arrival_query = {"match": {"arrival_date": arrival_date}}
    q = Q(arrival_query)

    from_ = None
    to_ = None
    subject = None
    if "from" in headers:
        from_ = headers["from"]
        from_query = {"match_phrase": {"sample.headers.from": from_}}
        q = q & Q(from_query)
    if "to" in headers:
        to_ = headers["to"]
        to_query = {"match_phrase": {"sample.headers.to": to_}}
        q = q & Q(to_query)
    if "subject" in headers:
        subject = headers["subject"]
        subject_query = {"match_phrase": {"sample.headers.subject": subject}}
        q = q & Q(subject_query)

    search.query = q
    existing = search.execute()

    if len(existing) > 0:
        raise AlreadySaved("A forensic sample to {0} from {1} "
                           "with a subject of {2} and arrival date of {3} "
                           "already exists in "
                           "Elasticsearch".format(to_, from_, subject,
                                                  arrival_date_human))

    parsed_sample = forensic_report["parsed_sample"]
    sample = _ForensicSampleDoc(
        raw=forensic_report["sample"],
        headers=headers,
        headers_only=forensic_report["sample_headers_only"],
        date=sample_date,
        subject=forensic_report["parsed_sample"]["subject"],
        filename_safe_subject=parsed_sample["filename_safe_subject"],
        body=forensic_report["parsed_sample"]["body"])

    for address in forensic_report["parsed_sample"]["to"]:
        sample.add_to(display_name=address["display_name"],
                      address=address["address"])
    for address in forensic_report["parsed_sample"]["reply_to"]:
        sample.add_reply_to(display_name=address["display_name"],
                            address=address["address"])
    for address in forensic_report["parsed_sample"]["cc"]:
        sample.add_cc(display_name=address["display_name"],
                      address=address["address"])
    for address in forensic_report["parsed_sample"]["bcc"]:
        sample.add_bcc(display_name=address["display_name"],
                       address=address["address"])
    for attachment in forensic_report["parsed_sample"]["attachments"]:
        sample.add_attachment(filename=attachment["filename"],
                              content_type=attachment["mail_content_type"],
                              sha256=attachment["sha256"])
    try:
        forensic_doc = _ForensicReportDoc(
            feedback_type=forensic_report["feedback_type"],
            user_agent=forensic_report["user_agent"],
            version=forensic_report["version"],
            original_mail_from=forensic_report["original_mail_from"],
            arrival_date=arrival_date,
            domain=forensic_report["reported_domain"],
            original_envelope_id=forensic_report["original_envelope_id"],
            authentication_results=forensic_report["authentication_results"],
            delivery_results=forensic_report["delivery_result"],
            source_ip_address=forensic_report["source"]["ip_address"],
            source_country=forensic_report["source"]["country"],
            source_reverse_dns=forensic_report["source"]["reverse_dns"],
            source_base_domain=forensic_report["source"]["base_domain"],
            authentication_mechanisms=forensic_report[
                "authentication_mechanisms"],
            auth_failure=forensic_report["auth_failure"],
            dkim_domain=forensic_report["dkim_domain"],
            original_rcpt_to=forensic_report["original_rcpt_to"],
            sample=sample)

        index = "dmarc_forensic"
        if index_suffix:
            index = "{0}_{1}".format(index, index_suffix)
        if monthly_indexes:
            index_date = arrival_date.strftime("%Y-%m")
        else:
            index_date = arrival_date.strftime("%Y-%m-%d")
        index = "{0}-{1}".format(index, index_date)
        index_settings = dict(number_of_shards=number_of_shards,
                              number_of_replicas=number_of_replicas)
        create_indexes([index], index_settings)
        forensic_doc.meta.index = index
        try:
            forensic_doc.save()
        except Exception as e:
            raise ElasticsearchError("Elasticsearch error: {0}".format(
                e.__str__()))
    except KeyError as e:
        raise InvalidForensicReport(
            "Forensic report missing required field: {0}".format(e.__str__()))
Example #32
0
def user_last_interaction(userid):
    s = Search().extra(size=1)

    s = s.query("match", user_id=userid).sort("-datetime")

    return s
Example #33
0
def example10():
    """
    DSL objects for common entities instead of dict/json.
    All importable from elasticsearch_dsl
    """
    from elasticsearch_dsl import Q, Search
    """
    Straightforward mapping to json - kwargs are translated into keys into json.
    You can use the to_dict() method to see the result json.
    """

    q = Q("terms", tags=["python", "search"])
    q.to_dict()
    """
    All objects can also be constructed using the raw dict.
    """

    q = Q({"terms": {"tags": ["python", "search"]}})
    q.to_dict()
    """
    Query objects support logical operators which result in bool queries
    """
    q = q | Q("match", title="python")
    q.to_dict()
    """
    DSL objects also allow for attribute access instead of ['key']
    """
    q.minimum_should_match = 2
    q.minimum_should_match
    q.to_dict()

    from datetime import date

    q = q & Q("range", **{"@timestamp": {"lt": date(2019, 1, 1)}})
    q.to_dict()
    """
    Configuration is global so no client needs to be passed around.
    """
    from elasticsearch_dsl import connections
    """
    Default connection used where no other connection specified. Any configuration
    methods just pass all parameters to the underlying elasticsearch-py client.
    """
    connections.create_connection(hosts=["localhost"])
    """
    Optionally specify an alias for the connection in case of multiple connections.
    """
    connections.create_connection("prod", hosts=["localhost"])
    s = Search(using="prod")
    s.count()
    """
    You can always just pass in your own client instance
    """
    s = Search(using=Elasticsearch())
    s.count()
    """
    Any method on Search returns a clone so you need to always assign it back to
    the same variable.
    """
    s = Search()
    s = s.params(q="fix")
    """
    Multiple queries are combined together using the AND operator
    """
    s = Search()
    s = s.query("match", description="fix")
    s = s.query("match", author="Honza")
    """
    Filter shortcut to use {bool: {filter: []}}
    """
    s = s.filter("range", committed_date={"lt": date(2016, 1, 1)})
    s.to_dict()
    """
    Exclude as a wrapper around must_not, use __ instead of dots for convenience.
    """
    s = s.exclude("term", committer__name__keyword="Honza Král")
    """
    Search is executed when iterated on or when .execute() is called.
    """
    for hit in s:
        """
        Hit class offers direct access to fields and via .meta any other properties
        on the returned hit (_id, _seq_no, ...)
        """
        print(f"{hit.meta.id[:6]} ({hit.author.name}): {hit.description[:50]}")
    """
    Aggregations are implemented in place to allow for chaining
    """
    s = Search(index="git")
    s.aggs.bucket("tags", "terms", field="terms").metric(
        "lines", "sum",
        field="stats.lines").metric("authors",
                                    "cardinality",
                                    field="author.name.keyword")
    r = s.execute()
    """
    Or modify aggregation in place
    """
    s.aggs["tags"].bucket("months",
                          "date_histogram",
                          field="committed_date",
                          interval="month")
    """
    Analysis
    """

    from elasticsearch_dsl import analyzer, token_filter

    a = analyzer(
        "file_analyzer",
        tokenizer="path_hierarchy",
        filter=[
            "lowercase",
            token_filter(
                "split_ext",
                "pattern_capture",
                preserve_original=True,
                patterns=[r"^([^\.]+)"],
            ),
        ],
    )

    a.simulate("test/integration/search.py")
    """
    """

    from elasticsearch_dsl import Document, Text, Keyword, InnerDoc, Date, Nested

    class FileDiff(InnerDoc):
        filename = Text(analyzer=a)
        patch = Text()

    class Commit(Document):
        description = Text()
        committed_date = Date()
        author = Text(fields={"keyword": Keyword()})

        files = Nested(FileDiff)

        def subject(self):
            return self.description.split("\n", 1)[0][:80]

        class Index:
            name = "git*"
            settings = {"number_of_replicas": 0}

    """
    Create the index
    """

    Commit.init(index="git-v2")
    """
    Search now returns Commit objects
    """
    for c in Commit.search():
        print(f"{c.meta.id}: {c.subject()}")
Example #34
0
            'w32apifiles', 'w32rawfiles', 'w32eventlogs'
        ]

        if data_type in q:
            query = search_queries.GetGeneratorQuery(data_type, str_query,
                                                     case, child_id, start,
                                                     length, sort, order)
        else:
            s = Search()
            s = s[0:1000]
            t = Q('query_string',
                  default_field="ComputerName.raw",
                  query=child_id) & Q('query_string',
                                      default_field="CaseInfo.case_name",
                                      query=case)
            query = s.query(t).filter('term', AuditType__Generator=data_type)

        try:
            r = requests.post(self.es_host + ":" + self.es_port + self.index +
                              self.type_audit_type + '/_search',
                              data=json.dumps(query.to_dict()),
                              auth=(self.elastic_user, self.elastic_pass),
                              verify=False)
        except ConnectionError as e:
            ret = {"connection_error": e.args[0]}
            return ret

        data = []

        try:
            for x in r.json()['hits']['hits']:
Example #35
0
    def search_day(self, qterm, score_metric='perc', **kwargs):
        """
        Searches in the elasticsearch index for irc messages, grouped by day and channel.

        Uses the elasticsearch aggregation function to build following aggregation levels of the documents:

        - A: filter (day/channel) -> B: group by day (day-bucket) -> C: group by channel (channel-bucket)

        ----

        The channel-buckets are sorted by their 99-percentile of their containing document-scores (This means that 1%
        of all the documents in the channel-bucket have a higher score than the 99-percentile of the channel-bucket.
        In comparsion to sum or avg, the 99-percentile has the advantage that higher document-scores/matching documents
        in the channel-bucket are valued higher. Many lower document scores will be valued less or even ignored.)
        For each day the highest perc-score of all channel-buckets on that day is remembered as ``max_score_day``.
        The day-buckets are then sorted by this highest perc-score ``max_score_day``.

        Important: This case describes the behaviour with a ``score_metric`` == 'perc'. If ``score_metric`` is changed,
        the behaviour is the same, except another metric is used.

        Definition: a document is one log-message

        :param score_metric: ``str`` Which metric to use for calculating channel-bucket score.
            This metric will also be used for sorting these buckets.

            - 'perc' 99-percentile of documents in channel-bucket. -> High-matching documents are valued higher
            - 'sum' sum of all document scores in channel-bucket -> All documents equal, many medium-matching documents
                may "eat-up" high-matching ones.
            - 'max' highest document score in channel-bucket as channel-bucket score -> Returns the day and channel with
                the highest matching log-message. Other messages on that day in that channel will be ignored.
        :param qterm: ``str`` Query-string to find
        :param \**kwargs:
                See below

            :Keyword Arguments:
                * *date_gte* (``datetime``) --
                  Filter, From: only emails greater than
                * *date_lte* (``datetime``) --
                  Filter, To: only emails less than
                * *date_sliding* (``str``) --
                  Filter sliding window, only emails of the past XX-hours/days/years... e.g. '-1d/d','-5y/y' --
                  See: https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#date-math
                * *date_sliding_type* (``str``) --
                  Valid date-type: e.g. y M d
                * *use_sliding_value* (``bool``) --
                  True: Only respect date_sliding and date_sliding_type.
                  False: only respect fix date: date_gte and date_lte
                * *number_results* (``int``) --
                  Number of total results to return
                * *sort_field* (``str``) --
                  By which field should results be sorted e.g. date, _score, username
                * *sort_dir* (``str``) --
                  In Which direction should results be sorted
                  '+': ascending
                  '-': descending)
        :return:
        """
        number_results = 50
        number_top_hits = 5

        # Get arguments
        date_gte = None  # '2010-01-31T22:28:14+0300'  # from
        date_lte = 'now'  # ''2012-09-20T17:41:14+0900' # 'now'  # to
        date_sliding_value = ''
        date_sliding_type = ''
        use_sliding_value = True
        sort_field = '_score'
        sort_dir = '-'
        for key, value in kwargs.items():
            if key == 'date_gte':
                date_gte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT +
                            '}').format(value)
            if key == 'date_lte':
                date_lte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT +
                            '}').format(value)
            if key == 'use_sliding_value':
                use_sliding_value = value
            if key == 'date_sliding_value':
                date_sliding_value = value
            if key == 'date_sliding_type':
                date_sliding_type = value
            if key == 'number_results':
                number_results = value
            if key == 'sort_field':
                sort_field = value
            if key == 'sort_dir':
                sort_dir = value

        # Get specific query arguments
        filter_channel = ''
        for key, value in kwargs.items():
            if key == 'filter_channel':
                filter_channel = value

        # Prepare query
        s = DslSearch(using=self._es, index=self._index_prefix.format('*'))
        s = s[0:0]  # don't return other results, only aggregation

        # Search-Query
        s = s.query(self._get_query(qterm))

        # Prepare score-metric and corresponding order-field for buckets and couments
        percentiles_percents = 99
        percentiles_percents_field = '99.0'
        percentiles_percents_field_order = elastic_constants.IRC_DAY_ORDER_FIELD[
            'perc']
        score_order_field = elastic_constants.IRC_DAY_ORDER_FIELD[score_metric]

        # Prepare aggregate-query:
        # Aggregation levels: A (date/channel filtered) -> B (bucket days) -> C (bucket channel)

        # A date/channel filtered
        filters = []
        # Date
        if use_sliding_value & (date_sliding_value != '') & (date_sliding_type
                                                             != ''):
            filters.append({
                'range': {
                    '@timestamp': {
                        'gte':
                        'now-{0}{1}'.format(date_sliding_value,
                                            date_sliding_type),
                        'lte':
                        'now'
                    }
                }
            })
        elif date_gte is not None:
            filters.append(
                {'range': {
                    '@timestamp': {
                        'gte': date_gte,
                        'lte': date_lte
                    }
                }})

        # Channel
        if filter_channel != '':
            filters.append({'term': {'channel.keyword': filter_channel}})

        a_log_filtered = A('filter', Q('bool', must=filters))

        # B bucket days
        b_bucket_days = A('date_histogram',
                          field='@timestamp',
                          interval='day',
                          format='yyyy-MM-dd',
                          min_doc_count=1,
                          order={'max_score_day': 'desc'})

        # C bucket channels
        c_bucket_channels = A('terms',
                              field='channel.keyword',
                              min_doc_count=1,
                              order={score_order_field: 'desc'})
        c_bucket_channels = c_bucket_channels \
            .metric('max_date', 'max', field='@timestamp') \
            .metric('sum_score_channel', 'sum', script={'inline': '_score', 'lang': 'painless'}) \
            .metric('max_score_channel', 'max', script={'inline': '_score', 'lang': 'painless'}) \
            .metric('percentiles_score_channel', 'percentiles', percents=[percentiles_percents],
                    script={'inline': '_score', 'lang': 'painless'}) \
            .metric('top_msg_hits', 'top_hits', size=number_top_hits,
                    highlight={'fields': {'msg': {}, 'username': {}, 'channel': {}}},
                    sort=[{'_score': {'order': 'desc'}}],
                    **{'_source': {
                        'includes': ['channel', 'username', '@timestamp', 'msg']}})

        # Stack aggregations Main -> A -> B -> C (reversed order)
        b_bucket_days.bucket('logs_per_channel', c_bucket_channels)
        b_bucket_days.metric('max_score_day', 'max',
                             field=score_order_field)  # Add metric
        a_log_filtered.bucket('logs_per_day', b_bucket_days)
        s.aggs.bucket('logs_filtered', a_log_filtered)

        # Execute query
        response = s.execute()

        # Flatten days-channels buckets (see: http://stackoverflow.com/a/952952/2003325)
        bucket_days = response.aggregations.logs_filtered.logs_per_day.buckets
        bucket_channel_flat = [
            item for sub in bucket_days
            for item in sub.logs_per_channel.buckets
        ]

        # Sort flattened buckets (one bucket is a channel per day)
        if sort_field == 'channel.keyword':

            def sort_lambda(bucket_channel):
                return bucket_channel['key']
        elif sort_field == '_score' and score_metric == 'perc':

            def sort_lambda(bucket_channel):
                return bucket_channel.percentiles_score_channel.values[
                    percentiles_percents_field]
        elif sort_field == '_score':  # '@timestamp' or 'sum_score_channel' or 'max_score_channel'

            def sort_lambda(bucket_channel):
                return bucket_channel[score_order_field].value
        elif sort_field == '@timestamp':

            def sort_lambda(bucket_channel):
                return bucket_channel['max_date'].value

        sort_dir = 'desc' if sort_dir == '-' else 'asc'

        bucket_channel_flat_sorted = sorted(bucket_channel_flat,
                                            key=sort_lambda,
                                            reverse=(sort_dir == 'desc'))

        # Limit result-size
        number_results_buckets = int(number_results / 3)
        bucket_channel_flat_sorted = bucket_channel_flat_sorted[
            0:number_results_buckets]

        # Get hits to display from flattened buckets
        hit_list = []
        for channel_bucket in bucket_channel_flat_sorted:
            for hit in channel_bucket.top_msg_hits.hits.hits:
                if score_order_field == percentiles_percents_field_order:
                    score = channel_bucket.percentiles_score_channel.values[
                        percentiles_percents_field]
                else:
                    score = channel_bucket[score_order_field].value
                hit.meta = {'score': score, 'highlight': {}}
                hit_src = hit['_source']
                hit.sent = dateutil.parser.parse(hit_src['@timestamp'])
                hit.day_raw = '{:%Y-%m-%d}'.format(hit.sent)
                hit.timestamp_raw = hit_src['@timestamp']
                hit.username = hit_src.username
                hit.channel = hit_src.channel
                hit.msg = hit_src.msg
                if hasattr(hit, 'highlight'):
                    hit.meta.highlight = copy.deepcopy(hit.highlight)
                hit.meta.id = hit['_id']
            hit_list[len(
                hit_list
            ):] = channel_bucket.top_msg_hits.hits.hits  # create hits list

        return hit_list
Example #36
0
def get_elastic_container_histogram_legacy(ident) -> List:
    """
    Fetches a stacked histogram of {year, in_ia}. This is for the older style
    of coverage graph (SVG or JSON export). This function should be DEPRECATED
    to be removed in the near future.

    Filters to the past 500 years (at most), or about 1000 values.

    Returns a list of tuples:
        (year, in_ia, count)
    """

    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
    search = search.query(
        'bool',
        must=[
            Q("range",
              release_year={
                  "gte": datetime.datetime.today().year - 499,
                  "lte": datetime.datetime.today().year,
              }),
        ],
        filter=[
            Q("bool",
              minimum_should_match=1,
              should=[
                  Q("match", container_id=ident),
              ]),
        ],
    )
    search.aggs.bucket(
        'year_in_ia',
        'composite',
        size=1000,
        sources=[
            {
                "year": {
                    "histogram": {
                        "field": "release_year",
                        "interval": 1,
                    },
                }
            },
            {
                "in_ia": {
                    "terms": {
                        "field": "in_ia",
                    },
                }
            },
        ],
    )
    search = search[:0]

    search = search.params(request_cache='true')
    resp = wrap_es_execution(search)

    buckets = resp.aggregations.year_in_ia.buckets
    vals = [(int(h['key']['year']), h['key']['in_ia'], h['doc_count'])
            for h in buckets]
    vals = sorted(vals)
    return vals
Example #37
0
def get_elastic_entity_stats() -> dict:
    """
    TODO: files, filesets, webcaptures (no schema yet)

    Returns dict:
        changelog: {latest: {index, datetime}}
        release: {total, refs_total}
        papers: {total, in_web, in_oa, in_kbart, in_web_not_kbart}
    """

    stats = {}

    # release totals
    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
    search.aggs.bucket(
        'release_ref_count',
        'sum',
        field='ref_count',
    )
    search = search[:0]  # pylint: disable=unsubscriptable-object

    search = search.params(request_cache=True)
    resp = wrap_es_execution(search)

    stats['release'] = {
        "total": int(resp.hits.total),
        "refs_total": int(resp.aggregations.release_ref_count.value),
    }

    # paper counts
    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
    search = search.query(
        'terms',
        release_type=[
            "article-journal",
            "paper-conference",
            # "chapter",
            # "thesis",
        ],
    )
    search.aggs.bucket('paper_like',
                       'filters',
                       filters={
                           "in_web": {
                               "term": {
                                   "in_web": "true"
                               }
                           },
                           "is_oa": {
                               "term": {
                                   "is_oa": "true"
                               }
                           },
                           "in_kbart": {
                               "term": {
                                   "in_kbart": "true"
                               }
                           },
                           "in_web_not_kbart": {
                               "bool": {
                                   "filter": [
                                       {
                                           "term": {
                                               "in_web": "true"
                                           }
                                       },
                                       {
                                           "term": {
                                               "in_kbart": "false"
                                           }
                                       },
                                   ]
                               }
                           },
                       })
    search = search[:0]

    search = search.params(request_cache=True)
    resp = wrap_es_execution(search)
    buckets = resp.aggregations.paper_like.buckets
    stats['papers'] = {
        'total': resp.hits.total,
        'in_web': buckets.in_web.doc_count,
        'is_oa': buckets.is_oa.doc_count,
        'in_kbart': buckets.in_kbart.doc_count,
        'in_web_not_kbart': buckets.in_web_not_kbart.doc_count,
    }

    # container counts
    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_CONTAINER_INDEX'])
    search.aggs.bucket(
        'release_ref_count',
        'sum',
        field='ref_count',
    )
    search = search[:0]  # pylint: disable=unsubscriptable-object

    search = search.params(request_cache=True)
    resp = wrap_es_execution(search)
    stats['container'] = {
        "total": resp.hits.total,
    }

    return stats
Example #38
0
def search():
    query = request.args.get('q')
    judge = request.args.get('judge')
    category = request.args.get('category')
    acts = request.args.get('acts')
    date_from = request.args.get('from')
    date_to = request.args.get('to')

    start = request.args.get('pagenum')
    if start:
        start = int(start) * 20
    else:
        start = 0
    verdict_tokens = []
    legal_tokens = []
    judge_name_tokens = []
    other_tokens = []
    judge2 = ""

    if query is not None:

        duration, verdict_tokens, legal_tokens, judge_name_tokens, other_tokens = parse_query(
            query)
        legal_other = legal_tokens + other_tokens
        verdict = ""
        for i in judge_name_tokens:
            if i is not None:
                judge2 += " " + i

        for i in verdict_tokens:
            if i is not None:
                verdict += " " + i

        print(legal_other)

        file_object = open('things.txt', "w+")
        # file_object1 = open('Word2.txt', "w+")

        for i in range(0, len(legal_other)):
            t = legal_other[i]
            t = t + '\n'
            print(t)
            file_object.write(t)

        t = 'EXIT'
        t = t + '\n'
        file_object.write(t)
        file_object.close()
        # file_object11 = open('things.txt', "r")
        subprocess.Popen("./distance vectors.bin < things.txt > Word2.txt",
                         shell=True)
        # call("["./distance" , "vectors.bin"],  stdin = file_object11, stdout = file_object1)

        # file_object1.close()

        with open('Word2.txt', 'r') as f:
            strin = f.read()

        words = re.sub('[^a-zA-Z0-9\n]', ',', strin)
        words = list([x for x in set(words.split(',')) if x != query])
        print(words)
        new_str = query + ' ' + ' '.join(words[:min(10, len(words))])

        # with open('Word2.txt', 'r') as f:
        #     t = f.readlines()
        #     t1 = t[0]

        # queryNew = ''
        # for i in range(0, len(t1)-2):
        #     queryNew = queryNew + t1[i]

        print('this :: ' + new_str)

    judge = judge if judge is not None else judge2
    s = Search(using=client)
    allfields = [
        'content', 'summary', 'judge', 'acts', 'title', 'verdict', 'keywords',
        'appeal', 'verdict', 'subject'
    ]
    should = []
    if (query is not None):
        q_base = Q('multi_match',
                   query=new_str,
                   fuzziness="1",
                   prefix_length=3,
                   fields=allfields)
        should.append(q_base)
    if (judge is not None):
        if (len(judge) > 0):
            q_judge = Q('multi_match', query=judge, fields=['judge'])
            should.append(q_judge)
    if (acts is not None):
        q_acts = Q('multi_match', query=acts, fields=['acts'])
        should.append(q_acts)
    if (category is not None):
        q_category = Q('multi_match', query=category, fields=['subject'])
        should.append(q_category)
    if (date_from is not None and date_to is not None):
        q_date = Q('range',
                   date={
                       'gte': date_from,
                       'lte': date_to,
                       'format': "yyyy/MM/dd"
                   })
        should.append(q_date)
    # if(date_from is None and date_to is not None):
    #     q_date = Q('range',fields=['date'],gte="1940/01/01",lte=date_to,format="yyyy/MM/dd")
    #     should.append(q_date)
    # if(date_from is not None and date_to is not None):
    #     q_date = Q('range',fields=['date'],gte=date_from,lte="now",format="yyyy/MM/dd")
    #     should.append(q_date)
    q = Q('bool', should=should, minimum_should_match=len(should))
    print(q)
    s = s.query(q)
    count = s.count()
    end = start + 20
    response = s[start:min(end, count)].execute()
    response = response.to_dict()
    result = {}

    # global query_
    # query_ = query
    # query_ = query_.strip().split()
    # query_ = m.infer_vector(query_, alpha=start_alpha, steps=infer_epoch)

    for i in range(len(response['hits']['hits'])):
        resp = response['hits']['hits'][i]["_source"]
        resp['score'] = response['hits']['hits'][i]["_score"]
        result[str(i)] = resp
        resp.pop('content')
    result['count'] = count
    # result_ = result.values()
    # result_ = sorted(result_, key=functools.cmp_to_key(compare))
    # result = {}
    # for i in range(len(response['hits']['hits'])):
    #     result[str(i)]=result_[i]

    return json.dumps(result)
Example #39
0
def save_aggregate_report_to_elasticsearch(aggregate_report,
                                           index_suffix=None,
                                           monthly_indexes=False,
                                           number_of_shards=1,
                                           number_of_replicas=1):
    """
    Saves a parsed DMARC aggregate report to ElasticSearch

    Args:
        aggregate_report (OrderedDict): A parsed forensic report
        index_suffix (str): The suffix of the name of the index to save to
        monthly_indexes (bool): Use monthly indexes instead of daily indexes
        number_of_shards (int): The number of shards to use in the index
        number_of_replicas (int): The number of replicas to use in the index

    Raises:
            AlreadySaved
    """
    logger.debug("Saving aggregate report to Elasticsearch")
    aggregate_report = aggregate_report.copy()
    metadata = aggregate_report["report_metadata"]
    org_name = metadata["org_name"]
    report_id = metadata["report_id"]
    domain = aggregate_report["policy_published"]["domain"]
    begin_date = human_timestamp_to_datetime(metadata["begin_date"])
    end_date = human_timestamp_to_datetime(metadata["end_date"])
    begin_date_human = begin_date.strftime("%Y-%m-%d %H:%M:%S")
    end_date_human = end_date.strftime("%Y-%m-%d %H:%M:%S")
    if monthly_indexes:
        index_date = begin_date.strftime("%Y-%m")
    else:
        index_date = begin_date.strftime("%Y-%m-%d")
    aggregate_report["begin_date"] = begin_date
    aggregate_report["end_date"] = end_date
    date_range = [aggregate_report["begin_date"], aggregate_report["end_date"]]

    org_name_query = Q(dict(match_phrase=dict(org_name=org_name)))
    report_id_query = Q(dict(match_phrase=dict(report_id=report_id)))
    domain_query = Q(dict(match_phrase={"published_policy.domain": domain}))
    begin_date_query = Q(dict(match=dict(date_range=begin_date)))
    end_date_query = Q(dict(match=dict(date_range=end_date)))

    search = Search(index="dmarc_aggregate*")
    query = org_name_query & report_id_query & domain_query
    query = query & begin_date_query & end_date_query
    search.query = query

    existing = search.execute()
    if len(existing) > 0:
        raise AlreadySaved("An aggregate report ID {0} from {1} about {2} "
                           "with a date range of {3} UTC to {4} UTC already "
                           "exists in "
                           "Elasticsearch".format(report_id, org_name, domain,
                                                  begin_date_human,
                                                  end_date_human))
    published_policy = _PublishedPolicy(
        domain=aggregate_report["policy_published"]["domain"],
        adkim=aggregate_report["policy_published"]["adkim"],
        aspf=aggregate_report["policy_published"]["aspf"],
        p=aggregate_report["policy_published"]["p"],
        sp=aggregate_report["policy_published"]["sp"],
        pct=aggregate_report["policy_published"]["pct"],
        fo=aggregate_report["policy_published"]["fo"])

    for record in aggregate_report["records"]:
        agg_doc = _AggregateReportDoc(
            xml_schema=aggregate_report["xml_schema"],
            org_name=metadata["org_name"],
            org_email=metadata["org_email"],
            org_extra_contact_info=metadata["org_extra_contact_info"],
            report_id=metadata["report_id"],
            date_range=date_range,
            date_begin=aggregate_report["begin_date"],
            date_end=aggregate_report["end_date"],
            errors=metadata["errors"],
            published_policy=published_policy,
            source_ip_address=record["source"]["ip_address"],
            source_country=record["source"]["country"],
            source_reverse_dns=record["source"]["reverse_dns"],
            source_base_domain=record["source"]["base_domain"],
            message_count=record["count"],
            disposition=record["policy_evaluated"]["disposition"],
            dkim_aligned=record["policy_evaluated"]["dkim"] is not None
            and record["policy_evaluated"]["dkim"].lower() == "pass",
            spf_aligned=record["policy_evaluated"]["spf"] is not None
            and record["policy_evaluated"]["spf"].lower() == "pass",
            header_from=record["identifiers"]["header_from"],
            envelope_from=record["identifiers"]["envelope_from"],
            envelope_to=record["identifiers"]["envelope_to"])

        for override in record["policy_evaluated"]["policy_override_reasons"]:
            agg_doc.add_policy_override(type_=override["type"],
                                        comment=override["comment"])

        for dkim_result in record["auth_results"]["dkim"]:
            agg_doc.add_dkim_result(domain=dkim_result["domain"],
                                    selector=dkim_result["selector"],
                                    result=dkim_result["result"])

        for spf_result in record["auth_results"]["spf"]:
            agg_doc.add_spf_result(domain=spf_result["domain"],
                                   scope=spf_result["scope"],
                                   result=spf_result["result"])

        index = "dmarc_aggregate"
        if index_suffix:
            index = "{0}_{1}".format(index, index_suffix)
        index = "{0}-{1}".format(index, index_date)
        index_settings = dict(number_of_shards=number_of_shards,
                              number_of_replicas=number_of_replicas)
        create_indexes([index], index_settings)
        agg_doc.meta.index = index

        try:
            agg_doc.save()
        except Exception as e:
            raise ElasticsearchError("Elasticsearch error: {0}".format(
                e.__str__()))
Example #40
0
    nrounds = args.nrounds
    alpha = args.alpha
    beta = args.beta
    R = args.R

    try:
        client = Elasticsearch()
        s = Search(using=client, index=index)

        if query is not None:
            for i in range(0, nrounds):

                q = Q('query_string', query=query[0])
                for i in range(1, len(query)):
                    q &= Q('query_string', query=query[i])
                s = s.query(q)
                response = s[0:nhits].execute()

                print("QUERY:")
                print(query)

                #Passem la query a un diccionary
                query_dict = queryToDict(query)

                sumDocs = {}

                # calcul dels documents
                #print( "------------------- CALULEM ELS DOCUMENTS -------------")
                for r in response:  # only returns a specific number of results
                    file_tw = toTFIDF(client, index, r.meta.id)  # tf-idf
                    sumDocs = {
Example #41
0
 def _delete(_by_filter):
     search = Search(index=self._index, using=self._es_client)
     search = search.query(_by_filter)
     return search.delete()
import sys
import os

#logging.basicConfig(level=logging.WARN)
#es = elasticsearch.Elasticsearch(
#        ['https://gracc.opensciencegrid.org/q'],
#        timeout=300, use_ssl=True, verify_certs=False)

es = elasticsearch.Elasticsearch(['localhost:9200'], timeout=300)

osg_raw_index = 'gracc.osg.raw-*'

s = Search(using=es, index=osg_raw_index)

# Match the records by ProbeName and processors = 0.
s = s.query("match", ProbeName="htcondor-ce:hosted-ce18.grid.uchicago.edu")
s = s.query("match", Processors=0)
s = s.filter('range', EndTime={'from': 'now-12M', 'to': 'now'})
response = s.execute()

print "Query took %i milliseconds" % response.took

print "Query got %i hits" % response.hits.total

#update_id = "8c5816978fee6fc17718bcf81350d1f4"
#print "About to update record with id: %s" % update_id
#es.update(index="gracc.osg.raw3-2017.07", doc_type='JobUsageRecord', id=update_id, body={'doc': {'VOName': 'UserSchool2017'}})
update_buffer = []
for hit in s.scan():
    # Calculate the new CoreHours (cores = 1):
    core_hours = hit.WallDuration / 3600.0
Example #43
0
r = s.execute()

# for b in r.aggregations.percents.values:
#     print(b, r.aggregations.percents.values[b])

s = Search(using=ES_CLIENT,
           index=f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion_id}")
q1 = Q("range",
       value={
           "gt": r.aggregations.percents.values[f'{high_threshold}.0'],
       })
q2 = Q("range",
       value={
           "lt": r.aggregations.percents.values[f'{low_threshold}.0'],
       })
s = s.query(q1 | q2)
s = s.query({
    "function_score": {
        "functions": [{
            "random_score": {
                "seed": "iivtiicthelyon1488"
            }
        }],
    }
})
s = s.source(('document_es_id', 'value'))[:2000]
r = s.execute()

document_eval_dict = dict((hit.document_es_id, hit.value) for hit in r)

s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT)
Example #44
0
def es_get_all(date_from, date_to, filters="", options=""):
    client = Elasticsearch(['192.168.129.132'])
    s = Search(using=client, index='logs-*')

    if date_from and date_to:
        datef, timef = date_from.split(" ")
        datetimef = str(datef) + "T" + str(timef) + ".000Z"
        datet, timet = date_to.split(" ")
        datetimet = str(datet) + "T" + str(timet) + ".000Z"
        s = s.query('bool',
                    filter=[
                        Q('range',
                          log_ingest_timestamp={
                              'gte': datetimef,
                              'lt': datetimet
                          })
                    ])
    if filters:
        for e in filters:
            if "=" == filters[e]["operator"]:
                f = []
                options = filters[e]["text"].split(",")
                for option in options:
                    if len(f) == 0:
                        f.append(Q("match", **{filters[e]["element"]: option}))
                    else:
                        f[0] = f[0] | Q("match", **
                                        {filters[e]["element"]: option})
                s = s.query('bool', filter=f)

            elif "!=" == filters[e]["operator"]:
                f = []
                options = filters[e]["text"].split(",")
                for option in options:
                    if len(f) == 0:
                        f.append(
                            ~Q("match", **{filters[e]["element"]: option}))
                    else:
                        f[0] = f[0] | Q("match", **
                                        {filters[e]["element"]: option})
                s = s.query('bool', filter=f)
    total = s.count()
    s = s[0:total]
    response = s.execute()

    events = []
    for hit in response:
        event = {}
        j = hit.to_dict()
        if "powershell" in options and "Microsoft-Windows-PowerShell/Operational" in j["log_name"] or\
                "sysmon" in options and "Sysmon" in j["log_name"]:
            event["event_id"] = j["event_id"]
            event["log_name"] = j[
                "log_name"]  # "Microsoft-Windows-Sysmon/Operational"
            event["computer_name"] = j["host_name"]
            event["event_data"] = {}
            if "Sysmon" in j["log_name"]:
                #print("entra en sysmon elastic")
                if "z_original_message" in j:
                    lines = str(j["z_original_message"]).splitlines()
                    for line in lines:
                        elements = line.split(": ")
                        key = elements[0]
                        if len(elements) > 1:
                            value = elements[1]
                        else:
                            value = ""
                        event["event_data"][key] = value
                    events.append(event)
            elif "Microsoft-Windows-PowerShell/Operational" in j["log_name"]:
                try:
                    event["event_data"]["log_ingest_timestamp"] = j[
                        "log_ingest_timestamp"]
                    if "powershell" in j:
                        if "host" in j["powershell"]:
                            if "application" in j["powershell"]["host"]:
                                decrypted = base64_in_application(
                                    j["powershell"]["host"]["application"])
                                event["event_data"]["application"] = j[
                                    "powershell"]["host"]["application"]
                                event["event_data"]["param"] = decrypted
                        elif "scriptblock" in j["powershell"]:
                            if "text" in j["powershell"]["scriptblock"]:
                                event["event_data"]["application"] = j[
                                    "powershell"]["scriptblock"]["text"]
                                event["event_data"]["param"] = ""
                    elif "param1" in j:
                        event["event_data"]["param"] = ""
                        event["event_data"]["application"] = j["param1"]
                    elif "param2" in j:
                        event["event_data"]["param"] = ""
                        event["event_data"]["application"] = j["param2"]
                    events.append(event)
                except Exception as e:
                    print("Eception: {}, Event: {}".format(e, j))
    return events
Example #45
0
def do_release_search(query: ReleaseQuery,
                      deep_page_limit: int = 2000) -> SearchHits:

    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_RELEASE_INDEX'])

    # availability filters
    if query.fulltext_only:
        search = search.filter("term", in_ia=True)

    # Below, we combine several queries to improve scoring.

    # this query use the fancy built-in query string parser
    basic_biblio = Q(
        "query_string",
        query=query.q,
        default_operator="AND",
        analyze_wildcard=True,
        allow_leading_wildcard=False,
        lenient=True,
        fields=[
            "title^2",
            "biblio",
        ],
    )
    has_fulltext = Q("term", in_ia=True)
    poor_metadata = Q(
        "bool",
        should=[
            # if these fields aren't set, metadata is poor. The more that do
            # not exist, the stronger the signal.
            Q("bool", must_not=Q("exists", field="title")),
            Q("bool", must_not=Q("exists", field="release_year")),
            Q("bool", must_not=Q("exists", field="release_type")),
            Q("bool", must_not=Q("exists", field="release_stage")),
            Q("bool", must_not=Q("exists", field="container_id")),
        ],
    )

    search = search.query(
        "boosting",
        positive=Q(
            "bool",
            must=basic_biblio,
            should=[has_fulltext],
        ),
        negative=poor_metadata,
        negative_boost=0.5,
    )

    # Sanity checks
    limit = min((int(query.limit or 25), 100))
    offset = max((int(query.offset or 0), 0))
    if offset > deep_page_limit:
        # Avoid deep paging problem.
        offset = deep_page_limit

    search = search[offset:(offset + limit)]

    resp = wrap_es_execution(search)
    results = results_to_dict(resp)

    for h in results:
        # Ensure 'contrib_names' is a list, not a single string
        if type(h['contrib_names']) is not list:
            h['contrib_names'] = [
                h['contrib_names'],
            ]
        h['contrib_names'] = [
            name.encode('utf8', 'ignore').decode('utf8')
            for name in h['contrib_names']
        ]

    return SearchHits(
        count_returned=len(results),
        count_found=int(resp.hits.total),
        offset=offset,
        limit=limit,
        deep_page_limit=deep_page_limit,
        query_time_ms=int(resp.took),
        results=results,
    )
Example #46
0
def search(search_params, index, page_size, ip, request,
           filter_dead, page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Add requested filters.
    if 'li' in search_params.data:
        s = _filter_licenses(s, search_params.data['li'])
    elif 'lt' in search_params.data:
        s = _filter_licenses(s, search_params.data['lt'])

    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)
    if 'extension' in search_params.data:
        extension = search_params.data['extension']
        extension_filter = Q('term', extension=extension)
        s = s.filter('bool', should=extension_filter, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(
            key=filter_cache_key,
            timeout=CACHE_TIMEOUT,
            value=filtered_providers
        )
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query(
            'query_string',
            query=query,
            fields=search_fields,
            type='most_fields'
        )
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query(
                'query_string', query=creator, default_field='creator'
            )
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query(
                'query_string', query=title, default_field='title'
            )
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query(
                'query_string',
                default_field='tags.name',
                query=tags
            )

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip))
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    search_response = s.execute()
    results = _post_process_results(
        s,
        start,
        end,
        page_size,
        search_response,
        request,
        filter_dead
    )

    result_count, page_count = _get_result_and_page_count(
        search_response,
        results,
        page_size
    )

    return results, page_count, result_count
Example #47
0
def get_elastic_container_stats(ident, issnl=None):
    """
    Returns dict:
        ident
        issnl (optional)
        total
        in_web
        in_kbart
        preserved
    """

    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
    search = search.query(
        'term',
        container_id=ident,
    )
    search.aggs.bucket(
        'container_stats',
        'filters',
        filters={
            "in_web": {
                "term": {
                    "in_web": True
                },
            },
            "in_kbart": {
                "term": {
                    "in_kbart": True
                },
            },
            "is_preserved": {
                "term": {
                    "is_preserved": True
                },
            },
        },
    )
    search.aggs.bucket(
        'preservation',
        'terms',
        field='preservation',
        missing='_unknown',
    )
    search.aggs.bucket(
        'release_type',
        'terms',
        field='release_type',
        missing='_unknown',
    )

    search = search[:0]

    search = search.params(request_cache=True)
    resp = wrap_es_execution(search)

    container_stats = resp.aggregations.container_stats.buckets
    preservation_bucket = agg_to_dict(resp.aggregations.preservation)
    preservation_bucket['total'] = resp.hits.total
    for k in ('bright', 'dark', 'shadows_only', 'none'):
        if not k in preservation_bucket:
            preservation_bucket[k] = 0
    if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
        preservation_bucket['none'] += preservation_bucket['shadows_only']
        preservation_bucket['shadows_only'] = 0
    release_type_bucket = agg_to_dict(resp.aggregations.release_type)
    stats = {
        'ident': ident,
        'issnl': issnl,
        'total': resp.hits.total,
        'in_web': container_stats['in_web']['doc_count'],
        'in_kbart': container_stats['in_kbart']['doc_count'],
        'is_preserved': container_stats['is_preserved']['doc_count'],
        'preservation': preservation_bucket,
        'release_type': release_type_bucket,
    }

    return stats
def more_like_this(obj,
                   fields,
                   max_query_terms=25,
                   min_term_freq=2,
                   min_doc_freq=5,
                   max_doc_freq=0,
                   query=None):
    """More like this.

    https://www.elastic.co/guide/en/elasticsearch/reference/current/
    query-dsl-mlt-query.html

    :param obj: Django model instance for which similar objects shall be found.
    :param fields: Fields to search in.
    :param max_query_terms:
    :param min_term_freq:
    :param min_doc_freq:
    :param max_doc_freq:
    :param query: Q query
    :type obj: Instance of `django.db.models.Model` (sub-classed) model.
    :type fields: list
    :type max_query_terms: int
    :type min_term_freq: int
    :type min_doc_freq: int
    :type max_doc_freq: int
    :type query: elasticsearch_dsl.query.Q
    :return: List of objects.
    :rtype: elasticsearch_dsl.search.Search

    Example:

        >>> from django_elasticsearch_dsl_drf.helpers import more_like_this
        >>> from books.models import Book
        >>> book = Book.objects.first()
        >>> similar_books = more_like_this(
        >>>     book,
        >>>     ['title', 'description', 'summary']
        >>> )
    """
    _index, _mapping = get_index_and_mapping_for_model(obj._meta.model)
    if _index is None:
        return None

    _client = connections.get_connection()
    _search = Search(using=_client, index=_index)

    if query is not None:
        _search = _search.query(query)

    kwargs = {}

    if max_query_terms is not None:
        kwargs['max_query_terms'] = max_query_terms

    if min_term_freq is not None:
        kwargs['min_term_freq'] = min_term_freq

    if min_doc_freq is not None:
        kwargs['min_doc_freq'] = min_doc_freq

    if max_doc_freq is not None:
        kwargs['max_doc_freq'] = max_doc_freq

    _like_options = {
        '_id': "{}".format(obj.pk),
        '_index': "{}".format(_index),
    }
    if not ELASTICSEARCH_GTE_7_0:
        _like_options.update({'_type': "{}".format(_mapping)})

    return _search.query(
        MoreLikeThis(fields=fields, like=_like_options, **kwargs))
Example #49
0
def get_elastic_preservation_by_year(query) -> List[dict]:
    """
    Fetches a stacked histogram of {year, preservation}.

    Preservation has 4 potential values; this function filters to the past 250
    years (at most), or about 1000 values.

    Returns a list of dicts, sorted by year, with keys/values like:

        {year (int), bright (int), dark (int), shadows_only (int), none (int)}
    """

    search = Search(using=app.es_client,
                    index=app.config['ELASTICSEARCH_RELEASE_INDEX'])
    if query.q not in [None, "*"]:
        search = search.query(
            "query_string",
            query=query.q,
            default_operator="AND",
            analyze_wildcard=True,
            allow_leading_wildcard=False,
            lenient=True,
            fields=[
                "biblio",
            ],
        )
    if query.container_id:
        search = search.filter(
            "term",
            container_id=query.container_id,
        )
    search = search.filter(
        "range",
        release_year={
            "gte": datetime.datetime.today().year - 249,
            "lte": datetime.datetime.today().year,
        },
    )

    search.aggs.bucket(
        'year_preservation',
        'composite',
        size=1500,
        sources=[
            {
                "year": {
                    "histogram": {
                        "field": "release_year",
                        "interval": 1,
                    },
                }
            },
            {
                "preservation": {
                    "terms": {
                        "field": "preservation",
                    },
                }
            },
        ],
    )
    search = search[:0]
    search = search.params(request_cache='true')
    resp = wrap_es_execution(search)

    buckets = resp.aggregations.year_preservation.buckets
    year_nums = set([int(h['key']['year']) for h in buckets])
    year_dicts = dict()
    if year_nums:
        for num in range(min(year_nums), max(year_nums) + 1):
            year_dicts[num] = dict(year=num,
                                   bright=0,
                                   dark=0,
                                   shadows_only=0,
                                   none=0)
        for row in buckets:
            year_dicts[int(
                row['key']['year'])][row['key']['preservation']] = int(
                    row['doc_count'])
    if app.config['FATCAT_MERGE_SHADOW_PRESERVATION']:
        for k in year_dicts.keys():
            year_dicts[k]['none'] += year_dicts[k]['shadows_only']
            year_dicts[k]['shadows_only'] = 0
    return sorted(year_dicts.values(), key=lambda x: x['year'])
Example #50
0
 def search(self, query, field, client):
     q = Q("multi_match", query=query, fields=[field], operator="and", tie_breaker=1, type="most_fields")
     s = Search(using=client)
     s = s.query(q)
     return s.execute().to_dict()["hits"]["hits"]
Example #51
0
    def search(self, qterm, **kwargs):
        r"""Searches in the elasticsearch index for the mail

            :param qterm:
                Query-string
            :type qterm: ``str``
            :param \**kwargs:
                See below

            :Keyword Arguments:
                * *date_gte* (``datetime``) --
                  Filter, From: only emails greater than
                * *date_lte* (``datetime``) --
                  Filter, To: only emails less than
                * *date_sliding* (``str``) --
                  Filter sliding window, only emails of the past XX-hours/days/years... e.g. '-1d/d','-5y/y' --
                  See: https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#date-math
                * *date_sliding_type* (``str``) --
                  Valid date-type: e.g. y M d
                * *use_sliding_value* (``bool``) --
                  True: Only respect date_sliding and date_sliding_type.
                  False: only respect fix date: date_gte and date_lte
                * *number_results* (``int``) --
                  Number of total results to return
                * *sort_field* (``str``) --
                  By which field should results be sorted e.g. date, _score, fromEmail.keyword
                * *sort_dir* (``str``) --
                  In Which direction should results be sorted
                  '+': ascending
                  '-': descending)
            :return: ``DslSearch Response``

            """

        number_results = 10

        # Get arguments
        date_gte = None  # '2010-01-31T22:28:14+0300'  # from
        date_lte = 'now'  # ''2012-09-20T17:41:14+0900' # 'now'  # to
        date_sliding_value = ''
        date_sliding_type = ''
        use_sliding_value = True
        sort_field = '_score'
        sort_dir = '-'
        for key, value in kwargs.items():
            if key == 'date_gte':
                date_gte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT +
                            '}').format(value)
            if key == 'date_lte':
                date_lte = ('{:' + dementor_constants.JSON_DATETIME_FORMAT +
                            '}').format(value)
            if key == 'use_sliding_value':
                use_sliding_value = value
            if key == 'date_sliding_value':
                date_sliding_value = value
            if key == 'date_sliding_type':
                date_sliding_type = value
            if key == 'number_results':
                number_results = value
            if key == 'sort_field':
                sort_field = value
            if key == 'sort_dir':
                sort_dir = value

        # Prepare query
        s = DslSearch(using=self._es, index=self._index_prefix.format('*'))

        # Filter date
        date_field_name = self.get_date_field_name()
        if use_sliding_value & (date_sliding_value != '') & (date_sliding_type
                                                             != ''):
            s = s.query(
                'bool',
                filter=[
                    Range(
                        **{
                            date_field_name: {
                                'gte':
                                'now-{0}{1}'.format(date_sliding_value,
                                                    date_sliding_type)
                            }
                        })
                ])
            # s = s.filter('range', date={'gte': 'now-{0}{1}'.format(date_sliding_value, date_sliding_type)})
        elif date_gte is not None:
            s = s.query(
                'bool',
                filter=[
                    Range(
                        **
                        {date_field_name: {
                            'lte': date_lte,
                            'gte': date_gte
                        }})
                ])
            # s = s.filter('range', date={'lte': date_lte, 'gte': date_gte})

        # Add query-specific fields
        s = self.add_query_fields(s, qterm, **kwargs)

        s = s.sort(
            ''.join((sort_dir, sort_field)),
            '-_score',
        )

        # Number of results
        s = s[0:number_results]

        # Execute
        response = s.execute()
        response_altered = self.alter_response(response)
        return response_altered
Example #52
0
    def find(self, query, client):
        q = Q("match", _id=query)
        s = Search(using=client)
        s = s.query(q)

        return s.execute().to_dict()["hits"]["hits"]
Example #53
0
def dataset_prepare(**kwargs):
    import os
    import itertools
    import shutil
    import artm
    import datetime
    from elasticsearch_dsl import Search, Q
    from dags.bigartm.services.cleaners import txt_writer

    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_TOPIC_MODELLING
    from mainapp.models_user import TopicGroup

    import logging
    es_logger = logging.getLogger('elasticsearch')
    es_logger.setLevel(logging.ERROR)

    # Recreate index object
    try:
        index = init_tm_index(**kwargs)
    except TMNotFoundException:
        return 1

    lc = artm.messages.ConfigureLoggingArgs()
    lib = artm.wrapper.LibArtm(logging_config=lc)
    lc.minloglevel = 3  # 0 = INFO, 1 = WARNING, 2 = ERROR, 3 = FATAL
    lib.ArtmConfigureLogging(lc)
    perform_actualize = 'perform_actualize' in kwargs
    fast = 'fast' in kwargs
    name = kwargs['name']
    name_translit = kwargs['name_translit']
    corpus = kwargs['corpus']
    if type(corpus) != list:
        corpus = [corpus]
    corpus_datetime_ignore = kwargs.get('corpus_datetime_ignore', [])
    source = kwargs['source']
    datetime_from = kwargs['datetime_from']
    datetime_to = kwargs['datetime_to']
    group_id = kwargs['group_id']
    topic_weight_threshold = kwargs['topic_weight_threshold']
    topic_doc = kwargs['topic_doc']
    uniq_topic_doc = kwargs['uniq_topic_doc']
    temp_folder = kwargs['temp_folder']
    text_field = kwargs['text_field']
    is_dynamic = 'is_dynamic' in kwargs and kwargs['is_dynamic']

    # Extract
    s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=corpus) \
                                                        .filter('exists', field=text_field)
    q_from = Q()
    q_to = Q()
    if source:
        s = s.filter("term", **{"source": source})
    if datetime_from:
        q_from = Q("range", datetime={"gte": datetime_from})
    if datetime_to and not perform_actualize:
        q_to = Q("range", datetime={"lte": datetime_to})
    q = (q_from & q_to)
    for corpus_to_ignore in corpus_datetime_ignore:
        q = q | (~Q('exists', field="datetime") & Q("term", corpus=corpus_to_ignore))
    s = s.query(q)
    s = s.source(["id", "text", text_field, "title", "source", "num_views", "num_comments", "datetime", "corpus"])[:50_000_000]

    group_document_es_ids = None
    print("!!! group_id", group_id) # TODO Remove prints
    if group_id:
        group = TopicGroup.objects.get(id=group_id)
        topic_ids = [t.topic_id for t in group.topics.all()]
        if not topic_ids:
            return "Group is empty"
        topic_modelling_name = group.topic_modelling_name
        st = Search(using=ES_CLIENT, index=f"{topic_doc}_{topic_modelling_name}") \
                 .filter("terms", **{"topic_id": topic_ids}) \
                 .filter("range", topic_weight={"gte": topic_weight_threshold}) \
                 .filter("range", datetime={"gte": datetime.date(2000, 1, 1)}) \
                 .source(('document_es_id'))[:5000000]
        print("!!!", f"{topic_doc}_{topic_modelling_name}", topic_ids, topic_weight_threshold)
        r = st.scan()
        group_document_es_ids = set([doc.document_es_id for doc in r])
        print(len(group_document_es_ids))

    # Exclude document already in TM if actualizing
    ids_to_skip = None
    if perform_actualize:
        std = Search(using=ES_CLIENT, index=f"{uniq_topic_doc}_{name}").source(['document_es_id'])[:50_000_000]
        ids_to_skip = set((doc.document_es_id for doc in std.scan()))
        print("!!!", "Skipping", len(ids_to_skip))

    print("!!!", "Potential docs", s.count())
    formated_data = document_scanner(s, text_field, corpus, ids_to_skip, group_document_es_ids)

    try:
        peek_doc = next(formated_data)
    except Exception as e:
        print("!!! No docs", e)
        peek_doc = False
    # if perform_actualize and peek_doc == False:
    #     return f"No documents to actualize"

    data_folder = os.path.join("/big_data/", temp_folder)

    try:
        os.mkdir(data_folder)
    except:
        pass

    if is_dynamic:
        data_folder = os.path.join(data_folder,
                                   f"bigartm_formated_data_{name if not name_translit else name_translit}{'_actualize' if perform_actualize else ''}{'_fast' if fast else ''}_{datetime_from.date()}_{datetime_to.date()}")
    else:
        data_folder = os.path.join(data_folder,
                                   f"bigartm_formated_data_{name if not name_translit else name_translit}{'_actualize' if perform_actualize else ''}{'_fast' if fast else ''}_{datetime_from}_{datetime_to}")
    try:
        shutil.rmtree(data_folder, ignore_errors=True)
        os.mkdir(data_folder)
    except:
        pass

    print("!!!", f"Writing documents")
    txt_writer(data=itertools.chain([peek_doc], formated_data), filename=os.path.join(data_folder, f"bigartm_formated_data.txt"))
    artm.BatchVectorizer(data_path=os.path.join(data_folder, f"bigartm_formated_data.txt"),
                         data_format="vowpal_wabbit",
                         target_folder=os.path.join(data_folder, "batches"))
    return f"index.number_of_document={index.number_of_documents}"
Example #54
0
def find_match(queue, clf):
    # connection to target and reference database
    client = Elasticsearch(timeout=200, port=ref_index_port)
    csxdb = mysql.connector.connect(user='******',
                                    password='******',
                                    host='csxstaging01',
                                    database='citeseerx2',
                                    charset='utf8',
                                    use_unicode=True)
    CSXcursor = csxdb.cursor(dictionary=True)
    CSXauthorCursor = csxdb.cursor(dictionary=True)
    REFdb = mysql.connector.connect(user='******',
                                    password='******',
                                    host='csxstaging01',
                                    database='wos2017_12',
                                    charset='utf8',
                                    use_unicode=True)
    REFcursor = REFdb.cursor(dictionary=True)

    while (True):
        if queue.empty():
            break
        try:
            csxID = queue.get()
            if csxID is None:
                queue.task_done()
                break
            CSXcursor.execute(cmd_paper % (csxID))
            CSXPaper = CSXcursor.fetchone()
            if CSXPaper is None:
                queue.task_done()
                continue
            CSXauthorCursor.execute(cmd_author % (csxID))
            CSXauthors = CSXauthorCursor.fetchall()
            s = Search(using=client, index=ref_index)
            if CSXPaper['title'] is None or len(CSXPaper['title']) < 20:
                if len(CSXauthors) > 0 and CSXauthors[0][
                        'lname'] is not None and CSXPaper['year'] is not None:
                    s.query = Q('bool',
                                should=[
                                    Q('match', year=CSXPaper['year']),
                                    Q('match', authors=CSXauthors[0]['lname'])
                                ])
                else:
                    if CSXPaper['abstract'] is not None:
                        s = s.query("match", abstract=CSXPaper['abstract'])
                    else:
                        queue.task_done()
                        continue
            else:
                s = s.query("match", title=CSXPaper['title'])
            response = s.execute()
            for hit in response:
                REFcursor.execute(cmd_REFpaper % (hit['id']))
                REFpaper = REFcursor.fetchone()
                REFcursor.execute(cmd_REFauthor % (hit['id']))
                REFauthors = REFcursor.fetchall()
                features = SimilarityProfile.calcFeatureVector(
                    REFpaper, REFauthors, CSXPaper, CSXauthors)
                label = clf.predict([features])
                if label == 1:
                    with open("results.txt", "a") as g:
                        fcntl.flock(g, fcntl.LOCK_EX)
                        g.write(csxID + '\t' + hit['id'] + '\n')
                        fcntl.flock(g, fcntl.LOCK_UN)
                    break
            queue.task_done()
        except:
            queue.task_done()
            print("-" * 60)
            print(csxID)
            print(traceback.format_exc())
            print(sys.exc_info()[0])
            print("-" * 60)
Example #55
0
import random
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

es = Elasticsearch()
s = Search(es)
dictmatch = {"doctors.location": "Jayanagar"}
hits = s.query("match", **dictmatch).extra(from_=5, size=1).execute()

hits = hits.to_dict()
for hit in hits['hits']['hits']:
    print str(hit['_source']['doctors'])
Example #56
0
    def search(self,
               lucene,
               index="*",
               doctype="doc",
               fields=None,
               date_field="@timestamp",
               days=None,
               start_time=None,
               end_time=None):
        '''
        Search Elastic and return the results as a list of dicts.

        lucene: A string containing the Elastic search (e.g., 'item:5282 AND color:red')
        index: A string containing the index name to search, or an index name pattern
               if you want to search multiple indices (e.g., 'myindex' or 'myindex-*')
        doctype: The document type you are interested in.
        fields: A string containing a comma-separated list of field names to return.
                The default is to return all fields, but using this list you can
                select only certain fields, which may make things a bit faster.
        date_field: The name of the field used for date/time comparison.
        days: Search the past X days. If provided, this supercedes both start_time
              and end_time.
        start_time: A datetime() object representing the start of the search
                    window. If used without end_time, the end of the search
                    window is the current time.
        end_time: A datetime() object representing the end of the search window.
                  If used without start_time, the search start will be the earliest
                  time in the index.
        '''

        s = Search(using=self.es_conn, index=index, doc_type=doctype)

        s = s.query("query_string", query=lucene)

        if fields:
            s = s.source(fields.split(','))

        # Add timestamp filters, if provided.  Days takes precendence over
        # use of either/both of start_time and end_time.
        # Note the weird unpacked dictionary syntax in the call to s.filter().
        # We have to do it this way because Python has an issue naming things
        # with "@" in them, but the default timestamp field in many ES servers is
        # "@timestamp".
        # ref:  https://github.com/elastic/elasticsearch-dsl-py/blob/master/docs/search_dsl.rst
        if days:
            end = datetime.now()
            start = end - timedelta(days=days)
            s = s.filter('range', **{date_field: {"gte": start, "lte": end}})
        elif start_time and not end_time:
            s = s.filter('range', **{date_field: {"gte": start_time}})
        elif end_time and not start_time:
            s = s.filter('range', **{date_field: {"lte": end_time}})
        elif start_time and end_time:
            s = s.filter('range',
                         **{date_field: {
                             "gte": start_time,
                             "lte": end_time
                         }})

        # execute the search
        results = s.scan()

        for hit in results:
            yield hit.to_dict()
def results(page):
    global tmp_text
    global tmp_title
    global tmp_star
    global tmp_director
    global tmp_language
    global tmp_location
    global tmp_time
    global tmp_categories
    global tmp_country
    global tmp_min
    global tmp_max
    global gresults

    # convert the <page> parameter in url to integer.
    if type(page) is not int:
        page = int(page.encode('utf-8'))
    # if the method of request is post (for initial query), store query in local global variables
    # if the method of request is get (for "next" results), extract query contents from client's global variables
    if request.method == 'POST':
        text_query = request.form['query']
        star_query = request.form['starring']
        director_query = request.form['director']
        language_query = request.form['language']
        location_query = request.form['location']
        time_query = request.form['time']
        categories_query = request.form['categories']
        country_query = request.form['country']
        mintime_query = request.form['mintime']

        if len(mintime_query) is 0:
            mintime = 0
        else:
            if mintime_query.replace('.', '', 1).isdigit():
                mintime = float(mintime_query)
            else:
                return render_template('error_page.html')

        maxtime_query = request.form['maxtime']
        if len(maxtime_query) is 0:
            maxtime = 99999
        else:
            if maxtime_query.replace('.', '', 1).isdigit():
                maxtime = float(maxtime_query)
            else:
                return render_template('error_page.html')

        # update global variable template data
        tmp_text = text_query
        tmp_star = star_query
        tmp_director = director_query
        tmp_language = language_query
        tmp_location = location_query
        tmp_time = time_query
        tmp_categories = categories_query
        tmp_country = country_query
        tmp_min = mintime
        tmp_max = maxtime
    else:
        # use the current values stored in global variables.
        text_query = tmp_text
        star_query = tmp_star
        director_query = tmp_director
        language_query = tmp_language
        location_query = tmp_location
        time_query = tmp_time
        categories_query = tmp_categories
        country_query = tmp_country
        mintime = tmp_min

        if tmp_min > 0:
            mintime_query = tmp_min
        else:
            mintime_query = ""
        maxtime = tmp_max
        if tmp_max < 99999:
            maxtime_query = tmp_max
        else:
            maxtime_query = ""

    # store query values to display in search boxes in UI
    shows = {}
    shows['text'] = text_query
    shows['starring'] = star_query
    shows['director'] = director_query
    shows['language'] = language_query
    shows['location'] = location_query
    shows['time'] = time_query
    shows['categories'] = categories_query
    shows['maxtime'] = maxtime_query
    shows['mintime'] = mintime_query

    # Create a search object to query our index
    search = Search(index='sample_film_index')

    # Build up your elasticsearch query in piecemeal fashion based on the user's parameters passed in.
    # The search API is "chainable".
    # Each call to search.query method adds criteria to our growing elasticsearch query.
    # You will change this section based on how you want to process the query data input into your interface.

    # search for runtime using a range query
    s = search.query('range', runtime={'gte': mintime, 'lte': maxtime})

    # Conjunctive search over multiple fields (title and text) using the text_query passed in
    if len(text_query) > 0:
        s = s.query('multi_match',
                    query=text_query,
                    type='cross_fields',
                    fields=['title', 'text'],
                    operator='and')
        response = s.execute()
        if len(response) == 0:
            s = search.query('range', runtime={'gte': mintime, 'lte': maxtime})
            s = s.query('multi_match',
                        query=text_query,
                        type='cross_fields',
                        fields=['title^4', 'text'],
                        operator='or')
        phrase = re.findall(r'"(.*?)"', text_query)
        if len(phrase) != 0:
            s = s.query(Q('match_phrase', text=phrase[0]))

    # search for matching stars
    # You should support multiple values (list)
    if len(star_query) > 0:
        s = s.query('match', starring=star_query)
    if len(director_query) > 0:
        s = s.query('match', director=director_query)
    if len(language_query) > 0:
        s = s.query('match', language=language_query)
    if len(location_query) > 0:
        s = s.query('match', location=location_query)
    if len(time_query) > 0:
        s = s.query('match', time=time_query)
    if len(categories_query) > 0:
        s = s.query('match', categories=categories_query)
    if len(country_query) > 0:
        s = s.query('match', categories=country_query)

    # highlight
    s = s.highlight_options(pre_tags='<mark>', post_tags='</mark>')
    # s = s.highlight('text', fragment_size=999999999, number_of_fragments=1)
    # s = s.highlight('title', fragment_size=999999999, number_of_fragments=1)
    for key in shows:
        s = s.highlight(key, fragment_size=999999999, number_of_fragments=1)

    # determine the subset of results to display (based on current <page> value)
    start = 0 + (page - 1) * 10
    end = 10 + (page - 1) * 10

    # execute search and return results in specified range.
    response = s[start:end].execute()

    # insert data into response
    resultList = {}
    for hit in response.hits:
        result = {}
        result['score'] = hit.meta.score

        if 'highlight' in hit.meta:
            if 'title' in hit.meta.highlight:
                result['title'] = hit.meta.highlight.title[0]
            else:
                result['title'] = hit.title

            if 'starring' in hit.meta.highlight:
                result['starring'] = hit.meta.highlight.starring[0]
            else:
                result['starring'] = hit.starring

            if 'runtime' in hit.meta.highlight:
                result['runtime'] = hit.meta.highlight.runtime[0]
            else:
                result['runtime'] = hit.runtime

            if 'director' in hit.meta.highlight:
                result['director'] = hit.meta.highlight.director[0]
            else:
                result['director'] = hit.director

            if 'location' in hit.meta.highlight:
                result['location'] = hit.meta.highlight.location[0]
            else:
                result['location'] = hit.location

            if 'time' in hit.meta.highlight:
                result['time'] = hit.meta.highlight.time[0]
            else:
                result['time'] = hit.time

            if 'language' in hit.meta.highlight:
                result['language'] = hit.meta.highlight.language[0]
            else:
                result['language'] = hit.language

            if 'categories' in hit.meta.highlight:
                result['categories'] = hit.meta.highlight.categories[0]
            else:
                result['categories'] = hit.categories

            if 'country' in hit.meta.highlight:
                result['country'] = hit.meta.highlight.country[0]
            else:
                result['country'] = hit.country
            if 'text' in hit.meta.highlight:
                result['text'] = hit.meta.highlight.text[0]
            else:
                result['text'] = hit.text

        else:
            result['title'] = hit.title
            result['starring'] = hit.starring
            result['runtime'] = hit.runtime
            result['director'] = hit.director
            result['location'] = hit.location
            result['time'] = hit.time
            result['language'] = hit.language
            result['categories'] = hit.categories
            result['country'] = hit.country
            result['text'] = hit.text

        resultList[hit.meta.id] = result

    # make the result list available globally
    gresults = resultList

    # get the total number of matching results
    result_num = response.hits.total

    # if we find the results, extract title and text information from doc_data, else do nothing
    if result_num > 0:
        return render_template('page_SERP.html',
                               results=resultList,
                               res_num=result_num,
                               page_num=page,
                               queries=shows)
    else:
        message = []
        if len(text_query) > 0:
            message.append('Unknown search term: ' + text_query)
        if len(star_query) > 0:
            message.append('Cannot find star: ' + star_query)
        if len(time_query) > 0:
            message.append('Cannot find time: ' + time_query)
        if len(director_query) > 0:
            message.append('Cannot find director: ' + director_query)
        if len(location_query) > 0:
            message.append('Cannot find location: ' + location_query)
        if len(language_query) > 0:
            message.append('Cannot find language: ' + language_query)
        if len(categories_query) > 0:
            message.append('Cannot find categories: ' + categories_query)
        if len(country_query) > 0:
            message.append('Cannot find country: ' + country_query)

        if len(mintime_query) > 0 and len(maxtime_query) > 0:
            message.append(
                'Cannot find running time between {} mins and {} mins'.format(
                    mintime_query, maxtime_query))
        elif len(mintime_query) > 0:
            message.append(
                'Cannot find running time greater than {} mins'.format(
                    mintime_query))
        else:
            message.append('Cannot find running time less than {} mins'.format(
                maxtime_query))

        return render_template('page_SERP.html',
                               results=message,
                               res_num=result_num,
                               page_num=page,
                               queries=shows)
Example #58
0
    def get_bucket_indexes(self,
                           corpusId,
                           bucketsIds: List[str] = [],
                           bucketNames: List[str] = [],
                           docTypes=["default"]):
        """
        Internal method to get indexes to search from

        :param bucketsIds:
        :param bucketNames:
        :param docTypes:        Get indexes for specific docTypes. If empty will use all docTypes.
        :return: {searchIndices: "<all indexes separted by comma>", indexByBucketId : {index1 : bucketId1, index2 : bucketId1 ...}}
        """
        logger = logging.getLogger(__name__)

        es = get_es_conn()
        s = Search(using=es, index=self.bucketBindingIndex)
        terms = {}

        if not corpusId:
            logger.info("Invalid search corpusId: '{0}'".format(corpusId))
            raise InvalidSearchParameterException(
                "Invalid search corpusId: '{0}'".format(corpusId))
        else:
            terms["corpusId"] = corpusId

        # Filter Ids
        if bucketsIds:
            for id in bucketsIds:
                if "*" in id:
                    logger.info("Invalid search bucketIds: {0}".format(id))
                    raise InvalidSearchParameterException(
                        "Invalid search bucketIds: {0}".format(id))

        # find all matching buckets.
        if bucketNames and bucketsIds:
            s.query = Q('bool',
                        must=[Q('term', corpusId=terms["corpusId"])],
                        should=[
                            Q("term", bucketId=bucketsIds),
                            Q("match", name=bucketNames)
                        ],
                        minimum_should_match=1)
        elif bucketNames:
            s.query = Q('bool',
                        must=[Q('term', corpusId=terms["corpusId"])],
                        should=[Q("match", name=bucketNames)],
                        minimum_should_match=1)
        elif bucketsIds:
            s.query = Q('bool',
                        must=[Q('term', corpusId=terms["corpusId"])],
                        should=[Q("term", bucketId=bucketsIds)],
                        minimum_should_match=1)
        else:
            s.query = Q('bool', must=[Q('term', corpusId=terms["corpusId"])])

        bucketInfo = s.execute()
        indexByBucketId = {}
        searchIndices = []

        # TODO: exception for bucket not allowed.
        for info in bucketInfo:
            bucket = self.get_bucket(corpusId, info.meta.id)
            strIndexes = bucket.dd.get_indices(docTypes)
            searchIndices.append(strIndexes)
            indices = strIndexes.replace('*', "").split(',')
            for index in indices:
                indexByBucketId[index] = bucket.id

        return {
            "searchIndices": searchIndices.join(","),
            "indexByBucketId": indexByBucketId
        }
Example #59
0
def calc(result):
    #xx = datetime.datetime.utcnow()
    #print 'x: ', xx
    #result['level1']['start'] = datetime.datetime.now().strftime("%B %d %Y, %X")
    #result['level1']['start'] = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.000Z")

    pue = dict()
    # Constants

    #pue['N1'] = 1000 / 1000
    #pue['N2'] = 1000 / 1000
    #pue['N3'] = 710 / 1000
    #pue['N4'] = 1700 / 1000
    ##pue['N6'] = 0
    #pue['N8'] = 500 / 1000
    #pue['N9'] = 1600 / 1000

    pue['N1'] = 1000
    pue['N2'] = 1000
    pue['N3'] = 710
    pue['N4'] = 1700
    ##pue['N6'] = 0
    pue['N8'] = 500
    pue['N9'] = 1600

    result['level1']['start'] = datetime.datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%S")
    result['level2']['start'] = datetime.datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%S")
    for x in i:
        indx = x + '*'
        ##print 'index', indx
        for eskey in i[x]:
            ##timespan = i[x][eskey]
            ##print 'key', eskey
            ##print 'value', i[x][eskey]
            (valueField, scale, variable, source) = i[x][eskey].split('|')
            if variable not in pue:
                pue[variable] = 0
                #print("clears pue")
            k = eskey.split('|')
            s = Search(using=esdb, index=indx)
            for j in k:
                (subkey, subvalue) = j.split(':')
                s = s.query("term", **{subkey: subvalue})
                ##print 'subkey', subkey
                ##print 'subvalue', subvalue
            ##s = s.query('range', **{'@timestamp':{'gte': '2018-07-01T00:00:00.000Z', 'lt':'2018-08-01T00:00:00.000Z'}})
            s = s.query('range',
                        **{'@timestamp': {
                            'gte': 'now-30m',
                            'lt': 'now'
                        }})
            s = s.sort('-@timestamp')
            #s = s.aggs.metric('power_sum', 'sum', field=valueField)
            s = s[0:1]

            #print s.to_dict()
            response = s.execute()

            #print 'Total %d hits found.' % response.hits.total
            if response.hits.total != 0:
                for commit in response:
                    #      print commit.to_dict()
                    pue[variable] += commit['data']['datum'] * float(scale)
            #      ##print commit.to_dict()
            #      for n in k:
            #         (sk, sv) = n.split(':')
            #         if sk.find('.') != -1:
            #            (psk, ssk) = sk.split('.')
            #            ##print 'key: ', psk
            #            ##print 'ha', commit[psk][ssk]
            #         ##else:
            #            ##print 'key: ', sk
            #            ##print 'value: ', sv
            #            ##print 'ha', commit[sk]
            #      v = response.aggregations.power_sum
            #      pue[variable] += ( v['value'] / response.hits.total )
            #      print("Processing %s" % variable)
            else:
                ##print s.to_dict()
                if result['level1'].has_key('missing') is False:
                    result['level1']['missing'] = [variable]
                    result['level2']['missing'] = [variable]
                else:
                    result['level1']['missing'].append(variable)
                    result['level2']['missing'].append(variable)

                if result['level2'].has_key('missing-meters') is False:
                    result['level1']['missing-meters'] = [source]
                    result['level2']['missing-meters'] = [source]
                else:
                    result['level1']['missing-meters'].append(source)
                    result['level2']['missing-meters'].append(source)
                #print 'No Value for: ', variable, ' ', source

    pue['N7'] = pue['N7p'] - pue['N7pp']
    pue['N10pp'] = pue['N10p'] - pue['N10']
    pue['D'] = pue['D1'] + pue['D2']
    pue['E'] = pue['E1'] + pue['E2']
    pue['F'] = pue['F1'] + pue['F2']
    if (pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] + pue['D2'] +
            pue['E1'] + pue['E2'] + pue['F1'] + pue['F2']) == 0:
        lineLoss = 0
    else:
        lineLoss = (pue['A1'] + pue['A2']) / (
            pue['B1'] + pue['B2'] + pue['C1'] + pue['C2'] + pue['D1'] +
            pue['D2'] + pue['E1'] + pue['E2'] + pue['F1'] + pue['F2'])
    if pue['Bp'] == 0:
        txLoss590 = 0
    else:
        txLoss590 = (pue['B1'] + pue['B2']) / pue['Bp']
    if pue['Cp'] == 0:
        txLoss596 = 0
    else:
        txLoss596 = (pue['C1'] + pue['C2']) / pue['Cp']

    #numm1 = ( ( pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] + pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] + (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 ) * txLoss590 + ( pue['Cp'] - pue['N10pp'] - pue['N11pp'] ) * txLoss596 + pue['D'] + pue['E'] + pue['F'] ) * lineLoss
    #demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] + pue['ND1-5'] + pue['ND1-6']) / 1000 - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp']

    numm1 = ((pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] +
              pue['N6'] + pue['N7'] + pue['N8'] + pue['N9'] - pue['N7p'] +
              pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] +
              pue['ND1-5'] + pue['ND1-6']) * txLoss590 +
             (pue['Cp'] - pue['N10pp'] - pue['N11pp']) * txLoss596 + pue['D'] +
             pue['E'] + pue['F']) * lineLoss
    demon1 = (pue['ND1-1'] + pue['ND1-2'] + pue['ND1-3'] + pue['ND1-4'] +
              pue['ND1-5'] +
              pue['ND1-6']) - pue['N7p'] + pue['Dp'] + pue['Ep'] + pue['Fp']

    numm2 = ((pue['N1'] + pue['N2'] + pue['N3'] + pue['N4'] + pue['N5'] +
              pue['N7'] + pue['N6'] + pue['N8'] + pue['N9'] + pue['ND2-1'] +
              pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue['ND2-5'] +
              pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue['ND2-9'] +
              pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue['ND2-13'] +
              pue['ND2-14'] + pue['ND2-15'] + pue['ND2-16'] + pue['ND2-17'] +
              pue['ND2-18']) * txLoss590 +
             (pue['Cp'] - pue['N10pp'] - pue['N11pp']) * txLoss596 + pue['D'] +
             pue['E'] + pue['F']) * lineLoss

    demon2 = pue['ND2-1'] + pue['ND2-2'] + pue['ND2-3'] + pue['ND2-4'] + pue[
        'ND2-5'] + pue['ND2-6'] + pue['ND2-7'] + pue['ND2-8'] + pue[
            'ND2-9'] + pue['ND2-10'] + pue['ND2-11'] + pue['ND2-12'] + pue[
                'ND2-13'] + pue['ND2-14'] + pue['ND2-15'] + pue[
                    'ND2-16'] + pue['ND2-17'] + pue['ND2-18'] + pue[
                        'Dp'] + pue['Ep'] + pue['Fp']

    if demon1 == 0:
        p1 = 0
    else:
        p1 = numm1 / demon1
    if demon2 == 0:
        p2 = 0
    else:
        p2 = numm2 / demon2

    result['level1']['pue'] = p1
    result['level2']['pue'] = p2
    result['level1']['end'] = datetime.datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%S")
    result['level2']['end'] = datetime.datetime.utcnow().strftime(
        "%Y-%m-%dT%H:%M:%S")
Example #60
0
def search_elastic(term='',
                   user=None,
                   sort='id',
                   order='desc',
                   category='0_0',
                   quality_filter='0',
                   page=1,
                   rss=False,
                   admin=False,
                   logged_in_user=None,
                   per_page=75,
                   max_search_results=1000):
    # This function can easily be memcached now

    es_client = Elasticsearch()

    es_sort_keys = {
        'id': 'id',
        'size': 'filesize',
        # 'name': 'display_name',  # This is slow and buggy
        'seeders': 'seed_count',
        'leechers': 'leech_count',
        'downloads': 'download_count'
    }

    sort_ = sort.lower()
    if sort_ not in es_sort_keys:
        flask.abort(400)

    es_sort = es_sort_keys[sort]

    order_keys = {'desc': 'desc', 'asc': 'asc'}

    order_ = order.lower()
    if order_ not in order_keys:
        flask.abort(400)

    # Only allow ID, desc if RSS
    if rss:
        sort = es_sort_keys['id']
        order = 'desc'

    # funky, es sort is default asc, prefixed by '-' if desc
    if 'desc' == order:
        es_sort = '-' + es_sort

    # Quality filter
    quality_keys = [
        '0',  # Show all
        '1',  # No remakes
        '2',  # Only trusted
        '3'  # Only completed
    ]

    if quality_filter.lower() not in quality_keys:
        flask.abort(400)

    quality_filter = int(quality_filter)

    # Category filter
    main_category = None
    sub_category = None
    main_cat_id = 0
    sub_cat_id = 0
    if category:
        cat_match = re.match(r'^(\d+)_(\d+)$', category)
        if not cat_match:
            flask.abort(400)

        main_cat_id = int(cat_match.group(1))
        sub_cat_id = int(cat_match.group(2))

        if main_cat_id > 0:
            if sub_cat_id > 0:
                sub_category = models.SubCategory.by_category_ids(
                    main_cat_id, sub_cat_id)
                if not sub_category:
                    flask.abort(400)
            else:
                main_category = models.MainCategory.by_id(main_cat_id)
                if not main_category:
                    flask.abort(400)

    # This might be useless since we validate users
    # before coming into this method, but just to be safe...
    if user:
        user = models.User.by_id(user)
        if not user:
            flask.abort(404)
        user = user.id

    same_user = False
    if logged_in_user:
        same_user = user == logged_in_user.id

    s = Search(using=es_client,
               index=app.config.get('ES_INDEX_NAME'))  # todo, sukebei prefix

    # Apply search term
    if term:
        s = s.query('simple_query_string',
                    analyzer='my_search_analyzer',
                    default_operator="AND",
                    query=term)

    # User view (/user/username)
    if user:
        s = s.filter('term', uploader_id=user)

        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in user is not the same as the user being viewed,
            # show only torrents that aren't hidden or anonymous.
            #
            # If logged in user is the same as the user being viewed,
            # show all torrents including hidden and anonymous ones.
            #
            # On RSS pages in user view, show only torrents that
            # aren't hidden or anonymous no matter what
            if not same_user or rss:
                s = s.filter('term', hidden=False)
                s = s.filter('term', anonymous=False)
    # General view (homepage, general search view)
    else:
        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in, show all torrents that aren't hidden unless they belong to you
            # On RSS pages, show all public torrents and nothing more.
            if logged_in_user and not rss:
                hiddenFilter = Q('term', hidden=False)
                userFilter = Q('term', uploader_id=logged_in_user.id)
                combinedFilter = hiddenFilter | userFilter
                s = s.filter('bool', filter=[combinedFilter])
            else:
                s = s.filter('term', hidden=False)

    if main_category:
        s = s.filter('term', main_category_id=main_cat_id)
    elif sub_category:
        s = s.filter('term', main_category_id=main_cat_id)
        s = s.filter('term', sub_category_id=sub_cat_id)

    if quality_filter == 0:
        pass
    elif quality_filter == 1:
        s = s.filter('term', remake=False)
    elif quality_filter == 2:
        s = s.filter('term', trusted=True)
    elif quality_filter == 3:
        s = s.filter('term', complete=True)

    # Apply sort
    s = s.sort(es_sort)

    # Only show first RESULTS_PER_PAGE items for RSS
    if rss:
        s = s[0:per_page]
    else:
        max_page = min(page,
                       int(math.ceil(max_search_results / float(per_page))))
        from_idx = (max_page - 1) * per_page
        to_idx = min(max_search_results, max_page * per_page)
        s = s[from_idx:to_idx]

    highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT')
    if highlight:
        s = s.highlight_options(tags_schema='styled')
        s = s.highlight("display_name")

    # Return query, uncomment print line to debug query
    # from pprint import pprint
    # print(json.dumps(s.to_dict()))
    return s.execute()