def document_field(field): """ The default ``field_factory`` method for converting Django field instances to ``elasticsearch_dsl.Field`` instances. Auto-created fields (primary keys, for example) and one-to-many fields (reverse FK relationships) are skipped. """ if field.auto_created or field.one_to_many: return None if field.many_to_many: return RawMultiString defaults = { models.DateField: dsl.Date(), models.DateTimeField: dsl.Date(), models.IntegerField: dsl.Long(), models.PositiveIntegerField: dsl.Long(), models.BooleanField: dsl.Boolean(), models.NullBooleanField: dsl.Boolean(), # models.SlugField: dsl.String(index='not_analyzed'), models.SlugField: dsl.Text(index='not_analyzed'), models.DecimalField: dsl.Double(), models.FloatField: dsl.Float(), } return defaults.get(field.__class__, RawString)
class TrainingJob(elasticsearch_dsl.Document): id = elasticsearch_dsl.Integer() schema_version = elasticsearch_dsl.Integer() job_name = elasticsearch_dsl.Keyword() author = elasticsearch_dsl.Keyword() created_at = elasticsearch_dsl.Date() ended_at = elasticsearch_dsl.Date() params = elasticsearch_dsl.Text() raw_log = elasticsearch_dsl.Text() model_url = elasticsearch_dsl.Text() # Metrics epochs = elasticsearch_dsl.Integer() train_acc = elasticsearch_dsl.Float() final_val_acc = elasticsearch_dsl.Float() best_val_acc = elasticsearch_dsl.Float() final_val_loss = elasticsearch_dsl.Float() best_val_loss = elasticsearch_dsl.Float() final_val_sensitivity = elasticsearch_dsl.Float() best_val_sensitivity = elasticsearch_dsl.Float() final_val_specificity = elasticsearch_dsl.Float() best_val_specificity = elasticsearch_dsl.Float() final_val_auc = elasticsearch_dsl.Float() best_val_auc = elasticsearch_dsl.Float() # Params batch_size = elasticsearch_dsl.Integer() val_split = elasticsearch_dsl.Float() seed = elasticsearch_dsl.Integer() rotation_range = elasticsearch_dsl.Float() width_shift_range = elasticsearch_dsl.Float() height_shift_range = elasticsearch_dsl.Float() shear_range = elasticsearch_dsl.Float() zoom_range = elasticsearch_dsl.Keyword() horizontal_flip = elasticsearch_dsl.Boolean() vertical_flip = elasticsearch_dsl.Boolean() dropout_rate1 = elasticsearch_dsl.Float() dropout_rate2 = elasticsearch_dsl.Float() data_dir = elasticsearch_dsl.Keyword() gcs_url = elasticsearch_dsl.Keyword() mip_thickness = elasticsearch_dsl.Integer() height_offset = elasticsearch_dsl.Integer() pixel_value_range = elasticsearch_dsl.Keyword() # We need to keep a list of params for the parser because # we can't use traditional approaches to get the class attrs params_to_parse = [ 'batch_size', 'val_split', 'seed', 'rotation_range', 'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range', 'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2', 'data_dir', 'gcs_url', 'mip_thickness', 'height_offset', 'pixel_value_range' ] class Index: name = TRAINING_JOBS
def decorator(cls): print("setup_schema:" + cls.__name__.lower()) # # create an elastic model from the schema # # there are two special keys you can use additionally to the # standard cerberus syntx: # "elastic" : add any Elastic DSL "Column" __init__ kwargs here, they will be handed raw # to the Column __init__ # "elastictype" : add a more specific elasticserach_dsl type definition (Text instead of string) # the two special keys will be removed from the schema at the end of this # decorator. # # # now set the right elastic types for the doc # from datetime import datetime #from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, Integer\ # Float, Byte, Text, analyzer, InnerObjectWrapper, Completion import elasticsearch_dsl for elem in cls.schema.keys(): #print(elem) # the raw Column __init__ parameters dict elastic=cls.schema[elem].get("elastic", {}) if cls.schema[elem]["type"] == "integer": setattr(cls, elem, elasticsearch_dsl.Integer(**elastic)) elif cls.schema[elem]["type"] == "float": setattr(cls, elem, elasticsearch_dsl.Float(**elastic)) elif cls.schema[elem]["type"] == "string": setattr(cls, elem, elasticsearch_dsl.Text(**elastic)) elif cls.schema[elem]["type"] == "bool": setattr(cls, elem, elasticsearch_dsl.Boolean(**elastic)) elif cls.schema[elem]["type"] == "date": setattr(cls, elem, elasticsearch_dsl.Date(**elastic)) elif cls.schema[elem]["type"] == "datetime": setattr(cls, elem, elasticsearch_dsl.Date(**elastic)) elif cls.schema[elem]["type"] == "number": setattr(cls, elem, elasticsearch_dsl.Integer(**elastic)) elif cls.schema[elem]["type"] == "binary": setattr(cls, elem, elasticsearch_dsl.Byte(**elastic)) elif cls.schema[elem]["type"] == "list": setattr(cls, elem, elasticsearch_dsl.Keyword(**elastic)) else: raise Exception("Wrong Datatype in schema") #print(" .. removing the schema (raw) elastic key(s)") cls.schema[elem].pop("elastic", None) cls.schema[elem].pop("elastictype", None) return cls
class BaseDocument(indices.BaseDocument): """Base search document.""" id = dsl.Integer() # pylint: disable=invalid-name slug = Slug() version = dsl.Keyword() name = Name() created = dsl.Date() modified = dsl.Date() contributor_id = dsl.Integer() contributor_name = User() # We use a separate field for contributor sorting because we use an entirely # different value for it (the display name). contributor_sort = dsl.Keyword() owner_ids = dsl.Integer(multi=True) owner_names = User(multi=True)
def test_aggregate_data_schema(): """Verify the behaviour of the ingress.utils.aggregate_data_schema function.""" class Base: # noqa data_schema = {} class Sub1(Base): # noqa data_schema = {'Sub1': True} class Sub2(Base): # noqa data_schema = {'Sub2': True} class SubSub1(Sub2): # noqa data_schema = {'SubSub1': True} class SubSub2(Sub2): # noqa data_schema = {'SubSub2': True} aggregated_schema = iu.aggregate_data_schema(Base, include_defaults=True) assert aggregated_schema == { 'Sub1': True, 'Sub2': True, 'SubSub1': True, 'SubSub2': True, '_raw': es.Object(dynamic=True), 'timestamp': es.Date() }
class Node(es.DocType): """ Elastic document describing user """ node_type = es.Keyword() objectID = es.Keyword() name = es.Text( fielddata=True, analyzer=autocomplete ) user = es.Object( fields={ 'id': es.Keyword(), 'name': es.Text( fielddata=True, analyzer=autocomplete) } ) description = es.Text() is_free = es.Boolean() project = es.Object( fields={ 'id': es.Keyword(), 'name': es.Keyword(), 'url': es.Keyword(), } ) media = es.Keyword() picture = es.Keyword() tags = es.Keyword(multi=True) license_notes = es.Text() created_at = es.Date() updated_at = es.Date() class Meta: index = 'nodes'
class EmbeddingIndex(es.Document): corpus = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() by_unit = es.Keyword() # Token/Word/Sentence/Text algorithm = es.Keyword() pooling = es.Keyword() meta_parameters = es.Object() class Index: name = ES_INDEX_EMBEDDING using = ES_CLIENT
def doc_field(type): defaults = { 'date': dsl.Date(), 'integer': dsl.Long(), 'boolean': dsl.Boolean(), 'double': dsl.Double(), 'float': dsl.Float(), } return defaults.get(type, RawString)
class DataDocument(BaseDocument): """Document for data search.""" started = dsl.Date() finished = dsl.Date() status = dsl.Keyword() process = dsl.Integer() process_type = ProcessType() # Keep backward compatibility. type = ProcessType() process_name = Name() tags = dsl.Keyword(multi=True) collection = dsl.Integer() entity = dsl.Integer() class Index: """Meta class for data search document.""" name = 'data'
class GroupDocument(esd.DocType): date = esd.Date() aperture = esd.Float() exposure = esd.Float() focal_length = esd.Float() focal_length_35 = esd.Float() iso = esd.Integer() model = esd.String(index='not_analyzed') #analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) lens = esd.String(index='not_analyzed') path = esd.String(index='not_analyzed') dirname = esd.String(index='not_analyzed') basename = esd.String(index='not_analyzed')
class Dictionary(es.Document): corpus = es.Keyword() name = es.Keyword() description = es.Text() datetime = es.Date() number_of_documents = es.Integer() is_ready = es.Boolean() class Index: name = ES_INDEX_DICTIONARY_INDEX using = ES_CLIENT
class TopicDocument(es.Document): topic_id = es.Keyword() topic_weight = es.Float() document_es_id = es.Keyword() datetime = es.Date() document_source = es.Keyword() document_corpus = es.Keyword() document_num_views = es.Integer() document_num_comments = es.Integer() class Index: name = ES_INDEX_TOPIC_DOCUMENT # f"{ES_INDEX_TOPIC_DOCUMENT}_{tm}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, "max_result_window": 5000000, } settings_dynamic = { "number_of_shards": 2, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "datetime": { "type": "date" }, "document_es_id": { "type": "keyword", }, "document_source": { "type": "keyword", }, "document_corpus": { "type": "keyword", }, "document_num_views": { "type": "long", }, "document_num_comments": { "type": "long", }, "topic_id": { "type": "keyword", }, "topic_weight": { "type": "float" } } }
class DataDocument(BaseDocument): """Document for data search.""" started = dsl.Date() finished = dsl.Date() status = dsl.Keyword() process = dsl.Integer() process_type = ProcessType() # Keep backward compatibility. type = ProcessType() # pylint: disable=invalid-name process_name = Name() tags = dsl.Keyword(multi=True) collection = dsl.Integer(multi=True) parents = dsl.Integer(multi=True) children = dsl.Integer(multi=True) entity = dsl.Integer(multi=True) class Meta: """Meta class for data search document.""" index = 'data'
class TestSearchDocument(BaseDocument): id = dsl.Integer() name = dsl.Text(fielddata=True) num = dsl.Integer() date = dsl.Date() json = dsl.Object() field_name = Name() field_process_type = ProcessType() none_test = dsl.Integer() class Index: name = "test_search"
class DataDocType(es.DocType): """Elasticsearch test model""" first_name = es.Keyword() last_name = es.Keyword() city = es.Text() skills = es.Keyword() birthday = es.Date() is_active = es.Boolean() score = es.Integer() description = es.Text() class Meta: index = 'test'
class Mappings(es.Document): threshold = es.Keyword() meta_dtm_name = es.Keyword() topic_modelling_first = es.Keyword() topic_modelling_second = es.Keyword() topic_modelling_first_from = es.Date(), topic_modelling_second_to = es.Date(), mappings_dict = es.Text() scores_list = es.Keyword() delta_words_dict = es.Text() delta_count_dict = es.Text() class Index: name = ES_INDEX_MAPPINGS using = ES_CLIENT settings = { "index.mapping.total_fields.limit": 5000, "number_of_shards": 1, "number_of_replicas": 1, } mappings = { "properties": { "threshold": { "type": "keyword", }, "meta_dtm_name": { "type": "keyword", }, "topic_modelling_first_from": { "type": "date" }, "topic_modelling_second_to": { "type": "date" } }, }
class META_DTM(es.Document): meta_name = es.Keyword() volume_days = es.Float() delta_days = es.Float() reset_index = es.Boolean() from_date = es.Date() to_date = es.Date() class Index: name = ES_INDEX_META_DTM using = ES_CLIENT settings = { "number_of_shards": 1, "number_of_replicas": 1, } mappings = { "properties": { "meta_name": { "type": "keyword", }, "volume_days": { "type": "float", }, "delta_days": { "type": "float", }, "reset_index": { "type": "boolean", }, "from_date": { "type": "date" }, "to_date": { "type": "date" } }, }
class TestSearchDocument(BaseDocument): # pylint: disable=no-member id = dsl.Integer() # pylint: disable=invalid-name name = dsl.Text(fielddata=True) num = dsl.Integer() date = dsl.Date() json = dsl.Object() field_name = Name() field_process_type = ProcessType() none_test = dsl.Integer() class Meta: index = 'test_search'
class TopicModellingIndex(es.Document): corpus = es.Keyword() source = es.Keyword() number_of_documents = es.Integer() is_ready = es.Boolean() has_topic_info = es.Boolean() name = es.Keyword() description = es.Text() datetime_created = es.Date() datetime_finished = es.Date() datetime_from = es.Date() datetime_to = es.Date() algorithm = es.Keyword() number_of_topics = es.Integer() hierarchical = es.Boolean() meta_parameters = es.Object() perplexity = es.Float() purity = es.Float() contrast = es.Float() coherence = es.Float() tau_smooth_sparse_theta = es.Float() tau_smooth_sparse_phi = es.Float() tau_decorrelator_phi = es.Float() tau_coherence_phi = es.Float() topics = es.Nested(Topic) is_actualizable = es.Boolean() class Index: name = ES_INDEX_TOPIC_MODELLING using = ES_CLIENT
class DataDocType(es.Document): """Elasticsearch test model""" first_name = es.Keyword() last_name = es.Keyword() city = es.Text() skills = es.Keyword() birthday = es.Date() is_active = es.Boolean() score = es.Integer() location = es.GeoPoint() description = es.Text() class Index: name = 'test'
class AWSIdNameMapping(dsl.DocType): class Meta: index = 'awsidnamemapping' key = dsl.String(index='not_analyzed') rid = dsl.String(index='not_analyzed') name = dsl.String(index='not_analyzed') date = dsl.Date(format='date_optional_time||epoch_millis') @classmethod def get_id_name_mapping(cls, key): s = cls.search() s = s.query('match', key=key).sort('-date') res = {} for hit in s.scan(): if hit.rid not in res: res[hit.rid] = hit.name return res
class InfoRiegoRecord(dsl.DocType): code = dsl.String() location = dsl.String() date = dsl.Date() rain = dsl.Float() temperature = dsl.Float() rel_humidity = dsl.Float() radiation = dsl.Float() wind_speed = dsl.Float() wind_direction = dsl.Float() lat_lon = dsl.GeoPoint(lat_lon=True) station_height = dsl.Integer() def save(self, **kwargs): return super(InfoRiegoRecord, self).save(**kwargs) class Meta: index = 'inforiego'
class DocumentLocation(es.Document): document_es_id = es.Keyword() document_datetime = es.Date() document_source = es.Keyword() location_name = es.Keyword() location_level = es.Keyword() location_weight = es.Float() location_id = es.Keyword() class Index: name = ES_INDEX_DOCUMENT_LOCATION # !!! f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion.id}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "document_datetime": { "type": "date" }, "document_es_id": { "type": "keyword" }, "document_source": { "type": "keyword" }, "location_level": { "type": "keyword" }, "location_name": { "type": "keyword" }, "location_weight": { "type": "float" }, "location_id": { "type": "keyword" }, } }
class PhotoDocument(esd.DocType): date = esd.Date() aperture = esd.Float() exposure = esd.Float() focal_length = esd.Float() focal_length_35 = esd.Float() iso = esd.Integer() size = esd.Integer() model = esd.String(index='not_analyzed') #analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) model_ci = esd.String(analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) lens = esd.String(index='not_analyzed') lens_ci = esd.String(analyzer=esd.analyzer('keyword', tokenizer="keyword", filter=['lowercase', ])) path = esd.String(index='not_analyzed') dirname = esd.String(index='not_analyzed') basename = esd.String(index='not_analyzed') def extended_dict(self): dct = self.to_dict() dct["id"] = self.meta.id return dct
def aggregate_data_schema( base_class: Type, include_defaults: bool = True, ) -> Dict[str, Any]: """Iterate through imported plugins and create an ingress mapping to process the data with.""" mapping: Dict = {} for subclass in find_subclasses(base_class): subclass_data_schema = None try: subclass_data_schema = getattr(subclass, 'data_schema') except AttributeError: continue if subclass_data_schema: mapping.update(subclass_data_schema) if include_defaults: mapping['_raw'] = es.Object(dynamic=True) mapping['timestamp'] = es.Date() return mapping
class DocumentEval(es.Document): value = es.Float() document_es_id = es.Keyword() document_datetime = es.Date() document_source = es.Keyword() topic_ids_top = es.Keyword() topic_ids_bottom = es.Keyword() class Index: name = ES_INDEX_DOCUMENT_EVAL # !!! f"{ES_INDEX_DOCUMENT_EVAL}_{tm}_{criterion.id}{_neg}{_m4a}{_m4a_class}" using = ES_CLIENT settings = { "number_of_shards": 3, "number_of_replicas": 1, "max_result_window": 5000000, } mappings = { "properties": { "document_datetime": { "type": "date" }, "document_es_id": { "type": "keyword" }, "document_source": { "type": "keyword" }, "value": { "type": "float" }, "topic_ids_top": { "type": "keyword" }, "topic_ids_bottom": { "type": "keyword" }, } }
class Job(es.DocType): class Meta: index = 'jobs' doc_type = 'job-offer' french_elision = es.token_filter('french_elision', type='elision', articles_case=True, articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ]) french_stopwords = es.token_filter('french_stopwords', type='stop', stopwords='_french_') # Do not include this filter if keywords is empty french_keywords = es.token_filter('french_keywords', type='keyword_marker', keywords=[]) french_stemmer = es.token_filter('french_stemmer', type='stemmer', language='light_french') french_analyzer = es.analyzer( 'french_analyzer', tokenizer='standard', filter=[ 'lowercase', 'asciifolding', french_elision, french_stopwords, # french_keywords, french_stemmer ], char_filter=['html_strip']) technologies_tokenizer = es.tokenizer('comma_tokenizer', type='pattern', pattern=' |,|, ') technologies_synonyms_filter = es.token_filter( 'technologies_synonyms', type='synonym', synonyms=[ 'c => c_language', 'c++, cpp => cpp_language', 'c/c++, c/cpp => c_language', 'c/c++, c/cpp => cpp_language', 'c#, c♯, csharp => csharp_language', 'f#, f♯, fsharp => fsharp_language', 'c#, c♯, csharp => dotnet', 'f#, f♯, fsharp => dotnet', '.net => dotnet' ]) technologies_analyzer = es.analyzer( 'technologies_analyzer', tokenizer=technologies_tokenizer, filter=['lowercase', 'asciifolding', technologies_synonyms_filter]) company_name_analyzer = es.analyzer('company_name_analyzer', tokenizer='standard', filter=['lowercase', 'asciifolding']) id = es.Integer() url = es.String(index='no') source = es.String(index='not_analyzed') title = es.String( analyzer=french_analyzer, fields={'technologies': es.String(analyzer=technologies_analyzer)}) description = es.String( analyzer=french_analyzer, fields={'technologies': es.String(analyzer=technologies_analyzer)}) company = es.String(analyzer=company_name_analyzer) company_url = es.String(index='no') address = es.String(analyzer=french_analyzer) address_is_valid = es.Boolean() tags = es.Nested(doc_class=Tag, properties=dict(tag=es.String(index='not_analyzed'), weight=es.Integer())) publication_datetime = es.Date() publication_datetime_is_fake = es.Boolean() crawl_datetime = es.Date() geolocation = es.GeoPoint() geolocation_is_valid = es.Boolean() def __init__(self, meta=None, **kwargs): super(Job, self).__init__(meta, **kwargs) self._doc_type.index = compute_index_name(self.index) @property def index(self): return self._doc_type.index @property def doc_type(self): return self._doc_type.name @property def published(self): return format_date(self.publication_datetime, locale='FR_fr') @property def published_in_days(self): delta = datetime.now() - self.publication_datetime # TODO: bugfix return format_timedelta(delta, granularity='day', locale='en_US') @property def alltags(self): tags = [] if self.tags: for tag in self.tags: if tag['tag'] not in condition_tags: tags.append(Tag2(tag['tag'], tag['weight'])) return tags @property def condition_tags(self): tags = [] if self.tags: for tag in self.tags: if tag['tag'] in condition_tags: tag = Tag2(tag['tag'], tag['weight'], Tag2.get_css(tag['tag'])) tags.append(tag) return tags
class ResponseDocType(FjordDocType): id = es_dsl.Integer() happy = es_dsl.Boolean() api = es_dsl.Integer() url = es_dsl.String(index='not_analyzed') url_domain = es_dsl.String(index='not_analyzed') has_email = es_dsl.Boolean() description = es_dsl.String(analyzer='snowball') category = es_dsl.String(index='not_analyzed') description_bigrams = es_dsl.String(index='not_analyzed') description_terms = es_dsl.String(analyzer='standard') user_agent = es_dsl.String(index='not_analyzed') product = es_dsl.String(index='not_analyzed') channel = es_dsl.String(index='not_analyzed') version = es_dsl.String(index='not_analyzed') browser = es_dsl.String(index='not_analyzed') browser_version = es_dsl.String(index='not_analyzed') platform = es_dsl.String(index='not_analyzed') locale = es_dsl.String(index='not_analyzed') country = es_dsl.String(index='not_analyzed') device = es_dsl.String(index='not_analyzed') manufacturer = es_dsl.String(index='not_analyzed') source = es_dsl.String(index='not_analyzed') campaign = es_dsl.String(index='not_analyzed') souce_campaign = es_dsl.String(index='not_analyzed') organic = es_dsl.Boolean() created = es_dsl.Date() docs = ResponseDocTypeManager() class Meta: pass def mlt(self): """Returns a search with a morelikethis query for docs like this""" # Short responses tend to not repeat any words, so then MLT # returns nothing. This fixes that by setting min_term_freq to # 1. Longer responses tend to repeat important words, so we can # set min_term_freq to 2. num_words = len(self.description.split(' ')) if num_words > 40: min_term_freq = 2 else: min_term_freq = 1 s = self.search() if self.product: s = s.filter('term', product=self.product) if self.platform: s = s.filter('term', platform=self.platform) s = s.query('more_like_this', fields=['description'], docs=[{ '_index': get_index_name(), '_type': self._doc_type.name, '_id': self.id }], min_term_freq=min_term_freq, stop_words=list(ANALYSIS_STOPWORDS)) return s @classmethod def get_model(cls): return Response @classmethod def public_fields(cls): """Fields that can be publicly-visible .. Note:: Do NOT include fields that have PII in them. """ return ('id', 'happy', 'api', 'url_domain', 'has_email', 'description', 'category', 'description_bigrams', 'user_agent', 'product', 'version', 'platform', 'locale', 'source', 'campaign', 'organic', 'created') @property def truncated_description(self): """Shorten feedback for dashboard view.""" return smart_truncate(self.description, length=500) @classmethod def extract_doc(cls, resp, with_id=True): """Converts a Response to a dict of values This can be used with ``ResponseDocType.from_obj()`` to create a ``ResponseDocType`` object or it can be used for indexing. :arg resp: a Response object :arg with_id: whether or not to include the ``_id`` value--include it when you're bulk indexing :returns: a dict """ doc = { 'id': resp.id, 'happy': resp.happy, 'api': resp.api, 'url': resp.url, 'url_domain': resp.url_domain, 'has_email': bool(resp.user_email), 'description': resp.description, 'user_agent': resp.user_agent, 'product': resp.product, 'channel': resp.channel, 'version': resp.version, 'browser': resp.browser, 'browser_version': resp.browser_version, 'platform': resp.platform, 'locale': resp.locale, 'country': resp.country, 'device': resp.device, 'manufacturer': resp.manufacturer, 'source': resp.source, 'campaign': resp.campaign, 'source_campaign': '::'.join([(resp.source or '--'), (resp.campaign or '--')]), 'organic': (not resp.campaign), 'created': resp.created } # We only compute bigrams for english because the analysis # uses English stopwords, stemmers, ... if resp.locale.startswith(u'en') and resp.description: doc['description_bigrams'] = compute_grams(resp.description) else: doc['description_bigrams'] = [] if with_id: doc['_id'] = doc['id'] return doc
class AWSDetailedLineitem(dsl.DocType): class Meta: index = 'awsdetailedlineitem' availability_zone = dsl.String(index='not_analyzed') cost = dsl.Double() un_blended_cost = dsl.Double() item_description = dsl.String(index='not_analyzed') linked_account_id = dsl.String(index='not_analyzed') operation = dsl.String() payer_account_id = dsl.String(index='not_analyzed') pricing_plan_id = dsl.Long() product_name = dsl.String(index='not_analyzed') rate = dsl.Double() un_blended_rate = dsl.Double() rate_id = dsl.Long() record_id = dsl.String(index='not_analyzed') reserved_instance = dsl.Boolean() resource_id = dsl.String(index='not_analyzed') subscription_id = dsl.Long() tag = dsl.Object( properties={ 'key': dsl.String(index='not_analyzed'), 'value': dsl.String(index='not_analyzed') }) usage_end_date = dsl.Date(format='strict_date_optional_time||epoch_millis') usage_quantity = dsl.Double() usage_start_date = dsl.Date( format='strict_date_optional_time||epoch_millis') usage_type = dsl.String(index='not_analyzed') @classmethod @with_cache(ttl=3600 * 3, worker_refresh=True) def keys_has_data(cls, keys, date_from=None, date_to=None): date_to = date_to or datetime.utcnow() s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if date_from: s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return res['hits']['total'] > 0 @classmethod @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d")) def get_first_date(cls, keys): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.sort('usage_start_date') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] == 0: return return res['hits']['hits'][0]['_source']['usage_start_date'].split( 'T')[0] @classmethod @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d")) def get_last_date(cls, keys, limit=None): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if limit: s = s.filter('range', usage_start_date={'to': limit.isoformat()}) s = s.sort('-usage_start_date') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=1, request_timeout=60) if res['hits']['total'] == 0: return return res['hits']['hits'][0]['_source']['usage_start_date'].split( 'T')[0] @classmethod def get_first_to_now_date(cls, keys): def from_date_to_today(d): now = datetime.utcnow() while d < now: yield d d += relativedelta(months=1) return list(from_date_to_today(cls.get_first_date(keys))) @classmethod def get_first_to_last_date(cls, keys): def from_date_to_last(d): last = cls.get_last_date(keys) while d < last: yield d d += relativedelta(months=1) return list(from_date_to_last(cls.get_first_date(keys))) @classmethod @with_cache(6 * 3600) def get_available_tags(cls, keys, only_with_data=None, product_name=None): s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) if product_name: s = s.filter('term', product_name=product_name) s.aggs.bucket('tag_key', 'terms', field='tag.key') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) tags = [] for tag in res['aggregations']['tag_key']['buckets']: if tag['key'].startswith('user:'******'key'].split(':')[1] if not only_with_data or name in AWSStat.latest_hourly_cpu_usage_by_tag( only_with_data )['tags'] or name in AWSStat.latest_daily_cpu_usage_by_tag( only_with_data)['tags']: tags.append(name) tags.sort() return dict(tags=tags) @classmethod @with_cache(ttl=6 * 3600) def get_cost_by_tag(cls, keys, tag, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)}) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s.aggs.bucket('total_cost', 'sum', field='cost') agg = s.aggs.bucket('tag_value', 'terms', field='tag.value', size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) tags = [{ 'tag_value': tag['key'], 'cost': tag['cost']['value'], } for tag in res['aggregations']['tag_value']['buckets']] return dict(tags=tags, total_cost=res['aggregations']['total_cost']['value']) @classmethod @with_cache(ttl=6 * 3600) def get_cost(cls, keys, date_from, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace( hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s.aggs.bucket('total_cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return dict(total_cost=res['aggregations']['total_cost']['value']) @classmethod @with_cache() def get_monthly_cost_by_tag(cls, keys, tag, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)}) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.bucket('total_cost', 'sum', field='cost') agg = agg.bucket('tag_value', 'terms', field='tag.value', size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) months = [{ 'month': interval['key_as_string'].split('T')[0][:-3], 'tags': [{ 'tag_value': tag['key'], 'cost': tag['cost']['value'], } for tag in interval['tag_value']['buckets']], 'total_cost': interval['total_cost']['value'], } for interval in res['aggregations']['intervals']['buckets']] return dict(months=months) @classmethod @with_cache() def get_cost_by_product(cls, key, date_from=None, date_to=None, without_discount=False, only_discount=False, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) if without_discount: s = s.query( 'bool', filter=[ ~dsl.Q('term', item_description='PAR_APN_ProgramFee_2500') ]) if only_discount: s = s.filter('term', item_description='PAR_APN_ProgramFee_2500') agg = s.aggs.bucket('products', 'terms', field='product_name', order={'cost': 'desc'}, size=size) agg.bucket('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) products = [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in res['aggregations']['products']['buckets']] return dict(products=products) @classmethod @with_cache() def get_cost_by_region(cls, keys, tagged=False, byaccount=False, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs if byaccount: agg = agg.bucket('accounts', 'terms', field='linked_account_id') agg = agg.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg = agg.bucket('regions', 'terms', field='availability_zone', size=size) agg.bucket('cost', 'sum', field='cost') if tagged: agg = agg.bucket('tags', 'terms', field='tag.value') agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0) return res['aggregations'] @classmethod @with_cache() def get_monthly_cost(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'month': interval['key_as_string'].split('T')[0], 'total_cost': interval['cost']['value'], } for interval in res['aggregations']['intervals']['buckets']] return dict(months=res) @classmethod @with_cache() def get_monthly_cost_by_product(cls, keys, tagged=False, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.bucket('cost', 'sum', field='cost') if tagged: agg = agg.bucket('tags', 'terms', field='tag.value') agg.bucket('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) def tagged_cost(bucket, total): total_tag = 0.0 for tag in bucket: total_tag += tag['cost']['value'] yield (tag['key'], tag['cost']['value']) if total != total_tag: yield ('untagged', total - total_tag) res = [{ 'month': interval['key_as_string'].split('T')[0], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], 'tags': [{ 'name': tag[0], 'cost': tag[1], } for tag in tagged_cost(product['tags']['buckets'], product['cost']['value'])], } for product in interval['products']['buckets']] if tagged else [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(months=res) @classmethod @with_cache(ttl=4 * 3600) def get_daily_cost_by_product(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace( hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.metric('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'day': interval['key_as_string'].split('T')[0], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(days=res) @classmethod @with_cache(ttl=24 * 3600) def get_yearly_cost_by_product(cls, keys, date_from=None, date_to=None, size=0x7FFFFFFF): date_from = date_from or datetime.utcnow().replace( month=1, day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(month=12, day=31, hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='year', min_doc_count=1) agg = agg.bucket('products', 'terms', field='product_name', size=size) agg.metric('cost', 'sum', field='cost') s = s.query('bool', filter=[~dsl.Q('term', cost=0)]) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) res = [{ 'year': interval['key_as_string'][:4], 'products': [{ 'product': SHORT_NAMES.get(product['key'], product['key']), 'cost': product['cost']['value'], } for product in interval['products']['buckets']] } for interval in res['aggregations']['intervals']['buckets']] return dict(years=res) @classmethod @with_cache() def get_cost_by_resource(cls, keys, date_from=None, date_to=None, search=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) if search: s = s.query('wildcard', resource_id='*{}*'.format(search)) agg = s.aggs.bucket('resources', 'terms', field='resource_id', order={'cost': 'desc'}, size=0x7FFFFFFF) agg.bucket('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) resources = [{ 'resource': resource['key'], 'cost': resource['cost']['value'], } for resource in res['aggregations']['resources']['buckets']] return resources @classmethod def get_monthly_cost_by_resource(cls, resource_ids, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) if resource_ids: s = cls.search() s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter('terms', resource_id=list(resource_ids)) agg = s.aggs.bucket('months', 'date_histogram', field='usage_start_date', interval='month', min_doc_count=1) agg.metric('cost', 'sum', field='cost') r = client.search('awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) return { e['key_as_string']: e['cost']['value'] for e in r['aggregations']['months']['buckets'] } else: return {} @classmethod @with_cache() def get_lambda_usage(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', product_name='AWS Lambda') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('resources', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'avg', field='cost') agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') agg = agg.bucket('descriptions', 'terms', field='item_description', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) #return res def _lambda_usage_regb(buckets, endswith): for b in buckets: if b['key'].endswith(endswith): return b['quantity']['value'] usages = [{ 'rid': usage['key'], 'name': usage['key'].split(':')[-1], 'requests': _lambda_usage_regb(usage['types']['buckets'], '-Request'), 'gb_seconds': _lambda_usage_regb(usage['types']['buckets'], '-Lambda-GB-Second'), 'cost': usage['cost']['value'], 'raw_cost': lambdapricing.get_raw_cost([ x['descriptions']['buckets'] for x in usage['types']['buckets'] ]), } for usage in res['aggregations']['resources']['buckets']] return usages @classmethod @with_cache() def get_s3_bandwidth_costs(cls, key, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg.metric('gb', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) transfers = [{ 'type': transfer['key'], 'quantity': transfer['gb']['value'], 'cost': transfer['cost']['value'], } for transfer in res['aggregations']['types']['buckets']] return transfers @classmethod @with_cache() def get_ec2_bandwidth_costs(cls, key, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Elastic Compute Cloud') s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) agg = s.aggs.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg.metric('gb', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) transfers = [{ 'type': transfer['key'], 'quantity': transfer['gb']['value'], 'cost': transfer['cost']['value'], } for transfer in res['aggregations']['types']['buckets']] return transfers @classmethod def get_ec2_daily_cost(cls, key): s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Elastic Compute Cloud') agg = s.aggs.bucket('intervals', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) agg.metric('cost', 'sum', field='cost') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) for interval in res['aggregations']['intervals']['buckets']: yield interval['key_as_string'].split( 'T')[0], interval['cost']['value'] @classmethod @with_cache() def get_elb_usage_a_day(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) gib = Fraction(2**30) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("prefix", resource_id="arn:aws:elasticloadbalancing") s = s.sort({"usage_start_date": {"order": "desc"}}) agg = s.aggs.bucket('rid', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF) agg.metric('quantity', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) elbs = [{ 'rid': elb['key'], 'cost': elb['cost']['value'] / (date_to - date_from).days, 'hours': float( sum([ x['quantity']['value'] for x in elb['types']['buckets'] if x['key'].endswith('LoadBalancerUsage') ]) / (date_to - date_from).days), 'bytes': float((sum([ x['quantity']['value'] for x in elb['types']['buckets'] if x['key'].endswith('Bytes') ]) * gib) / (date_to - date_from).days), } for elb in res['aggregations']['rid']['buckets']] return elbs @classmethod @with_cache() def get_instance_type(cls, keys, date_from=None, date_to=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.extra(_source=[ 'usage_start_date', 'usage_type', 'availability_zone', 'resource_id' ]) s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("term", product_name='Amazon Elastic Compute Cloud') s = s.query('wildcard', usage_type='*BoxUsage:*') s = s.filter('exists', field='resource_id') s = s.sort({"usage_start_date": {"order": "desc"}}) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=10000, request_timeout=60) def cut_region_name(s): return s[:-1] if s[-1].isalpha() else s types = [] refs = {} def add_in_types(type, rid): ref_tuple = (type['hour'], type['instance'], type['region']) if ref_tuple in refs: refs[ref_tuple]['rids'].append(rid) refs[ref_tuple]['ridCount'] += 1 return type['rids'] = [rid] types.append(type) refs[ref_tuple] = types[-1] for r in res['hits']['hits']: elem = { 'hour': r['_source']['usage_start_date'], 'instance': r['_source']['usage_type'].split(':')[1], 'region': cut_region_name(r['_source']['availability_zone']) if 'availability_zone' in r['_source'] else 'unknown', 'ridCount': 1, } add_in_types(elem, r['_source']['resource_id']) return types @classmethod @with_cache() def get_instance_hour(cls, keys, date_from=None, date_to=None, min_hour=None): date_from = date_from or datetime.utcnow().replace( day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter("term", product_name='Amazon Elastic Compute Cloud') s = s.filter('prefix', resource_id='i-') s = s.query('wildcard', usage_type='*BoxUsage*') agg = s.aggs.bucket('resource_id', 'terms', field='resource_id', size=0x7FFFFFFF) agg.bucket('days', 'date_histogram', field='usage_start_date', interval='day', min_doc_count=1) res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) instance_list = [] for instance in res['aggregations']['resource_id']['buckets']: tmp_hours = [] for day in instance['days']['buckets']: tmp_hours.append(day['doc_count']) avg_hours = sum(tmp_hours) / float(len(tmp_hours)) if not min_hour or avg_hours >= min_hour: instance_list.append(dict(id=instance['key'], hours=avg_hours)) return sorted(instance_list, key=lambda x: x['hours'], reverse=True) @classmethod @with_cache() def get_s3_buckets_per_tag(cls, keys): def _check_if_in_list(dict_list, value, key): return next((item for item in dict_list if item[key] == value), None) def _parse_tag_keys_results(res): bucket_tagged = [] for bucket_tag_key in res['aggregations']['tag_key']['buckets']: buff_tag_key = _check_if_in_list(bucket_tagged, bucket_tag_key['key'], 'tag_key') if buff_tag_key is None: buff_tag_key = { "tag_key": bucket_tag_key['key'], "tag_value": [] } buff_tag_key = _parse_tag_values_results( bucket_tag_key, buff_tag_key) bucket_tagged.append(buff_tag_key) return bucket_tagged def _parse_tag_values_results(bucket_tag_key, buff_tag_key): for bucket_tag_value in bucket_tag_key['tag_value']['buckets']: buff_tag_value = _check_if_in_list(buff_tag_key['tag_value'], bucket_tag_value['key'], 'tag_value') if buff_tag_value is None: buff_tag_value = { "tag_value": bucket_tag_value['key'], "s3_buckets": [] } buff_tag_value = _parse_buckets_results( buff_tag_value, bucket_tag_value) buff_tag_key['tag_value'].append(buff_tag_value) return buff_tag_key def _parse_buckets_results(buff_tag_value, bucket_tag_value): for bucket_resource_id in bucket_tag_value['ressource_id'][ 'buckets']: buff_bucket_resource_id = _check_if_in_list( buff_tag_value['s3_buckets'], bucket_resource_id['key'], 'bucket_name') if buff_bucket_resource_id is None: buff_bucket_resource_id = { "bucket_name": bucket_resource_id['key'], "account_id": bucket_resource_id['account_id']['buckets'][0]['key'] } buff_tag_value['s3_buckets'].append(buff_bucket_resource_id) return buff_tag_value s = cls.search() s = s.filter( 'terms', linked_account_id=keys if isinstance(keys, list) else [keys]) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.query('exists', field="tag") s = s.query('wildcard', item_description="*storage*") agg = s.aggs.bucket('tag_key', 'terms', field="tag.key") agg = agg.bucket('tag_value', 'terms', field='tag.value') agg.bucket('ressource_id', 'terms', field='resource_id').bucket('account_id', 'terms', field='linked_account_id') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) ''' bucket_tagged structure [{ "tag_key" : "KEY", # Unique in list "tag_value": [{ "tag_value": "VALUE", # Unique in list "s3_buckets": [{ "bucket_name": "BUCKET_NAME", "account_id": "ACCOUND_ID" }, {...}] }, {...}] }, {...}] ''' bucket_tagged = _parse_tag_keys_results(res) return bucket_tagged @classmethod @with_cache() def get_s3_bandwidth_info_and_cost_per_name(cls, key, bucket_resource_ids, date_from=None, date_to=None): date_from = date_from or (datetime.utcnow() - relativedelta( month=1)).replace(day=1, hour=0, minute=0, second=0, microsecond=0) date_to = date_to or date_from.replace(day=calendar.monthrange( date_from.year, date_from.month)[1], hour=23, minute=59, second=59, microsecond=999999) s = cls.search() s = s.filter('term', linked_account_id=key) s = s.filter('term', product_name='Amazon Simple Storage Service') s = s.filter('terms', resource_id=bucket_resource_ids if isinstance( bucket_resource_ids, list) else [bucket_resource_ids]) s = s.filter('range', usage_start_date={ 'from': date_from.isoformat(), 'to': date_to.isoformat() }) s = s.filter('wildcard', usage_type="*Bytes") agg = s.aggs.bucket('bucket_name', 'terms', field='resource_id', size=0x7FFFFFFF) agg.metric('cost', 'sum', field='cost') agg = agg.bucket('transfer_type', 'terms', field='usage_type') agg.metric('data', 'sum', field='usage_quantity') res = client.search(index='awsdetailedlineitem', body=s.to_dict(), size=0, request_timeout=60) data = [{ "bucket_name": bucket['key'], "cost": bucket['cost']['value'], "transfer_stats": [{ "type": transfer_stat['key'], "data": transfer_stat['data']['value'] } for transfer_stat in bucket['transfer_type']['buckets']] } for bucket in res['aggregations']['bucket_name']['buckets']] return data
class AWSMetric(dsl.DocType): class Meta: index = 'awsmetric' key = dsl.String(index='not_analyzed') resource = dsl.String(index='not_analyzed') metric = dsl.String(index='not_analyzed') time = dsl.Date(format='date_optional_time||epoch_millis') period = dsl.Integer() value = dsl.Double() @classmethod def underutilized_resources(cls, keys, timespan=timedelta(days=30)): keys = any_key_to_string_array(keys) s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EC2:CPUUtilization:Maximum') s = s.filter('terms', key=keys) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('percentiles', 'percentile_ranks', field='value', values=[20, 50]) res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) resources = [] for resource in res['aggregations']['resources']['buckets']: if resource['percentiles']['values']['20.0'] == 100: res_region, res_id = resource['key'].split('/') resources.append( dict(type='EC2 Instance', id=res_id, region=res_region, underutilized=['CPU usage under 20%'])) return dict(resources=resources) @classmethod def hourly_cpu_usage(cls, keys, resources=None): s = cls.search() if isinstance(keys, basestring): keys = [keys] elif not isinstance(keys, list): keys = list(keys) assert all(isinstance(key, basestring) for key in keys) s = s.filter('terms', key=keys) if resources: s = s.filter('terms', resource=resources) s = s.filter('term', metric='AWS/EC2:CPUUtilization:Maximum') agg = s.aggs.bucket('intervals', 'date_histogram', field='time', interval='hour', min_doc_count=1) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) tmp_hours = defaultdict(list) for interval in res['aggregations']['intervals']['buckets']: interval_hour = interval['key_as_string'].split('T')[1].split( ':')[0] tmp_hours[interval_hour].append(interval['utilization']['value']) hours = OrderedDict( zip(["{:02d}".format(x) for x in range(0, 24)], itertools.repeat(0))) for hour, values in tmp_hours.iteritems(): hours[hour] = sum(values) / len(values) if not tmp_hours: return None return [ dict(hour=hour, cpu=float(cpu)) for hour, cpu in hours.iteritems() ] @classmethod def days_of_the_week_cpu_usage(cls, keys, resources=None): s = cls.search() if isinstance(keys, basestring): keys = [keys] elif not isinstance(keys, list): keys = list(keys) assert all(isinstance(key, basestring) for key in keys) s = s.filter('terms', key=keys) if resources: s = s.filter('terms', resource=resources) s = s.filter('term', metric='AWS/EC2:CPUUtilization:Maximum') agg = s.aggs.bucket('intervals', 'date_histogram', field='time', interval='day', min_doc_count=1) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) tmp_days_of_the_week = defaultdict(list) for interval in res['aggregations']['intervals']['buckets']: weekday = datetime.strptime( interval['key_as_string'].split('T')[0], '%Y-%m-%d').date().weekday() tmp_days_of_the_week[weekday].append( interval['utilization']['value']) days = OrderedDict(zip(range(0, 7), itertools.repeat(0))) for weekday, values in tmp_days_of_the_week.iteritems(): days[weekday] = sum(values) / len(values) if not tmp_days_of_the_week: return None return [ dict(day=calendar.day_name[weekday], cpu=float(cpu)) for weekday, cpu in days.iteritems() ] @classmethod def daily_cpu_utilization(cls, key): s = cls.search() s = s.filter('term', key=key) s = s.filter('term', metric='AWS/EC2:CPUUtilization:Maximum') agg = s.aggs.bucket('intervals', 'date_histogram', field='time', interval='day', min_doc_count=1) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for interval in res['aggregations']['intervals']['buckets']: yield interval['key_as_string'].split( 'T')[0], interval['utilization']['value'] @classmethod def get_cpu_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EC2:CPUUtilization:Maximum') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_instance_read_iops_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EC2:DiskReadOps:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_instance_write_iops_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EC2:DiskWriteOps:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_instance_read_bytes_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EBS:DiskReadBytes:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_instance_write_bytes_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EBS:DiskWriteBytes:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_volume_read_iops_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EBS:VolumeReadOps:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_volume_write_iops_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EBS:VolumeWriteOps:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_volume_read_bytes_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EBS:VolumeReadBytes:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_volume_write_bytes_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EBS:VolumeWriteBytes:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_network_in_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EC2:NetworkIn:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_network_out_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/EC2:NetworkOut:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value'] @classmethod def get_s3_space_usage(cls, key, timespan=timedelta(days=30)): s = cls.search() s = s.filter('range', time={'gt': (datetime.utcnow() - timespan).isoformat()}) s = s.filter('term', metric='AWS/S3:BucketSizeBytes:Average') s = s.filter('term', key=key) agg = s.aggs.bucket('resources', 'terms', field='resource', size=300) agg.metric('utilization', 'avg', field='value') res = client.search(index='awsmetric', body=s.to_dict(), size=0, request_timeout=60) for resource in res['aggregations']['resources']['buckets']: yield resource['key'], resource['utilization']['value']