Example #1
0
class TrainingJob(elasticsearch_dsl.Document):
    id = elasticsearch_dsl.Integer()
    schema_version = elasticsearch_dsl.Integer()
    job_name = elasticsearch_dsl.Keyword()
    author = elasticsearch_dsl.Keyword()
    created_at = elasticsearch_dsl.Date()
    ended_at = elasticsearch_dsl.Date()
    params = elasticsearch_dsl.Text()
    raw_log = elasticsearch_dsl.Text()
    model_url = elasticsearch_dsl.Text()

    # Metrics
    epochs = elasticsearch_dsl.Integer()
    train_acc = elasticsearch_dsl.Float()
    final_val_acc = elasticsearch_dsl.Float()
    best_val_acc = elasticsearch_dsl.Float()
    final_val_loss = elasticsearch_dsl.Float()
    best_val_loss = elasticsearch_dsl.Float()
    final_val_sensitivity = elasticsearch_dsl.Float()
    best_val_sensitivity = elasticsearch_dsl.Float()
    final_val_specificity = elasticsearch_dsl.Float()
    best_val_specificity = elasticsearch_dsl.Float()
    final_val_auc = elasticsearch_dsl.Float()
    best_val_auc = elasticsearch_dsl.Float()

    # Params
    batch_size = elasticsearch_dsl.Integer()
    val_split = elasticsearch_dsl.Float()
    seed = elasticsearch_dsl.Integer()

    rotation_range = elasticsearch_dsl.Float()
    width_shift_range = elasticsearch_dsl.Float()
    height_shift_range = elasticsearch_dsl.Float()
    shear_range = elasticsearch_dsl.Float()
    zoom_range = elasticsearch_dsl.Keyword()
    horizontal_flip = elasticsearch_dsl.Boolean()
    vertical_flip = elasticsearch_dsl.Boolean()

    dropout_rate1 = elasticsearch_dsl.Float()
    dropout_rate2 = elasticsearch_dsl.Float()

    data_dir = elasticsearch_dsl.Keyword()
    gcs_url = elasticsearch_dsl.Keyword()

    mip_thickness = elasticsearch_dsl.Integer()
    height_offset = elasticsearch_dsl.Integer()
    pixel_value_range = elasticsearch_dsl.Keyword()

    # We need to keep a list of params for the parser because
    # we can't use traditional approaches to get the class attrs
    params_to_parse = [
        'batch_size', 'val_split', 'seed', 'rotation_range',
        'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range',
        'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2',
        'data_dir', 'gcs_url', 'mip_thickness', 'height_offset',
        'pixel_value_range'
    ]

    class Index:
        name = TRAINING_JOBS
Example #2
0
def document_field(field):
    """
    The default ``field_factory`` method for converting Django field instances to ``elasticsearch_dsl.Field`` instances.
    Auto-created fields (primary keys, for example) and one-to-many fields (reverse FK relationships) are skipped.
    """
    if field.auto_created or field.one_to_many:
        return None
    if field.many_to_many:
        return RawMultiString
    defaults = {
        models.DateField:
        dsl.Date(),
        models.DateTimeField:
        dsl.Date(),
        models.IntegerField:
        dsl.Long(),
        models.PositiveIntegerField:
        dsl.Long(),
        models.BooleanField:
        dsl.Boolean(),
        models.NullBooleanField:
        dsl.Boolean(),
        #        models.SlugField: dsl.String(index='not_analyzed'),
        models.SlugField:
        dsl.Text(index='not_analyzed'),
        models.DecimalField:
        dsl.Double(),
        models.FloatField:
        dsl.Float(),
    }
    return defaults.get(field.__class__, RawString)
Example #3
0
def doc_field(type):
    defaults = {
        'date': dsl.Date(),
        'integer': dsl.Long(),
        'boolean': dsl.Boolean(),
        'double': dsl.Double(),
        'float': dsl.Float(),
    }
    return defaults.get(type, RawString)
Example #4
0
class GameSummary(elasticsearch_dsl.Document):
    """Game search model"""

    id = elasticsearch_dsl.Text()
    name = elasticsearch_dsl.Text()
    isPublic = elasticsearch_dsl.Boolean()
    players = elasticsearch_dsl.Object(PlayersInGame)

    class Index:  # pylint: disable=missing-class-docstring
        name = "games"
Example #5
0
class EntityDocument(CollectionDocument):
    """Document for entity search."""

    descriptor_completed = dsl.Boolean()
    collections = dsl.Integer(multi=True)

    class Meta:
        """Meta class for entity search document."""

        index = 'entity'
Example #6
0
class Dictionary(es.Document):
    corpus = es.Keyword()
    name = es.Keyword()
    description = es.Text()
    datetime = es.Date()
    number_of_documents = es.Integer()

    is_ready = es.Boolean()

    class Index:
        name = ES_INDEX_DICTIONARY_INDEX
        using = ES_CLIENT
class DataDocType(es.DocType):
    """Elasticsearch test model"""
    first_name = es.Keyword()
    last_name = es.Keyword()
    city = es.Text()
    skills = es.Keyword()
    birthday = es.Date()
    is_active = es.Boolean()
    score = es.Integer()
    description = es.Text()

    class Meta:
        index = 'test'
Example #8
0
        def decorator(cls):
            print("setup_schema:" + cls.__name__.lower())
            #
            # create an elastic model from the schema
            #
            # there are two special keys you can use additionally to the
            # standard cerberus syntx:
            # "elastic" :   add any Elastic DSL "Column" __init__ kwargs here, they will be handed raw
            #               to the Column __init__
            # "elastictype" : add a more specific elasticserach_dsl type definition (Text instead of string)
            # the two special keys will be removed from the schema at the end of this
            # decorator.
            #    

            #
            # now set the right elastic types for the doc
            #
            from datetime import datetime
            #from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, Integer\
            #    Float, Byte, Text, analyzer, InnerObjectWrapper, Completion
            import elasticsearch_dsl
            
            for elem in cls.schema.keys():
                #print(elem)
                # the raw Column __init__ parameters dict
                elastic=cls.schema[elem].get("elastic", {})
                if cls.schema[elem]["type"] == "integer":
                    setattr(cls, elem, elasticsearch_dsl.Integer(**elastic))
                elif cls.schema[elem]["type"] == "float":
                    setattr(cls, elem, elasticsearch_dsl.Float(**elastic))
                elif cls.schema[elem]["type"] == "string":
                    setattr(cls, elem, elasticsearch_dsl.Text(**elastic))
                elif cls.schema[elem]["type"] == "bool":
                    setattr(cls, elem, elasticsearch_dsl.Boolean(**elastic))
                elif cls.schema[elem]["type"] == "date":
                    setattr(cls, elem, elasticsearch_dsl.Date(**elastic))
                elif cls.schema[elem]["type"] == "datetime":
                    setattr(cls, elem, elasticsearch_dsl.Date(**elastic))
                elif cls.schema[elem]["type"] == "number":
                    setattr(cls, elem, elasticsearch_dsl.Integer(**elastic))
                elif cls.schema[elem]["type"] == "binary":
                    setattr(cls, elem, elasticsearch_dsl.Byte(**elastic))
                elif cls.schema[elem]["type"] == "list":
                    setattr(cls, elem, elasticsearch_dsl.Keyword(**elastic))
                else:
                    raise Exception("Wrong Datatype in schema") 
                #print("  .. removing the schema (raw) elastic key(s)")
                cls.schema[elem].pop("elastic", None)
                cls.schema[elem].pop("elastictype", None)

            return cls
Example #9
0
class GeoCoding(PluginBase):
    """Class that will attempt to geotag a tweet."""

    data_schema = {
        'geotagged': es.Boolean(),
        'location': es.Object(Location),
        'coordinates': es.GeoPoint(),
    }

    def __init__(self, *args, **kwargs) -> None:
        """Setup Carmen geotagging options, then init super."""
        with warnings.catch_warnings():
            # The default setup of carmen appears to raise several warnings, we
            # suppress them with the catch_warnings context manager.
            warnings.simplefilter("ignore")
            resolver_options = {'place': {'allow_unknown_locations': True}}
            self.geotagger = get_resolver(options=resolver_options)
            self.geotagger.load_locations()
            self.location_resolver = LocationEncoder()

        super().__init__(*args, **kwargs)  # type: ignore

    def process_tweet(self, tweet_json: Dict[str, Any]) -> Dict[str, Any]:
        """
        Attempt to geotag the tweet data.

        Returns the tweet with new data if any was resolved and will set
        geotagged according to success or failure.
        """
        LOG.debug('Attempting to geotag tweet')
        tweet_location = self.geotagger.resolve_tweet(tweet_json['_raw'])

        tweet_json['geotagged'] = False

        if tweet_location:
            LOG.debug('  This tweet includes location information')
            tweet_json['location'] = self.location_resolver.default(
                tweet_location[1])

            if 'latitude' in tweet_json[
                    'location'] and 'longitude' in tweet_json['location']:
                tweet_json['coordinates'] = {
                    'lat': tweet_json['location']['latitude'],
                    'lon': tweet_json['location']['longitude'],
                }

                tweet_json['geotagged'] = True
                LOG.debug('Geotagging completed!')

        return tweet_json
Example #10
0
class DataDocType(es.Document):
    """Elasticsearch test model"""
    first_name = es.Keyword()
    last_name = es.Keyword()
    city = es.Text()
    skills = es.Keyword()
    birthday = es.Date()
    is_active = es.Boolean()
    score = es.Integer()
    location = es.GeoPoint()
    description = es.Text()


    class Index:
        name = 'test'
Example #11
0
class TopicModellingIndex(es.Document):
    corpus = es.Keyword()
    source = es.Keyword()
    number_of_documents = es.Integer()
    is_ready = es.Boolean()
    has_topic_info = es.Boolean()
    name = es.Keyword()
    description = es.Text()
    datetime_created = es.Date()
    datetime_finished = es.Date()

    datetime_from = es.Date()
    datetime_to = es.Date()

    algorithm = es.Keyword()
    number_of_topics = es.Integer()
    hierarchical = es.Boolean()
    meta_parameters = es.Object()

    perplexity = es.Float()
    purity = es.Float()
    contrast = es.Float()
    coherence = es.Float()

    tau_smooth_sparse_theta = es.Float()
    tau_smooth_sparse_phi = es.Float()
    tau_decorrelator_phi = es.Float()
    tau_coherence_phi = es.Float()

    topics = es.Nested(Topic)

    is_actualizable = es.Boolean()

    class Index:
        name = ES_INDEX_TOPIC_MODELLING
        using = ES_CLIENT
Example #12
0
class Text(es.InnerDoc):
    """Simple Elasticsearch DSL mapping of the text data this plugin will return."""

    full_text = es.Text()
    pattern_polarity = es.Float()
    pattern_subjectivity = es.Float()
    short_text = es.Text()
    translated = es.Text()
    truncated = es.Boolean()
    tweet_length = es.Integer()
    vader_compound = es.Float()
    vader_compound_inverted = es.Float()
    vader_negative = es.Float()
    vader_neutral = es.Float()
    vader_positive = es.Float()
Example #13
0
class BaseDocument(dsl.DocType):
    """Base document class to build ElasticSearch documents.

    This is standard ``elasticsearch-dsl`` ``DocType`` class with
    already added fields for handling permissions.

    """

    #: list of user ids with view permission on the object
    users_with_permissions = dsl.Keyword(multi=True)

    #: list of group ids with view permission on the object
    groups_with_permissions = dsl.Keyword(multi=True)

    #: identifies if object has public view permission assigned
    public_permission = dsl.Boolean()
Example #14
0
class Node(es.DocType):
    """
    Elastic document describing user
    """

    node_type = es.Keyword()

    objectID = es.Keyword()

    name = es.Text(
        fielddata=True,
        analyzer=autocomplete
    )

    user = es.Object(
        fields={
            'id': es.Keyword(),
            'name': es.Text(
                fielddata=True,
                analyzer=autocomplete)
        }
    )

    description = es.Text()

    is_free = es.Boolean()

    project = es.Object(
        fields={
            'id': es.Keyword(),
            'name': es.Keyword(),
            'url': es.Keyword(),
        }
    )

    media = es.Keyword()

    picture = es.Keyword()

    tags = es.Keyword(multi=True)
    license_notes = es.Text()

    created_at = es.Date()
    updated_at = es.Date()

    class Meta:
        index = 'nodes'
Example #15
0
class EmbeddingIndex(es.Document):
    corpus = es.Keyword()
    number_of_documents = es.Integer()
    is_ready = es.Boolean()
    name = es.Keyword()
    description = es.Text()
    datetime_created = es.Date()
    datetime_finished = es.Date()

    by_unit = es.Keyword()  # Token/Word/Sentence/Text
    algorithm = es.Keyword()
    pooling = es.Keyword()
    meta_parameters = es.Object()

    class Index:
        name = ES_INDEX_EMBEDDING
        using = ES_CLIENT
Example #16
0
class KadastraalSubject(es.DocType):
    naam = es.Text(
        analyzer=analyzers.naam,
        fields={
            'raw': es.Keyword(),
            'ngram': es.Text(
                analyzer=analyzers.kad_sbj_naam,
                search_analyzer=analyzers.kad_obj_aanduiding_keyword)})

    natuurlijk_persoon = es.Boolean()
    geslachtsnaam = es.Text(analyzer=analyzers.naam)
    order = es.Integer()

    subtype = es.Keyword()
    _display = es.Keyword()

    class Index:
        name = settings.ELASTIC_INDICES['BRK_SUBJECT']
Example #17
0
class META_DTM(es.Document):
    meta_name = es.Keyword()
    volume_days = es.Float()
    delta_days = es.Float()
    reset_index = es.Boolean()
    from_date = es.Date()
    to_date = es.Date()

    class Index:
        name = ES_INDEX_META_DTM
        using = ES_CLIENT

        settings = {
            "number_of_shards": 1,
            "number_of_replicas": 1,
        }
        mappings = {
            "properties": {
                "meta_name": {
                    "type": "keyword",
                },
                "volume_days": {
                    "type": "float",
                },
                "delta_days": {
                    "type": "float",
                },
                "reset_index": {
                    "type": "boolean",
                },
                "from_date": {
                    "type": "date"
                },
                "to_date": {
                    "type": "date"
                }
            },
        }
Example #18
0
class Inschrijving(es.Document):
    """
    Elastic data of 'vestigingen' or 'mac'
    from handelsregister
    """
    maatschappelijke_activiteit_id = es.Keyword()
    vestiging_id = es.Keyword()

    dataset = es.Keyword()

    kvk_nummer = es.Keyword()
    handelsnaam = es.Keyword()
    datum_aanvang = es.Date()
    eigenaar_naam = es.Keyword()
    eigenaar_id = es.Keyword()
    non_mailing = es.Boolean()

    aantal_werkzame_personen = es.Integer()
    rechtsvorm = es.Keyword()

    # Address information
    bezoekadres_volledig_adres = es.Keyword()
    bezoekadres_correctie = es.Boolean()
    bezoekadres_afgeschermd = es.Boolean()
    bezoekadres_openbare_ruimte = es.Keyword()
    bezoekadres_huisnummer = es.Integer()
    bezoekadres_huisletter = es.Keyword()
    bezoekadres_huisnummertoevoeging = es.Keyword()
    bezoekadres_postcode = es.Keyword()
    bezoekadres_plaats = es.Keyword()

    bezoekadres_buurt_code = es.Keyword()
    bezoekadres_buurt_naam = es.Keyword()
    bezoekadres_buurtcombinatie_code = es.Keyword()
    bezoekadres_buurtcombinatie_naam = es.Keyword()
    bezoekadres_ggw_code = es.Keyword()
    bezoekadres_ggw_naam = es.Keyword()
    bezoekadres_gsg_naam = es.Keyword()
    bezoekadres_stadsdeel_code = es.Keyword()
    bezoekadres_stadsdeel_naam = es.Keyword()

    postadres_volledig_adres = es.Keyword()
    postadres_correctie = es.Boolean()
    postadres_afgeschermd = es.Boolean()
    postadres_openbare_ruimte = es.Keyword()
    postadres_huisnummer = es.Integer()
    postadres_huisletter = es.Keyword()
    postadres_huisnummertoevoeging = es.Keyword()
    postadres_postcode = es.Keyword()
    postadres_plaats = es.Keyword()

    # And the bag numid
    bag_numid = es.Keyword()
    adresseerbaar_object_id = identificatie = es.Keyword()
    centroid = es.GeoPoint()

    # Categores
    hoofdcategorie = es.Keyword(multi=True)
    subcategorie = es.Keyword(multi=True)

    # SBI codes
    sbi_code = es.Text(
        multi=True,
        fielddata=True,
        analyzer=autocomplete,
    )

    sbi_omschrijving = es.Keyword(multi=True)

    sbi_l1 = es.Keyword(multi=True)
    sbi_l2 = es.Keyword(multi=True)
    sbi_l3 = es.Keyword(multi=True)
    sbi_l4 = es.Keyword(multi=True)
    sbi_l5 = es.Keyword(multi=True)

    # bijzondere rechtstoestand

    # status = es.Keyword()

    bijzondere_rechtstoestand = es.Keyword()

    class Meta:
        all = es.MetaField(enabled=False)
        doc_type = 'vestiging'

    class Index:
        doc_type = 'vestiging'
        name = settings.ELASTIC_INDICES['DS_HR_INDEX']
Example #19
0
class Job(es.DocType):
    class Meta:
        index = 'jobs'
        doc_type = 'job-offer'

    french_elision = es.token_filter('french_elision',
                                     type='elision',
                                     articles_case=True,
                                     articles=[
                                         'l', 'm', 't', 'qu', 'n', 's', 'j',
                                         'd', 'c', 'jusqu', 'quoiqu', 'lorsqu',
                                         'puisqu'
                                     ])

    french_stopwords = es.token_filter('french_stopwords',
                                       type='stop',
                                       stopwords='_french_')

    # Do not include this filter if keywords is empty
    french_keywords = es.token_filter('french_keywords',
                                      type='keyword_marker',
                                      keywords=[])

    french_stemmer = es.token_filter('french_stemmer',
                                     type='stemmer',
                                     language='light_french')

    french_analyzer = es.analyzer(
        'french_analyzer',
        tokenizer='standard',
        filter=[
            'lowercase',
            'asciifolding',
            french_elision,
            french_stopwords,
            # french_keywords,
            french_stemmer
        ],
        char_filter=['html_strip'])

    technologies_tokenizer = es.tokenizer('comma_tokenizer',
                                          type='pattern',
                                          pattern=' |,|, ')

    technologies_synonyms_filter = es.token_filter(
        'technologies_synonyms',
        type='synonym',
        synonyms=[
            'c => c_language', 'c++, cpp => cpp_language',
            'c/c++, c/cpp => c_language', 'c/c++, c/cpp => cpp_language',
            'c#, c♯, csharp => csharp_language',
            'f#, f♯, fsharp => fsharp_language', 'c#, c♯, csharp => dotnet',
            'f#, f♯, fsharp => dotnet', '.net => dotnet'
        ])

    technologies_analyzer = es.analyzer(
        'technologies_analyzer',
        tokenizer=technologies_tokenizer,
        filter=['lowercase', 'asciifolding', technologies_synonyms_filter])

    company_name_analyzer = es.analyzer('company_name_analyzer',
                                        tokenizer='standard',
                                        filter=['lowercase', 'asciifolding'])

    id = es.Integer()

    url = es.String(index='no')
    source = es.String(index='not_analyzed')

    title = es.String(
        analyzer=french_analyzer,
        fields={'technologies': es.String(analyzer=technologies_analyzer)})

    description = es.String(
        analyzer=french_analyzer,
        fields={'technologies': es.String(analyzer=technologies_analyzer)})

    company = es.String(analyzer=company_name_analyzer)

    company_url = es.String(index='no')

    address = es.String(analyzer=french_analyzer)
    address_is_valid = es.Boolean()

    tags = es.Nested(doc_class=Tag,
                     properties=dict(tag=es.String(index='not_analyzed'),
                                     weight=es.Integer()))

    publication_datetime = es.Date()
    publication_datetime_is_fake = es.Boolean()

    crawl_datetime = es.Date()

    geolocation = es.GeoPoint()
    geolocation_is_valid = es.Boolean()

    def __init__(self, meta=None, **kwargs):
        super(Job, self).__init__(meta, **kwargs)
        self._doc_type.index = compute_index_name(self.index)

    @property
    def index(self):
        return self._doc_type.index

    @property
    def doc_type(self):
        return self._doc_type.name

    @property
    def published(self):
        return format_date(self.publication_datetime, locale='FR_fr')

    @property
    def published_in_days(self):
        delta = datetime.now() - self.publication_datetime  # TODO: bugfix
        return format_timedelta(delta, granularity='day', locale='en_US')

    @property
    def alltags(self):
        tags = []
        if self.tags:
            for tag in self.tags:
                if tag['tag'] not in condition_tags:
                    tags.append(Tag2(tag['tag'], tag['weight']))
        return tags

    @property
    def condition_tags(self):
        tags = []
        if self.tags:
            for tag in self.tags:
                if tag['tag'] in condition_tags:
                    tag = Tag2(tag['tag'], tag['weight'],
                               Tag2.get_css(tag['tag']))
                    tags.append(tag)
        return tags
Example #20
0
class DictionaryWord(es.Document):
    dictionary = es.Keyword()
    word = es.Keyword()
    word_normal = es.Keyword()

    is_in_pymorphy2_dict = es.Boolean()
    is_multiple_normals_in_pymorphy2 = es.Boolean()
    is_stop_word = es.Boolean()
    is_latin = es.Boolean()
    is_kazakh = es.Boolean()
    n_gram_len = es.Integer()
    pos_tag = es.Keyword()
    word_len = es.Integer()

    word_frequency = es.Integer()
    word_normal_frequency = es.Integer()
    document_frequency = es.Integer()
    document_normal_frequency = es.Integer()

    word_frequency_relative = es.Float()
    word_normal_frequency_relative = es.Float()
    document_frequency_relative = es.Float()
    document_normal_frequency_relative = es.Float()

    word_first_capital_ratio = es.Float()
    word_normal_first_capital_ratio = es.Float()

    class Index:
        name = ES_INDEX_DICTIONARY_WORD  # f"{ES_INDEX_DICTIONARY_WORD}_{name}{_temp}"
        using = ES_CLIENT

        settings = {
            "number_of_shards": 3,
            "number_of_replicas": 1,
        }

        mappings = {
            "properties": {
                "dictionary": {
                    "type": "keyword",
                },
                "word": {
                    "type": "keyword",
                },
                "word_normal": {
                    "type": "keyword",
                },
                "is_in_pymorphy2_dict": {
                    "type": "boolean",
                },
                "is_multiple_normals_in_pymorphy2": {
                    "type": "boolean",
                },
                "is_stop_word": {
                    "type": "boolean",
                },
                "is_latin": {
                    "type": "boolean",
                },
                "is_kazakh": {
                    "type": "boolean",
                },
                "n_gram_len": {
                    "type": "integer",
                },
                "pos_tag": {
                    "type": "keyword",
                },
                "word_len": {
                    "type": "integer",
                },
                "word_frequency": {
                    "type": "integer",
                },
                "word_normal_frequency": {
                    "type": "integer",
                },
                "document_frequency": {
                    "type": "integer",
                },
                "document_normal_frequency": {
                    "type": "integer",
                },
                "word_frequency_relative": {
                    "type": "float",
                },
                "word_normal_frequency_relative": {
                    "type": "float",
                },
                "document_frequency_relative": {
                    "type": "float",
                },
                "document_normal_frequency_relative": {
                    "type": "float",
                },
                "word_first_capital_ratio": {
                    "type": "float",
                },
                "word_normal_first_capital_ratio": {
                    "type": "float",
                },
            },
        }
Example #21
0
class Nummeraanduiding(es.DocType):
    """
    All bag objects should have one or more adresses

    Een nummeraanduiding, in de volksmond ook wel adres genoemd, is een door
    het bevoegde gemeentelijke orgaan als
    zodanig toegekende aanduiding van een verblijfsobject, standplaats of
    ligplaats.

    [Stelselpedia](http://www.amsterdam.nl/stelselpedia/bag-index/catalogus-bag/objectklasse-2/)
    """
    straatnaam = es.Text(analyzer=analyzers.adres,
                         fields={
                             'raw':
                             es.Keyword(),
                             'ngram_edge':
                             es.Text(analyzer=analyzers.autocomplete,
                                     search_analyzer='standard')
                         })

    straatnaam_keyword = es.Keyword()

    straatnaam_nen = es.Text(analyzer=analyzers.adres,
                             fields={
                                 'raw':
                                 es.Keyword(),
                                 'ngram_edge':
                                 es.Text(analyzer=analyzers.autocomplete,
                                         search_analyzer='standard')
                             })

    straatnaam_nen_keyword = es.Keyword()

    straatnaam_ptt = es.Text(analyzer=analyzers.adres,
                             fields={
                                 'raw':
                                 es.Keyword(),
                                 'ngram_edge':
                                 es.Text(analyzer=analyzers.autocomplete,
                                         search_analyzer='standard'),
                                 'keyword':
                                 es.Keyword(normalizer=analyzers.lowercase),
                             })

    straatnaam_ptt_keyword = es.Keyword()

    adres = es.Text(analyzer=analyzers.adres,
                    fields={
                        'raw':
                        es.Keyword(),
                        'ngram_edge':
                        es.Text(analyzer=analyzers.autocomplete,
                                search_analyzer='standard'),
                    })

    comp_address = es.Text(analyzer=analyzers.adres,
                           fields={
                               'raw':
                               es.Keyword(),
                               'ngram':
                               es.Text(analyzer=analyzers.autocomplete,
                                       search_analyzer='standard')
                           })
    comp_address_nen = es.Text(analyzer=analyzers.adres,
                               fields={
                                   'raw':
                                   es.Keyword(),
                                   'ngram':
                                   es.Text(analyzer=analyzers.autocomplete,
                                           search_analyzer='standard')
                               })
    comp_address_ptt = es.Text(analyzer=analyzers.adres,
                               fields={
                                   'raw':
                                   es.Keyword(),
                                   'ngram':
                                   es.Text(analyzer=analyzers.autocomplete,
                                           search_analyzer='standard')
                               })
    comp_address_pcode = es.Text(analyzer=analyzers.adres,
                                 fields={
                                     'raw':
                                     es.Keyword(),
                                     'ngram':
                                     es.Text(analyzer=analyzers.autocomplete,
                                             search_analyzer='standard')
                                 })

    huisnummer = es.Integer(
        fields={'variation': es.Text(analyzer=analyzers.huisnummer)})

    toevoeging = es.Text(analyzer=analyzers.toevoeging,
                         fields={'keyword': es.Keyword()})

    # to return official bag fields
    bag_toevoeging = es.Keyword()
    bag_huisletter = es.Keyword()
    woonplaats = es.Keyword()

    postcode = es.Text(
        analyzer=analyzers.postcode,
        fields=postcode_fields,
    )

    order = es.Integer()

    hoofdadres = es.Boolean()
    status = es.Nested(
        properties={
            'code': es.Keyword(normalizer=analyzers.lowercase),
            'omschrijving': es.Text()
        })

    vbo_status = es.Nested(
        properties={
            'code': es.Keyword(normalizer=analyzers.lowercase),
            'omschrijving': es.Text()
        })

    subtype = es.Keyword()
    _display = es.Keyword()

    landelijk_id = es.Text(analyzer=analyzers.autocomplete,
                           fields={
                               'raw': es.Keyword(),
                               'nozero': es.Text(analyzer=analyzers.nozero)
                           })
    adresseerbaar_object_id = es.Text(  # Is landelijk_id for related verblijfsobject, ligplaats of standplaats
        analyzer=analyzers.autocomplete,
        fields={
            'raw': es.Keyword(),
            'nozero': es.Text(analyzer=analyzers.nozero)
        })

    class Index:
        name = settings.ELASTIC_INDICES['NUMMERAANDUIDING']
Example #22
0
class ResponseDocType(FjordDocType):
    id = es_dsl.Integer()
    happy = es_dsl.Boolean()
    api = es_dsl.Integer()
    url = es_dsl.String(index='not_analyzed')
    url_domain = es_dsl.String(index='not_analyzed')
    has_email = es_dsl.Boolean()
    description = es_dsl.String(analyzer='snowball')
    category = es_dsl.String(index='not_analyzed')
    description_bigrams = es_dsl.String(index='not_analyzed')
    description_terms = es_dsl.String(analyzer='standard')
    user_agent = es_dsl.String(index='not_analyzed')
    product = es_dsl.String(index='not_analyzed')
    channel = es_dsl.String(index='not_analyzed')
    version = es_dsl.String(index='not_analyzed')
    browser = es_dsl.String(index='not_analyzed')
    browser_version = es_dsl.String(index='not_analyzed')
    platform = es_dsl.String(index='not_analyzed')
    locale = es_dsl.String(index='not_analyzed')
    country = es_dsl.String(index='not_analyzed')
    device = es_dsl.String(index='not_analyzed')
    manufacturer = es_dsl.String(index='not_analyzed')
    source = es_dsl.String(index='not_analyzed')
    campaign = es_dsl.String(index='not_analyzed')
    souce_campaign = es_dsl.String(index='not_analyzed')
    organic = es_dsl.Boolean()
    created = es_dsl.Date()

    docs = ResponseDocTypeManager()

    class Meta:
        pass

    def mlt(self):
        """Returns a search with a morelikethis query for docs like this"""
        # Short responses tend to not repeat any words, so then MLT
        # returns nothing. This fixes that by setting min_term_freq to
        # 1. Longer responses tend to repeat important words, so we can
        # set min_term_freq to 2.
        num_words = len(self.description.split(' '))
        if num_words > 40:
            min_term_freq = 2
        else:
            min_term_freq = 1

        s = self.search()
        if self.product:
            s = s.filter('term', product=self.product)
        if self.platform:
            s = s.filter('term', platform=self.platform)

        s = s.query('more_like_this',
                    fields=['description'],
                    docs=[{
                        '_index': get_index_name(),
                        '_type': self._doc_type.name,
                        '_id': self.id
                    }],
                    min_term_freq=min_term_freq,
                    stop_words=list(ANALYSIS_STOPWORDS))
        return s

    @classmethod
    def get_model(cls):
        return Response

    @classmethod
    def public_fields(cls):
        """Fields that can be publicly-visible

        .. Note::

           Do NOT include fields that have PII in them.

        """
        return ('id', 'happy', 'api', 'url_domain', 'has_email', 'description',
                'category', 'description_bigrams', 'user_agent', 'product',
                'version', 'platform', 'locale', 'source', 'campaign',
                'organic', 'created')

    @property
    def truncated_description(self):
        """Shorten feedback for dashboard view."""
        return smart_truncate(self.description, length=500)

    @classmethod
    def extract_doc(cls, resp, with_id=True):
        """Converts a Response to a dict of values

        This can be used with ``ResponseDocType.from_obj()`` to create a
        ``ResponseDocType`` object or it can be used for indexing.

        :arg resp: a Response object
        :arg with_id: whether or not to include the ``_id`` value--include
            it when you're bulk indexing

        :returns: a dict

        """
        doc = {
            'id':
            resp.id,
            'happy':
            resp.happy,
            'api':
            resp.api,
            'url':
            resp.url,
            'url_domain':
            resp.url_domain,
            'has_email':
            bool(resp.user_email),
            'description':
            resp.description,
            'user_agent':
            resp.user_agent,
            'product':
            resp.product,
            'channel':
            resp.channel,
            'version':
            resp.version,
            'browser':
            resp.browser,
            'browser_version':
            resp.browser_version,
            'platform':
            resp.platform,
            'locale':
            resp.locale,
            'country':
            resp.country,
            'device':
            resp.device,
            'manufacturer':
            resp.manufacturer,
            'source':
            resp.source,
            'campaign':
            resp.campaign,
            'source_campaign':
            '::'.join([(resp.source or '--'), (resp.campaign or '--')]),
            'organic': (not resp.campaign),
            'created':
            resp.created
        }

        # We only compute bigrams for english because the analysis
        # uses English stopwords, stemmers, ...
        if resp.locale.startswith(u'en') and resp.description:
            doc['description_bigrams'] = compute_grams(resp.description)
        else:
            doc['description_bigrams'] = []

        if with_id:
            doc['_id'] = doc['id']
        return doc
class AWSDetailedLineitem(dsl.DocType):
    class Meta:
        index = 'awsdetailedlineitem'

    availability_zone = dsl.String(index='not_analyzed')
    cost = dsl.Double()
    un_blended_cost = dsl.Double()
    item_description = dsl.String(index='not_analyzed')
    linked_account_id = dsl.String(index='not_analyzed')
    operation = dsl.String()
    payer_account_id = dsl.String(index='not_analyzed')
    pricing_plan_id = dsl.Long()
    product_name = dsl.String(index='not_analyzed')
    rate = dsl.Double()
    un_blended_rate = dsl.Double()
    rate_id = dsl.Long()
    record_id = dsl.String(index='not_analyzed')
    reserved_instance = dsl.Boolean()
    resource_id = dsl.String(index='not_analyzed')
    subscription_id = dsl.Long()
    tag = dsl.Object(
        properties={
            'key': dsl.String(index='not_analyzed'),
            'value': dsl.String(index='not_analyzed')
        })
    usage_end_date = dsl.Date(format='strict_date_optional_time||epoch_millis')
    usage_quantity = dsl.Double()
    usage_start_date = dsl.Date(
        format='strict_date_optional_time||epoch_millis')
    usage_type = dsl.String(index='not_analyzed')

    @classmethod
    @with_cache(ttl=3600 * 3, worker_refresh=True)
    def keys_has_data(cls, keys, date_from=None, date_to=None):
        date_to = date_to or datetime.utcnow()
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        if date_from:
            s = s.filter('range',
                         usage_start_date={
                             'from': date_from.isoformat(),
                             'to': date_to.isoformat()
                         })
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        return res['hits']['total'] > 0

    @classmethod
    @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d"))
    def get_first_date(cls, keys):
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.sort('usage_start_date')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)
        if res['hits']['total'] == 0:
            return
        return res['hits']['hits'][0]['_source']['usage_start_date'].split(
            'T')[0]

    @classmethod
    @with_cache(is_json=False, ret=lambda x: datetime.strptime(x, "%Y-%m-%d"))
    def get_last_date(cls, keys, limit=None):
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        if limit:
            s = s.filter('range', usage_start_date={'to': limit.isoformat()})
        s = s.sort('-usage_start_date')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=1,
                            request_timeout=60)
        if res['hits']['total'] == 0:
            return
        return res['hits']['hits'][0]['_source']['usage_start_date'].split(
            'T')[0]

    @classmethod
    def get_first_to_now_date(cls, keys):
        def from_date_to_today(d):
            now = datetime.utcnow()
            while d < now:
                yield d
                d += relativedelta(months=1)

        return list(from_date_to_today(cls.get_first_date(keys)))

    @classmethod
    def get_first_to_last_date(cls, keys):
        def from_date_to_last(d):
            last = cls.get_last_date(keys)
            while d < last:
                yield d
                d += relativedelta(months=1)

        return list(from_date_to_last(cls.get_first_date(keys)))

    @classmethod
    @with_cache(6 * 3600)
    def get_available_tags(cls, keys, only_with_data=None, product_name=None):
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        if product_name:
            s = s.filter('term', product_name=product_name)
        s.aggs.bucket('tag_key', 'terms', field='tag.key')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        tags = []
        for tag in res['aggregations']['tag_key']['buckets']:
            if tag['key'].startswith('user:'******'key'].split(':')[1]
                if not only_with_data or name in AWSStat.latest_hourly_cpu_usage_by_tag(
                        only_with_data
                )['tags'] or name in AWSStat.latest_daily_cpu_usage_by_tag(
                        only_with_data)['tags']:
                    tags.append(name)
        tags.sort()
        return dict(tags=tags)

    @classmethod
    @with_cache(ttl=6 * 3600)
    def get_cost_by_tag(cls, keys, tag, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)})
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s.aggs.bucket('total_cost', 'sum', field='cost')
        agg = s.aggs.bucket('tag_value',
                            'terms',
                            field='tag.value',
                            size=0x7FFFFFFF)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        tags = [{
            'tag_value': tag['key'],
            'cost': tag['cost']['value'],
        } for tag in res['aggregations']['tag_value']['buckets']]
        return dict(tags=tags,
                    total_cost=res['aggregations']['total_cost']['value'])

    @classmethod
    @with_cache(ttl=6 * 3600)
    def get_cost(cls, keys, date_from, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(
            hour=23, minute=59, second=59, microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s.aggs.bucket('total_cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        return dict(total_cost=res['aggregations']['total_cost']['value'])

    @classmethod
    @with_cache()
    def get_monthly_cost_by_tag(cls, keys, tag, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', **{'tag.key': 'user:{}'.format(tag)})
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg.bucket('total_cost', 'sum', field='cost')
        agg = agg.bucket('tag_value',
                         'terms',
                         field='tag.value',
                         size=0x7FFFFFFF)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        months = [{
            'month':
            interval['key_as_string'].split('T')[0][:-3],
            'tags': [{
                'tag_value': tag['key'],
                'cost': tag['cost']['value'],
            } for tag in interval['tag_value']['buckets']],
            'total_cost':
            interval['total_cost']['value'],
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=months)

    @classmethod
    @with_cache()
    def get_cost_by_product(cls,
                            key,
                            date_from=None,
                            date_to=None,
                            without_discount=False,
                            only_discount=False,
                            size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        if without_discount:
            s = s.query(
                'bool',
                filter=[
                    ~dsl.Q('term', item_description='PAR_APN_ProgramFee_2500')
                ])
        if only_discount:
            s = s.filter('term', item_description='PAR_APN_ProgramFee_2500')
        agg = s.aggs.bucket('products',
                            'terms',
                            field='product_name',
                            order={'cost': 'desc'},
                            size=size)
        agg.bucket('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        products = [{
            'product': SHORT_NAMES.get(product['key'], product['key']),
            'cost': product['cost']['value'],
        } for product in res['aggregations']['products']['buckets']]
        return dict(products=products)

    @classmethod
    @with_cache()
    def get_cost_by_region(cls,
                           keys,
                           tagged=False,
                           byaccount=False,
                           date_from=None,
                           date_to=None,
                           size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })

        agg = s.aggs
        if byaccount:
            agg = agg.bucket('accounts', 'terms', field='linked_account_id')
        agg = agg.bucket('intervals',
                         'date_histogram',
                         field='usage_start_date',
                         interval='month',
                         min_doc_count=1)
        agg = agg.bucket('regions',
                         'terms',
                         field='availability_zone',
                         size=size)
        agg.bucket('cost', 'sum', field='cost')
        if tagged:
            agg = agg.bucket('tags', 'terms', field='tag.value')
            agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0)

        return res['aggregations']

    @classmethod
    @with_cache()
    def get_monthly_cost(cls,
                         keys,
                         date_from=None,
                         date_to=None,
                         size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'month': interval['key_as_string'].split('T')[0],
            'total_cost': interval['cost']['value'],
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=res)

    @classmethod
    @with_cache()
    def get_monthly_cost_by_product(cls,
                                    keys,
                                    tagged=False,
                                    date_from=None,
                                    date_to=None,
                                    size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.bucket('cost', 'sum', field='cost')
        if tagged:
            agg = agg.bucket('tags', 'terms', field='tag.value')
            agg.bucket('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        def tagged_cost(bucket, total):
            total_tag = 0.0
            for tag in bucket:
                total_tag += tag['cost']['value']
                yield (tag['key'], tag['cost']['value'])
            if total != total_tag:
                yield ('untagged', total - total_tag)

        res = [{
            'month':
            interval['key_as_string'].split('T')[0],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
                'tags': [{
                    'name': tag[0],
                    'cost': tag[1],
                } for tag in tagged_cost(product['tags']['buckets'],
                                         product['cost']['value'])],
            } for product in interval['products']['buckets']] if tagged else [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=res)

    @classmethod
    @with_cache(ttl=4 * 3600)
    def get_daily_cost_by_product(cls,
                                  keys,
                                  date_from=None,
                                  date_to=None,
                                  size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(
            hour=23, minute=59, second=59, microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='day',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.metric('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'day':
            interval['key_as_string'].split('T')[0],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(days=res)

    @classmethod
    @with_cache(ttl=24 * 3600)
    def get_yearly_cost_by_product(cls,
                                   keys,
                                   date_from=None,
                                   date_to=None,
                                   size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(month=12,
                                               day=31,
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='year',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.metric('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'year':
            interval['key_as_string'][:4],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(years=res)

    @classmethod
    @with_cache()
    def get_cost_by_resource(cls,
                             keys,
                             date_from=None,
                             date_to=None,
                             search=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        if search:
            s = s.query('wildcard', resource_id='*{}*'.format(search))
        agg = s.aggs.bucket('resources',
                            'terms',
                            field='resource_id',
                            order={'cost': 'desc'},
                            size=0x7FFFFFFF)
        agg.bucket('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        resources = [{
            'resource': resource['key'],
            'cost': resource['cost']['value'],
        } for resource in res['aggregations']['resources']['buckets']]
        return resources

    @classmethod
    def get_monthly_cost_by_resource(cls,
                                     resource_ids,
                                     date_from=None,
                                     date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        if resource_ids:
            s = cls.search()
            s = s.filter('range',
                         usage_start_date={
                             'from': date_from.isoformat(),
                             'to': date_to.isoformat()
                         })
            s = s.filter('terms', resource_id=list(resource_ids))
            agg = s.aggs.bucket('months',
                                'date_histogram',
                                field='usage_start_date',
                                interval='month',
                                min_doc_count=1)
            agg.metric('cost', 'sum', field='cost')
            r = client.search('awsdetailedlineitem',
                              body=s.to_dict(),
                              size=0,
                              request_timeout=60)
            return {
                e['key_as_string']: e['cost']['value']
                for e in r['aggregations']['months']['buckets']
            }
        else:
            return {}

    @classmethod
    @with_cache()
    def get_lambda_usage(cls, keys, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', product_name='AWS Lambda')
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('resources',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'avg', field='cost')
        agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF)
        agg.metric('quantity', 'sum', field='usage_quantity')
        agg = agg.bucket('descriptions',
                         'terms',
                         field='item_description',
                         size=0x7FFFFFFF)
        agg.metric('quantity', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        #return res

        def _lambda_usage_regb(buckets, endswith):
            for b in buckets:
                if b['key'].endswith(endswith):
                    return b['quantity']['value']

        usages = [{
            'rid':
            usage['key'],
            'name':
            usage['key'].split(':')[-1],
            'requests':
            _lambda_usage_regb(usage['types']['buckets'], '-Request'),
            'gb_seconds':
            _lambda_usage_regb(usage['types']['buckets'], '-Lambda-GB-Second'),
            'cost':
            usage['cost']['value'],
            'raw_cost':
            lambdapricing.get_raw_cost([
                x['descriptions']['buckets'] for x in usage['types']['buckets']
            ]),
        } for usage in res['aggregations']['resources']['buckets']]
        return usages

    @classmethod
    @with_cache()
    def get_s3_bandwidth_costs(cls, key, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Simple Storage Service')
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('types',
                            'terms',
                            field='usage_type',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg.metric('gb', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        transfers = [{
            'type': transfer['key'],
            'quantity': transfer['gb']['value'],
            'cost': transfer['cost']['value'],
        } for transfer in res['aggregations']['types']['buckets']]
        return transfers

    @classmethod
    @with_cache()
    def get_ec2_bandwidth_costs(cls, key, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Elastic Compute Cloud')
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('types',
                            'terms',
                            field='usage_type',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg.metric('gb', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        transfers = [{
            'type': transfer['key'],
            'quantity': transfer['gb']['value'],
            'cost': transfer['cost']['value'],
        } for transfer in res['aggregations']['types']['buckets']]
        return transfers

    @classmethod
    def get_ec2_daily_cost(cls, key):
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Elastic Compute Cloud')

        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='day',
                            min_doc_count=1)
        agg.metric('cost', 'sum', field='cost')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        for interval in res['aggregations']['intervals']['buckets']:
            yield interval['key_as_string'].split(
                'T')[0], interval['cost']['value']

    @classmethod
    @with_cache()
    def get_elb_usage_a_day(cls, keys, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        gib = Fraction(2**30)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter("prefix", resource_id="arn:aws:elasticloadbalancing")
        s = s.sort({"usage_start_date": {"order": "desc"}})
        agg = s.aggs.bucket('rid',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg = agg.bucket('types', 'terms', field='usage_type', size=0x7FFFFFFF)
        agg.metric('quantity', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        elbs = [{
            'rid':
            elb['key'],
            'cost':
            elb['cost']['value'] / (date_to - date_from).days,
            'hours':
            float(
                sum([
                    x['quantity']['value'] for x in elb['types']['buckets']
                    if x['key'].endswith('LoadBalancerUsage')
                ]) / (date_to - date_from).days),
            'bytes':
            float((sum([
                x['quantity']['value']
                for x in elb['types']['buckets'] if x['key'].endswith('Bytes')
            ]) * gib) / (date_to - date_from).days),
        } for elb in res['aggregations']['rid']['buckets']]
        return elbs

    @classmethod
    @with_cache()
    def get_instance_type(cls, keys, date_from=None, date_to=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.extra(_source=[
            'usage_start_date', 'usage_type', 'availability_zone',
            'resource_id'
        ])
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter("term", product_name='Amazon Elastic Compute Cloud')
        s = s.query('wildcard', usage_type='*BoxUsage:*')
        s = s.filter('exists', field='resource_id')
        s = s.sort({"usage_start_date": {"order": "desc"}})
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=10000,
                            request_timeout=60)

        def cut_region_name(s):
            return s[:-1] if s[-1].isalpha() else s

        types = []
        refs = {}

        def add_in_types(type, rid):
            ref_tuple = (type['hour'], type['instance'], type['region'])
            if ref_tuple in refs:
                refs[ref_tuple]['rids'].append(rid)
                refs[ref_tuple]['ridCount'] += 1
                return
            type['rids'] = [rid]
            types.append(type)
            refs[ref_tuple] = types[-1]

        for r in res['hits']['hits']:
            elem = {
                'hour':
                r['_source']['usage_start_date'],
                'instance':
                r['_source']['usage_type'].split(':')[1],
                'region':
                cut_region_name(r['_source']['availability_zone'])
                if 'availability_zone' in r['_source'] else 'unknown',
                'ridCount':
                1,
            }
            add_in_types(elem, r['_source']['resource_id'])
        return types

    @classmethod
    @with_cache()
    def get_instance_hour(cls,
                          keys,
                          date_from=None,
                          date_to=None,
                          min_hour=None):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter("term", product_name='Amazon Elastic Compute Cloud')
        s = s.filter('prefix', resource_id='i-')
        s = s.query('wildcard', usage_type='*BoxUsage*')
        agg = s.aggs.bucket('resource_id',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.bucket('days',
                   'date_histogram',
                   field='usage_start_date',
                   interval='day',
                   min_doc_count=1)
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        instance_list = []
        for instance in res['aggregations']['resource_id']['buckets']:
            tmp_hours = []
            for day in instance['days']['buckets']:
                tmp_hours.append(day['doc_count'])
            avg_hours = sum(tmp_hours) / float(len(tmp_hours))
            if not min_hour or avg_hours >= min_hour:
                instance_list.append(dict(id=instance['key'], hours=avg_hours))
        return sorted(instance_list, key=lambda x: x['hours'], reverse=True)

    @classmethod
    @with_cache()
    def get_s3_buckets_per_tag(cls, keys):
        def _check_if_in_list(dict_list, value, key):
            return next((item for item in dict_list if item[key] == value),
                        None)

        def _parse_tag_keys_results(res):
            bucket_tagged = []
            for bucket_tag_key in res['aggregations']['tag_key']['buckets']:
                buff_tag_key = _check_if_in_list(bucket_tagged,
                                                 bucket_tag_key['key'],
                                                 'tag_key')
                if buff_tag_key is None:
                    buff_tag_key = {
                        "tag_key": bucket_tag_key['key'],
                        "tag_value": []
                    }
                buff_tag_key = _parse_tag_values_results(
                    bucket_tag_key, buff_tag_key)
                bucket_tagged.append(buff_tag_key)
            return bucket_tagged

        def _parse_tag_values_results(bucket_tag_key, buff_tag_key):
            for bucket_tag_value in bucket_tag_key['tag_value']['buckets']:
                buff_tag_value = _check_if_in_list(buff_tag_key['tag_value'],
                                                   bucket_tag_value['key'],
                                                   'tag_value')
                if buff_tag_value is None:
                    buff_tag_value = {
                        "tag_value": bucket_tag_value['key'],
                        "s3_buckets": []
                    }
                buff_tag_value = _parse_buckets_results(
                    buff_tag_value, bucket_tag_value)
                buff_tag_key['tag_value'].append(buff_tag_value)
            return buff_tag_key

        def _parse_buckets_results(buff_tag_value, bucket_tag_value):
            for bucket_resource_id in bucket_tag_value['ressource_id'][
                    'buckets']:
                buff_bucket_resource_id = _check_if_in_list(
                    buff_tag_value['s3_buckets'], bucket_resource_id['key'],
                    'bucket_name')
                if buff_bucket_resource_id is None:
                    buff_bucket_resource_id = {
                        "bucket_name":
                        bucket_resource_id['key'],
                        "account_id":
                        bucket_resource_id['account_id']['buckets'][0]['key']
                    }
                buff_tag_value['s3_buckets'].append(buff_bucket_resource_id)
            return buff_tag_value

        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('term', product_name='Amazon Simple Storage Service')
        s = s.query('exists', field="tag")
        s = s.query('wildcard', item_description="*storage*")
        agg = s.aggs.bucket('tag_key', 'terms', field="tag.key")
        agg = agg.bucket('tag_value', 'terms', field='tag.value')
        agg.bucket('ressource_id', 'terms',
                   field='resource_id').bucket('account_id',
                                               'terms',
                                               field='linked_account_id')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        '''
        bucket_tagged structure
        [{
            "tag_key" : "KEY", # Unique in list
            "tag_value": [{
                "tag_value": "VALUE", # Unique in list
                "s3_buckets": [{
                    "bucket_name": "BUCKET_NAME",
                    "account_id": "ACCOUND_ID"
                }, {...}]
            }, {...}]
        }, {...}]
        '''

        bucket_tagged = _parse_tag_keys_results(res)
        return bucket_tagged

    @classmethod
    @with_cache()
    def get_s3_bandwidth_info_and_cost_per_name(cls,
                                                key,
                                                bucket_resource_ids,
                                                date_from=None,
                                                date_to=None):
        date_from = date_from or (datetime.utcnow() - relativedelta(
            month=1)).replace(day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('term', product_name='Amazon Simple Storage Service')
        s = s.filter('terms',
                     resource_id=bucket_resource_ids if isinstance(
                         bucket_resource_ids, list) else [bucket_resource_ids])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        s = s.filter('wildcard', usage_type="*Bytes")
        agg = s.aggs.bucket('bucket_name',
                            'terms',
                            field='resource_id',
                            size=0x7FFFFFFF)
        agg.metric('cost', 'sum', field='cost')
        agg = agg.bucket('transfer_type', 'terms', field='usage_type')
        agg.metric('data', 'sum', field='usage_quantity')
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)
        data = [{
            "bucket_name":
            bucket['key'],
            "cost":
            bucket['cost']['value'],
            "transfer_stats": [{
                "type": transfer_stat['key'],
                "data": transfer_stat['data']['value']
            } for transfer_stat in bucket['transfer_type']['buckets']]
        } for bucket in res['aggregations']['bucket_name']['buckets']]
        return data
Example #24
0
class Company(es.DocType):
    class Meta:
        index = 'companies'
        doc_type = 'company'

    french_elision = es.token_filter(
        'french_elision',
        type='elision',
        articles_case=True,
        articles=[
            'l', 'm', 't', 'qu', 'n', 's',
            'j', 'd', 'c', 'jusqu', 'quoiqu',
            'lorsqu', 'puisqu'
        ]
    )

    french_stopwords = es.token_filter(
        'french_stopwords',
        type='stop',
        stopwords='_french_'
    )

    # Do not include this filter if keywords is empty
    french_keywords = es.token_filter(
        'french_keywords',
        type='keyword_marker',
        keywords=[]
    )

    french_stemmer = es.token_filter(
        'french_stemmer',
        type='stemmer',
        language='light_french'
    )

    french_analyzer = es.analyzer(
        'french_analyzer',
        tokenizer='standard',
        filter=[
            'lowercase',
            'asciifolding',
            french_elision,
            french_stopwords,
            # french_keywords,
            french_stemmer
        ],
        char_filter=['html_strip']
    )

    technologies_tokenizer = es.tokenizer(
        'comma_tokenizer',
        type='pattern',
        pattern=' |,|, '
    )

    technologies_synonyms_filter = es.token_filter(
        'technologies_synonyms',
        type='synonym',
        synonyms=[
            'c => c_language',
            'c++, cpp => cpp_language',
            'c/c++, c/cpp => c_language',
            'c/c++, c/cpp => cpp_language',
            'c#, c♯, csharp => csharp_language',
            'f#, f♯, fsharp => fsharp_language',
            'c#, c♯, csharp => dotnet',
            'f#, f♯, fsharp => dotnet',
            '.net => dotnet'
        ]
    )

    technologies_analyzer = es.analyzer(
        'technologies_analyzer',
        tokenizer=technologies_tokenizer,
        filter=[
            'lowercase',
            'asciifolding',
            technologies_synonyms_filter
        ]
    )

    company_name_analyzer = es.analyzer(
        'company_name_analyzer',
        tokenizer='standard',
        filter=[
            'lowercase',
            'asciifolding'
        ]
    )

    id = es.String(index='no')

    name = es.String(analyzer=french_analyzer)

    description = es.String(
        analyzer=french_analyzer,
        fields={
            'technologies': es.String(analyzer=technologies_analyzer)
        }
    )

    technologies = es.String(analyzer=technologies_analyzer)

    url = es.String(index='no')
    logo_url = es.String(index='no')

    address = es.String(analyzer=french_analyzer)
    address_is_valid = es.Boolean()

    email = es.String(index='no')
    phone = es.String(index='no')

    geolocation = es.GeoPoint()
    geolocation_is_valid = es.Boolean()

    def __init__(self, meta=None, **kwargs):
        super(Company, self).__init__(meta, **kwargs)
        self._doc_type.index = compute_index_name(self.index)

    @property
    def index(self):
        return self._doc_type.index

    @property
    def doc_type(self):
        return self._doc_type.name
Example #25
0
class Inschrijving(es.DocType):

    _display = es.Keyword()

    _kvk_display = es.Keyword()

    doctype = es.Keyword()

    kvk_nummer = es.Text(analyzer=analyzers.autocomplete,
                         fields={
                             'raw': es.Keyword(),
                             'nozero': es.Text(analyzer=analyzers.nozero)
                         })

    vestigingsnummer = es.Text(analyzer=analyzers.autocomplete,
                               fields={
                                   'raw': es.Keyword(),
                                   'nozero':
                                   es.Text(analyzer=analyzers.nozero),
                                   'int': es.Integer()
                               })

    hoofdvestiging = es.Boolean()

    sbi = es.Nested(
        properties={
            'code':
            es.Text(analyzer=analyzers.autocomplete,
                    fields={'raw': es.Keyword()}),
            'omschrijving':
            es.Text(),
        })

    naam = es.Text(analyzer=analyzers.adres,
                   fields={
                       'raw':
                       es.Keyword(),
                       'ngram':
                       es.Text(analyzer=analyzers.autocomplete,
                               search_analyzer='standard')
                   })

    handelsnamen = es.Nested(
        properties={
            'naam':
            es.Text(analyzer=analyzers.adres,
                    fields={
                        'raw':
                        es.Keyword(),
                        'ngram':
                        es.Text(analyzer=analyzers.autocomplete,
                                search_analyzer='standard')
                    })
        })

    postadres = es.Text(analyzer=analyzers.adres,
                        fields={
                            'raw':
                            es.Keyword(),
                            'ngram':
                            es.Text(analyzer=analyzers.autocomplete,
                                    search_analyzer='standard')
                        })

    bezoekadres = es.Text(analyzer=analyzers.adres,
                          fields={
                              'raw':
                              es.Keyword(),
                              'ngram':
                              es.Text(analyzer=analyzers.autocomplete,
                                      search_analyzer='standard')
                          })

    bezoekadres_correctie = es.Boolean()

    # hoofdvestiging

    centroid = es.GeoPoint()

    class Index:
        name = settings.ELASTIC_INDICES['HR']
Example #26
0
class ValidationJob(elasticsearch_dsl.Document):
    """
    Object for validation data.
    TODO: Can this be merged with TrainingJob, with a common
        parent object?
    """
    id = elasticsearch_dsl.Integer()
    schema_version = elasticsearch_dsl.Integer()
    job_name = elasticsearch_dsl.Keyword()
    author = elasticsearch_dsl.Keyword()
    created_at = elasticsearch_dsl.Date()
    params = elasticsearch_dsl.Text()
    raw_log = elasticsearch_dsl.Text()

    # Metrics
    purported_acc = elasticsearch_dsl.Float()
    purported_loss = elasticsearch_dsl.Float()
    purported_sensitivity = elasticsearch_dsl.Float()

    avg_test_acc = elasticsearch_dsl.Float()
    avg_test_loss = elasticsearch_dsl.Float()
    avg_test_sensitivity = elasticsearch_dsl.Float()
    avg_test_specificity = elasticsearch_dsl.Float()
    avg_test_true_pos = elasticsearch_dsl.Float()
    avg_test_false_neg = elasticsearch_dsl.Float()
    avg_test_auc = elasticsearch_dsl.Float()

    best_test_acc = elasticsearch_dsl.Float()
    best_test_loss = elasticsearch_dsl.Float()
    best_test_sensitivity = elasticsearch_dsl.Float()
    best_test_specificity = elasticsearch_dsl.Float()
    best_test_true_pos = elasticsearch_dsl.Float()
    best_test_false_neg = elasticsearch_dsl.Float()
    best_test_auc = elasticsearch_dsl.Float()
    best_end_val_acc = elasticsearch_dsl.Float()
    best_end_val_loss = elasticsearch_dsl.Float()
    best_max_val_acc = elasticsearch_dsl.Float()
    best_max_val_loss = elasticsearch_dsl.Float()

    # Params
    batch_size = elasticsearch_dsl.Integer()
    val_split = elasticsearch_dsl.Float()
    seed = elasticsearch_dsl.Integer()

    rotation_range = elasticsearch_dsl.Float()
    width_shift_range = elasticsearch_dsl.Float()
    height_shift_range = elasticsearch_dsl.Float()
    shear_range = elasticsearch_dsl.Float()
    zoom_range = elasticsearch_dsl.Keyword()
    horizontal_flip = elasticsearch_dsl.Boolean()
    vertical_flip = elasticsearch_dsl.Boolean()

    dropout_rate1 = elasticsearch_dsl.Float()
    dropout_rate2 = elasticsearch_dsl.Float()

    data_dir = elasticsearch_dsl.Keyword()
    gcs_url = elasticsearch_dsl.Keyword()

    mip_thickness = elasticsearch_dsl.Integer()
    height_offset = elasticsearch_dsl.Integer()
    pixel_value_range = elasticsearch_dsl.Keyword()

    # We need to keep a list of params for the parser because
    # we can't use traditional approaches to get the class attrs
    params_to_parse = [
        'batch_size', 'val_split', 'seed', 'rotation_range',
        'width_shift_range', 'height_shift_range', 'shear_range', 'zoom_range',
        'horizontal_flip', 'vertical_flip', 'dropout_rate1', 'dropout_rate2',
        'data_dir', 'gcs_url', 'mip_thickness', 'height_offset',
        'pixel_value_range'
    ]

    class Index:
        name = VALIDATION_JOBS