Ejemplo n.º 1
0
    def sanity_check_new_index(self, attempt, document, new_index_name, previous_record_count):
        """ Ensure that we do not point to an index that looks like it has missing data. """
        current_record_count = self.get_record_count(document)
        percentage_change = self.percentage_change(current_record_count, previous_record_count)

        # Verify there was not a big shift in record count
        record_count_is_sane = percentage_change < settings.INDEX_SIZE_CHANGE_THRESHOLD

        # Spot check a known-flaky field type to detect VAN-391
        aggregation_type = Mapping.from_es(new_index_name)['aggregation_key'].name
        record_count_is_sane = record_count_is_sane and aggregation_type == 'keyword'

        if not record_count_is_sane:
            conn = get_connection()
            alternate_current_record_count = conn.search({"query": {"match_all": {}}}, index=new_index_name).get(
                'hits', {}).get('total', {}).get('value', 0)
            message = '''
        Sanity check failed for attempt #{0}.
        Index name: {1}
        Percentage change: {2}
        Previous record count: {3}
        Base record count: {4}
        Search record count: {5}
        Aggregation key type: {6}
                '''.format(
                attempt,
                new_index_name,
                str(int(round(percentage_change * 100, 0))) + '%',
                previous_record_count,
                current_record_count,
                alternate_current_record_count,
                aggregation_type,
            )
            logger.info(message)
            logger.info('...sleeping for 5 seconds...')
            time.sleep(5)
        else:
            message = '''
        Sanity check passed for attempt #{0}.
        Index name: {1}
        Percentage change: {2}
        Previous record count: {3}
        Current record count: {4}
                '''.format(
                attempt,
                new_index_name,
                str(int(round(percentage_change * 100, 0))) + '%',
                previous_record_count,
                current_record_count
            )
            logger.info(message)

        index_info_string = (
            'The previous index contained [{}] records. '
            'The new index contains [{}] records, a [{:.2f}%] change.'.format(
                previous_record_count, current_record_count, percentage_change * 100
            )
        )

        return record_count_is_sane, index_info_string
Ejemplo n.º 2
0
def view_mappings(index='buddyupevents', doc_type='event'):
    """Return a Mapping of mappings.

    Usage: explore.view_mappings().to_dict()
    """
    m = Mapping.from_es(index, doc_type)
    return m
Ejemplo n.º 3
0
def main():
    nltk.download('vader_lexicon')

    # Prepare index mappings
    mapping = Mapping(DOC_TYPE)
    mapping.field('centroid', GeoPoint())
    mapping.field('timestamp_ms', Date())
    mapping.save(TARGET_INDEX)

    try:
        # API Documentation
        # https://developer.twitter.com/en/docs/tweets/filter-realtime/api-reference/post-statuses-filter
        streaming_api = TweetStreamer(CONSUMER_KEY, CONSUMER_SECRET,
                                      ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

        # Select bounding box here: http://boundingbox.klokantech.com
        mk_ltn_nham = '-1.0282,51.8575,-0.3249,52.2864'  # Milton Keynes + Luton + N'hampton
        uk = '-11.21,50.08,1.56,58.98'  # UK
        us_can = '-126.95,24.7,-59.68,50.01'  # US + Canada
        eu_nafr = '-30.2,26.5,52.9,71.0'  # Europe + north africa

        # Keywords are expressed as a comma-separated list
        terms = 'gdpr'

        # Disclaimer 1: Twitter Streaming API cannot filter by terms AND location!
        # Disclaimer 2: The API returns an incredibly small subset of tweets...
        # streaming_api.statuses.filter(track=terms)
        streaming_api.statuses.filter(locations=uk)
    except ConnectionError as err:
        LOGGER.error('Connection error! %s', err)
Ejemplo n.º 4
0
def sale_mapping():
    m = Mapping(SALE_DOC_TYPE)

    m.meta('dynamic', 'strict')

    m.field('id', 'integer')
    m.field('shop_id', 'integer')
    m.field('client_id', 'keyword')
    m.field('timestamp', 'date')
    m.field('price', 'integer')
    m.field('payment_type', 'keyword')

    products = Nested(
        properties={
            'product_id':
            'keyword',
            'product_type':
            'keyword',
            'product_price':
            'integer',
            'parts':
            Nested(
                properties={
                    'part_id': 'keyword',
                    'warehouse_id': 'keyword',
                    'part_price': 'integer',
                }),
        })
    m.field('products', products)

    return m
Ejemplo n.º 5
0
def _create_mapping(conn):
    """
    Actually create the mapping, including deleting it if it's there
    so we can create it.
    """

    # Delete the mapping if an older version exists.

    if conn.indices.exists_type(index=INDEX_NAME, doc_type=DOC_TYPE):
        conn.indices.delete_mapping(index=INDEX_NAME, doc_type=DOC_TYPE)

    mapping = Mapping(DOC_TYPE)
    mapping.field("id", "integer")
    mapping.field("course", "string", index="not_analyzed")
    mapping.field("description_path", "string", index="no")
    mapping.field("description", "string", index="analyzed")
    mapping.field("preview_url", "string", index="no")
    mapping.field("repository", "string", index="not_analyzed")
    mapping.field("resource_type", "string", index="not_analyzed")
    mapping.field("content_xml", "string", index="no")
    mapping.field("content_stripped", "string", index="analyzed")
    mapping.field("run", "string", index="not_analyzed")
    mapping.field("titlesort", "string", index="not_analyzed")
    mapping.field("title", "string", index="analyzed")

    mapping.field("xa_avg_grade", "float")
    mapping.field("xa_histogram_grade", "float")
    mapping.field("xa_nr_attempts", "integer")
    mapping.field("xa_nr_views", "integer")

    mapping.save(INDEX_NAME)
Ejemplo n.º 6
0
def create_sequence_index(index_name='', start=''):
    es = configM.elastic(mode=index_name)
    sequence_index = Index("sequence", using=es)
    if sequence_index.exists():
        logging.debug('sequence id %s already exists' % index_name)

    else:
        logging.debug('create sequence id %s starting at %s' % (index_name, start or 0))
        sequence_index.settings(
            number_of_shards=1,
            number_of_replicas=0
        )
        sequence_index.create()

        m = Mapping("sequence")
        m.meta("_all", enabled=False)
        m.meta("_source", enabled=False)
        m.save("sequence", using=es)

    if start:
        tasks = ('{"index": {"_index": "sequence", "_type": "sequence", "_id": "%s", "version": "%s", "version_type": "external"}}\n{}\n' %
                 (index_name, start))
        result = es.bulk(body=tasks)
        logging.debug('sequence id starting at %s: %s' % (start, result))
        return result
Ejemplo n.º 7
0
def create_mapping():
    print "Creating mapping..."

    m = Mapping('movie')

    m.field('name', 'text')
    m.field('plot', 'text')
    m.field('genres', 'text')
    m.field('director', 'text')
    m.field('keywords', 'text')
    m.field('awards', 'text')
    m.field('stars', 'text')
    m.field('duration', 'text')
    m.field('actors', 'text')
    m.field('creators', 'text')
    m.field('description', 'text')

    m.field('ratingValue', 'float')
    m.field('ratingCount', 'integer')

    m.field('language', 'keyword')
    m.field('country', 'keyword')

    m.field('releaseDate', 'date')

    m.save('imdb')
Ejemplo n.º 8
0
def index_table(df, index, conn, dtypes=None, streaming=True, chunksize=1000):
    """Index a pandas DataFrame into ES as a mapping.

    Parameters
    ----------
        df : pandas.DataFrame
            The dataframe to be indexed
        index : str
            Name of the index in which to insert the table.
        conn : elasticsearch.client.Elasticsearch
            The ES client / connection to use.
        dtypes : mapping of NumPy datatypes to ES field types.
            Leaving it empty (default) is not recommended,
            since it will affect the quality of search results.
        streaming : bool
            Whether to use the streaming ES bulk API. Useful for large datasets.
        chunksize : int
            The chunksize to use when streaming,
            ignored if streaming is False or chunksize > len(df).
    Returns
    -------
        tuple
            A 2-tuple containing (number of records successfully indexed, number of failures).
            Ideally this should be (len(df), 0)

    Example
    -------
    >>> import pandas as pd
    >>> from elasticsearch_dsl import field
    >>> from elasticsearch_dsl.connections import create_connection
    >>> df = pd.read_csv('iris.csv')
    >>> dtypes = {'Petal Length': field.Double(),
    ...           'Sepal Width': field.Double(),
    ...           'Species': field.Keyword()}
    >>> conn = create_connection(hosts=['localhost'])
    >>> index_table(df, 'iris', conn, dtypes)
    (150, 0)

    """
    m = Mapping()
    if dtypes is None:
        warn((
            'Attempting to find ES types for the dataframe. This may affect search results.'
            ' Consider manually mapping the types to ES field types.'))
        dtypes = _find_es_types(df)
    for c, estype in dtypes.items():
        m.field(c, estype)
    m.save(index)

    def _actions():
        for i, r in df.iterrows():
            yield {'_index': index, '_source': r.to_dict()}

    if chunksize < df.shape[0] and streaming:
        status = [c for c in streaming_bulk(conn, _actions(), chunksize)]
        n_success = sum([r[0] for r in status])
        return n_success, df.shape[0] - n_success
    return bulk(conn, _actions(), stats_only=True)
Ejemplo n.º 9
0
    def mapping(self, index, document):
        """ This method looks for the mapping in an index for a given document type

        :param index: Elasticsearch index
        :param document: type of document

        :returns: dictionary with the mapping
        """

        mapping = Mapping.from_es(index, document, using=self.es)
        return mapping.to_dict()[document]['properties']
Ejemplo n.º 10
0
    def mapping(self, index, document):
        """ This method looks for the mapping in an index for a given document type

        :param index: Elasticsearch index
        :param document: type of document

        :returns: dictionary with the mapping
        """

        mapping = Mapping.from_es(index, document, using=self.es)
        return mapping.to_dict()[document]['properties']
Ejemplo n.º 11
0
 def create_mapping(cls, user_id):
     """Create elasticsearch mapping object for an user."""
     m = Mapping(cls.doc_type)
     m.meta('_all', enabled=True)
     m.field(
         'attachments',
         Nested(doc_class=IndexedMessageAttachment, include_in_all=True))
     m.field('body_html', 'text')
     m.field('body_plain', 'text')
     m.field('date', 'date')
     m.field('date_delete', 'date')
     m.field('date_insert', 'date')
     m.field('discussion_id', 'keyword')
     m.field(
         'external_references',
         Nested(doc_class=IndexedExternalReferences, include_in_all=True))
     m.field('identities',
             Nested(doc_class=IndexedIdentity, include_in_all=True))
     m.field('importance_level', 'short')
     m.field('is_answered', 'boolean')
     m.field('is_draft', 'boolean')
     m.field('is_unread', 'boolean')
     m.field('message_id', 'keyword')
     m.field('parent_id', 'keyword')
     participants = Nested(doc_class=IndexedParticipant,
                           include_in_all=True,
                           properties={
                               "address": 'keyword',
                               "contact_id": 'keyword',
                               "label": 'text',
                               "protocol": 'keyword',
                               "type": 'keyword'
                           })
     m.field('participants', participants)
     m.field('privacy_features', Nested(include_in_all=True))
     pi = Nested(doc_class=PIIndexModel,
                 include_in_all=True,
                 properties={
                     "technic": "integer",
                     "comportment": "integer",
                     "context": "integer",
                     "version": "integer",
                     "date_update": "date"
                 })
     m.field("pi", pi)
     m.field('raw_msg_id', "keyword")
     m.field('subject', 'text')
     m.field('tags',
             Nested(doc_class=IndexedResourceTag, include_in_all=True))
     m.field('type', 'keyword')
     m.save(using=cls.client(), index=user_id)
     return m
Ejemplo n.º 12
0
    def _make_mapping(self):
        """
        Creates the index with the correct mapping
        :return:
        """
        m = Mapping()
        # add fields
        m.field('Title', 'text')
        m.field('Text', 'text')
        m.field('Publish_Date',
                'date')  # date type complicates matters across websites
        m.field('URL', 'text')
        m.field('Scrape_Date',
                'date')  # date type complicates matters across websites
        m.field('Source', 'text')
        m.field('Search_Keyword', 'text')  # save list as text?
        m.field('SE_Is_Risk', 'boolean')
        m.field('GP_Is_Risk', 'boolean')
        m.field('RG_Is_Risk', 'boolean')
        m.field('SE_Risk_Rating', 'float')
        m.field('GP_Risk_Rating', 'float')
        m.field('RG_Risk_Rating', 'float')
        m.field('SE_SnP_Open', 'float')
        m.field('SE_SnP_Close', 'float')
        m.field('SE_AbbV_Open', 'float')
        m.field('SE_AbbV_Close', 'float')
        m.field('SE_XBI_Open', 'float')
        m.field('SE_XBI_Close', 'float')
        m.field('SE_SnP_Open_Plus1', 'float')
        m.field('SE_SnP_Close_Plus1', 'float')
        m.field('SE_AbbV_Open_Plus1', 'float')
        m.field('SE_AbbV_Close_Plus1', 'float')
        m.field('SE_XBI_Open_Plus1', 'float')
        m.field('SE_XBI_Close_Plus1', 'float')
        m.field('SE_SentimentScore', 'float')
        m.field('SE_SentimentPolarity', 'float')
        m.field('CompositeScore', 'float')
        m.field('RG_FDA_Warning', 'boolean')
        m.field('GP_SentimentScore', 'float')
        m.field('GP_SentimentPolarity', 'float')
        m.field('GP_Location', 'text')
        m.field('GP_Country', 'text')
        m.field('Article_references', 'float')
        m.field('Is_source_type_RG', 'boolean')
        m.field('Is_source_type_SE', 'boolean')
        m.field('Is_source_type_GP', 'boolean')

        # save the mapping into index 'my-index'
        try:
            m.save(self._index_name)
        except Exception as e:
            print("Could not save schema!", e)
Ejemplo n.º 13
0
    def mapping_es (es, es_index):

        mapping = Mapping('items')
        mapping.field('author_name', String(index='not_analyzed'))
        mapping.field('first_commit', Date())
        mapping.field('last_commit', Date())
        mapping.field('commits', 'integer')
        mapping.field('author_org_name', String(index='not_analyzed'))
        mapping.field('repo_name', String(index='not_analyzed'))
        mapping.field('project', String(index='not_analyzed'))
        mapping.field('uuid', String(index='not_analyzed'))
        print("Uploading mapping to ElasticSearch")
        mapping.save(es_index, using=es)
Ejemplo n.º 14
0
def traffic_mapping():
    m = Mapping(TRAFFIC_DOC_TYPE)

    m.meta('dynamic', 'strict')

    m.field('id', 'integer')
    m.field('shop_id', 'integer')
    m.field('timestamp', 'date')
    m.field('duration', 'integer')
    m.field('incoming_traffic', 'integer')
    m.field('outgoing_traffic', 'integer')

    return m
Ejemplo n.º 15
0
    def get_mapping(cls):
        m = Mapping(cls.get_doc_type())
        m.meta('dynamic', 'strict')

        nested_properties = {}
        fields_to_nest = []

        # First pass: treat "standard" fields and gather info for nested fields
        for field in cls._fields:
            if isinstance(field, NestedField):
                nested_properties[field.key] = (field, {})

            if field.parent:
                # Fields to nest, we'll deal with it later
                fields_to_nest.append(field)
            else:
                # "Standard" field, we're good to go
                m.field(field.storage_field, field.type)

        # We deal with nested fields now
        for field in fields_to_nest:
            # Sanity check
            if field.parent not in nested_properties:
                raise Exception(
                    'Nested field {} needs to be defined in {}'.format(
                        field.parent, str(cls)))

            _, properties = nested_properties[field.parent]
            properties[field.storage_field] = field.type

        # A first pass for deeply nested fields
        # FIXME: Not confident this works for all nested configurations :o
        for field, properties in nested_properties.values():
            if not field.parent:
                # First level nested field, ignore for now
                continue

            _, parent_properties = nested_properties[field.parent]
            parent_properties[field.key] = Nested(properties=properties)

        # Final pass for first level nested fields
        for field, properties in nested_properties.values():
            if field.parent:
                # Already dealt with deeply nested fields
                continue

            m.field(field.key, Nested(properties=properties))

        return m
Ejemplo n.º 16
0
    def add_mapping_to_index(self, lang_code, lang_analyzer, delete_old_index=False, kuromoji_synonyms=None):
        """
        Add or update mail/irc-mapping to EL-index, create/update required analyzers and add fields.

        :param lang_code: ``str`` Language of index e.g. 'ja'
        :param lang_analyzer: ``str`` Name of analyzer for language e.g. 'kuromoji', 'standard' etc.
        :param delete_old_index: ``bool`` Delete index if existing? Default: False = Update existing index (Close, Update, Open)
        :param kuromoji_synonyms: ``dict`` Synonyms for kuromoji Japanese analyzer.
            Keep old synonyms if synonyms list empty and index not deleted
        :return: None
        """
        if kuromoji_synonyms is None:
            kuromoji_synonyms = []
        analyzer_lang = helpers.get_analyzer(lang_analyzer, delete_old_index=delete_old_index,
                                             user_dictionary_file=self._user_dictionary_file,
                                             synonyms=kuromoji_synonyms)
        analyzer_case_insensitive_sort = analysis.analyzer('case_insensitive_sort',
                                                           tokenizer=analysis.tokenizer('keyword'),
                                                           filter=['lowercase'])
        mapping = Mapping(self._type_name)
        reopen_index = False
        index_name = self._index_prefix.format(lang_code)
        if self._es.indices.exists(index=index_name):
            if delete_old_index:
                self._es.indices.delete(index=index_name, ignore=[400, 404])
            else:
                self._es.indices.close(index=index_name)
                reopen_index = True
                mapping = Mapping.from_es(index_name, self._type_name, using=self._es)  # Get existing index from server

        self.add_mapping_fields(mapping, analyzer_lang, analyzer_case_insensitive_sort)

        mapping.save(index_name, using=self._es)  # Insert or update

        if reopen_index:
            self._es.indices.open(index=index_name)
Ejemplo n.º 17
0
    def _create_index(self):
        dt = datetime.utcnow()
        dt = dt.strftime('%Y.%m')
        es = connections.get_connection()
        if not es.indices.exists('indicators-{}'.format(dt)):
            index = Index('indicators-{}'.format(dt))
            index.aliases(live={})
            index.doc_type(Indicator)
            index.create()

            m = Mapping('indicator')
            m.field('indicator_ipv4', 'ip')
            m.field('indicator_ipv4_mask', 'integer')
            m.save('indicators-{}'.format(dt))
        return 'indicators-{}'.format(dt)
Ejemplo n.º 18
0
def write_user_mapping(index='buddyupusers', doc_type='user'):
    """Write the `user` mapping for buddy up."""
    m = Mapping(doc_type)

    public = Object()
    public.field('first_name', 'string', index='not_analyzed')
    public.field('last_name', 'string', index='not_analyzed')
    public.field('signed_up_at', 'date')
    m.field('public', public)

    groups = Nested()
    groups.field('creator', 'string', index='not_analyzed')
    groups.field('group_id', 'string', index='not_analyzed')
    groups.field('school_id', 'string', index='not_analyzed')
    groups.field('subject', 'string', index='not_analyzed')
    groups.field('subject_code', 'string', index='not_analyzed')
    groups.field('subject_icon', 'string', index='not_analyzed')
    groups.field('start', 'date')
    groups.field('end', 'date')
    m.field('groups', groups)

    private = Object()
    m.field('private', private)

    internal = Object()
    m.field('internal', internal)

    schools = Nested()
    m.field('schools', schools)

    classes = Nested()
    classes.field('course_id', 'string', index='not_analyzed')
    classes.field('id', 'string', index='not_analyzed')
    classes.field('school_id', 'string', index='not_analyzed')
    classes.field('subject_icon', 'string', index='not_analyzed')
    m.field('classes', classes)

    buddies = Nested()
    buddies.field('user_id', 'string', index='not_analyzed')
    buddies.field('first_name', 'string', index='not_analyzed')
    buddies.field('last_name', 'string', index='not_analyzed')
    m.field('buddies', buddies)

    buddies_outgoing = Nested()
    buddies_outgoing.field('user_id', 'string', index='not_analyzed')
    m.field('buddies_outgoing', buddies_outgoing)

    m.save(index)
Ejemplo n.º 19
0
    def _create_index(self):
        # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json
        # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk
        dt = datetime.utcnow()
        dt = dt.strftime('%Y.%m')
        es = connections.get_connection()
        if not es.indices.exists('indicators-{}'.format(dt)):
            index = Index('indicators-{}'.format(dt))
            index.aliases(live={})
            index.doc_type(Indicator)
            index.create()

            m = Mapping('indicator')
            m.field('indicator_ipv4', 'ip')
            m.field('indicator_ipv4_mask', 'integer')
            m.save('indicators-{}'.format(dt))
        return 'indicators-{}'.format(dt)
Ejemplo n.º 20
0
def test_doc_type_can_be_set():
    i = Index('i', doc_type='t')
    m = Mapping('t')
    m.field('title', Text())
    i.mapping(m)

    assert {
        'mappings': {
            't': {
                'properties': {
                    'title': {
                        'type': 'text'
                    }
                }
            }
        }
    } == i.to_dict()
Ejemplo n.º 21
0
def _create_index():
    # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json
    # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk
    idx = _current_index()
    es = connections.get_connection()
    if not es.indices.exists(idx):
        index = Index(idx)
        index.aliases(live={})
        index.doc_type(Indicator)
        index.create()

        m = Mapping('indicator')
        m.field('indicator_ipv4', 'ip')
        m.field('indicator_ipv4_mask', 'integer')
        m.field('lasttime', 'date')
        m.save(idx)
    return idx
Ejemplo n.º 22
0
def _get_es_facets():
    """Returns a dict from UI facet name to Elasticsearch facet object."""
    using = Elasticsearch(app.app.config['ELASTICSEARCH_URL'])
    try:
        mapping = Mapping.from_es(app.app.config['INDEX_NAME'],
                                  'type',
                                  using=using).to_dict()
    except TransportError as e:
        if 'index_not_found_exception' in e.error:
            app.app.logger.error('Index %s not found at %s' %
                                 (app.app.config['INDEX_NAME'],
                                  app.app.config['ELASTICSEARCH_URL']))
            raise e

    config_path = os.path.join(app.app.config['DATASET_CONFIG_DIR'], 'ui.json')
    facets_config = _parse_json_file(config_path)['facets']

    # Preserve order, so facets are returned in same order as the config file.
    facets = OrderedDict()

    for facet_config in facets_config:
        field_name = facet_config['elasticsearch_field_name']
        if not field_name in mapping['type']['properties']:
            raise ValueError(
                'elasticsearch_field_name %s not found in Elasticsearch index %s'
                % (field_name, app.app.config['INDEX_NAME']))
        field_type = mapping['type']['properties'][field_name]['type']
        ui_facet_name = facet_config['ui_facet_name']
        if field_type == 'text':
            # Use ".keyword" because we want aggregation on keyword field, not
            # term field. See
            # https://www.elastic.co/guide/en/elasticsearch/reference/6.2/fielddata.html#before-enabling-fielddata
            facets[ui_facet_name] = TermsFacet(field=field_name + '.keyword',
                                               size=20)
        else:
            # Assume numeric type.
            # TODO: Handle other types.
            # TODO: Automatically figure out bucket intervals. Unfortunately
            # Elasticsearch won't do this for us
            # (https://github.com/elastic/elasticsearch/issues/9572). Make the
            # ranges easy to read (10-19,20-29 instead of 10-17,18-25).
            facets[ui_facet_name] = HistogramFacet(field=field_name,
                                                   interval=10)
    app.app.logger.info('Elasticsearch facets: %s' % facets)
    return facets
Ejemplo n.º 23
0
    def get_es_mapping(self):
        """Setup mapping (data scheme).

        .. note::
            You will probably want to change the analyzer and boost value.
            Also consider the ``index='not_analyzed'`` option to improve performances.

        See https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html#mappings

        .. attention::
            You *may* want to override this method (otherwise ES choose the mapping by itself).

        :return: mapping object
        :rtype: elasticsearch_dsl.Mapping
        """

        es_mapping = Mapping(self.get_es_document_type())
        return es_mapping
Ejemplo n.º 24
0
def write_event_mapping(index='buddyupevents', doc_type='event'):
    """Write the `event` mapping for an ES index and doc type.

    http://elasticsearch-dsl.readthedocs.org/en/latest/persistence.html#mappings
    """
    m = Mapping(doc_type)
    data = Object()  # elasticsearch_dsl field object
    data.field('password', 'string', index='no', include_in_all=False, store=False)
    data.field('first_name', 'string', index='not_analyzed')
    data.field('last_name', 'string', index='not_analyzed')
    data.field('accepted_by', 'string', index='not_analyzed')
    data.field('accepted_by_last_name', 'string', index='not_analyzed')
    data.field('accepted_by_first_name', 'string', index='not_analyzed')
    data.field('requested_by', 'string', index='not_analyzed')
    data.field('requested_by_last_name', 'string', index='not_analyzed')
    data.field('requested_by_first_name', 'string', index='not_analyzed')
    data.field('recipient', 'string', index='not_analyzed')
    data.field('recipient_first_name', 'string', index='not_analyzed')
    data.field('recipient_last_name', 'string', index='not_analyzed')
    data.field('sender', 'string', index='not_analyzed')
    data.field('sender_first_name', 'string', index='not_analyzed')
    data.field('sender_last_name', 'string', index='not_analyzed')
    data.field('sender_last_name', 'string', index='not_analyzed')
    data.field('email', 'string', index='not_analyzed')
    data.field('sent_at', 'date')
    data.field('start', 'date')
    data.field('end', 'date')

    device = Object()
    device.field('manufacturer', 'string', index='not_analyzed')
    device.field('model', 'string', index='not_analyzed')
    device.field('platform', 'string', index='not_analyzed')
    data.field('device', device)
    m.field('data', data)

    m.field('created_at', 'date')
    m.field('first_name', 'string', index='not_analyzed')
    m.field('last_name', 'string', index='not_analyzed')
    m.field('id', 'string', index='not_analyzed')
    m.field('creator', 'string', index='not_analyzed')
    m.field('involved', 'string', index='not_analyzed')
    m.field('profile_pic_url_tiny', 'string', index='not_analyzed')
    m.field('type', 'string', index='not_analyzed')
    m.save(index)
Ejemplo n.º 25
0
    def handle(self, *args, **options):
        from elasticsearch import Elasticsearch
        from elasticsearch_dsl import Index, Mapping
        ELASTICSEARCH_HOSTS = ['localhost']
        ELASTICSEARCH_INDEX = 'kindle2'

        es = Elasticsearch(ELASTICSEARCH_HOSTS)
        newindex = Index(ELASTICSEARCH_INDEX, using=es)
        if newindex.exists():
            exit('index already exists,change a new name')
        mp = Mapping()
        mp.field('title', 'text')
        mp.field('creator', 'text')
        mp.field('publisher', 'text')
        mp.field('iclass', 'text')
        mp.field('isbn', 'text')
        mp.field('asin', 'keyword')

        newindex.mapping(mp)
        newindex.create()
Ejemplo n.º 26
0
    def construct_model_class(self, entity_cls):
        """Return a fully-baked Model class for a given Entity class"""
        model_cls = None

        # Return the model class if it was already seen/decorated
        if entity_cls.meta_.schema_name in self._model_classes:
            model_cls = self._model_classes[entity_cls.meta_.schema_name]
        else:
            from protean.core.model import ModelMeta

            meta_ = ModelMeta()
            meta_.entity_cls = entity_cls

            # Construct Inner Index class with options
            options = {}
            options["name"] = self.derive_schema_name(entity_cls)
            if "SETTINGS" in self.conn_info and self.conn_info["SETTINGS"]:
                options["settings"] = self.conn_info["SETTINGS"]

            index_cls = type("Index", (object, ), options)

            attrs = {"meta_": meta_, "Index": index_cls}

            # FIXME Ensure the custom model attributes are constructed properly
            model_cls = type(entity_cls.__name__ + "Model",
                             (ElasticsearchModel, ), attrs)

            # Create Dynamic Mapping and associate with index
            # FIXME Expand to all types of fields
            id_field_name = id_field(entity_cls).field_name
            m = Mapping()
            m.field(id_field_name, Keyword())

            model_cls._index.mapping(m)

            # Memoize the constructed model class
            self._model_classes[entity_cls.meta_.schema_name] = model_cls

        # Set Entity Class as a class level attribute for the Model, to be able to reference later.
        return model_cls
Ejemplo n.º 27
0
Archivo: utils.py Proyecto: olabi/lore
def ensure_vocabulary_mappings(term_info):
    """
    Ensure the mapping is properly set in Elasticsearch to always do exact
    matches on taxonomy terms. Accepts the output of get_resource_terms.

    Calling this function during indexing means that vocabularies do not
    need to be added to the mapping in advance. This deals with the fact
    that vocabularies can be added on-the-fly without having to play around
    with extra signals.

    Args:
        term_info (dict): Details of terms for a group of LearningResources.
    """
    if len(term_info) == 0:
        return

    get_conn()  # We don't need the return value; just for it to exist.

    # Retrieve current mapping from Elasticsearch.
    mapping = Mapping.from_es(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Get the field names from the mapping.
    existing_vocabs = set(mapping.to_dict()["learningresource"]["properties"])

    # Get all the taxonomy names from the data.
    vocab_ids = set()
    for vocab_terms in term_info.values():
        for vocab_id in vocab_terms.keys():
            vocab_ids.add(vocab_id)
    updated = False
    # Add vocabulary to mapping if necessary.
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        if vocab_key in existing_vocabs:
            continue
        mapping.field(vocab_key, "string", index="not_analyzed")
        updated = True
    if updated:
        mapping.save(INDEX_NAME)
        refresh_index()
Ejemplo n.º 28
0
def ensure_vocabulary_mappings(term_info):
    """
    Ensure the mapping is properly set in Elasticsearch to always do exact
    matches on taxonomy terms. Accepts the output of get_resource_terms.

    Calling this function during indexing means that vocabularies do not
    need to be added to the mapping in advance. This deals with the fact
    that vocabularies can be added on-the-fly without having to play around
    with extra signals.

    Args:
        term_info (dict): Details of terms for a group of LearningResources.
    """
    if len(term_info) == 0:
        return

    get_conn()  # We don't need the return value; just for it to exist.

    # Retrieve current mapping from Elasticsearch.
    mapping = Mapping.from_es(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Get the field names from the mapping.
    existing_vocabs = set(mapping.to_dict()["learningresource"]["properties"])

    # Get all the taxonomy names from the data.
    vocab_ids = set()
    for vocab_terms in term_info.values():
        for vocab_id in vocab_terms.keys():
            vocab_ids.add(vocab_id)
    updated = False
    # Add vocabulary to mapping if necessary.
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        if vocab_key in existing_vocabs:
            continue
        mapping.field(vocab_key, "string", index="not_analyzed")
        updated = True
    if updated:
        mapping.save(INDEX_NAME)
        refresh_index()
Ejemplo n.º 29
0
    def __init__(self, name, bases, attrs):
        meta = attrs.pop('Meta', None)
        self.meta = meta
        self.index = getattr(meta, 'index', None)
        self.doc_type = getattr(meta, 'doc_type', None)
        self._using = getattr(meta, 'using', None)
        self.mapping = getattr(meta, 'mapping', Mapping(self.doc_type))
        self.serializer = getattr(meta, 'serializer', None)

        for name, value in list(attrs.items()):
            if isinstance(value, Field):
                self.mapping.field(name, value)
                del attrs[name]

        for name in dir(meta):
            if isinstance(getattr(meta, name, None), MetaField):
                params = getattr(meta, name)
                self.mapping.meta(name, *params.args, **params.kwargs)

        for b in bases:
            if hasattr(b, '_doc_type') and hasattr(b._doc_type, 'mapping'):
                self.mapping.update(b._doc_type.mapping, update_only=True)
                self._using = self._using or b._doc_type._using
                self.index = self.index or b._doc_type.index
Ejemplo n.º 30
0
 class Meta:
     mapping = Mapping('my_d')
     mapping.meta('_all', enabled=False)
Ejemplo n.º 31
0
 class Meta:
     mapping = Mapping('doc')
Ejemplo n.º 32
0
def test_mapping(setup_es):
    """Test the ES mapping for an order."""
    mapping = Mapping.from_es(OrderSearchApp.es_model.get_write_index(),
                              OrderSearchApp.name)

    assert mapping.to_dict() == {
        'order': {
            'dynamic': 'false',
            'properties': {
                'assignees': {
                    'include_in_parent': True,
                    'properties': {
                        'dit_team': {
                            'include_in_parent': True,
                            'properties': {
                                'id': {
                                    'type': 'keyword',
                                },
                                'name': {
                                    'analyzer': 'lowercase_keyword_analyzer',
                                    'fielddata': True,
                                    'type': 'text',
                                },
                            },
                            'type': 'nested',
                        },
                        'first_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'id': {
                            'type': 'keyword',
                        },
                        'last_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name': {
                            'copy_to': ['assignees.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'billing_address_1': {
                    'type': 'text',
                },
                'billing_address_2': {
                    'type': 'text',
                },
                'billing_address_country': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'billing_address_county': {
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'billing_address_postcode': {
                    'type': 'text',
                },
                'billing_address_town': {
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'billing_contact_name': {
                    'type': 'text',
                },
                'billing_company_name': {
                    'type': 'text',
                },
                'billing_email': {
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'billing_phone': {
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'cancellation_reason': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'cancelled_by': {
                    'include_in_parent': True,
                    'properties': {
                        'first_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'id': {
                            'type': 'keyword',
                        },
                        'last_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name': {
                            'copy_to': ['cancelled_by.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'cancelled_on': {
                    'type': 'date',
                },
                'company': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'copy_to': ['company.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                        'trading_name': {
                            'copy_to': ['company.trading_name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'trading_name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'completed_by': {
                    'include_in_parent': True,
                    'properties': {
                        'first_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'id': {
                            'type': 'keyword',
                        },
                        'last_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name': {
                            'copy_to': ['completed_by.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'completed_on': {
                    'type': 'date',
                },
                'contact': {
                    'include_in_parent': True,
                    'properties': {
                        'first_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'id': {
                            'type': 'keyword',
                        },
                        'last_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name': {
                            'copy_to': ['contact.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'contact_email': {
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'contact_phone': {
                    'type': 'keyword',
                },
                'contacts_not_to_approach': {
                    'type': 'text',
                },
                'created_by': {
                    'include_in_parent': True,
                    'properties': {
                        'first_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'id': {
                            'type': 'keyword',
                        },
                        'last_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name': {
                            'copy_to': ['created_by.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                        'dit_team': {
                            'include_in_parent': True,
                            'properties': {
                                'id': {
                                    'type': 'keyword',
                                },
                                'name': {
                                    'analyzer': 'lowercase_keyword_analyzer',
                                    'fielddata': True,
                                    'type': 'text',
                                },
                            },
                            'type': 'nested',
                        },
                    },
                    'type': 'nested',
                },
                'created_on': {
                    'type': 'date',
                },
                'delivery_date': {
                    'type': 'date',
                },
                'description': {
                    'analyzer': 'english_analyzer',
                    'type': 'text',
                },
                'discount_value': {
                    'index': False,
                    'type': 'integer',
                },
                'existing_agents': {
                    'index': False,
                    'type': 'text',
                },
                'further_info': {
                    'type': 'text',
                },
                'id': {
                    'type': 'keyword',
                },
                'modified_on': {
                    'type': 'date',
                },
                'net_cost': {
                    'index': False,
                    'type': 'integer',
                },
                'paid_on': {
                    'type': 'date',
                },
                'payment_due_date': {
                    'type': 'date',
                },
                'po_number': {
                    'index': False,
                    'type': 'keyword',
                },
                'primary_market': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'reference': {
                    'copy_to': ['reference_trigram'],
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'reference_trigram': {
                    'analyzer': 'trigram_analyzer',
                    'type': 'text',
                },
                'sector': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'ancestors': {
                            'include_in_parent': True,
                            'properties': {
                                'id': {
                                    'type': 'keyword',
                                },
                            },
                            'type': 'nested',
                        },
                    },
                    'type': 'nested',
                },
                'uk_region': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'service_types': {
                    'include_in_parent': True,
                    'properties': {
                        'id': {
                            'type': 'keyword',
                        },
                        'name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'status': {
                    'analyzer': 'lowercase_keyword_analyzer',
                    'fielddata': True,
                    'type': 'text',
                },
                'subscribers': {
                    'include_in_parent': True,
                    'properties': {
                        'dit_team': {
                            'include_in_parent': True,
                            'properties': {
                                'id': {
                                    'type': 'keyword',
                                },
                                'name': {
                                    'analyzer': 'lowercase_keyword_analyzer',
                                    'fielddata': True,
                                    'type': 'text',
                                },
                            },
                            'type': 'nested',
                        },
                        'first_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'id': {
                            'type': 'keyword',
                        },
                        'last_name': {
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name': {
                            'copy_to': ['subscribers.name_trigram'],
                            'analyzer': 'lowercase_keyword_analyzer',
                            'fielddata': True,
                            'type': 'text',
                        },
                        'name_trigram': {
                            'analyzer': 'trigram_analyzer',
                            'type': 'text',
                        },
                    },
                    'type': 'nested',
                },
                'subtotal_cost': {
                    'copy_to': ['subtotal_cost_string'],
                    'type': 'integer',
                },
                'subtotal_cost_string': {
                    'type': 'keyword',
                },
                'total_cost': {
                    'copy_to': ['total_cost_string'],
                    'type': 'integer',
                },
                'total_cost_string': {
                    'type': 'keyword',
                },
                'vat_cost': {
                    'index': False,
                    'type': 'integer',
                },
                'vat_number': {
                    'index': False,
                    'type': 'keyword',
                },
                'vat_status': {
                    'index': False,
                    'type': 'keyword',
                },
                'vat_verified': {
                    'index': False,
                    'type': 'boolean',
                },
            },
        },
    }