def sanity_check_new_index(self, attempt, document, new_index_name, previous_record_count): """ Ensure that we do not point to an index that looks like it has missing data. """ current_record_count = self.get_record_count(document) percentage_change = self.percentage_change(current_record_count, previous_record_count) # Verify there was not a big shift in record count record_count_is_sane = percentage_change < settings.INDEX_SIZE_CHANGE_THRESHOLD # Spot check a known-flaky field type to detect VAN-391 aggregation_type = Mapping.from_es(new_index_name)['aggregation_key'].name record_count_is_sane = record_count_is_sane and aggregation_type == 'keyword' if not record_count_is_sane: conn = get_connection() alternate_current_record_count = conn.search({"query": {"match_all": {}}}, index=new_index_name).get( 'hits', {}).get('total', {}).get('value', 0) message = ''' Sanity check failed for attempt #{0}. Index name: {1} Percentage change: {2} Previous record count: {3} Base record count: {4} Search record count: {5} Aggregation key type: {6} '''.format( attempt, new_index_name, str(int(round(percentage_change * 100, 0))) + '%', previous_record_count, current_record_count, alternate_current_record_count, aggregation_type, ) logger.info(message) logger.info('...sleeping for 5 seconds...') time.sleep(5) else: message = ''' Sanity check passed for attempt #{0}. Index name: {1} Percentage change: {2} Previous record count: {3} Current record count: {4} '''.format( attempt, new_index_name, str(int(round(percentage_change * 100, 0))) + '%', previous_record_count, current_record_count ) logger.info(message) index_info_string = ( 'The previous index contained [{}] records. ' 'The new index contains [{}] records, a [{:.2f}%] change.'.format( previous_record_count, current_record_count, percentage_change * 100 ) ) return record_count_is_sane, index_info_string
def view_mappings(index='buddyupevents', doc_type='event'): """Return a Mapping of mappings. Usage: explore.view_mappings().to_dict() """ m = Mapping.from_es(index, doc_type) return m
def main(): nltk.download('vader_lexicon') # Prepare index mappings mapping = Mapping(DOC_TYPE) mapping.field('centroid', GeoPoint()) mapping.field('timestamp_ms', Date()) mapping.save(TARGET_INDEX) try: # API Documentation # https://developer.twitter.com/en/docs/tweets/filter-realtime/api-reference/post-statuses-filter streaming_api = TweetStreamer(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) # Select bounding box here: http://boundingbox.klokantech.com mk_ltn_nham = '-1.0282,51.8575,-0.3249,52.2864' # Milton Keynes + Luton + N'hampton uk = '-11.21,50.08,1.56,58.98' # UK us_can = '-126.95,24.7,-59.68,50.01' # US + Canada eu_nafr = '-30.2,26.5,52.9,71.0' # Europe + north africa # Keywords are expressed as a comma-separated list terms = 'gdpr' # Disclaimer 1: Twitter Streaming API cannot filter by terms AND location! # Disclaimer 2: The API returns an incredibly small subset of tweets... # streaming_api.statuses.filter(track=terms) streaming_api.statuses.filter(locations=uk) except ConnectionError as err: LOGGER.error('Connection error! %s', err)
def sale_mapping(): m = Mapping(SALE_DOC_TYPE) m.meta('dynamic', 'strict') m.field('id', 'integer') m.field('shop_id', 'integer') m.field('client_id', 'keyword') m.field('timestamp', 'date') m.field('price', 'integer') m.field('payment_type', 'keyword') products = Nested( properties={ 'product_id': 'keyword', 'product_type': 'keyword', 'product_price': 'integer', 'parts': Nested( properties={ 'part_id': 'keyword', 'warehouse_id': 'keyword', 'part_price': 'integer', }), }) m.field('products', products) return m
def _create_mapping(conn): """ Actually create the mapping, including deleting it if it's there so we can create it. """ # Delete the mapping if an older version exists. if conn.indices.exists_type(index=INDEX_NAME, doc_type=DOC_TYPE): conn.indices.delete_mapping(index=INDEX_NAME, doc_type=DOC_TYPE) mapping = Mapping(DOC_TYPE) mapping.field("id", "integer") mapping.field("course", "string", index="not_analyzed") mapping.field("description_path", "string", index="no") mapping.field("description", "string", index="analyzed") mapping.field("preview_url", "string", index="no") mapping.field("repository", "string", index="not_analyzed") mapping.field("resource_type", "string", index="not_analyzed") mapping.field("content_xml", "string", index="no") mapping.field("content_stripped", "string", index="analyzed") mapping.field("run", "string", index="not_analyzed") mapping.field("titlesort", "string", index="not_analyzed") mapping.field("title", "string", index="analyzed") mapping.field("xa_avg_grade", "float") mapping.field("xa_histogram_grade", "float") mapping.field("xa_nr_attempts", "integer") mapping.field("xa_nr_views", "integer") mapping.save(INDEX_NAME)
def create_sequence_index(index_name='', start=''): es = configM.elastic(mode=index_name) sequence_index = Index("sequence", using=es) if sequence_index.exists(): logging.debug('sequence id %s already exists' % index_name) else: logging.debug('create sequence id %s starting at %s' % (index_name, start or 0)) sequence_index.settings( number_of_shards=1, number_of_replicas=0 ) sequence_index.create() m = Mapping("sequence") m.meta("_all", enabled=False) m.meta("_source", enabled=False) m.save("sequence", using=es) if start: tasks = ('{"index": {"_index": "sequence", "_type": "sequence", "_id": "%s", "version": "%s", "version_type": "external"}}\n{}\n' % (index_name, start)) result = es.bulk(body=tasks) logging.debug('sequence id starting at %s: %s' % (start, result)) return result
def create_mapping(): print "Creating mapping..." m = Mapping('movie') m.field('name', 'text') m.field('plot', 'text') m.field('genres', 'text') m.field('director', 'text') m.field('keywords', 'text') m.field('awards', 'text') m.field('stars', 'text') m.field('duration', 'text') m.field('actors', 'text') m.field('creators', 'text') m.field('description', 'text') m.field('ratingValue', 'float') m.field('ratingCount', 'integer') m.field('language', 'keyword') m.field('country', 'keyword') m.field('releaseDate', 'date') m.save('imdb')
def index_table(df, index, conn, dtypes=None, streaming=True, chunksize=1000): """Index a pandas DataFrame into ES as a mapping. Parameters ---------- df : pandas.DataFrame The dataframe to be indexed index : str Name of the index in which to insert the table. conn : elasticsearch.client.Elasticsearch The ES client / connection to use. dtypes : mapping of NumPy datatypes to ES field types. Leaving it empty (default) is not recommended, since it will affect the quality of search results. streaming : bool Whether to use the streaming ES bulk API. Useful for large datasets. chunksize : int The chunksize to use when streaming, ignored if streaming is False or chunksize > len(df). Returns ------- tuple A 2-tuple containing (number of records successfully indexed, number of failures). Ideally this should be (len(df), 0) Example ------- >>> import pandas as pd >>> from elasticsearch_dsl import field >>> from elasticsearch_dsl.connections import create_connection >>> df = pd.read_csv('iris.csv') >>> dtypes = {'Petal Length': field.Double(), ... 'Sepal Width': field.Double(), ... 'Species': field.Keyword()} >>> conn = create_connection(hosts=['localhost']) >>> index_table(df, 'iris', conn, dtypes) (150, 0) """ m = Mapping() if dtypes is None: warn(( 'Attempting to find ES types for the dataframe. This may affect search results.' ' Consider manually mapping the types to ES field types.')) dtypes = _find_es_types(df) for c, estype in dtypes.items(): m.field(c, estype) m.save(index) def _actions(): for i, r in df.iterrows(): yield {'_index': index, '_source': r.to_dict()} if chunksize < df.shape[0] and streaming: status = [c for c in streaming_bulk(conn, _actions(), chunksize)] n_success = sum([r[0] for r in status]) return n_success, df.shape[0] - n_success return bulk(conn, _actions(), stats_only=True)
def mapping(self, index, document): """ This method looks for the mapping in an index for a given document type :param index: Elasticsearch index :param document: type of document :returns: dictionary with the mapping """ mapping = Mapping.from_es(index, document, using=self.es) return mapping.to_dict()[document]['properties']
def create_mapping(cls, user_id): """Create elasticsearch mapping object for an user.""" m = Mapping(cls.doc_type) m.meta('_all', enabled=True) m.field( 'attachments', Nested(doc_class=IndexedMessageAttachment, include_in_all=True)) m.field('body_html', 'text') m.field('body_plain', 'text') m.field('date', 'date') m.field('date_delete', 'date') m.field('date_insert', 'date') m.field('discussion_id', 'keyword') m.field( 'external_references', Nested(doc_class=IndexedExternalReferences, include_in_all=True)) m.field('identities', Nested(doc_class=IndexedIdentity, include_in_all=True)) m.field('importance_level', 'short') m.field('is_answered', 'boolean') m.field('is_draft', 'boolean') m.field('is_unread', 'boolean') m.field('message_id', 'keyword') m.field('parent_id', 'keyword') participants = Nested(doc_class=IndexedParticipant, include_in_all=True, properties={ "address": 'keyword', "contact_id": 'keyword', "label": 'text', "protocol": 'keyword', "type": 'keyword' }) m.field('participants', participants) m.field('privacy_features', Nested(include_in_all=True)) pi = Nested(doc_class=PIIndexModel, include_in_all=True, properties={ "technic": "integer", "comportment": "integer", "context": "integer", "version": "integer", "date_update": "date" }) m.field("pi", pi) m.field('raw_msg_id', "keyword") m.field('subject', 'text') m.field('tags', Nested(doc_class=IndexedResourceTag, include_in_all=True)) m.field('type', 'keyword') m.save(using=cls.client(), index=user_id) return m
def _make_mapping(self): """ Creates the index with the correct mapping :return: """ m = Mapping() # add fields m.field('Title', 'text') m.field('Text', 'text') m.field('Publish_Date', 'date') # date type complicates matters across websites m.field('URL', 'text') m.field('Scrape_Date', 'date') # date type complicates matters across websites m.field('Source', 'text') m.field('Search_Keyword', 'text') # save list as text? m.field('SE_Is_Risk', 'boolean') m.field('GP_Is_Risk', 'boolean') m.field('RG_Is_Risk', 'boolean') m.field('SE_Risk_Rating', 'float') m.field('GP_Risk_Rating', 'float') m.field('RG_Risk_Rating', 'float') m.field('SE_SnP_Open', 'float') m.field('SE_SnP_Close', 'float') m.field('SE_AbbV_Open', 'float') m.field('SE_AbbV_Close', 'float') m.field('SE_XBI_Open', 'float') m.field('SE_XBI_Close', 'float') m.field('SE_SnP_Open_Plus1', 'float') m.field('SE_SnP_Close_Plus1', 'float') m.field('SE_AbbV_Open_Plus1', 'float') m.field('SE_AbbV_Close_Plus1', 'float') m.field('SE_XBI_Open_Plus1', 'float') m.field('SE_XBI_Close_Plus1', 'float') m.field('SE_SentimentScore', 'float') m.field('SE_SentimentPolarity', 'float') m.field('CompositeScore', 'float') m.field('RG_FDA_Warning', 'boolean') m.field('GP_SentimentScore', 'float') m.field('GP_SentimentPolarity', 'float') m.field('GP_Location', 'text') m.field('GP_Country', 'text') m.field('Article_references', 'float') m.field('Is_source_type_RG', 'boolean') m.field('Is_source_type_SE', 'boolean') m.field('Is_source_type_GP', 'boolean') # save the mapping into index 'my-index' try: m.save(self._index_name) except Exception as e: print("Could not save schema!", e)
def mapping_es (es, es_index): mapping = Mapping('items') mapping.field('author_name', String(index='not_analyzed')) mapping.field('first_commit', Date()) mapping.field('last_commit', Date()) mapping.field('commits', 'integer') mapping.field('author_org_name', String(index='not_analyzed')) mapping.field('repo_name', String(index='not_analyzed')) mapping.field('project', String(index='not_analyzed')) mapping.field('uuid', String(index='not_analyzed')) print("Uploading mapping to ElasticSearch") mapping.save(es_index, using=es)
def traffic_mapping(): m = Mapping(TRAFFIC_DOC_TYPE) m.meta('dynamic', 'strict') m.field('id', 'integer') m.field('shop_id', 'integer') m.field('timestamp', 'date') m.field('duration', 'integer') m.field('incoming_traffic', 'integer') m.field('outgoing_traffic', 'integer') return m
def get_mapping(cls): m = Mapping(cls.get_doc_type()) m.meta('dynamic', 'strict') nested_properties = {} fields_to_nest = [] # First pass: treat "standard" fields and gather info for nested fields for field in cls._fields: if isinstance(field, NestedField): nested_properties[field.key] = (field, {}) if field.parent: # Fields to nest, we'll deal with it later fields_to_nest.append(field) else: # "Standard" field, we're good to go m.field(field.storage_field, field.type) # We deal with nested fields now for field in fields_to_nest: # Sanity check if field.parent not in nested_properties: raise Exception( 'Nested field {} needs to be defined in {}'.format( field.parent, str(cls))) _, properties = nested_properties[field.parent] properties[field.storage_field] = field.type # A first pass for deeply nested fields # FIXME: Not confident this works for all nested configurations :o for field, properties in nested_properties.values(): if not field.parent: # First level nested field, ignore for now continue _, parent_properties = nested_properties[field.parent] parent_properties[field.key] = Nested(properties=properties) # Final pass for first level nested fields for field, properties in nested_properties.values(): if field.parent: # Already dealt with deeply nested fields continue m.field(field.key, Nested(properties=properties)) return m
def add_mapping_to_index(self, lang_code, lang_analyzer, delete_old_index=False, kuromoji_synonyms=None): """ Add or update mail/irc-mapping to EL-index, create/update required analyzers and add fields. :param lang_code: ``str`` Language of index e.g. 'ja' :param lang_analyzer: ``str`` Name of analyzer for language e.g. 'kuromoji', 'standard' etc. :param delete_old_index: ``bool`` Delete index if existing? Default: False = Update existing index (Close, Update, Open) :param kuromoji_synonyms: ``dict`` Synonyms for kuromoji Japanese analyzer. Keep old synonyms if synonyms list empty and index not deleted :return: None """ if kuromoji_synonyms is None: kuromoji_synonyms = [] analyzer_lang = helpers.get_analyzer(lang_analyzer, delete_old_index=delete_old_index, user_dictionary_file=self._user_dictionary_file, synonyms=kuromoji_synonyms) analyzer_case_insensitive_sort = analysis.analyzer('case_insensitive_sort', tokenizer=analysis.tokenizer('keyword'), filter=['lowercase']) mapping = Mapping(self._type_name) reopen_index = False index_name = self._index_prefix.format(lang_code) if self._es.indices.exists(index=index_name): if delete_old_index: self._es.indices.delete(index=index_name, ignore=[400, 404]) else: self._es.indices.close(index=index_name) reopen_index = True mapping = Mapping.from_es(index_name, self._type_name, using=self._es) # Get existing index from server self.add_mapping_fields(mapping, analyzer_lang, analyzer_case_insensitive_sort) mapping.save(index_name, using=self._es) # Insert or update if reopen_index: self._es.indices.open(index=index_name)
def _create_index(self): dt = datetime.utcnow() dt = dt.strftime('%Y.%m') es = connections.get_connection() if not es.indices.exists('indicators-{}'.format(dt)): index = Index('indicators-{}'.format(dt)) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.save('indicators-{}'.format(dt)) return 'indicators-{}'.format(dt)
def write_user_mapping(index='buddyupusers', doc_type='user'): """Write the `user` mapping for buddy up.""" m = Mapping(doc_type) public = Object() public.field('first_name', 'string', index='not_analyzed') public.field('last_name', 'string', index='not_analyzed') public.field('signed_up_at', 'date') m.field('public', public) groups = Nested() groups.field('creator', 'string', index='not_analyzed') groups.field('group_id', 'string', index='not_analyzed') groups.field('school_id', 'string', index='not_analyzed') groups.field('subject', 'string', index='not_analyzed') groups.field('subject_code', 'string', index='not_analyzed') groups.field('subject_icon', 'string', index='not_analyzed') groups.field('start', 'date') groups.field('end', 'date') m.field('groups', groups) private = Object() m.field('private', private) internal = Object() m.field('internal', internal) schools = Nested() m.field('schools', schools) classes = Nested() classes.field('course_id', 'string', index='not_analyzed') classes.field('id', 'string', index='not_analyzed') classes.field('school_id', 'string', index='not_analyzed') classes.field('subject_icon', 'string', index='not_analyzed') m.field('classes', classes) buddies = Nested() buddies.field('user_id', 'string', index='not_analyzed') buddies.field('first_name', 'string', index='not_analyzed') buddies.field('last_name', 'string', index='not_analyzed') m.field('buddies', buddies) buddies_outgoing = Nested() buddies_outgoing.field('user_id', 'string', index='not_analyzed') m.field('buddies_outgoing', buddies_outgoing) m.save(index)
def _create_index(self): # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk dt = datetime.utcnow() dt = dt.strftime('%Y.%m') es = connections.get_connection() if not es.indices.exists('indicators-{}'.format(dt)): index = Index('indicators-{}'.format(dt)) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.save('indicators-{}'.format(dt)) return 'indicators-{}'.format(dt)
def test_doc_type_can_be_set(): i = Index('i', doc_type='t') m = Mapping('t') m.field('title', Text()) i.mapping(m) assert { 'mappings': { 't': { 'properties': { 'title': { 'type': 'text' } } } } } == i.to_dict()
def _create_index(): # https://github.com/csirtgadgets/massive-octo-spice/blob/develop/elasticsearch/observables.json # http://elasticsearch-py.readthedocs.org/en/master/api.html#elasticsearch.Elasticsearch.bulk idx = _current_index() es = connections.get_connection() if not es.indices.exists(idx): index = Index(idx) index.aliases(live={}) index.doc_type(Indicator) index.create() m = Mapping('indicator') m.field('indicator_ipv4', 'ip') m.field('indicator_ipv4_mask', 'integer') m.field('lasttime', 'date') m.save(idx) return idx
def _get_es_facets(): """Returns a dict from UI facet name to Elasticsearch facet object.""" using = Elasticsearch(app.app.config['ELASTICSEARCH_URL']) try: mapping = Mapping.from_es(app.app.config['INDEX_NAME'], 'type', using=using).to_dict() except TransportError as e: if 'index_not_found_exception' in e.error: app.app.logger.error('Index %s not found at %s' % (app.app.config['INDEX_NAME'], app.app.config['ELASTICSEARCH_URL'])) raise e config_path = os.path.join(app.app.config['DATASET_CONFIG_DIR'], 'ui.json') facets_config = _parse_json_file(config_path)['facets'] # Preserve order, so facets are returned in same order as the config file. facets = OrderedDict() for facet_config in facets_config: field_name = facet_config['elasticsearch_field_name'] if not field_name in mapping['type']['properties']: raise ValueError( 'elasticsearch_field_name %s not found in Elasticsearch index %s' % (field_name, app.app.config['INDEX_NAME'])) field_type = mapping['type']['properties'][field_name]['type'] ui_facet_name = facet_config['ui_facet_name'] if field_type == 'text': # Use ".keyword" because we want aggregation on keyword field, not # term field. See # https://www.elastic.co/guide/en/elasticsearch/reference/6.2/fielddata.html#before-enabling-fielddata facets[ui_facet_name] = TermsFacet(field=field_name + '.keyword', size=20) else: # Assume numeric type. # TODO: Handle other types. # TODO: Automatically figure out bucket intervals. Unfortunately # Elasticsearch won't do this for us # (https://github.com/elastic/elasticsearch/issues/9572). Make the # ranges easy to read (10-19,20-29 instead of 10-17,18-25). facets[ui_facet_name] = HistogramFacet(field=field_name, interval=10) app.app.logger.info('Elasticsearch facets: %s' % facets) return facets
def get_es_mapping(self): """Setup mapping (data scheme). .. note:: You will probably want to change the analyzer and boost value. Also consider the ``index='not_analyzed'`` option to improve performances. See https://elasticsearch-dsl.readthedocs.io/en/latest/persistence.html#mappings .. attention:: You *may* want to override this method (otherwise ES choose the mapping by itself). :return: mapping object :rtype: elasticsearch_dsl.Mapping """ es_mapping = Mapping(self.get_es_document_type()) return es_mapping
def write_event_mapping(index='buddyupevents', doc_type='event'): """Write the `event` mapping for an ES index and doc type. http://elasticsearch-dsl.readthedocs.org/en/latest/persistence.html#mappings """ m = Mapping(doc_type) data = Object() # elasticsearch_dsl field object data.field('password', 'string', index='no', include_in_all=False, store=False) data.field('first_name', 'string', index='not_analyzed') data.field('last_name', 'string', index='not_analyzed') data.field('accepted_by', 'string', index='not_analyzed') data.field('accepted_by_last_name', 'string', index='not_analyzed') data.field('accepted_by_first_name', 'string', index='not_analyzed') data.field('requested_by', 'string', index='not_analyzed') data.field('requested_by_last_name', 'string', index='not_analyzed') data.field('requested_by_first_name', 'string', index='not_analyzed') data.field('recipient', 'string', index='not_analyzed') data.field('recipient_first_name', 'string', index='not_analyzed') data.field('recipient_last_name', 'string', index='not_analyzed') data.field('sender', 'string', index='not_analyzed') data.field('sender_first_name', 'string', index='not_analyzed') data.field('sender_last_name', 'string', index='not_analyzed') data.field('sender_last_name', 'string', index='not_analyzed') data.field('email', 'string', index='not_analyzed') data.field('sent_at', 'date') data.field('start', 'date') data.field('end', 'date') device = Object() device.field('manufacturer', 'string', index='not_analyzed') device.field('model', 'string', index='not_analyzed') device.field('platform', 'string', index='not_analyzed') data.field('device', device) m.field('data', data) m.field('created_at', 'date') m.field('first_name', 'string', index='not_analyzed') m.field('last_name', 'string', index='not_analyzed') m.field('id', 'string', index='not_analyzed') m.field('creator', 'string', index='not_analyzed') m.field('involved', 'string', index='not_analyzed') m.field('profile_pic_url_tiny', 'string', index='not_analyzed') m.field('type', 'string', index='not_analyzed') m.save(index)
def handle(self, *args, **options): from elasticsearch import Elasticsearch from elasticsearch_dsl import Index, Mapping ELASTICSEARCH_HOSTS = ['localhost'] ELASTICSEARCH_INDEX = 'kindle2' es = Elasticsearch(ELASTICSEARCH_HOSTS) newindex = Index(ELASTICSEARCH_INDEX, using=es) if newindex.exists(): exit('index already exists,change a new name') mp = Mapping() mp.field('title', 'text') mp.field('creator', 'text') mp.field('publisher', 'text') mp.field('iclass', 'text') mp.field('isbn', 'text') mp.field('asin', 'keyword') newindex.mapping(mp) newindex.create()
def construct_model_class(self, entity_cls): """Return a fully-baked Model class for a given Entity class""" model_cls = None # Return the model class if it was already seen/decorated if entity_cls.meta_.schema_name in self._model_classes: model_cls = self._model_classes[entity_cls.meta_.schema_name] else: from protean.core.model import ModelMeta meta_ = ModelMeta() meta_.entity_cls = entity_cls # Construct Inner Index class with options options = {} options["name"] = self.derive_schema_name(entity_cls) if "SETTINGS" in self.conn_info and self.conn_info["SETTINGS"]: options["settings"] = self.conn_info["SETTINGS"] index_cls = type("Index", (object, ), options) attrs = {"meta_": meta_, "Index": index_cls} # FIXME Ensure the custom model attributes are constructed properly model_cls = type(entity_cls.__name__ + "Model", (ElasticsearchModel, ), attrs) # Create Dynamic Mapping and associate with index # FIXME Expand to all types of fields id_field_name = id_field(entity_cls).field_name m = Mapping() m.field(id_field_name, Keyword()) model_cls._index.mapping(m) # Memoize the constructed model class self._model_classes[entity_cls.meta_.schema_name] = model_cls # Set Entity Class as a class level attribute for the Model, to be able to reference later. return model_cls
def ensure_vocabulary_mappings(term_info): """ Ensure the mapping is properly set in Elasticsearch to always do exact matches on taxonomy terms. Accepts the output of get_resource_terms. Calling this function during indexing means that vocabularies do not need to be added to the mapping in advance. This deals with the fact that vocabularies can be added on-the-fly without having to play around with extra signals. Args: term_info (dict): Details of terms for a group of LearningResources. """ if len(term_info) == 0: return get_conn() # We don't need the return value; just for it to exist. # Retrieve current mapping from Elasticsearch. mapping = Mapping.from_es(index=INDEX_NAME, doc_type=DOC_TYPE) # Get the field names from the mapping. existing_vocabs = set(mapping.to_dict()["learningresource"]["properties"]) # Get all the taxonomy names from the data. vocab_ids = set() for vocab_terms in term_info.values(): for vocab_id in vocab_terms.keys(): vocab_ids.add(vocab_id) updated = False # Add vocabulary to mapping if necessary. for vocab_id in vocab_ids: vocab_key = make_vocab_key(vocab_id) if vocab_key in existing_vocabs: continue mapping.field(vocab_key, "string", index="not_analyzed") updated = True if updated: mapping.save(INDEX_NAME) refresh_index()
def __init__(self, name, bases, attrs): meta = attrs.pop('Meta', None) self.meta = meta self.index = getattr(meta, 'index', None) self.doc_type = getattr(meta, 'doc_type', None) self._using = getattr(meta, 'using', None) self.mapping = getattr(meta, 'mapping', Mapping(self.doc_type)) self.serializer = getattr(meta, 'serializer', None) for name, value in list(attrs.items()): if isinstance(value, Field): self.mapping.field(name, value) del attrs[name] for name in dir(meta): if isinstance(getattr(meta, name, None), MetaField): params = getattr(meta, name) self.mapping.meta(name, *params.args, **params.kwargs) for b in bases: if hasattr(b, '_doc_type') and hasattr(b._doc_type, 'mapping'): self.mapping.update(b._doc_type.mapping, update_only=True) self._using = self._using or b._doc_type._using self.index = self.index or b._doc_type.index
class Meta: mapping = Mapping('my_d') mapping.meta('_all', enabled=False)
class Meta: mapping = Mapping('doc')
def test_mapping(setup_es): """Test the ES mapping for an order.""" mapping = Mapping.from_es(OrderSearchApp.es_model.get_write_index(), OrderSearchApp.name) assert mapping.to_dict() == { 'order': { 'dynamic': 'false', 'properties': { 'assignees': { 'include_in_parent': True, 'properties': { 'dit_team': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'first_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'id': { 'type': 'keyword', }, 'last_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name': { 'copy_to': ['assignees.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, }, 'type': 'nested', }, 'billing_address_1': { 'type': 'text', }, 'billing_address_2': { 'type': 'text', }, 'billing_address_country': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'billing_address_county': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'billing_address_postcode': { 'type': 'text', }, 'billing_address_town': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'billing_contact_name': { 'type': 'text', }, 'billing_company_name': { 'type': 'text', }, 'billing_email': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'billing_phone': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'cancellation_reason': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'cancelled_by': { 'include_in_parent': True, 'properties': { 'first_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'id': { 'type': 'keyword', }, 'last_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name': { 'copy_to': ['cancelled_by.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, }, 'type': 'nested', }, 'cancelled_on': { 'type': 'date', }, 'company': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'copy_to': ['company.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, 'trading_name': { 'copy_to': ['company.trading_name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'trading_name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, }, 'type': 'nested', }, 'completed_by': { 'include_in_parent': True, 'properties': { 'first_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'id': { 'type': 'keyword', }, 'last_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name': { 'copy_to': ['completed_by.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, }, 'type': 'nested', }, 'completed_on': { 'type': 'date', }, 'contact': { 'include_in_parent': True, 'properties': { 'first_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'id': { 'type': 'keyword', }, 'last_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name': { 'copy_to': ['contact.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, }, 'type': 'nested', }, 'contact_email': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'contact_phone': { 'type': 'keyword', }, 'contacts_not_to_approach': { 'type': 'text', }, 'created_by': { 'include_in_parent': True, 'properties': { 'first_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'id': { 'type': 'keyword', }, 'last_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name': { 'copy_to': ['created_by.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, 'dit_team': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, }, 'type': 'nested', }, 'created_on': { 'type': 'date', }, 'delivery_date': { 'type': 'date', }, 'description': { 'analyzer': 'english_analyzer', 'type': 'text', }, 'discount_value': { 'index': False, 'type': 'integer', }, 'existing_agents': { 'index': False, 'type': 'text', }, 'further_info': { 'type': 'text', }, 'id': { 'type': 'keyword', }, 'modified_on': { 'type': 'date', }, 'net_cost': { 'index': False, 'type': 'integer', }, 'paid_on': { 'type': 'date', }, 'payment_due_date': { 'type': 'date', }, 'po_number': { 'index': False, 'type': 'keyword', }, 'primary_market': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'reference': { 'copy_to': ['reference_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'reference_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, 'sector': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'ancestors': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, }, 'type': 'nested', }, }, 'type': 'nested', }, 'uk_region': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'service_types': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'status': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'subscribers': { 'include_in_parent': True, 'properties': { 'dit_team': { 'include_in_parent': True, 'properties': { 'id': { 'type': 'keyword', }, 'name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, }, 'type': 'nested', }, 'first_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'id': { 'type': 'keyword', }, 'last_name': { 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name': { 'copy_to': ['subscribers.name_trigram'], 'analyzer': 'lowercase_keyword_analyzer', 'fielddata': True, 'type': 'text', }, 'name_trigram': { 'analyzer': 'trigram_analyzer', 'type': 'text', }, }, 'type': 'nested', }, 'subtotal_cost': { 'copy_to': ['subtotal_cost_string'], 'type': 'integer', }, 'subtotal_cost_string': { 'type': 'keyword', }, 'total_cost': { 'copy_to': ['total_cost_string'], 'type': 'integer', }, 'total_cost_string': { 'type': 'keyword', }, 'vat_cost': { 'index': False, 'type': 'integer', }, 'vat_number': { 'index': False, 'type': 'keyword', }, 'vat_status': { 'index': False, 'type': 'keyword', }, 'vat_verified': { 'index': False, 'type': 'boolean', }, }, }, }