class ForumDocument(SumoDocument): """ ES document for forum posts. Thread information is duplicated across all posts in that thread. """ thread_title = field.Text() thread_forum_id = field.Keyword() thread_created = field.Date() thread_creator_id = field.Keyword() thread_is_locked = field.Boolean() thread_is_sticky = field.Boolean() content = field.Text() author_id = field.Keyword() created = field.Date() updated = field.Date() updated_by_id = field.Keyword() class Index: name = config.FORUM_INDEX_NAME using = config.DEFAULT_ES7_CONNECTION def get_field_value(self, field, instance, *args): if field.startswith("thread_"): instance = instance.thread field = field[len("thread_"):] return super().get_field_value(field, instance, *args) @classmethod def get_model(cls): return Post @classmethod def get_queryset(cls): return Post.objects.select_related("thread")
class Ssn_trace(InnerDoc): is_valid = field.Boolean() is_deceased = field.Boolean() ssn = field.Keyword() human_message = field.Text() issued = field.Object(Ssn_issued)
def test_boolean_deserialization(): bf = field.Boolean() assert not bf.deserialize("false") assert not bf.deserialize(False) assert not bf.deserialize("") assert not bf.deserialize(0) assert bf.deserialize(True) assert bf.deserialize("true") assert bf.deserialize(1)
def _schema2doc_map(self): _map = { 'integer': dsl_field.Long(), 'number': dsl_field.ScaledFloat(scaling_factor=100), 'string': dsl_field.Text(analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), 'keyword': dsl_field.Keyword(), }), 'any': dsl_field.Text(analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), 'keyword': dsl_field.Keyword(), }), 'boolean': dsl_field.Boolean(), 'time': dsl_field.Text( fields={ 'text': dsl_field.Text(), 'time': dsl_field.Date( format=constance_config.TIME_FORMATS), }), 'duration': dsl_field.DateRange(), 'default': dsl_field.Text(), 'date': dsl_field.Text( fields={ 'text': dsl_field.Text(), 'date': dsl_field.Date( format=constance_config.DATE_FORMATS), }), 'datetime': dsl_field.Text( fields={ 'text': dsl_field.Text(), 'datetime': dsl_field.Date(format=constance_config.DATE_FORMATS), }) } for key, val in _map.items(): _map[key] = CustomObject(properties={ 'val': val, 'repr': dsl_field.Keyword(), }) return _map
class ForumDocument(SumoDocument): """ ES document for forum posts. Thread information is duplicated across all posts in that thread. """ thread_title = field.Text() thread_forum_id = field.Keyword() forum_slug = field.Keyword() thread_id = field.Keyword() thread_created = field.Date() thread_creator_id = field.Keyword() thread_is_locked = field.Boolean() thread_is_sticky = field.Boolean() content = field.Text() author_id = field.Keyword() created = field.Date() updated = field.Date() updated_by_id = field.Keyword() class Index: pass def prepare_forum_slug(self, instance): return instance.thread.forum.slug def get_field_value(self, field, instance, *args): if field.startswith("thread_"): instance = instance.thread field = field[len("thread_"):] return super().get_field_value(field, instance, *args) @classmethod def get_model(cls): return Post @classmethod def get_queryset(cls): return Post.objects.prefetch_related("thread", "thread__forum")
class CompanyDocType(DocType): date_of_creation = FormattedDate(date_format='%Y-%m-%d') description = field.Text() employees = field.Text() facebook_url = field.Text() pk = field.Integer() keywords = field.Text() linkedin_url = field.Text() logo = field.Text() has_single_sector = field.Boolean() modified = FormattedDate(date_format='%Y-%m-%dT%H:%M:%S.%fZ') name = field.Text() number = field.Text() sectors = field.Text(multi=True) sectors_label = field.Text(multi=True) slug = field.Text() summary = field.Text() twitter_url = field.Text() website = field.Text() supplier_case_studies = field.Nested( properties={ 'pk': field.Integer(), 'title': field.Text(), 'short_summary': field.Text(), 'description': field.Text(), 'sector': field.Text(), 'keywords': field.Text(), 'image_one_caption': field.Text(), 'image_two_caption': field.Text(), 'image_three_caption': field.Text(), 'testimonial': field.Text(), 'slug': field.Text(), }) class Meta: index = 'company'
POS: 'NOUN', 'OP': "+" }]), ('pnp', [{ POS: "ADJ", "OP": "*" }, { POS: "PROPN", "OP": "+" }])] PD2ES_TYPES = { np.dtype('O'): field.Text(), np.dtype('int64'): field.Integer(), np.dtype('float64'): field.Double(), np.dtype('<M8[ns]'): field.Date(), np.dtype('bool'): field.Boolean() } def load_spacy_model(): nlp = spacy.load('en_core_web_lg') nlp.tokenizer.infix_finditer = re.compile(r'[~\-_]').finditer return nlp def make_matcher(nlp, patterns=PATTERNS): matcher = spacy.matcher.Matcher(nlp.vocab) for _id, pattern in PATTERNS: matcher.add(_id, None, pattern) return matcher
class AnswerDocument(QuestionDocument): """ ES document for Answers. Every Answer in DB gets an AnswerDocument in ES. Child class to QuestionDocument, with fields here un-prefixed. This document defines the answer-specific fields which are included in an AnswerDocument in addition to the de-normalized fields of an Answer's Question which are defined in QuestionDocument. Since QuestionDocument and AnswerDocument are stored in the same index, ES sees QuestionDocuments and AnswerDocuments the same, just with some documents missing certain fields. Enables aggregations on answers, such as when creating contribution metrics, and enables searching within an AAQ thread, or on Answer-specific properties like being a solution. """ creator_id = field.Keyword() created = field.Date() content = SumoLocaleAwareTextField(term_vector="with_positions_offsets") updated = field.Date() updated_by_id = field.Keyword() num_helpful_votes = field.Integer() num_unhelpful_votes = field.Integer() is_solution = field.Boolean() @classmethod def prepare(cls, instance, **kwargs): """Override super method to exclude certain docs.""" # Add a discard field in the document if the following conditions are met # Answer document is spam if instance.is_spam or instance.question.is_spam: instance.es_discard_doc = "unindex_me" obj = super().prepare(instance, **kwargs) # add a prefix to the id so we don't clash with QuestionDocuments obj.meta.id = "a_{}".format(obj.meta.id) return obj def prepare_is_solution(self, instance): solution_id = instance.question.solution_id return solution_id is not None and solution_id == instance.id def prepare_locale(self, instance): return instance.question.locale def prepare_num_helpful_votes(self, instance): if hasattr(instance, "es_num_helpful_votes"): return instance.es_num_helpful_votes return instance.num_helpful_votes def prepare_num_unhelpful_votes(self, instance): if hasattr(instance, "es_num_unhelpful_votes"): return instance.es_num_unhelpful_votes return instance.num_unhelpful_votes def prepare_answer_content(self, instance): # clear answer_content field from QuestionDocument, # as we don't need the content of sibling answers in an AnswerDocument return None def get_field_value(self, field, instance, *args): if field.startswith("question_"): instance = instance.question return super().get_field_value(field, instance, *args) def to_action(self, *args, **kwargs): # if the id is un-prefixed, add it if not str(self.meta.id).startswith("a_"): self.meta.id = f"a_{self.meta.id}" return super().to_action(*args, **kwargs) @classmethod def get(cls, id, **kwargs): # if the id is un-prefixed, add it if not str(id).startswith("a_"): id = f"a_{id}" return super().get(id, **kwargs) @classmethod def get_model(cls): return Answer @classmethod def get_queryset(cls): return ( Answer.objects # prefetch each answer's question, # applying the same optimizations as in the QuestionDocument .prefetch_related( Prefetch("question", queryset=QuestionDocument.get_queryset())) # count votes in db to improve performance .annotate( es_num_helpful_votes=Count("votes", filter=Q(votes__helpful=True)), es_num_unhelpful_votes=Count("votes", filter=Q(votes__helpful=False)), ))
class QuestionDocument(SumoDocument): """ ES document for Questions. Every Question in DB gets a QuestionDocument in ES. Parent class to AnswerDocument, with most fields here prefixed with "question_". This document defines the question-specific fields (most of) which are de-normalized in the AnswerDocument. Since QuestionDocument and AnswerDocument are stored in the same index, ES sees QuestionDocuments and AnswerDocuments the same, just with some documents missing certain fields. Enables searching for AAQ threads as a unit. """ question_id = field.Keyword() question_title = SumoLocaleAwareTextField() question_creator_id = field.Keyword() question_content = SumoLocaleAwareTextField( term_vector="with_positions_offsets") question_created = field.Date() question_updated = field.Date() question_updated_by_id = field.Keyword() question_has_solution = field.Boolean() question_is_locked = field.Boolean() question_is_archived = field.Boolean() question_product_id = field.Keyword() question_topic_id = field.Keyword() question_taken_by_id = field.Keyword() question_taken_until = field.Date() question_tag_ids = field.Keyword(multi=True) question_num_votes = field.Integer() # store answer content to optimise searching for AAQ threads as a unit answer_content = SumoLocaleAwareTextField( multi=True, term_vector="with_positions_offsets") locale = field.Keyword() class Index: name = config.QUESTION_INDEX_NAME using = config.DEFAULT_ES7_CONNECTION @classmethod def prepare(cls, instance): """Override super method to exclude certain docs.""" # Add a discard field in the document if the following conditions are met # Question document is spam if instance.is_spam: instance.es_discard_doc = "unindex_me" return super(QuestionDocument, cls).prepare(instance) def prepare_question_tag_ids(self, instance): return [tag.id for tag in instance.tags.all()] def prepare_question_has_solution(self, instance): return instance.solution_id is not None def prepare_question_num_votes(self, instance): if hasattr(instance, "es_question_num_votes"): return instance.es_question_num_votes return instance.num_votes def prepare_answer_content(self, instance): return [ answer.content for answer in ( # when bulk indexing use answer queryset prefetched in `get_queryset` method # this is to avoid running an extra query for each question in the chunk instance.es_question_answers_not_spam if hasattr( instance, "es_question_answers_not_spam") # fallback if non-spam answers haven't been prefetched else instance.answers.filter(is_spam=False)) ] def get_field_value(self, field, *args): if field.startswith("question_"): field = field[len("question_"):] return super().get_field_value(field, *args) @classmethod def get_model(cls): return Question @classmethod def get_queryset(cls): return ( Question.objects # prefetch answers which aren't spam to avoid extra queries when iterating over them .prefetch_related( Prefetch( "answers", queryset=Answer.objects.filter(is_spam=False), to_attr="es_question_answers_not_spam", )) # prefetch tags to avoid extra queries when iterating over them .prefetch_related("tags") # count votes in db to improve performance .annotate(es_question_num_votes=Count("votes")))
_schema2doc_map = { 'integer': dsl_field.Float(), 'number': dsl_field.ScaledFloat(scaling_factor=100), 'string': dsl_field.Text( analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), } ), 'any': dsl_field.Text( analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), } ), 'boolean': dsl_field.Boolean(), 'date': dsl_field.Date(), 'datetime': dsl_field.Date(), 'time': dsl_field.Date() } _schema_to_api_field = { 'integer': api_fields.Number, 'number': api_fields.Number, 'string': api_fields.String, 'any': api_fields.String, 'boolean': api_fields.Boolean, 'date': api_fields.DateTime, 'datetime': api_fields.DateTime, 'time': api_fields.Time }
class ShpData(IndexedData): _type = 'geo' _schema2doc_map = { 'C': dsl_field.Text( analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), 'keyword': dsl_field.Keyword(), }, ), 'D': dsl_field.Date(), 'N': dsl_field.ScaledFloat(scaling_factor=100), 'L': dsl_field.Boolean(), '@': dsl_field.Date(), 'I': dsl_field.Long(), '+': dsl_field.Long(), 'F': dsl_field.Float(), 'O': dsl_field.Double(), } _schema_to_api_field = { 'C': api_fields.String, 'D': api_fields.DateTime, 'N': api_fields.Number, 'L': api_fields.Boolean, '@': api_fields.DateTime, 'I': api_fields.Number, '+': api_fields.Number, 'F': api_fields.Number, 'O': api_fields.Number, } _schema_long_names = { 'C': 'string', 'D': 'datetime', 'N': 'number', 'L': 'boolean', '@': 'datetime', 'I': 'integer', '+': 'integer', 'F': 'number', 'O': 'number', } _source = None _schema = None _transformer = None def __init__(self, resource, from_table_index=False): super().__init__(resource) self.from_table_index = from_table_index @property def has_geo_data(self): return True @property def is_chartable(self): fields = self.schema return len(fields) > 1 and any( (field.type in ('N', 'I', '+', 'F', 'O') for field in fields)) @property def source(self): if not self._source: with ArchiveReader(self.resource.main_file.path) as extracted: shp_path = next( iter(f for f in extracted if f.endswith('.shp'))) self._source = shapefile.Reader(shp_path) self._transformer = ShapeTransformer(extracted) return self._source def get_schema(self, **kwargs): use_aliases = kwargs.get('use_aliases', False) headers = self.reversed_headers_map return { 'fields': [{ 'name': headers[item.name] if use_aliases else item.name, 'type': self._schema_long_names[item.type], 'format': 'default' } for item in self.schema] } @property def schema(self): if not self._schema: self._schema = [ DBSchemaField(*_f) for _f in self.source.fields[1:] ] return self._schema def prepare_doc(self): _fields = { 'shape': dsl_field.GeoShape(), 'point': dsl_field.GeoPoint(), 'shape_type': dsl_field.Integer(), 'label': dsl_field.Text(), 'resource': dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text(analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) }), 'updated_at': dsl_field.Date(), 'row_no': dsl_field.Long() } _map = {} for idx, _f in enumerate(self.schema, 1): if _f.type not in self._schema2doc_map: continue alias_name = _f.name field_name = f'col{idx}' _field = self._schema2doc_map[_f.type] _map[field_name] = alias_name _fields[field_name] = _field _fields['Index'] = type('Index', (type, ), {'name': self.idx_name}) doc = type(self.idx_name, (Document, ), _fields) doc._doc_type.mapping._meta['_meta'] = {'headers': _map} return doc def get_api_fields(self): record_fields = {} for f in self.schema: field_name = self.reversed_headers_map[f.name] field_cls = self._schema_to_api_field[f.type] record_fields[field_name] = field_cls(is_tabular_data_field=True) return record_fields @staticmethod def _get_row_id(row): return str( uuid.uuid5(uuid.NAMESPACE_DNS, '+|+'.join(str(i)[:10000] for i in row))) def _docs_iter(self, doc): for row_no, sr in enumerate(self.source.shapeRecords(), 1): geojson = self._transformer.transform(sr.shape) v = { 'shape': geojson, 'updated_at': datetime.now(), 'row_no': row_no, 'resource': { 'id': self.resource.id, 'title': self.resource.title }, } for i, val in enumerate(sr.record, 1): v[f'col{i}'] = val if val != b'' else None v['shape_type'] = sr.shape.shapeType v['point'] = median_point(geojson) tds = self.resource.tabular_data_schema if tds is not None and 'geo' in tds and 'label' in tds['geo']: v['label'] = sr.record[tds['geo']['label'].get('col_name')] d = doc(**v) d.meta.id = self._get_row_id(sr.record) yield d
def __init__(self, *args, **kwargs): super(ContributorField, self).__init__(*args, **kwargs) self.properties['username'] = field.String(index='not_analyzed') self.properties['is_freelance'] = field.Boolean()
class CompanyDocument(Document): wildcard = field.Text(analyzer=american_english_analyzer) casestudy_wildcard = field.Text(analyzer=american_english_analyzer) keyword_wildcard = field.Keyword() case_study_count = field.Integer() date_of_creation = field.Date(index=False) description = field.Text( copy_to='wildcard', analyzer=american_english_analyzer ) has_description = field.Boolean() employees = field.Keyword(index=False, store=True) facebook_url = field.Keyword(index=False, store=True) pk = field.Integer(index=False) keywords = field.Text(copy_to='wildcard') linkedin_url = field.Keyword(index=False, store=True) logo = field.Keyword(index=False, store=True) has_single_sector = field.Boolean() modified = field.Date(index=False) ordering_name = field.Keyword() name = field.Text(copy_to=['wildcard', 'ordering_name']) number = field.Keyword(copy_to='keyword_wildcard',) sectors = field.Keyword(multi=True, copy_to='keyword_wildcard', store=True) sectors_label = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_industries = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_regions = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_languages = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_countries = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) # Represents Dict as it's the primitive datatype for this field expertise_products_services = field.Object() expertise_products_services_labels = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) expertise_labels = field.Keyword( multi=True, copy_to='keyword_wildcard', store=True ) slug = field.Keyword(copy_to='keyword_wildcard', store=True) summary = field.Text( copy_to='wildcard', analyzer=american_english_analyzer ) twitter_url = field.Keyword(index=False, store=True) website = field.Keyword(copy_to='keyword_wildcard', store=True) supplier_case_studies = field.Nested( properties={ 'pk': field.Integer(index=False), 'title': field.Text(copy_to='casestudy_wildcard'), 'short_summary': field.Text(copy_to='casestudy_wildcard'), 'description': field.Text(copy_to='casestudy_wildcard'), 'sector': field.Keyword(copy_to='keyword_wildcard', store=True), 'keywords': field.Text(copy_to='casestudy_wildcard'), 'image_one_caption': field.Text(copy_to='casestudy_wildcard'), 'image_two_caption': field.Text(copy_to='casestudy_wildcard'), 'image_three_caption': field.Text(copy_to='casestudy_wildcard'), 'testimonial': field.Text(copy_to='casestudy_wildcard'), 'website': field.Keyword(copy_to='casestudy_wildcard', store=True), 'slug': field.Keyword(copy_to='keyword_wildcard', store=True), 'testimonial_name': field.Keyword( copy_to='casestudy_wildcard', store=True ), 'testimonial_company': field.Text(copy_to='casestudy_wildcard'), 'testimonial_job_title': field.Text(copy_to='casestudy_wildcard'), } ) is_showcase_company = field.Boolean() is_published_investment_support_directory = field.Boolean() is_published_find_a_supplier = field.Boolean() class Meta: index = settings.ELASTICSEARCH_COMPANY_INDEX_ALIAS
class QuestionDocument(SumoDocument): """ ES document for Questions. Every Question in DB gets a QuestionDocument in ES. Parent class to AnswerDocument, with most fields here prefixed with "question_". This document defines the question-specific fields (most of) which are de-normalized in the AnswerDocument. Since QuestionDocument and AnswerDocument are stored in the same index, ES sees QuestionDocuments and AnswerDocuments the same, just with some documents missing certain fields. Enables searching for AAQ threads as a unit. """ question_id = field.Keyword() question_title = SumoLocaleAwareTextField() question_creator_id = field.Keyword() question_content = SumoLocaleAwareTextField( term_vector="with_positions_offsets") question_created = field.Date() question_updated = field.Date() question_updated_by_id = field.Keyword() question_has_solution = field.Boolean() question_is_locked = field.Boolean() question_is_archived = field.Boolean() question_is_spam = field.Boolean() question_marked_as_spam = field.Date() question_marked_as_spam_by_id = field.Keyword() question_product_id = field.Keyword() question_topic_id = field.Keyword() question_taken_by_id = field.Keyword() question_taken_until = field.Date() question_tag_ids = field.Keyword(multi=True) question_num_votes = field.Integer() # store answer content to optimise searching for AAQ threads as a unit answer_content = SumoLocaleAwareTextField( multi=True, term_vector="with_positions_offsets") locale = field.Keyword() class Index: name = config.QUESTION_INDEX_NAME using = config.DEFAULT_ES7_CONNECTION def prepare_question_tag_ids(self, instance): return [tag.id for tag in instance.tags.all()] def prepare_question_has_solution(self, instance): return instance.solution_id is not None def prepare_question_num_votes(self, instance): if hasattr(instance, "es_question_num_votes"): return instance.es_question_num_votes return instance.num_votes def prepare_answer_content(self, instance): return [answer.content for answer in instance.answers.all()] def get_field_value(self, field, *args): if field.startswith("question_"): field = field[len("question_"):] return super().get_field_value(field, *args) @classmethod def get_model(cls): return Question @classmethod def get_queryset(cls): return (Question.objects.prefetch_related("answers") # prefetch tags to avoid extra queries when iterating over them .prefetch_related("tags") # count votes in db to improve performance .annotate(es_question_num_votes=Count("votes")))