def serialize_preprint(preprint, category): elastic_document = {} try: normalized_title = six.u(preprint.title) except TypeError: normalized_title = preprint.title normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore') elastic_document = { 'id': preprint._id, 'contributors': [ { 'fullname': x['fullname'], 'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None } for x in preprint._contributors.filter(preprintcontributor__visible=True).order_by('preprintcontributor___order') .values('fullname', 'guids___id', 'is_active') ], 'title': preprint.title, 'normalized_title': normalized_title, 'category': category, 'public': preprint.is_public, 'published': preprint.verified_publishable, 'is_retracted': preprint.is_retracted, 'tags': list(preprint.tags.filter(system=False).values_list('name', flat=True)), 'description': preprint.description, 'url': preprint.url, 'date_created': preprint.created, 'license': serialize_node_license_record(preprint.license), 'boost': 2, # More relevant than a registration 'extra_search_terms': clean_splitters(preprint.title), } return elastic_document
def serialize_preprint(preprint, category): elastic_document = {} try: normalized_title = six.u(preprint.title) except TypeError: normalized_title = preprint.title normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore') elastic_document = { 'id': preprint._id, 'contributors': [ { 'fullname': x['fullname'], 'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None } for x in preprint._contributors.filter(preprintcontributor__visible=True).order_by('preprintcontributor___order') .values('fullname', 'guids___id', 'is_active') ], 'title': preprint.title, 'normalized_title': normalized_title, 'category': category, 'public': preprint.is_public, 'published': preprint.verified_publishable, 'is_retracted': preprint.is_retracted, 'tags': list(preprint.tags.filter(system=False).values_list('name', flat=True)), 'description': preprint.description, 'url': preprint.url, 'date_created': preprint.created, 'license': serialize_node_license_record(preprint.license), 'boost': 2, # More relevant than a registration 'extra_search_terms': clean_splitters(preprint.title), } return elastic_document
def update_file(file_, index=None, delete=False): index = index or INDEX target = file_.target # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None file_node_is_qa = bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection(file_.tags.all().values_list('name', flat=True)) ) or bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection(target.tags.all().values_list('name', flat=True)) ) or any(substring in target.title for substring in settings.DO_NOT_INDEX_LIST['titles']) if not file_.name or not target.is_public or delete or target.is_deleted or target.archiving or file_node_is_qa: client().delete( index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404] ) return # We build URLs manually here so that this function can be # run outside of a Flask request context (e.g. in a celery task) file_deep_url = '/{target_id}/files/{provider}{path}/'.format( target_id=target._id, provider=file_.provider, path=file_.path, ) if target.is_quickfiles: node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id) else: node_url = '/{target_id}/'.format(target_id=target._id) guid_url = None file_guid = file_.get_guid(create=False) if file_guid: guid_url = '/{file_guid}/'.format(file_guid=file_guid._id) file_doc = { 'id': file_._id, 'deep_url': file_deep_url, 'guid_url': guid_url, 'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)), 'name': file_.name, 'category': 'file', 'node_url': node_url, 'node_title': getattr(target, 'title', None), 'parent_id': target.parent_node._id if getattr(target, 'parent_node', None) else None, 'is_registration': getattr(target, 'is_registration', False), 'is_retracted': getattr(target, 'is_retracted', False), 'extra_search_terms': clean_splitters(file_.name), } client().index( index=index, doc_type='file', body=file_doc, id=file_._id, refresh=True )
def update_file(file_, index=None, delete=False): index = index or INDEX # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None if not file_.name or not file_.node.is_public or delete or file_.node.is_deleted or file_.node.archiving: client().delete(index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404]) return # We build URLs manually here so that this function can be # run outside of a Flask request context (e.g. in a celery task) file_deep_url = '/{node_id}/files/{provider}{path}/'.format( node_id=file_.node._id, provider=file_.provider, path=file_.path, ) node_url = '/{node_id}/'.format(node_id=file_.node._id) guid_url = None file_guid = file_.get_guid(create=False) if file_guid: guid_url = '/{file_guid}/'.format(file_guid=file_guid._id) file_doc = { 'id': file_._id, 'deep_url': file_deep_url, 'guid_url': guid_url, 'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)), 'name': file_.name, 'category': 'file', 'node_url': node_url, 'node_title': file_.node.title, 'parent_id': file_.node.parent_node._id if file_.node.parent_node else None, 'is_registration': file_.node.is_registration, 'is_retracted': file_.node.is_retracted, 'extra_search_terms': clean_splitters(file_.name), } client().index(index=index, doc_type='file', body=file_doc, id=file_._id, refresh=True)
def serialize_node(node, category): from website.addons.wiki.model import NodeWikiPage elastic_document = {} parent_id = node.parent_id try: normalized_title = six.u(node.title) except TypeError: normalized_title = node.title normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore') elastic_document = { 'id': node._id, 'contributors': [ { 'fullname': x.fullname, 'url': x.profile_url if x.is_active else None } for x in node.visible_contributors if x is not None ], 'title': node.title, 'normalized_title': normalized_title, 'category': category, 'public': node.is_public, 'tags': [tag._id for tag in node.tags if tag], 'description': node.description, 'url': node.url, 'is_registration': node.is_registration, 'is_pending_registration': node.is_pending_registration, 'is_retracted': node.is_retracted, 'is_pending_retraction': node.is_pending_retraction, 'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False, 'is_pending_embargo': node.is_pending_embargo, 'registered_date': node.registered_date, 'wikis': {}, 'parent_id': parent_id, 'date_created': node.date_created, 'license': serialize_node_license_record(node.license), 'affiliated_institutions': [inst.name for inst in node.affiliated_institutions], 'boost': int(not node.is_registration) + 1, # This is for making registered projects less relevant 'extra_search_terms': clean_splitters(node.title), } if not node.is_retracted: for wiki in [ NodeWikiPage.load(x) for x in node.wiki_pages_current.values() ]: elastic_document['wikis'][wiki.page_name] = wiki.raw_text(node) return elastic_document
def serialize_node(node, category): NodeWikiPage = apps.get_model('addons_wiki.NodeWikiPage') elastic_document = {} parent_id = node.parent_id try: normalized_title = six.u(node.title) except TypeError: normalized_title = node.title normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore') elastic_document = { 'id': node._id, 'contributors': [ { 'fullname': x['fullname'], 'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None } for x in node._contributors.filter(contributor__visible=True).order_by('contributor___order') .values('fullname', 'guids___id', 'is_active') ], 'title': node.title, 'normalized_title': normalized_title, 'category': category, 'public': node.is_public, 'tags': list(node.tags.filter(system=False).values_list('name', flat=True)), 'description': node.description, 'url': node.url, 'is_registration': node.is_registration, 'is_pending_registration': node.is_pending_registration, 'is_retracted': node.is_retracted, 'is_pending_retraction': node.is_pending_retraction, 'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False, 'is_pending_embargo': node.is_pending_embargo, 'registered_date': node.registered_date, 'wikis': {}, 'parent_id': parent_id, 'date_created': node.date_created, 'license': serialize_node_license_record(node.license), 'affiliated_institutions': list(node.affiliated_institutions.values_list('name', flat=True)), 'boost': int(not node.is_registration) + 1, # This is for making registered projects less relevant 'extra_search_terms': clean_splitters(node.title), 'preprint_url': node.preprint_url, } if not node.is_retracted: for wiki in NodeWikiPage.objects.filter(guids___id__in=node.wiki_pages_current.values()): # '.' is not allowed in field names in ES2 elastic_document['wikis'][wiki.page_name.replace('.', ' ')] = wiki.raw_text(node) return elastic_document
def serialize_node(node, category): NodeWikiPage = apps.get_model('addons_wiki.NodeWikiPage') elastic_document = {} parent_id = node.parent_id try: normalized_title = six.u(node.title) except TypeError: normalized_title = node.title normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore') elastic_document = { 'id': node._id, 'contributors': [ { 'fullname': x['fullname'], 'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None } for x in node._contributors.filter(contributor__visible=True).order_by('contributor___order') .values('fullname', 'guids___id', 'is_active') ], 'title': node.title, 'normalized_title': normalized_title, 'category': category, 'public': node.is_public, 'tags': list(node.tags.filter(system=False).values_list('name', flat=True)), 'description': node.description, 'url': node.url, 'is_registration': node.is_registration, 'is_pending_registration': node.is_pending_registration, 'is_retracted': node.is_retracted, 'is_pending_retraction': node.is_pending_retraction, 'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False, 'is_pending_embargo': node.is_pending_embargo, 'registered_date': node.registered_date, 'wikis': {}, 'parent_id': parent_id, 'date_created': node.date_created, 'license': serialize_node_license_record(node.license), 'affiliated_institutions': list(node.affiliated_institutions.values_list('name', flat=True)), 'boost': int(not node.is_registration) + 1, # This is for making registered projects less relevant 'extra_search_terms': clean_splitters(node.title), 'preprint_url': node.preprint_url, } if not node.is_retracted: for wiki in NodeWikiPage.objects.filter(guids___id__in=node.wiki_pages_current.values()): # '.' is not allowed in field names in ES2 elastic_document['wikis'][wiki.page_name.replace('.', ' ')] = wiki.raw_text(node) return elastic_document
def update_file(file_, index=None, delete=False): index = index or INDEX # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None if not file_.name or not file_.node.is_public or delete or file_.node.is_deleted or file_.node.archiving: client().delete( index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404] ) return # We build URLs manually here so that this function can be # run outside of a Flask request context (e.g. in a celery task) file_deep_url = '/{node_id}/files/{provider}{path}/'.format( node_id=file_.node._id, provider=file_.provider, path=file_.path, ) node_url = '/{node_id}/'.format(node_id=file_.node._id) guid_url = None file_guid = file_.get_guid(create=False) if file_guid: guid_url = '/{file_guid}/'.format(file_guid=file_guid._id) file_doc = { 'id': file_._id, 'deep_url': file_deep_url, 'guid_url': guid_url, 'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)), 'name': file_.name, 'category': 'file', 'node_url': node_url, 'node_title': file_.node.title, 'parent_id': file_.node.parent_node._id if file_.node.parent_node else None, 'is_registration': file_.node.is_registration, 'is_retracted': file_.node.is_retracted, 'extra_search_terms': clean_splitters(file_.name), } client().index( index=index, doc_type='file', body=file_doc, id=file_._id, refresh=True )
def serialize_group(group, category): elastic_document = {} try: normalized_title = six.u(group.name) except TypeError: normalized_title = group.name normalized_title = unicodedata.normalize('NFKD', normalized_title).encode( 'ascii', 'ignore') elastic_document = { 'id': group._id, 'members': [{ 'fullname': x['fullname'], 'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None } for x in group.members_only.values('fullname', 'guids___id', 'is_active')], 'managers': [{ 'fullname': x['fullname'], 'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None } for x in group.managers.values('fullname', 'guids___id', 'is_active') ], 'title': group.name, 'normalized_title': normalized_title, 'category': category, 'url': group.url, 'date_created': group.created, 'boost': 2, # More relevant than a registration 'extra_search_terms': clean_splitters(group.name), } return elastic_document
def update_file(file_, index=None, delete=False): index = index or INDEX target = file_.target # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None file_node_is_qa = bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection( file_.tags.all().values_list('name', flat=True))) or bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection( target.tags.all().values_list('name', flat=True))) or any( substring in target.title for substring in settings.DO_NOT_INDEX_LIST['titles']) if not file_.name or not target.is_public or delete or file_node_is_qa or getattr( target, 'is_deleted', False) or getattr( target, 'archiving', False) or target.is_spam or ( target.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH): client().delete(index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404]) return if isinstance(target, Preprint): if not getattr( target, 'verified_publishable', False) or target.primary_file != file_ or target.is_spam or ( target.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH): client().delete(index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404]) return # We build URLs manually here so that this function can be # run outside of a Flask request context (e.g. in a celery task) file_deep_url = '/{target_id}/files/{provider}{path}/'.format( target_id=target._id, provider=file_.provider, path=file_.path, ) if getattr(target, 'is_quickfiles', None): node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id) else: node_url = '/{target_id}/'.format(target_id=target._id) guid_url = None file_guid = file_.get_guid(create=False) if file_guid: guid_url = '/{file_guid}/'.format(file_guid=file_guid._id) # File URL's not provided for preprint files, because the File Detail Page will # just reroute to preprints detail file_doc = { 'id': file_._id, 'deep_url': None if isinstance(target, Preprint) else file_deep_url, 'guid_url': None if isinstance(target, Preprint) else guid_url, 'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)), 'name': file_.name, 'category': 'file', 'node_url': node_url, 'node_title': getattr(target, 'title', None), 'parent_id': target.parent_node._id if getattr(target, 'parent_node', None) else None, 'is_registration': getattr(target, 'is_registration', False), 'is_retracted': getattr(target, 'is_retracted', False), 'extra_search_terms': clean_splitters(file_.name), } client().index(index=index, doc_type='file', body=file_doc, id=file_._id, refresh=True)
def update_file(file_, index=None, delete=False): index = index or INDEX target = file_.target # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None file_node_is_qa = bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection( file_.tags.all().values_list('name', flat=True))) or bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection( target.tags.all().values_list('name', flat=True))) or any( substring in target.title for substring in settings.DO_NOT_INDEX_LIST['titles']) if not file_.name or not target.is_public or delete or target.is_deleted or target.archiving or file_node_is_qa: client().delete(index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404]) return # We build URLs manually here so that this function can be # run outside of a Flask request context (e.g. in a celery task) file_deep_url = '/{target_id}/files/{provider}{path}/'.format( target_id=target._id, provider=file_.provider, path=file_.path, ) if target.is_quickfiles: node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id) else: node_url = '/{target_id}/'.format(target_id=target._id) guid_url = None file_guid = file_.get_guid(create=False) if file_guid: guid_url = '/{file_guid}/'.format(file_guid=file_guid._id) file_doc = { 'id': file_._id, 'deep_url': file_deep_url, 'guid_url': guid_url, 'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)), 'name': file_.name, 'category': 'file', 'node_url': node_url, 'node_title': getattr(target, 'title', None), 'parent_id': target.parent_node._id if getattr(target, 'parent_node', None) else None, 'is_registration': getattr(target, 'is_registration', False), 'is_retracted': getattr(target, 'is_retracted', False), 'extra_search_terms': clean_splitters(file_.name), } client().index(index=index, doc_type='file', body=file_doc, id=file_._id, refresh=True)
def update_file(file_, index=None, delete=False): index = index or INDEX target = file_.target # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None file_node_is_qa = bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection(file_.tags.all().values_list('name', flat=True)) ) or bool( set(settings.DO_NOT_INDEX_LIST['tags']).intersection(target.tags.all().values_list('name', flat=True)) ) or any(substring in target.title for substring in settings.DO_NOT_INDEX_LIST['titles']) if not file_.name or not target.is_public or delete or file_node_is_qa or getattr(target, 'is_deleted', False) or getattr(target, 'archiving', False) or target.is_spam or ( target.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH): client().delete( index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404] ) return if isinstance(target, Preprint): if not getattr(target, 'verified_publishable', False) or target.primary_file != file_ or target.is_spam or ( target.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH): client().delete( index=index, doc_type='file', id=file_._id, refresh=True, ignore=[404] ) return # We build URLs manually here so that this function can be # run outside of a Flask request context (e.g. in a celery task) file_deep_url = '/{target_id}/files/{provider}{path}/'.format( target_id=target._id, provider=file_.provider, path=file_.path, ) if getattr(target, 'is_quickfiles', None): node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id) else: node_url = '/{target_id}/'.format(target_id=target._id) guid_url = None file_guid = file_.get_guid(create=False) if file_guid: guid_url = '/{file_guid}/'.format(file_guid=file_guid._id) # File URL's not provided for preprint files, because the File Detail Page will # just reroute to preprints detail file_doc = { 'id': file_._id, 'deep_url': None if isinstance(target, Preprint) else file_deep_url, 'guid_url': None if isinstance(target, Preprint) else guid_url, 'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)), 'name': file_.name, 'category': 'file', 'node_url': node_url, 'node_title': getattr(target, 'title', None), 'parent_id': target.parent_node._id if getattr(target, 'parent_node', None) else None, 'is_registration': getattr(target, 'is_registration', False), 'is_retracted': getattr(target, 'is_retracted', False), 'extra_search_terms': clean_splitters(file_.name), } client().index( index=index, doc_type='file', body=file_doc, id=file_._id, refresh=True )
def serialize_node(node, category): from website.addons.wiki.model import NodeWikiPage elastic_document = {} parent_id = node.parent_id try: normalized_title = six.u(node.title) except TypeError: normalized_title = node.title normalized_title = unicodedata.normalize('NFKD', normalized_title).encode( 'ascii', 'ignore') elastic_document = { 'id': node._id, 'contributors': [{ 'fullname': x.fullname, 'url': x.profile_url if x.is_active else None } for x in node.visible_contributors if x is not None], 'title': node.title, 'normalized_title': normalized_title, 'category': category, 'public': node.is_public, 'tags': [tag._id for tag in node.tags if tag], 'description': node.description, 'url': node.url, 'is_registration': node.is_registration, 'is_pending_registration': node.is_pending_registration, 'is_retracted': node.is_retracted, 'is_pending_retraction': node.is_pending_retraction, 'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False, 'is_pending_embargo': node.is_pending_embargo, 'registered_date': node.registered_date, 'wikis': {}, 'parent_id': parent_id, 'date_created': node.date_created, 'license': serialize_node_license_record(node.license), 'affiliated_institutions': [inst.name for inst in node.affiliated_institutions], 'boost': int(not node.is_registration) + 1, # This is for making registered projects less relevant 'extra_search_terms': clean_splitters(node.title), } if not node.is_retracted: for wiki in [ NodeWikiPage.load(x) for x in node.wiki_pages_current.values() ]: elastic_document['wikis'][wiki.page_name] = wiki.raw_text(node) return elastic_document