Esempio n. 1
0
def serialize_preprint(preprint, category):
    elastic_document = {}

    try:
        normalized_title = six.u(preprint.title)
    except TypeError:
        normalized_title = preprint.title
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore')
    elastic_document = {
        'id': preprint._id,
        'contributors': [
            {
                'fullname': x['fullname'],
                'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None
            }
            for x in preprint._contributors.filter(preprintcontributor__visible=True).order_by('preprintcontributor___order')
            .values('fullname', 'guids___id', 'is_active')
        ],
        'title': preprint.title,
        'normalized_title': normalized_title,
        'category': category,
        'public': preprint.is_public,
        'published': preprint.verified_publishable,
        'is_retracted': preprint.is_retracted,
        'tags': list(preprint.tags.filter(system=False).values_list('name', flat=True)),
        'description': preprint.description,
        'url': preprint.url,
        'date_created': preprint.created,
        'license': serialize_node_license_record(preprint.license),
        'boost': 2,  # More relevant than a registration
        'extra_search_terms': clean_splitters(preprint.title),
    }

    return elastic_document
Esempio n. 2
0
def serialize_preprint(preprint, category):
    elastic_document = {}

    try:
        normalized_title = six.u(preprint.title)
    except TypeError:
        normalized_title = preprint.title
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore')
    elastic_document = {
        'id': preprint._id,
        'contributors': [
            {
                'fullname': x['fullname'],
                'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None
            }
            for x in preprint._contributors.filter(preprintcontributor__visible=True).order_by('preprintcontributor___order')
            .values('fullname', 'guids___id', 'is_active')
        ],
        'title': preprint.title,
        'normalized_title': normalized_title,
        'category': category,
        'public': preprint.is_public,
        'published': preprint.verified_publishable,
        'is_retracted': preprint.is_retracted,
        'tags': list(preprint.tags.filter(system=False).values_list('name', flat=True)),
        'description': preprint.description,
        'url': preprint.url,
        'date_created': preprint.created,
        'license': serialize_node_license_record(preprint.license),
        'boost': 2,  # More relevant than a registration
        'extra_search_terms': clean_splitters(preprint.title),
    }

    return elastic_document
Esempio n. 3
0
def update_file(file_, index=None, delete=False):
    index = index or INDEX
    target = file_.target

    # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None
    file_node_is_qa = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(file_.tags.all().values_list('name', flat=True))
    ) or bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(target.tags.all().values_list('name', flat=True))
    ) or any(substring in target.title for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if not file_.name or not target.is_public or delete or target.is_deleted or target.archiving or file_node_is_qa:
        client().delete(
            index=index,
            doc_type='file',
            id=file_._id,
            refresh=True,
            ignore=[404]
        )
        return

    # We build URLs manually here so that this function can be
    # run outside of a Flask request context (e.g. in a celery task)
    file_deep_url = '/{target_id}/files/{provider}{path}/'.format(
        target_id=target._id,
        provider=file_.provider,
        path=file_.path,
    )
    if target.is_quickfiles:
        node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id)
    else:
        node_url = '/{target_id}/'.format(target_id=target._id)

    guid_url = None
    file_guid = file_.get_guid(create=False)
    if file_guid:
        guid_url = '/{file_guid}/'.format(file_guid=file_guid._id)
    file_doc = {
        'id': file_._id,
        'deep_url': file_deep_url,
        'guid_url': guid_url,
        'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)),
        'name': file_.name,
        'category': 'file',
        'node_url': node_url,
        'node_title': getattr(target, 'title', None),
        'parent_id': target.parent_node._id if getattr(target, 'parent_node', None) else None,
        'is_registration': getattr(target, 'is_registration', False),
        'is_retracted': getattr(target, 'is_retracted', False),
        'extra_search_terms': clean_splitters(file_.name),
    }

    client().index(
        index=index,
        doc_type='file',
        body=file_doc,
        id=file_._id,
        refresh=True
    )
Esempio n. 4
0
def update_file(file_, index=None, delete=False):
    index = index or INDEX

    # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None
    if not file_.name or not file_.node.is_public or delete or file_.node.is_deleted or file_.node.archiving:
        client().delete(index=index,
                        doc_type='file',
                        id=file_._id,
                        refresh=True,
                        ignore=[404])
        return

    # We build URLs manually here so that this function can be
    # run outside of a Flask request context (e.g. in a celery task)
    file_deep_url = '/{node_id}/files/{provider}{path}/'.format(
        node_id=file_.node._id,
        provider=file_.provider,
        path=file_.path,
    )
    node_url = '/{node_id}/'.format(node_id=file_.node._id)

    guid_url = None
    file_guid = file_.get_guid(create=False)
    if file_guid:
        guid_url = '/{file_guid}/'.format(file_guid=file_guid._id)
    file_doc = {
        'id':
        file_._id,
        'deep_url':
        file_deep_url,
        'guid_url':
        guid_url,
        'tags':
        list(file_.tags.filter(system=False).values_list('name', flat=True)),
        'name':
        file_.name,
        'category':
        'file',
        'node_url':
        node_url,
        'node_title':
        file_.node.title,
        'parent_id':
        file_.node.parent_node._id if file_.node.parent_node else None,
        'is_registration':
        file_.node.is_registration,
        'is_retracted':
        file_.node.is_retracted,
        'extra_search_terms':
        clean_splitters(file_.name),
    }

    client().index(index=index,
                   doc_type='file',
                   body=file_doc,
                   id=file_._id,
                   refresh=True)
Esempio n. 5
0
def serialize_node(node, category):
    from website.addons.wiki.model import NodeWikiPage

    elastic_document = {}
    parent_id = node.parent_id

    try:
        normalized_title = six.u(node.title)
    except TypeError:
        normalized_title = node.title
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore')
    elastic_document = {
        'id': node._id,
        'contributors': [
            {
                'fullname': x.fullname,
                'url': x.profile_url if x.is_active else None
            }
            for x in node.visible_contributors
            if x is not None
        ],
        'title': node.title,
        'normalized_title': normalized_title,
        'category': category,
        'public': node.is_public,
        'tags': [tag._id for tag in node.tags if tag],
        'description': node.description,
        'url': node.url,
        'is_registration': node.is_registration,
        'is_pending_registration': node.is_pending_registration,
        'is_retracted': node.is_retracted,
        'is_pending_retraction': node.is_pending_retraction,
        'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False,
        'is_pending_embargo': node.is_pending_embargo,
        'registered_date': node.registered_date,
        'wikis': {},
        'parent_id': parent_id,
        'date_created': node.date_created,
        'license': serialize_node_license_record(node.license),
        'affiliated_institutions': [inst.name for inst in node.affiliated_institutions],
        'boost': int(not node.is_registration) + 1,  # This is for making registered projects less relevant
        'extra_search_terms': clean_splitters(node.title),
    }
    if not node.is_retracted:
        for wiki in [
            NodeWikiPage.load(x)
            for x in node.wiki_pages_current.values()
        ]:
            elastic_document['wikis'][wiki.page_name] = wiki.raw_text(node)

    return elastic_document
Esempio n. 6
0
def serialize_node(node, category):
    NodeWikiPage = apps.get_model('addons_wiki.NodeWikiPage')

    elastic_document = {}
    parent_id = node.parent_id

    try:
        normalized_title = six.u(node.title)
    except TypeError:
        normalized_title = node.title
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore')
    elastic_document = {
        'id': node._id,
        'contributors': [
            {
                'fullname': x['fullname'],
                'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None
            }
            for x in node._contributors.filter(contributor__visible=True).order_by('contributor___order')
            .values('fullname', 'guids___id', 'is_active')
        ],
        'title': node.title,
        'normalized_title': normalized_title,
        'category': category,
        'public': node.is_public,
        'tags': list(node.tags.filter(system=False).values_list('name', flat=True)),
        'description': node.description,
        'url': node.url,
        'is_registration': node.is_registration,
        'is_pending_registration': node.is_pending_registration,
        'is_retracted': node.is_retracted,
        'is_pending_retraction': node.is_pending_retraction,
        'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False,
        'is_pending_embargo': node.is_pending_embargo,
        'registered_date': node.registered_date,
        'wikis': {},
        'parent_id': parent_id,
        'date_created': node.date_created,
        'license': serialize_node_license_record(node.license),
        'affiliated_institutions': list(node.affiliated_institutions.values_list('name', flat=True)),
        'boost': int(not node.is_registration) + 1,  # This is for making registered projects less relevant
        'extra_search_terms': clean_splitters(node.title),
        'preprint_url': node.preprint_url,
    }
    if not node.is_retracted:
        for wiki in NodeWikiPage.objects.filter(guids___id__in=node.wiki_pages_current.values()):
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.page_name.replace('.', ' ')] = wiki.raw_text(node)

    return elastic_document
Esempio n. 7
0
def serialize_node(node, category):
    NodeWikiPage = apps.get_model('addons_wiki.NodeWikiPage')

    elastic_document = {}
    parent_id = node.parent_id

    try:
        normalized_title = six.u(node.title)
    except TypeError:
        normalized_title = node.title
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode('ascii', 'ignore')
    elastic_document = {
        'id': node._id,
        'contributors': [
            {
                'fullname': x['fullname'],
                'url': '/{}/'.format(x['guids___id']) if x['is_active'] else None
            }
            for x in node._contributors.filter(contributor__visible=True).order_by('contributor___order')
            .values('fullname', 'guids___id', 'is_active')
        ],
        'title': node.title,
        'normalized_title': normalized_title,
        'category': category,
        'public': node.is_public,
        'tags': list(node.tags.filter(system=False).values_list('name', flat=True)),
        'description': node.description,
        'url': node.url,
        'is_registration': node.is_registration,
        'is_pending_registration': node.is_pending_registration,
        'is_retracted': node.is_retracted,
        'is_pending_retraction': node.is_pending_retraction,
        'embargo_end_date': node.embargo_end_date.strftime('%A, %b. %d, %Y') if node.embargo_end_date else False,
        'is_pending_embargo': node.is_pending_embargo,
        'registered_date': node.registered_date,
        'wikis': {},
        'parent_id': parent_id,
        'date_created': node.date_created,
        'license': serialize_node_license_record(node.license),
        'affiliated_institutions': list(node.affiliated_institutions.values_list('name', flat=True)),
        'boost': int(not node.is_registration) + 1,  # This is for making registered projects less relevant
        'extra_search_terms': clean_splitters(node.title),
        'preprint_url': node.preprint_url,
    }
    if not node.is_retracted:
        for wiki in NodeWikiPage.objects.filter(guids___id__in=node.wiki_pages_current.values()):
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.page_name.replace('.', ' ')] = wiki.raw_text(node)

    return elastic_document
Esempio n. 8
0
def update_file(file_, index=None, delete=False):
    index = index or INDEX

    # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None
    if not file_.name or not file_.node.is_public or delete or file_.node.is_deleted or file_.node.archiving:
        client().delete(
            index=index,
            doc_type='file',
            id=file_._id,
            refresh=True,
            ignore=[404]
        )
        return

    # We build URLs manually here so that this function can be
    # run outside of a Flask request context (e.g. in a celery task)
    file_deep_url = '/{node_id}/files/{provider}{path}/'.format(
        node_id=file_.node._id,
        provider=file_.provider,
        path=file_.path,
    )
    node_url = '/{node_id}/'.format(node_id=file_.node._id)

    guid_url = None
    file_guid = file_.get_guid(create=False)
    if file_guid:
        guid_url = '/{file_guid}/'.format(file_guid=file_guid._id)
    file_doc = {
        'id': file_._id,
        'deep_url': file_deep_url,
        'guid_url': guid_url,
        'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)),
        'name': file_.name,
        'category': 'file',
        'node_url': node_url,
        'node_title': file_.node.title,
        'parent_id': file_.node.parent_node._id if file_.node.parent_node else None,
        'is_registration': file_.node.is_registration,
        'is_retracted': file_.node.is_retracted,
        'extra_search_terms': clean_splitters(file_.name),
    }

    client().index(
        index=index,
        doc_type='file',
        body=file_doc,
        id=file_._id,
        refresh=True
    )
Esempio n. 9
0
def serialize_group(group, category):
    elastic_document = {}

    try:
        normalized_title = six.u(group.name)
    except TypeError:
        normalized_title = group.name
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode(
        'ascii', 'ignore')
    elastic_document = {
        'id':
        group._id,
        'members': [{
            'fullname':
            x['fullname'],
            'url':
            '/{}/'.format(x['guids___id']) if x['is_active'] else None
        } for x in group.members_only.values('fullname', 'guids___id',
                                             'is_active')],
        'managers': [{
            'fullname':
            x['fullname'],
            'url':
            '/{}/'.format(x['guids___id']) if x['is_active'] else None
        } for x in group.managers.values('fullname', 'guids___id', 'is_active')
                     ],
        'title':
        group.name,
        'normalized_title':
        normalized_title,
        'category':
        category,
        'url':
        group.url,
        'date_created':
        group.created,
        'boost':
        2,  # More relevant than a registration
        'extra_search_terms':
        clean_splitters(group.name),
    }

    return elastic_document
Esempio n. 10
0
def update_file(file_, index=None, delete=False):
    index = index or INDEX
    target = file_.target

    # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None
    file_node_is_qa = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
            file_.tags.all().values_list('name', flat=True))) or bool(
                set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
                    target.tags.all().values_list('name', flat=True))) or any(
                        substring in target.title
                        for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if not file_.name or not target.is_public or delete or file_node_is_qa or getattr(
            target, 'is_deleted', False) or getattr(
                target, 'archiving', False) or target.is_spam or (
                    target.spam_status == SpamStatus.FLAGGED
                    and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH):
        client().delete(index=index,
                        doc_type='file',
                        id=file_._id,
                        refresh=True,
                        ignore=[404])
        return

    if isinstance(target, Preprint):
        if not getattr(
                target, 'verified_publishable',
                False) or target.primary_file != file_ or target.is_spam or (
                    target.spam_status == SpamStatus.FLAGGED
                    and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH):
            client().delete(index=index,
                            doc_type='file',
                            id=file_._id,
                            refresh=True,
                            ignore=[404])
            return

    # We build URLs manually here so that this function can be
    # run outside of a Flask request context (e.g. in a celery task)
    file_deep_url = '/{target_id}/files/{provider}{path}/'.format(
        target_id=target._id,
        provider=file_.provider,
        path=file_.path,
    )
    if getattr(target, 'is_quickfiles', None):
        node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id)
    else:
        node_url = '/{target_id}/'.format(target_id=target._id)

    guid_url = None
    file_guid = file_.get_guid(create=False)
    if file_guid:
        guid_url = '/{file_guid}/'.format(file_guid=file_guid._id)
    # File URL's not provided for preprint files, because the File Detail Page will
    # just reroute to preprints detail
    file_doc = {
        'id':
        file_._id,
        'deep_url':
        None if isinstance(target, Preprint) else file_deep_url,
        'guid_url':
        None if isinstance(target, Preprint) else guid_url,
        'tags':
        list(file_.tags.filter(system=False).values_list('name', flat=True)),
        'name':
        file_.name,
        'category':
        'file',
        'node_url':
        node_url,
        'node_title':
        getattr(target, 'title', None),
        'parent_id':
        target.parent_node._id
        if getattr(target, 'parent_node', None) else None,
        'is_registration':
        getattr(target, 'is_registration', False),
        'is_retracted':
        getattr(target, 'is_retracted', False),
        'extra_search_terms':
        clean_splitters(file_.name),
    }

    client().index(index=index,
                   doc_type='file',
                   body=file_doc,
                   id=file_._id,
                   refresh=True)
Esempio n. 11
0
def update_file(file_, index=None, delete=False):
    index = index or INDEX
    target = file_.target

    # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None
    file_node_is_qa = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
            file_.tags.all().values_list('name', flat=True))) or bool(
                set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
                    target.tags.all().values_list('name', flat=True))) or any(
                        substring in target.title
                        for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if not file_.name or not target.is_public or delete or target.is_deleted or target.archiving or file_node_is_qa:
        client().delete(index=index,
                        doc_type='file',
                        id=file_._id,
                        refresh=True,
                        ignore=[404])
        return

    # We build URLs manually here so that this function can be
    # run outside of a Flask request context (e.g. in a celery task)
    file_deep_url = '/{target_id}/files/{provider}{path}/'.format(
        target_id=target._id,
        provider=file_.provider,
        path=file_.path,
    )
    if target.is_quickfiles:
        node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id)
    else:
        node_url = '/{target_id}/'.format(target_id=target._id)

    guid_url = None
    file_guid = file_.get_guid(create=False)
    if file_guid:
        guid_url = '/{file_guid}/'.format(file_guid=file_guid._id)
    file_doc = {
        'id':
        file_._id,
        'deep_url':
        file_deep_url,
        'guid_url':
        guid_url,
        'tags':
        list(file_.tags.filter(system=False).values_list('name', flat=True)),
        'name':
        file_.name,
        'category':
        'file',
        'node_url':
        node_url,
        'node_title':
        getattr(target, 'title', None),
        'parent_id':
        target.parent_node._id
        if getattr(target, 'parent_node', None) else None,
        'is_registration':
        getattr(target, 'is_registration', False),
        'is_retracted':
        getattr(target, 'is_retracted', False),
        'extra_search_terms':
        clean_splitters(file_.name),
    }

    client().index(index=index,
                   doc_type='file',
                   body=file_doc,
                   id=file_._id,
                   refresh=True)
Esempio n. 12
0
def update_file(file_, index=None, delete=False):
    index = index or INDEX
    target = file_.target

    # TODO: Can remove 'not file_.name' if we remove all base file nodes with name=None
    file_node_is_qa = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(file_.tags.all().values_list('name', flat=True))
    ) or bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(target.tags.all().values_list('name', flat=True))
    ) or any(substring in target.title for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if not file_.name or not target.is_public or delete or file_node_is_qa or getattr(target, 'is_deleted', False) or getattr(target, 'archiving', False) or target.is_spam or (
            target.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH):
        client().delete(
            index=index,
            doc_type='file',
            id=file_._id,
            refresh=True,
            ignore=[404]
        )
        return

    if isinstance(target, Preprint):
        if not getattr(target, 'verified_publishable', False) or target.primary_file != file_ or target.is_spam or (
                target.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH):
            client().delete(
                index=index,
                doc_type='file',
                id=file_._id,
                refresh=True,
                ignore=[404]
            )
            return

    # We build URLs manually here so that this function can be
    # run outside of a Flask request context (e.g. in a celery task)
    file_deep_url = '/{target_id}/files/{provider}{path}/'.format(
        target_id=target._id,
        provider=file_.provider,
        path=file_.path,
    )
    if getattr(target, 'is_quickfiles', None):
        node_url = '/{user_id}/quickfiles/'.format(user_id=target.creator._id)
    else:
        node_url = '/{target_id}/'.format(target_id=target._id)

    guid_url = None
    file_guid = file_.get_guid(create=False)
    if file_guid:
        guid_url = '/{file_guid}/'.format(file_guid=file_guid._id)
    # File URL's not provided for preprint files, because the File Detail Page will
    # just reroute to preprints detail
    file_doc = {
        'id': file_._id,
        'deep_url': None if isinstance(target, Preprint) else file_deep_url,
        'guid_url': None if isinstance(target, Preprint) else guid_url,
        'tags': list(file_.tags.filter(system=False).values_list('name', flat=True)),
        'name': file_.name,
        'category': 'file',
        'node_url': node_url,
        'node_title': getattr(target, 'title', None),
        'parent_id': target.parent_node._id if getattr(target, 'parent_node', None) else None,
        'is_registration': getattr(target, 'is_registration', False),
        'is_retracted': getattr(target, 'is_retracted', False),
        'extra_search_terms': clean_splitters(file_.name),
    }

    client().index(
        index=index,
        doc_type='file',
        body=file_doc,
        id=file_._id,
        refresh=True
    )
Esempio n. 13
0
def serialize_node(node, category):
    from website.addons.wiki.model import NodeWikiPage

    elastic_document = {}
    parent_id = node.parent_id

    try:
        normalized_title = six.u(node.title)
    except TypeError:
        normalized_title = node.title
    normalized_title = unicodedata.normalize('NFKD', normalized_title).encode(
        'ascii', 'ignore')
    elastic_document = {
        'id':
        node._id,
        'contributors': [{
            'fullname': x.fullname,
            'url': x.profile_url if x.is_active else None
        } for x in node.visible_contributors if x is not None],
        'title':
        node.title,
        'normalized_title':
        normalized_title,
        'category':
        category,
        'public':
        node.is_public,
        'tags': [tag._id for tag in node.tags if tag],
        'description':
        node.description,
        'url':
        node.url,
        'is_registration':
        node.is_registration,
        'is_pending_registration':
        node.is_pending_registration,
        'is_retracted':
        node.is_retracted,
        'is_pending_retraction':
        node.is_pending_retraction,
        'embargo_end_date':
        node.embargo_end_date.strftime('%A, %b. %d, %Y')
        if node.embargo_end_date else False,
        'is_pending_embargo':
        node.is_pending_embargo,
        'registered_date':
        node.registered_date,
        'wikis': {},
        'parent_id':
        parent_id,
        'date_created':
        node.date_created,
        'license':
        serialize_node_license_record(node.license),
        'affiliated_institutions':
        [inst.name for inst in node.affiliated_institutions],
        'boost':
        int(not node.is_registration) +
        1,  # This is for making registered projects less relevant
        'extra_search_terms':
        clean_splitters(node.title),
    }
    if not node.is_retracted:
        for wiki in [
                NodeWikiPage.load(x) for x in node.wiki_pages_current.values()
        ]:
            elastic_document['wikis'][wiki.page_name] = wiki.raw_text(node)

    return elastic_document