Beispiel #1
0
def update_preprint(preprint, index=None, bulk=False, async_update=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(
            OsfStorageFile,
            Q(target_content_type=ContentType.objects.get_for_model(
                type(preprint)),
              target_object_id=preprint.id)):
        update_file(file_, index=index)

    is_qa_preprint = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
            preprint.tags.all().values_list('name', flat=True))) or any(
                substring in preprint.title
                for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if not preprint.verified_publishable or preprint.is_spam or (
            preprint.spam_status == SpamStatus.FLAGGED
            and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH) or is_qa_preprint:
        delete_doc(preprint._id, preprint, category='preprint', index=index)
    else:
        category = 'preprint'
        elastic_document = serialize_preprint(preprint, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index,
                           doc_type=category,
                           id=preprint._id,
                           body=elastic_document,
                           refresh=True)
    def get_events(self, date):
        """ Get all node logs from a given date for a 24 hour period,
        ending at the date given.
        """
        super(UserDomainEvents, self).get_events(date)

        # In the end, turn the date back into a datetime at midnight for queries
        date = datetime(date.year, date.month, date.day).replace(tzinfo=pytz.UTC)

        logger.info('Gathering user domains between {} and {}'.format(
            date, (date + timedelta(days=1)).isoformat()
        ))
        user_query = (Q(date_confirmed__lt=date + timedelta(days=1)) &
                      Q(date_confirmed__gte=date) &
                      Q(username__isnull=False))
        users = paginated(OSFUser, query=user_query)
        user_domain_events = []
        for user in users:
            user_date = user.date_confirmed.replace(tzinfo=pytz.UTC)
            event = {
                'keen': {'timestamp': user_date.isoformat()},
                'date': user_date.isoformat(),
                'domain': user.username.split('@')[-1]
            }
            user_domain_events.append(event)

        logger.info('User domains collected. {} users and their email domains.'.format(len(user_domain_events)))
        return user_domain_events
Beispiel #3
0
    def get_events(self, date):
        """ Get all node logs from a given date for a 24 hour period,
        ending at the date given.
        """
        super(UserDomainEvents, self).get_events(date)

        # In the end, turn the date back into a datetime at midnight for queries
        date = datetime(date.year, date.month,
                        date.day).replace(tzinfo=pytz.UTC)

        logger.info('Gathering user domains between {} and {}'.format(
            date, (date + timedelta(1)).isoformat()))
        user_query = (Q('date_confirmed', 'lt', date + timedelta(1))
                      & Q('date_confirmed', 'gte', date)
                      & Q('username', 'ne', None))
        users = paginated(OSFUser, query=user_query)
        user_domain_events = []
        for user in users:
            user_date = user.date_confirmed.replace(tzinfo=pytz.UTC)
            event = {
                'keen': {
                    'timestamp': user_date.isoformat()
                },
                'date': user_date.isoformat(),
                'domain': user.username.split('@')[-1]
            }
            user_domain_events.append(event)

        logger.info(
            'User domains collected. {} users and their email domains.'.format(
                len(user_domain_events)))
        return user_domain_events
Beispiel #4
0
    def get_events(self, date):
        """ Get all node logs from a given date for a 24 hour period,
        ending at the date given.
        """
        super(NodeLogEvents, self).get_events(date)

        # In the end, turn the date back into a datetime at midnight for queries
        date = datetime(date.year, date.month, date.day).replace(tzinfo=pytz.UTC)

        logger.info('Gathering node logs between {} and {}'.format(
            date, (date + timedelta(1)).isoformat()
        ))

        node_log_query = Q('date', 'lt', date + timedelta(1)) & Q('date', 'gte', date)

        node_logs = paginated(NodeLog, query=node_log_query)
        node_log_events = []
        for node_log in node_logs:
            log_date = node_log.date.replace(tzinfo=pytz.UTC)
            event = {
                'keen': {'timestamp': log_date.isoformat()},
                'date': log_date.isoformat(),
                'action': node_log.action
            }

            if node_log.user:
                event.update({'user_id': node_log.user._id})

            node_log_events.append(event)

        logger.info('NodeLogs counted. {} NodeLogs.'.format(len(node_log_events)))
        return node_log_events
Beispiel #5
0
def update_node(node, index=None, bulk=False, async_update=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(
            OsfStorageFile,
            Q(target_content_type=ContentType.objects.get_for_model(
                type(node)),
              target_object_id=node.id)):
        update_file(file_, index=index)

    is_qa_node = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
            node.tags.all().values_list('name', flat=True))) or any(
                substring in node.title
                for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if node.is_deleted or not node.is_public or node.archiving or node.is_spam or (
            node.spam_status == SpamStatus.FLAGGED
            and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH
    ) or node.is_quickfiles or is_qa_node:
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
        elastic_document = serialize_node(node, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index,
                           doc_type=category,
                           id=node._id,
                           body=elastic_document,
                           refresh=True)
Beispiel #6
0
    def get_events(self, date=None):
        super(AddonSnapshot, self).get_events(date)

        counts = []
        addons_available = {
            k: v
            for k, v in [(addon.short_name, addon)
                         for addon in ADDONS_AVAILABLE]
        }

        for short_name, addon in addons_available.iteritems():

            has_external_account = hasattr(addon.models.get('nodesettings'),
                                           'external_account')

            connected_count = 0
            deleted_count = 0
            disconnected_count = 0
            node_settings_model = addon.models.get('nodesettings')
            if node_settings_model:
                for node_settings in paginated(node_settings_model):
                    if node_settings.owner and not node_settings.owner.all_tags.filter(
                            name='old_node_collection', system=True).exists():
                        connected_count += 1
                deleted_count = addon.models['nodesettings'].objects.filter(
                    deleted=True).count() if addon.models.get(
                        'nodesettings') else 0
                if has_external_account:
                    disconnected_count = addon.models[
                        'nodesettings'].objects.filter(
                            external_account__isnull=True, deleted=False
                        ).count() if addon.models.get('nodesettings') else 0
                else:
                    if addon.models.get('nodesettings'):
                        for nsm in addon.models['nodesettings'].objects.filter(
                                deleted=False):
                            if nsm.configured and not nsm.complete:
                                disconnected_count += 1
            total = connected_count + deleted_count + disconnected_count
            usage_counts = get_enabled_authorized_linked(
                addon.models.get('usersettings'), has_external_account,
                addon.short_name)

            counts.append({
                'provider': {
                    'name': short_name
                },
                'users': usage_counts,
                'nodes': {
                    'total': total,
                    'connected': connected_count,
                    'deleted': deleted_count,
                    'disconnected': disconnected_count
                }
            })

            logger.info(
                '{} counted. Users with a linked node: {}, Total connected nodes: {}.'
                .format(addon.short_name, usage_counts['linked'], total))
        return counts
Beispiel #7
0
    def get_events(self, date):
        super(UserSummary, self).get_events(date)

        # Convert to a datetime at midnight for queries and the timestamp
        timestamp_datetime = datetime(date.year, date.month,
                                      date.day).replace(tzinfo=pytz.UTC)
        query_datetime = timestamp_datetime + timedelta(1)

        active_user_query = (Q('is_registered', 'eq', True)
                             & Q('password', 'ne', None)
                             & Q('merged_by', 'eq', None)
                             & Q('date_disabled', 'eq', None)
                             & Q('date_confirmed', 'ne', None)
                             & Q('date_confirmed', 'lt', query_datetime))

        active_users = 0
        depth_users = 0
        profile_edited = 0
        user_pages = paginated(OSFUser, query=active_user_query)
        for user in user_pages:
            active_users += 1
            log_count = count_user_logs(user)
            if log_count >= LOG_THRESHOLD:
                depth_users += 1
            if user.social or user.schools or user.jobs:
                profile_edited += 1

        counts = {
            'keen': {
                'timestamp': timestamp_datetime.isoformat()
            },
            'status': {
                'active':
                active_users,
                'depth':
                depth_users,
                'unconfirmed':
                OSFUser.find(
                    Q('date_registered', 'lt', query_datetime)
                    & Q('date_confirmed', 'eq', None)).count(),
                'deactivated':
                OSFUser.find(
                    Q('date_disabled', 'ne', None)
                    & Q('date_disabled', 'lt', query_datetime)).count(),
                'merged':
                OSFUser.find(
                    Q('date_registered', 'lt', query_datetime)
                    & Q('merged_by', 'ne', None)).count(),
                'profile_edited':
                profile_edited
            }
        }
        logger.info(
            'Users counted. Active: {}, Depth: {}, Unconfirmed: {}, Deactivated: {}, Merged: {}, Profile Edited: {}'
            .format(counts['status']['active'], counts['status']['depth'],
                    counts['status']['unconfirmed'],
                    counts['status']['deactivated'],
                    counts['status']['merged'],
                    counts['status']['profile_edited']))
        return [counts]
def main(dry=True):
    count = 0
    for node in paginated(AbstractNode, increment=1000):
        true_root = node.get_root()
        if not node.root or node.root.id != true_root.id:
            count += 1
            logger.info('Setting root for node {} to {}'.format(node._id, true_root._id))
            if not dry:
                AbstractNode.objects.filter(id=node.id).update(root=true_root)
    logger.info('Finished migrating {} nodes'.format(count))
Beispiel #9
0
def get_enabled_authorized_linked(user_settings_list, has_external_account,
                                  short_name):
    """ Gather the number of users who have at least one node in each of the stages for an addon

    :param user_settings_list: list of user_settings for a particualr addon
    :param has_external_account: where addon is derrived from, determines method to load node settings
    :param short_name: short name of addon to get correct node_settings
    :return:  dict with number of users that have at least one project at each stage
    """
    from addons.forward.models import NodeSettings as ForwardNodeSettings

    num_enabled = 0  # of users w/ 1+ addon account connected
    num_authorized = 0  # of users w/ 1+ addon account connected to 1+ node
    num_linked = 0  # of users w/ 1+ addon account connected to 1+ node and configured

    # osfstorage and wiki don't have user_settings, so always assume they're enabled, authorized, linked
    if short_name == 'osfstorage' or short_name == 'wiki':
        num_enabled = num_authorized = num_linked = OSFUser.objects.filter(
            is_registered=True,
            password__isnull=False,
            merged_by__isnull=True,
            date_disabled__isnull=True,
            date_confirmed__isnull=False).count()

    elif short_name == 'forward':
        num_enabled = num_authorized = ForwardNodeSettings.objects.count()
        num_linked = ForwardNodeSettings.objects.filter(
            url__isnull=False).count()

    else:
        for user_settings in paginated(user_settings_list):
            node_settings_list = []
            if has_external_account:
                if user_settings.has_auth:
                    num_enabled += 1
                    node_settings_list = [
                        AbstractNode.load(guid).get_addon(short_name)
                        for guid in user_settings.oauth_grants.keys()
                    ]
            else:
                num_enabled += 1
                node_settings_list = [
                    AbstractNode.load(guid).get_addon(short_name)
                    for guid in user_settings.nodes_authorized
                ]
            if any([ns.has_auth for ns in node_settings_list if ns]):
                num_authorized += 1
                if any([(ns.complete and ns.configured)
                        for ns in node_settings_list if ns]):
                    num_linked += 1
    return {
        'enabled': num_enabled,
        'authorized': num_authorized,
        'linked': num_linked
    }
Beispiel #10
0
def main(dry=True):
    count = 0
    for node in paginated(AbstractNode, increment=1000):
        true_root = node.get_root()
        if not node.root or node.root.id != true_root.id:
            count += 1
            logger.info('Setting root for node {} to {}'.format(
                node._id, true_root._id))
            if not dry:
                AbstractNode.objects.filter(id=node.id).update(root=true_root)
    logger.info('Finished migrating {} nodes'.format(count))
Beispiel #11
0
def migrate_users(index):
    logger.info('Migrating users to index: {}'.format(index))
    n_migr = 0
    n_iter = 0
    users = paginated(OSFUser, query=None, each=True)
    for user in users:
        if user.is_active:
            search.update_user(user, index=index)
            n_migr += 1
        n_iter += 1

    logger.info('Users iterated: {0}\nUsers migrated: {1}'.format(
        n_iter, n_migr))
Beispiel #12
0
    def get_events(self, date=None):
        super(AddonSnapshot, self).get_events(date)

        counts = []
        addons_available = {k: v for k, v in [(addon.short_name, addon) for addon in ADDONS_AVAILABLE]}

        for short_name, addon in addons_available.iteritems():

            has_external_account = hasattr(addon.models.get('nodesettings'), 'external_account')

            connected_count = 0
            deleted_count = 0
            disconnected_count = 0
            node_settings_model = addon.models.get('nodesettings')
            if node_settings_model:
                for node_settings in paginated(node_settings_model):
                    if node_settings.owner and not node_settings.owner.all_tags.filter(name='old_node_collection', system=True).exists():
                        connected_count += 1
                deleted_count = addon.models['nodesettings'].objects.filter(deleted=True).count() if addon.models.get('nodesettings') else 0
                if has_external_account:
                    disconnected_count = addon.models['nodesettings'].objects.filter(external_account__isnull=True, deleted=False).count() if addon.models.get('nodesettings') else 0
                else:
                    if addon.models.get('nodesettings'):
                        for nsm in addon.models['nodesettings'].objects.filter(deleted=False):
                            if nsm.configured and not nsm.complete:
                                disconnected_count += 1
            total = connected_count + deleted_count + disconnected_count
            usage_counts = get_enabled_authorized_linked(addon.models.get('usersettings'), has_external_account, addon.short_name)

            counts.append({
                'provider': {
                    'name': short_name
                },
                'users': usage_counts,
                'nodes': {
                    'total': total,
                    'connected': connected_count,
                    'deleted': deleted_count,
                    'disconnected': disconnected_count
                }
            })

            logger.info(
                '{} counted. Users with a linked node: {}, Total connected nodes: {}.'.format(
                    addon.short_name,
                    usage_counts['linked'],
                    total
                )
            )
        return counts
Beispiel #13
0
def update_preprint(preprint, index=None, bulk=False, async_update=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(OsfStorageFile, Q(target_content_type=ContentType.objects.get_for_model(type(preprint)), target_object_id=preprint.id)):
        update_file(file_, index=index)

    is_qa_preprint = bool(set(settings.DO_NOT_INDEX_LIST['tags']).intersection(preprint.tags.all().values_list('name', flat=True))) or any(substring in preprint.title for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if not preprint.verified_publishable or preprint.is_spam or (preprint.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH) or is_qa_preprint:
        delete_doc(preprint._id, preprint, category='preprint', index=index)
    else:
        category = 'preprint'
        elastic_document = serialize_preprint(preprint, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index, doc_type=category, id=preprint._id, body=elastic_document, refresh=True)
Beispiel #14
0
def update_node(node, index=None, bulk=False, async_update=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(OsfStorageFile, Q(target_content_type=ContentType.objects.get_for_model(type(node)), target_object_id=node.id)):
        update_file(file_, index=index)

    is_qa_node = bool(set(settings.DO_NOT_INDEX_LIST['tags']).intersection(node.tags.all().values_list('name', flat=True))) or any(substring in node.title for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if node.is_deleted or not node.is_public or node.archiving or node.is_spam or (node.spam_status == SpamStatus.FLAGGED and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH) or node.is_quickfiles or is_qa_node:
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
        elastic_document = serialize_node(node, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index, doc_type=category, id=node._id, body=elastic_document, refresh=True)
Beispiel #15
0
def get_enabled_authorized_linked(user_settings_list, has_external_account, short_name):
    """ Gather the number of users who have at least one node in each of the stages for an addon

    :param user_settings_list: list of user_settings for a particualr addon
    :param has_external_account: where addon is derrived from, determines method to load node settings
    :param short_name: short name of addon to get correct node_settings
    :return:  dict with number of users that have at least one project at each stage
    """
    from addons.forward.models import NodeSettings as ForwardNodeSettings

    num_enabled = 0  # of users w/ 1+ addon account connected
    num_authorized = 0  # of users w/ 1+ addon account connected to 1+ node
    num_linked = 0  # of users w/ 1+ addon account connected to 1+ node and configured

    # osfstorage and wiki don't have user_settings, so always assume they're enabled, authorized, linked
    if short_name == 'osfstorage' or short_name == 'wiki':
        num_enabled = num_authorized = num_linked = OSFUser.objects.filter(
            is_registered=True,
            password__isnull=False,
            merged_by__isnull=True,
            date_disabled__isnull=True,
            date_confirmed__isnull=False
        ).count()

    elif short_name == 'forward':
        num_enabled = num_authorized = ForwardNodeSettings.objects.count()
        num_linked = ForwardNodeSettings.objects.filter(url__isnull=False).count()

    else:
        for user_settings in paginated(user_settings_list):
            node_settings_list = []
            if has_external_account:
                if user_settings.has_auth:
                    num_enabled += 1
                    node_settings_list = [AbstractNode.load(guid).get_addon(short_name) for guid in user_settings.oauth_grants.keys()]
            else:
                num_enabled += 1
                node_settings_list = [AbstractNode.load(guid).get_addon(short_name) for guid in user_settings.nodes_authorized]
            if any([ns.has_auth for ns in node_settings_list if ns]):
                num_authorized += 1
                if any([(ns.complete and ns.configured) for ns in node_settings_list if ns]):
                    num_linked += 1
    return {
        'enabled': num_enabled,
        'authorized': num_authorized,
        'linked': num_linked
    }
def remove_search_index(dry_run=True):
    tag_query = Q()
    title_query = Q()
    for tag in DO_NOT_INDEX_LIST['tags']:
        tag_query |= Q(tags__name = tag)

    for title in DO_NOT_INDEX_LIST['titles']:
        title_query |= Q(title__contains = title)

    increment = 20
    nodes = paginated(AbstractNode, query=Q(is_public=True) & (tag_query | title_query), increment=increment, each=True)
    if dry_run:
        logger.warn('Dry run mode.')
        for node in nodes:
            logger.info('Removing {} with title \'{}\' from search index and SHARE.'.format(node._id, node.title))
    else:
        for node in nodes:
            update_node(node, bulk=False, async=True)
            update_node_share(node)
Beispiel #17
0
def migrate_nodes(index, query=None):
    logger.info('Migrating nodes to index: {}'.format(index))
    node_query = Q(is_public=True, is_deleted=False)
    if query:
        node_query = query & node_query
    total = AbstractNode.objects.filter(node_query).count()
    increment = 100
    total_pages = (total // increment) + 1
    pages = paginated(AbstractNode,
                      query=node_query,
                      increment=increment,
                      each=False,
                      include=['contributor__user__guids'])

    for page_number, page in enumerate(pages):
        logger.info('Updating page {} / {}'.format(page_number + 1,
                                                   total_pages))
        AbstractNode.bulk_update_search(page, index=index)

    logger.info('Nodes migrated: {}'.format(total))
Beispiel #18
0
    def get_events(self, date):
        super(UserSummary, self).get_events(date)

        # Convert to a datetime at midnight for queries and the timestamp
        timestamp_datetime = datetime(date.year, date.month,
                                      date.day).replace(tzinfo=pytz.UTC)
        query_datetime = timestamp_datetime + timedelta(days=1)

        active_user_query = (Q(is_registered=True) & Q(password__isnull=False)
                             & Q(merged_by__isnull=True)
                             & Q(date_disabled__isnull=True)
                             & Q(date_confirmed__isnull=False)
                             & Q(date_confirmed__lt=query_datetime))

        active_users = 0
        depth_users = 0
        profile_edited = 0
        user_pages = paginated(OSFUser, query=active_user_query)
        for user in user_pages:
            active_users += 1
            log_count = count_user_logs(user)
            if log_count >= LOG_THRESHOLD:
                depth_users += 1
            if user.social or user.schools or user.jobs:
                profile_edited += 1
        new_users = OSFUser.objects.filter(
            is_active=True,
            date_confirmed__gte=timestamp_datetime,
            date_confirmed__lt=query_datetime)
        counts = {
            'keen': {
                'timestamp': timestamp_datetime.isoformat()
            },
            'status': {
                'active':
                active_users,
                'depth':
                depth_users,
                'new_users_daily':
                new_users.count(),
                'new_users_with_institution_daily':
                new_users.filter(
                    affiliated_institutions__isnull=False).count(),
                'unconfirmed':
                OSFUser.objects.filter(date_registered__lt=query_datetime,
                                       date_confirmed__isnull=True).count(),
                'deactivated':
                OSFUser.objects.filter(
                    date_disabled__isnull=False,
                    date_disabled__lt=query_datetime).count(),
                'merged':
                OSFUser.objects.filter(date_registered__lt=query_datetime,
                                       merged_by__isnull=False).count(),
                'profile_edited':
                profile_edited,
            }
        }

        try:
            # Because this data reads from Keen it could fail if Keen read api fails while writing is still allowed
            counts['status']['stickiness'] = self.calculate_stickiness(
                timestamp_datetime, query_datetime)
        except requests.exceptions.ConnectionError:
            sentry.log_message(
                'Unable to read from Keen. stickiness metric not collected for date {}'
                .format(timestamp_datetime.isoformat()))

        logger.info(
            'Users counted. Active: {}, Depth: {}, Unconfirmed: {}, Deactivated: {}, Merged: {}, Profile Edited: {}'
            .format(counts['status']['active'], counts['status']['depth'],
                    counts['status']['unconfirmed'],
                    counts['status']['deactivated'],
                    counts['status']['merged'],
                    counts['status']['profile_edited']))
        return [counts]
Beispiel #19
0
        'boost': int(not node.is_registration) + 1,  # This is for making registered projects less relevant
        'extra_search_terms': clean_splitters(node.title),
        'preprint_url': node.preprint_url,
    }
    if not node.is_retracted:
        for wiki in node.get_wiki_pages_latest():
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.wiki_page.page_name.replace('.', ' ')] = wiki.raw_text(node)

    return elastic_document

@requires_search
def update_node(node, index=None, bulk=False, async=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(OsfStorageFile, Q(node=node)):
        update_file(file_, index=index)

    is_qa_node = bool(set(settings.DO_NOT_INDEX_LIST['tags']).intersection(node.tags.all().values_list('name', flat=True))) or any(substring in node.title for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if node.is_deleted or not node.is_public or node.archiving or (node.is_spammy and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH) or node.is_quickfiles or is_qa_node:
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
        elastic_document = serialize_node(node, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index, doc_type=category, id=node._id, body=elastic_document, refresh=True)

def bulk_update_nodes(serialize, nodes, index=None):
    """Updates the list of input projects
Beispiel #20
0
        'boost': int(not node.is_registration) + 1,  # This is for making registered projects less relevant
        'extra_search_terms': clean_splitters(node.title),
        'preprint_url': node.preprint_url,
    }
    if not node.is_retracted:
        for wiki in node.get_wiki_pages_latest():
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.wiki_page.page_name.replace('.', ' ')] = wiki.raw_text(node)

    return elastic_document

@requires_search
def update_node(node, index=None, bulk=False, async=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(OsfStorageFile, Q(target_content_type=ContentType.objects.get_for_model(type(node)), target_object_id=node.id)):
        update_file(file_, index=index)

    is_qa_node = bool(set(settings.DO_NOT_INDEX_LIST['tags']).intersection(node.tags.all().values_list('name', flat=True))) or any(substring in node.title for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if node.is_deleted or not node.is_public or node.archiving or (node.is_spammy and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH) or node.is_quickfiles or is_qa_node:
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
        elastic_document = serialize_node(node, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index, doc_type=category, id=node._id, body=elastic_document, refresh=True)

def bulk_update_nodes(serialize, nodes, index=None):
    """Updates the list of input projects
Beispiel #21
0
    if not node.is_retracted:
        for wiki in WikiPage.objects.get_wiki_pages_latest(node):
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.wiki_page.page_name.replace(
                '.', ' ')] = wiki.raw_text(node)

    return elastic_document


@requires_search
def update_node(node, index=None, bulk=False, async=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(
            OsfStorageFile,
            Q(target_content_type=ContentType.objects.get_for_model(
                type(node)),
              target_object_id=node.id)):
        update_file(file_, index=index)

    is_qa_node = bool(
        set(settings.DO_NOT_INDEX_LIST['tags']).intersection(
            node.tags.all().values_list('name', flat=True))) or any(
                substring in node.title
                for substring in settings.DO_NOT_INDEX_LIST['titles'])
    if node.is_deleted or not node.is_public or node.archiving or (
            node.is_spammy and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH
    ) or node.is_quickfiles or is_qa_node:
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
    }
    if not node.is_retracted:
        for wiki in NodeWikiPage.objects.filter(
                guids___id__in=node.wiki_pages_current.values()):
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.page_name.replace(
                '.', ' ')] = wiki.raw_text(node)

    return elastic_document


@requires_search
def update_node(node, index=None, bulk=False, async=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(OsfStorageFile, Q(node=node)):
        update_file(file_, index=index)

    if node.is_deleted or not node.is_public or node.archiving or (
            node.is_spammy and
            settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH) or node.is_quickfiles:
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
        elastic_document = serialize_node(node, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index,
                           doc_type=category,
                           id=node._id,
Beispiel #23
0
        'boost': int(not node.is_registration) + 1,  # This is for making registered projects less relevant
        'extra_search_terms': clean_splitters(node.title),
        'preprint_url': node.preprint_url,
    }
    if not node.is_retracted:
        for wiki in NodeWikiPage.objects.filter(guids___id__in=node.wiki_pages_current.values()):
            # '.' is not allowed in field names in ES2
            elastic_document['wikis'][wiki.page_name.replace('.', ' ')] = wiki.raw_text(node)

    return elastic_document

@requires_search
def update_node(node, index=None, bulk=False, async=False):
    from addons.osfstorage.models import OsfStorageFile
    index = index or INDEX
    for file_ in paginated(OsfStorageFile, Q('node', 'eq', node)):
        update_file(file_, index=index)

    if node.is_deleted or not node.is_public or node.archiving or (node.is_spammy and settings.SPAM_FLAGGED_REMOVE_FROM_SEARCH):
        delete_doc(node._id, node, index=index)
    else:
        category = get_doctype_from_node(node)
        elastic_document = serialize_node(node, category)
        if bulk:
            return elastic_document
        else:
            client().index(index=index, doc_type=category, id=node._id, body=elastic_document, refresh=True)

def bulk_update_nodes(serialize, nodes, index=None):
    """Updates the list of input projects
Beispiel #24
0
    def get_events(self, date):
        super(UserSummary, self).get_events(date)

        # Convert to a datetime at midnight for queries and the timestamp
        timestamp_datetime = datetime(date.year, date.month, date.day).replace(tzinfo=pytz.UTC)
        query_datetime = timestamp_datetime + timedelta(days=1)

        active_user_query = (
            Q(is_registered=True) &
            Q(password__isnull=False) &
            Q(merged_by__isnull=True) &
            Q(date_disabled__isnull=True) &
            Q(date_confirmed__isnull=False) &
            Q(date_confirmed__lt=query_datetime)
        )

        active_users = 0
        depth_users = 0
        profile_edited = 0
        user_pages = paginated(OSFUser, query=active_user_query)
        for user in user_pages:
            active_users += 1
            log_count = count_user_logs(user)
            if log_count >= LOG_THRESHOLD:
                depth_users += 1
            if user.social or user.schools or user.jobs:
                profile_edited += 1
        new_users = OSFUser.objects.filter(is_active=True, date_confirmed__gte=timestamp_datetime, date_confirmed__lt=query_datetime)
        counts = {
            'keen': {
                'timestamp': timestamp_datetime.isoformat()
            },
            'status': {
                'active': active_users,
                'depth': depth_users,
                'new_users_daily': new_users.count(),
                'new_users_with_institution_daily': new_users.filter(affiliated_institutions__isnull=False).count(),
                'unconfirmed': OSFUser.objects.filter(date_registered__lt=query_datetime, date_confirmed__isnull=True).count(),
                'deactivated': OSFUser.objects.filter(date_disabled__isnull=False, date_disabled__lt=query_datetime).count(),
                'merged': OSFUser.objects.filter(date_registered__lt=query_datetime, merged_by__isnull=False).count(),
                'profile_edited': profile_edited,
            }
        }

        try:
            # Because this data reads from Keen it could fail if Keen read api fails while writing is still allowed
            counts['status']['stickiness'] = self.calculate_stickiness(timestamp_datetime, query_datetime)
        except requests.exceptions.ConnectionError:
            sentry.log_message('Unable to read from Keen. stickiness metric not collected for date {}'.format(timestamp_datetime.isoformat()))

        logger.info(
            'Users counted. Active: {}, Depth: {}, Unconfirmed: {}, Deactivated: {}, Merged: {}, Profile Edited: {}'.format(
                counts['status']['active'],
                counts['status']['depth'],
                counts['status']['unconfirmed'],
                counts['status']['deactivated'],
                counts['status']['merged'],
                counts['status']['profile_edited']
            )
        )
        return [counts]