def main(send_email=False): logger.info('Starting Project storage audit') init_app(set_backends=True, routes=False) lines = [] projects = {} users = defaultdict(lambda: (0, 0)) progress_bar = progressbar.ProgressBar( maxval=Node.find(Q('parent_node', 'eq', None)).count()).start() for i, node in enumerate(Node.find(Q('parent_node', 'eq', None))): progress_bar.update(i + 1) if node._id in WHITE_LIST: continue # Dont count whitelisted nodes against users projects[node._id] = get_usage(node) for contrib in node.contributors: if node.can_edit(user=contrib): users[contrib._id] = tuple( map(sum, zip(users[contrib._id], projects[node._id])) ) # Adds tuples together, map(sum, zip((a, b), (c, d))) -> (a+c, b+d) if i % 25 == 0: # Clear all caches for key in ('node', 'user', 'fileversion', 'storedfilenode'): Node._cache.data.get(key, {}).clear() Node._object_cache.data.get(key, {}).clear() # Collect garbage gc.collect() progress_bar.finish() for model, collection, limit in ((User, users, USER_LIMIT), (Node, projects, PROJECT_LIMIT)): for item, (used, deleted) in filter(functools.partial(limit_filter, limit), collection.items()): line = '{!r} has exceeded the limit {:.2f}GBs ({}b) with {:.2f}GBs ({}b) used and {:.2f}GBs ({}b) deleted.'.format( model.load(item), limit / GBs, limit, used / GBs, used, deleted / GBs, deleted) logger.info(line) lines.append(line) if lines: if send_email: logger.info('Sending email...') mails.send_mail('*****@*****.**', mails.EMPTY, body='\n'.join(lines), subject='Script: OsfStorage usage audit') else: logger.info('send_email is False, not sending email'.format( len(lines))) logger.info('{} offending project(s) and user(s) found'.format( len(lines))) else: logger.info('No offending projects or users found')
def get_queryset(self): query = self.get_query_from_request() blacklisted = self.is_blacklisted(query) nodes = Node.find(query) # If attempting to filter on a blacklisted field, exclude withdrawals. if blacklisted: non_withdrawn_list = [node._id for node in nodes if not node.is_retracted] non_withdrawn_nodes = Node.find(Q("_id", "in", non_withdrawn_list)) return non_withdrawn_nodes return nodes
def get_queryset(self): query = self.get_query_from_request() blacklisted = self.is_blacklisted(query) nodes = Node.find(query) # If attempting to filter on a blacklisted field, exclude retractions. if blacklisted: non_retracted_list = [node._id for node in nodes if not node.is_retracted] non_retracted_nodes = Node.find(Q('_id', 'in', non_retracted_list)) return non_retracted_nodes return nodes
def get_queryset(self): query = self.get_query_from_request() blacklisted = self.is_blacklisted(query) nodes = Node.find(query) # If attempting to filter on a blacklisted field, exclude retractions. if blacklisted: non_retracted_list = [node._id for node in nodes if not node.is_retracted] non_retracted_nodes = Node.find(Q('_id', 'in', non_retracted_list)) return non_retracted_nodes return nodes
def main(send_email=False): logger.info('Starting Project storage audit') init_app(set_backends=True, routes=False) lines = [] projects = {} users = defaultdict(lambda: (0, 0)) for node in Node.find(Q('__backrefs.parent.node.nodes', 'eq', None)): # ODM hack to ignore all nodes with parents if node._id in WHITE_LIST: continue # Dont count whitelisted nodes against users projects[node] = get_usage(node) for contrib in node.contributors: if node.can_edit(user=contrib): users[contrib] = tuple(map(sum, zip(users[contrib], projects[node]))) # Adds tuples together, map(sum, zip((a, b), (c, d))) -> (a+c, b+d) for collection, limit in ((users, USER_LIMIT), (projects, PROJECT_LIMIT)): for item, (used, deleted) in filter(functools.partial(limit_filter, limit), collection.items()): line = '{!r} has exceeded the limit {:.2f}GBs ({}b) with {:.2f}GBs ({}b) used and {:.2f}GBs ({}b) deleted.'.format(item, limit / GBs, limit, used / GBs, used, deleted / GBs, deleted) logger.info(line) lines.append(line) if lines: if send_email: logger.info('Sending email...') mails.send_mail('*****@*****.**', mails.EMPTY, body='\n'.join(lines), subject='Script: OsfStorage usage audit') else: logger.info('send_email is False, not sending email'.format(len(lines))) logger.info('{} offending project(s) and user(s) found'.format(len(lines))) else: logger.info('No offending projects or users found')
def do_migration(): dupe_nodes = [n for n in Node.find(Q('_id', 'in', list(set([l.node._id for l in NodeLog.find(Q('action', 'eq', 'preprint_license_updated'))])))) if NodeLog.find(Q('action', 'eq', 'preprint_license_updated') & Q('node', 'eq', n._id)).count() > 1] logger.info('Found {} nodes with multiple preprint_license_updated logs'.format(len(dupe_nodes))) for node in dupe_nodes: preprint_license_updated_logs = [log for log in node.logs if log.action == 'preprint_license_updated'] log = preprint_license_updated_logs.pop() while(preprint_license_updated_logs): next_log = preprint_license_updated_logs.pop() timedelta = log.date - next_log.date if timedelta.seconds < 60: logger.info( 'Hiding duplicate preprint_license_updated log with ID {} from node {}, timedelta was {}'.format( log._id, node._id, timedelta ) ) log.should_hide = True log.save() else: logger.info( 'Skipping preprint_license_updated log with ID {} from node {}, timedelta was {}'.format( log._id, node._id, timedelta ) ) log = next_log
def migrate_nodes(): migrated_count = 0 for node in Node.find(): was_migrated = migrate_category(node) if was_migrated: node.save() logger.info('Migrated {0}'.format(node._id)) migrated_count += 1 logger.info('Finished migrating {0} nodes.'.format(migrated_count))
def migrate_nodes(): migrated_count = 0 for node in Node.find(): was_migrated = migrate_category(node) if was_migrated: node.save() logger.info('Migrated {0}'.format(node._id)) migrated_count += 1 logger.info('Finished migrating {0} nodes.'.format(migrated_count))
def test_delete_registration_tree(self): proj = factories.NodeFactory() factories.NodeFactory(parent=proj) comp2 = factories.NodeFactory(parent=proj) factories.NodeFactory(parent=comp2) reg = factories.RegistrationFactory(project=proj) reg_ids = [reg._id] + [r._id for r in reg.get_descendants_recursive()] archiver_utils.delete_registration_tree(reg) assert_false(Node.find(Q('_id', 'in', reg_ids) & Q('is_deleted', 'eq', False)).count())
def get_projects(time=None, public=False, registered=False): query = (Q('parent_node', 'eq', None) & CONTENT_NODE_QUERY) if time: query = query & Q('date_created', 'lt', time) if public: query = query & Q('is_public', 'eq', True) if registered: query = query & Q('is_registration', 'eq', True) return Node.find(query).count()
def test_delete_registration_tree(self): proj = factories.NodeFactory() factories.NodeFactory(parent=proj) comp2 = factories.NodeFactory(parent=proj) factories.NodeFactory(parent=comp2) reg = factories.RegistrationFactory(project=proj) reg_ids = [reg._id] + [r._id for r in reg.get_descendants_recursive()] archiver_utils.delete_registration_tree(reg) assert_false(Node.find(Q('_id', 'in', reg_ids) & Q('is_deleted', 'eq', False)).count())
def project_tag(tag, auth, **kwargs): tag_obj = Tag.load(tag) if tag_obj: nodes = Node.find(Q("tags", "eq", tag_obj._id)) else: nodes = [] visible_nodes = [obj for obj in nodes if obj.can_view(auth)] return {"nodes": [{"title": node.title, "url": node.url} for node in visible_nodes], "tag": tag}
def get_targets(): logger.info('Acquiring targets...') targets = [ u for u in User.find() if Node.find( Q('is_bookmark_collection', 'eq', True) & Q('is_deleted', 'eq', False) & Q('creator', 'eq', u._id)).count() > 1 ] logger.info('Found {} target users.'.format(len(targets))) return targets
def main(send_email=False): logger.info('Starting Project storage audit') init_app(set_backends=True, routes=False) lines = [] projects = {} users = defaultdict(lambda: (0, 0)) progress_bar = progressbar.ProgressBar(maxval=Node.find(Q('parent_node', 'eq', None)).count()).start() for i, node in enumerate(Node.find(Q('parent_node', 'eq', None))): progress_bar.update(i+1) if node._id in WHITE_LIST: continue # Dont count whitelisted nodes against users projects[node._id] = get_usage(node) for contrib in node.contributors: if node.can_edit(user=contrib): users[contrib._id] = tuple(map(sum, zip(users[contrib._id], projects[node._id]))) # Adds tuples together, map(sum, zip((a, b), (c, d))) -> (a+c, b+d) if i % 25 == 0: # Clear all caches for key in ('node', 'user', 'fileversion', 'storedfilenode'): Node._cache.data.get(key, {}).clear() Node._object_cache.data.get(key, {}).clear() # Collect garbage gc.collect() progress_bar.finish() for model, collection, limit in ((User, users, USER_LIMIT), (Node, projects, PROJECT_LIMIT)): for item, (used, deleted) in filter(functools.partial(limit_filter, limit), collection.items()): line = '{!r} has exceeded the limit {:.2f}GBs ({}b) with {:.2f}GBs ({}b) used and {:.2f}GBs ({}b) deleted.'.format(model.load(item), limit / GBs, limit, used / GBs, used, deleted / GBs, deleted) logger.info(line) lines.append(line) if lines: if send_email: logger.info('Sending email...') mails.send_mail('*****@*****.**', mails.EMPTY, body='\n'.join(lines), subject='Script: OsfStorage usage audit') else: logger.info('send_email is False, not sending email'.format(len(lines))) logger.info('{} offending project(s) and user(s) found'.format(len(lines))) else: logger.info('No offending projects or users found')
def get_projects(time=None, public=False, registered=False): query = (Q('category', 'eq', 'project') & Q('is_deleted', 'eq', False) & Q('is_folder', 'ne', True)) if time: query = query & Q('date_created', 'lt', time) if public: query = query & Q('is_public', 'eq', True) if registered: query = query & Q('is_registration', 'eq', True) return Node.find(query).count()
def get_queryset(self): node = self.get_node() req_query = self.get_query_from_request() query = ( Q('_id', 'in', [e._id for e in node.nodes if e.primary]) & req_query ) nodes = Node.find(query) auth = get_user_auth(self.request) return sorted([each for each in nodes if each.can_view(auth)], key=lambda n: n.date_modified, reverse=True)
def get_queryset(self): node = self.get_node() req_query = self.get_query_from_request() query = ( Q('_id', 'in', [e._id for e in node.nodes if e.primary]) & req_query ) nodes = Node.find(query) auth = get_user_auth(self.request) return sorted([each for each in nodes if each.can_view(auth)], key=lambda n: n.date_modified, reverse=True)
def get_projects(time=None, public=False, registered=False): query = ( Q('parent_node', 'eq', None) & CONTENT_NODE_QUERY ) if time: query = query & Q('date_created', 'lt', time) if public: query = query & Q('is_public', 'eq', True) if registered: query = query & Q('is_registration', 'eq', True) return Node.find(query).count()
def recent_public_registrations(n=10): registrations = Node.find(CONTENT_NODE_QUERY & Q('parent_node', 'eq', None) & Q('is_public', 'eq', True) & Q('is_registration', 'eq', True)).sort( '-registered_date') for reg in registrations: if not n: break if reg.is_retracted or reg.is_pending_embargo: # Filter based on calculated properties continue n -= 1 yield reg
def recent_public_registrations(n=10): recent_query = (Q('category', 'eq', 'project') & Q('is_public', 'eq', True) & Q('is_deleted', 'eq', False)) registrations = Node.find(recent_query & Q('is_registration', 'eq', True)).sort( '-registered_date') for reg in registrations: if not n: break if reg.is_retracted or reg.is_pending_embargo: continue n = n - 1 yield reg
def get_queryset(self): node = self.get_node() req_query = self.get_query_from_request() node_pks = node.node_relations.filter( is_node_link=False).select_related('child').values_list( 'child__pk', flat=True) query = (Q('pk', 'in', node_pks) & req_query) nodes = Node.find(query).order_by('-date_modified') auth = get_user_auth(self.request) pks = [each.pk for each in nodes if each.can_view(auth)] return Node.objects.filter(pk__in=pks).order_by('-date_modified')
def get_projects(time=None, public=False, registered=False): query = ( Q('category', 'eq', 'project') & Q('is_deleted', 'eq', False) & Q('is_folder', 'ne', True) ) if time: query = query & Q('date_created', 'lt', time) if public: query = query & Q('is_public', 'eq', True) if registered: query = query & Q('is_registration', 'eq', True) return Node.find(query).count()
def project_tag(tag, auth, **kwargs): tag_obj = Tag.load(tag) if tag_obj: nodes = Node.find(Q('tags', 'eq', tag_obj._id)) else: nodes = [] visible_nodes = [obj for obj in nodes if obj.can_view(auth)] return { 'nodes': [{ 'title': node.title, 'url': node.url, } for node in visible_nodes], 'tag': tag, }
def recent_public_registrations(n=10): registrations = Node.find( CONTENT_NODE_QUERY & Q('parent_node', 'eq', None) & Q('is_public', 'eq', True) & Q('is_registration', 'eq', True) ).sort( '-registered_date' ) for reg in registrations: if not n: break if reg.is_retracted or reg.is_pending_embargo: # Filter based on calculated properties continue n -= 1 yield reg
def project_tag(tag, auth, **kwargs): tag_obj = Tag.load(tag) if tag_obj: nodes = Node.find(Q('tags', 'eq', tag_obj._id)) else: nodes = [] visible_nodes = [obj for obj in nodes if obj.can_view(auth)] return { 'nodes': [ { 'title': node.title, 'url': node.url, } for node in visible_nodes ], 'tag': tag, }
def migrate(): targets = get_targets() total = len(targets) for i, user in enumerate(targets): logger.info('({}/{}) Preparing to migrate User {}'.format(i + 1, total, user._id)) bookmarks = Node.find(Q('is_bookmark_collection', 'eq', True) & Q('creator', 'eq', user._id)).sort('-date_modified') bookmark_to_keep = None for n in bookmarks: if n.nodes: bookmark_to_keep = n bookmark_to_keep = bookmark_to_keep or bookmarks[0] logger.info('Marking Node {} as primary Bookmark Collection for User {}, preparing to delete others'.format(bookmark_to_keep._id, user._id)) for n in bookmarks: if n._id != bookmark_to_keep._id: n.is_deleted = True n.save() logger.info('Successfully migrated User {}'.format(user._id)) logger.info('Successfully migrated {} users'.format(total))
def recent_public_registrations(n=10): recent_query = ( Q('category', 'eq', 'project') & Q('is_public', 'eq', True) & Q('is_deleted', 'eq', False) ) registrations = Node.find( recent_query & Q('is_registration', 'eq', True) ).sort( '-registered_date' ) for reg in registrations: if not n: break if reg.is_retracted or reg.pending_embargo: continue n = n - 1 yield reg
def do_migration(): dupe_nodes = [ n for n in Node.find( Q( '_id', 'in', list( set([ l.node._id for l in NodeLog.find( Q('action', 'eq', 'preprint_license_updated')) ])))) if NodeLog.find( Q('action', 'eq', 'preprint_license_updated') & Q('node', 'eq', n._id)).count() > 1 ] logger.info( 'Found {} nodes with multiple preprint_license_updated logs'.format( len(dupe_nodes))) for node in dupe_nodes: preprint_license_updated_logs = [ log for log in node.logs if log.action == 'preprint_license_updated' ] log = preprint_license_updated_logs.pop() while (preprint_license_updated_logs): next_log = preprint_license_updated_logs.pop() timedelta = log.date - next_log.date if timedelta.seconds < 60: logger.info( 'Hiding duplicate preprint_license_updated log with ID {} from node {}, timedelta was {}' .format(log._id, node._id, timedelta)) log.should_hide = True log.save() else: logger.info( 'Skipping preprint_license_updated log with ID {} from node {}, timedelta was {}' .format(log._id, node._id, timedelta)) log = next_log
def migrate(): targets = get_targets() total = len(targets) for i, user in enumerate(targets): logger.info('({}/{}) Preparing to migrate User {}'.format( i + 1, total, user._id)) bookmarks = Node.find( Q('is_bookmark_collection', 'eq', True) & Q('creator', 'eq', user._id)).sort('-date_modified') bookmark_to_keep = None for n in bookmarks: if n.nodes: bookmark_to_keep = n bookmark_to_keep = bookmark_to_keep or bookmarks[0] logger.info( 'Marking Node {} as primary Bookmark Collection for User {}, preparing to delete others' .format(bookmark_to_keep._id, user._id)) for n in bookmarks: if n._id != bookmark_to_keep._id: n.is_deleted = True n.save() logger.info('Successfully migrated User {}'.format(user._id)) logger.info('Successfully migrated {} users'.format(total))
def get_targets(): logger.info('Acquiring targets...') targets = [u for u in User.find() if Node.find(Q('is_bookmark_collection', 'eq', True) & Q('is_deleted', 'eq', False) & Q('creator', 'eq', u._id)).count() > 1] logger.info('Found {} target users.'.format(len(targets))) return targets
def find_nested_projects(): return Node.find( Q('__backrefs.parent.node.nodes.0', 'exists', True) & Q('category', 'eq', 'project') & Q('is_deleted', 'eq', False))
def contributed(self): from website.project.model import Node return Node.find(Q('contributors', 'eq', self._id))
def created(self): from website.project.model import Node return Node.find(Q('creator', 'eq', self._id))
def get_queryset(self): query = self.get_query_from_request() return Node.find(query)
def get_user_nodes_since_workshop(user, workshop_date): query_date = workshop_date + timedelta(days=1) query = Q('creator', 'eq', user._id) & Q('date_created', 'gt', query_date) return list(Node.find(query=query))
def get_user_nodes_since_workshop(user, workshop_date): query_date = workshop_date + timedelta(days=1) query = Q('creator', 'eq', user._id) & Q('date_created', 'gt', query_date) return list(Node.find(query=query))
def find_nested_projects(): return Node.find( Q('__backrefs.parent.node.nodes.0', 'exists', True) & Q('category', 'eq', 'project') & Q('is_deleted', 'eq', False) )