def save_bare_tags(page_size=5000): print 'Starting {}...'.format(sys._getframe().f_code.co_name) count = 0 start = datetime.now() total = MODMTag.find().count() while count < total: with transaction.atomic(): tags = [] for modm_tag in MODMTag.find().sort('-_id')[count:count + page_size]: tags.append(Tag(_id=modm_tag._id, lower=modm_tag.lower, system=False)) count += 1 if count % page_size == 0 or count == total: then = datetime.now() print 'Saving tags {} through {}...'.format( count - page_size, count) woot = Tag.objects.bulk_create(tags) now = datetime.now() print 'Done with {} tags in {} seconds...'.format( len(woot), (now - then).total_seconds()) tags = None woot = None trash = gc.collect() print 'Took out {} trashes'.format(trash) print 'MODM Tags: {}'.format(total) print 'django Tags: {}'.format(Tag.objects.all().count()) print 'Done with {} in {} seconds...'.format( sys._getframe().f_code.co_name, (datetime.now() - start).total_seconds())
def save_bare_tags(page_size=5000): print 'Starting {}...'.format(sys._getframe().f_code.co_name) count = 0 start = datetime.now() total = MODMTag.find().count() while count < total: with transaction.atomic(): tags = [] for modm_tag in MODMTag.find().sort('-_id')[count:count + page_size]: tags.append( Tag(_id=modm_tag._id, lower=modm_tag.lower, system=False)) count += 1 if count % page_size == 0 or count == total: then = datetime.now() print 'Saving tags {} through {}...'.format( count - page_size, count) woot = Tag.objects.bulk_create(tags) now = datetime.now() print 'Done with {} tags in {} seconds...'.format( len(woot), (now - then).total_seconds()) tags = None woot = None trash = gc.collect() print 'Took out {} trashes'.format(trash) print 'MODM Tags: {}'.format(total) print 'django Tags: {}'.format(Tag.objects.all().count()) print 'Done with {} in {} seconds...'.format( sys._getframe().f_code.co_name, (datetime.now() - start).total_seconds())
def test_add_tag(self): file_ = self.root.append_file('That\'s How Strong My Love Is.mp3') tag = Tag(_id='Redding') tag.save() file_.tags.append(tag) file_.save() find = query_tag_file('Redding')['results'] assert_equal(len(find), 1)
def test_add_tag(self): file_ = self.root.append_file('That\'s How Strong My Love Is.mp3') tag = Tag(_id='Redding', name='Redding') tag.save() file_.tags.add(tag) file_.save() find = query_tag_file('Redding')['results'] assert_equal(len(find), 1)
def test_file_remove_tag(self): file = self.node_settings.get_root().append_file('Champion.mp3') tag = Tag(name='Graduation') tag.save() file.tags.add(tag) file.save() assert_in('Graduation', file.tags.values_list('name', flat=True)) url = self.project.api_url_for('osfstorage_remove_tag', fid=file._id) self.app.delete_json(url, {'tag': 'Graduation'}, auth=self.user.auth) file.reload() assert_not_in('Graduation', file.tags.values_list('name', flat=True))
def test_file_add_tag_fail_doesnt_create_log(self, mock_log): file = self.node_settings.get_root().append_file('UltraLightBeam.mp3') tag = Tag(_id='The Life of Pablo') tag.save() file.tags.append(tag) file.save() url = self.project.api_url_for('osfstorage_add_tag', fid=file._id) res = self.app.post_json(url, {'tag': 'The Life of Pablo'}, auth=self.user.auth, expect_errors=True) assert_equal(res.status_code, 400) mock_log.assert_not_called()
def test_tag_the_same_tag(self): file = self.node_settings.get_root().append_file('Lie,Cheat,Steal.mp3') tag = Tag(_id='Run_the_Jewels') tag.save() file.tags.append(tag) file.save() assert_in('Run_the_Jewels', file.tags) url = self.project.api_url_for('osfstorage_add_tag', fid=file._id) res = self.app.post_json(url, {'tag': 'Run_the_Jewels'}, auth=self.user.auth, expect_errors=True) assert_equal(res.status_code, 400) assert_equal(res.json['status'], 'failure')
def test_file_remove_tag(self): file = self.node_settings.get_root().append_file('Champion.mp3') tag = Tag(_id='Graduation') tag.save() file.tags.append(tag) file.save() assert_in('Graduation', file.tags) url = self.project.api_url_for('osfstorage_remove_tag', fid=file._id) self.app.delete_json(url, {'tag': 'Graduation'}, auth=self.user.auth) file.reload() assert_not_in('Graduation', file.tags)
def test_file_remove_tag_creates_log(self): file = self.node_settings.get_root().append_file('Formation.flac') tag = Tag(_id='You that when you cause all this conversation') tag.save() file.tags.append(tag) file.save() url = self.project.api_url_for('osfstorage_remove_tag', fid=file._id) res = self.app.delete_json(url, {'tag': 'You that when you cause all this conversation'}, auth=self.user.auth) assert_equal(res.status_code, 200) self.node.reload() assert_equal(self.node.logs[-1].action, 'file_tag_removed')
def test_remove_tag(self): file_ = self.root.append_file('I\'ve Been Loving You Too Long.mp3') tag = Tag(_id='Blue') tag.save() file_.tags.append(tag) file_.save() find = query_tag_file('Blue')['results'] assert_equal(len(find), 1) file_.tags.remove('Blue') file_.save() find = query_tag_file('Blue')['results'] assert_equal(len(find), 0)
def test_remove_tag(self): file_ = self.root.append_file('I\'ve Been Loving You Too Long.mp3') tag = Tag(_id='Blue', name='Blue') tag.save() file_.tags.add(tag) file_.save() find = query_tag_file('Blue')['results'] assert_equal(len(find), 1) file_.tags.remove(tag) file_.save() find = query_tag_file('Blue')['results'] assert_equal(len(find), 0)
def add_tag(self, tag, auth, save=True, log=True): from website.models import Tag, NodeLog # Prevent import error if tag not in self.tags and not self.node.is_registration: new_tag = Tag.load(tag) if not new_tag: new_tag = Tag(_id=tag) new_tag.save() self.tags.append(new_tag) if log: self.add_tag_log(NodeLog.FILE_TAG_ADDED, tag, auth) if save: self.save() return True return False
def conference_submissions(**kwargs): """Return data for all OSF4M submissions. The total number of submissions for each meeting is calculated and cached in the Conference.num_submissions field. """ conferences = Conference.find(Q('is_meeting', 'ne', False)) # TODO: Revisit this loop, there has to be a way to optimize it for conf in conferences: # For efficiency, we filter by tag first, then node # instead of doing a single Node query projects = set() tags = Tag.find( Q('system', 'eq', False) & Q('name', 'iexact', conf.endpoint.lower())).values_list( 'pk', flat=True) nodes = Node.find( Q('tags', 'in', tags) & Q('is_public', 'eq', True) & Q('is_deleted', 'ne', True)).include('guids') projects.update(list(nodes)) num_submissions = len(projects) # Cache the number of submissions conf.num_submissions = num_submissions bulk_update(conferences, update_fields=['num_submissions']) return {'success': True}
def conference_submissions(**kwargs): """Return data for all OSF4M submissions. The total number of submissions for each meeting is calculated and cached in the Conference.num_submissions field. """ submissions = [] # TODO: Revisit this loop, there has to be a way to optimize it for conf in Conference.find(): # For efficiency, we filter by tag first, then node # instead of doing a single Node query projects = set() tags = Tag.find(Q('lower', 'eq', conf.endpoint.lower())).get_keys() nodes = Node.find( Q('tags', 'in', tags) & Q('is_public', 'eq', True) & Q('is_deleted', 'ne', True) ) projects.update(list(nodes)) for idx, node in enumerate(projects): submissions.append(_render_conference_node(node, idx, conf)) num_submissions = len(projects) # Cache the number of submissions conf.num_submissions = num_submissions conf.save() if num_submissions < settings.CONFERENCE_MIN_COUNT: continue submissions.sort(key=lambda submission: submission['dateCreated'], reverse=True) return {'submissions': submissions}
def conference_view(**kwargs): meetings = [] submissions = [] for conf in Conference.find(): # For efficiency, we filter by tag first, then node # instead of doing a single Node query projects = set() for tag in Tag.find(Q('_id', 'iexact', conf.endpoint)): for node in tag.node__tagged: if not node: continue if not node.is_public or node.is_deleted: continue projects.add(node) for idx, node in enumerate(projects): submissions.append(_render_conference_node(node, idx, conf)) num_submissions = len(projects) if num_submissions < settings.CONFERENCE_MIN_COUNT: continue meetings.append({ 'name': conf.name, 'active': conf.active, 'url': web_url_for('conference_results', meeting=conf.endpoint), 'count': num_submissions, }) submissions.sort(key=lambda submission: submission['dateCreated'], reverse=True) meetings.sort(key=lambda meeting: meeting['count'], reverse=True) return {'meetings': meetings, 'submissions': submissions}
def conference_submissions(**kwargs): """Return data for all OSF4M submissions. The total number of submissions for each meeting is calculated and cached in the Conference.num_submissions field. """ submissions = [] for conf in Conference.find(): # For efficiency, we filter by tag first, then node # instead of doing a single Node query projects = set() for tag in Tag.find(Q('lower', 'eq', conf.endpoint.lower())): for node in tag.node__tagged.find(Q('is_public', 'eq', True) & Q('is_deleted', 'eq', False)): projects.add(node) for idx, node in enumerate(projects): submissions.append(_render_conference_node(node, idx, conf)) num_submissions = len(projects) # Cache the number of submissions conf.num_submissions = num_submissions conf.save() if num_submissions < settings.CONFERENCE_MIN_COUNT: continue submissions.sort(key=lambda submission: submission['dateCreated'], reverse=True) return {'submissions': submissions}
def conference_submissions(**kwargs): """Return data for all OSF4M submissions. The total number of submissions for each meeting is calculated and cached in the Conference.num_submissions field. """ submissions = [] # TODO: Revisit this loop, there has to be a way to optimize it for conf in Conference.find(): if (hasattr(conf, 'is_meeting') and (conf.is_meeting is False)): break # For efficiency, we filter by tag first, then node # instead of doing a single Node query projects = set() tags = Tag.find(Q('lower', 'eq', conf.endpoint.lower())).get_keys() nodes = Node.find( Q('tags', 'in', tags) & Q('is_public', 'eq', True) & Q('is_deleted', 'ne', True)) projects.update(list(nodes)) for idx, node in enumerate(projects): submissions.append(_render_conference_node(node, idx, conf)) num_submissions = len(projects) # Cache the number of submissions conf.num_submissions = num_submissions conf.save() if num_submissions < settings.CONFERENCE_MIN_COUNT: continue submissions.sort(key=lambda submission: submission['dateCreated'], reverse=True) return {'submissions': submissions}
def remove_tag(self, tag, auth, save=True, log=True): from website.models import Tag, NodeLog # Prevent import error tag = Tag.load(tag) if tag and tag in self.tags and not self.node.is_registration: self.tags.remove(tag) if log: self.add_tag_log(NodeLog.FILE_TAG_REMOVED, tag._id, auth) if save: self.save() return True return False
def verify_tags(node, modm_node): modm_tag_keys = [x for x in sorted(set(modm_node.tags._to_primary_keys())) if MODMTag.load(x)] django_tag_keys = sorted(set(node.tags.filter( system=False).values_list('_id', flat=True))) modm_system_tag_keys = sorted(set(modm_node.system_tags)) django_system_tag_keys = sorted(set( node.system_tags.values_list('_id', flat=True))) assert modm_tag_keys == django_tag_keys, 'Modm tags {} don\'t match django tags {} in node {}:{}'.format( modm_tag_keys, django_tag_keys, modm_node._id, node._guid.guid) assert modm_system_tag_keys == django_system_tag_keys, 'Modm system tag keys {} don\'t match django system tags {}'.format( modm_system_tag_keys, django_system_tag_keys)
def verify_tags(node, modm_node): modm_tag_keys = [ x for x in sorted(set(modm_node.tags._to_primary_keys())) if MODMTag.load(x) ] django_tag_keys = sorted( set(node.tags.filter(system=False).values_list('_id', flat=True))) modm_system_tag_keys = sorted(set(modm_node.system_tags)) django_system_tag_keys = sorted( set(node.system_tags.values_list('_id', flat=True))) assert modm_tag_keys == django_tag_keys, 'Modm tags {} don\'t match django tags {} in node {}:{}'.format( modm_tag_keys, django_tag_keys, modm_node._id, node._guid.guid) assert modm_system_tag_keys == django_system_tag_keys, 'Modm system tag keys {} don\'t match django system tags {}'.format( modm_system_tag_keys, django_system_tag_keys)
def remove_tag(self, tag, auth, save=True, log=True): from website.models import Tag, NodeLog # Prevent import error if self.node.is_registration: # Can't perform edits on a registration raise NodeStateError tag = Tag.load(tag) if not tag: raise InvalidTagError elif tag not in self.tags: raise TagNotFoundError else: self.tags.remove(tag) if log: self.add_tag_log(NodeLog.FILE_TAG_REMOVED, tag._id, auth) if save: self.save() return True
def parse(self): raw_post = random.choice(Post.objects.filter(is_raw = True)) page = self.get_html(raw_post.original_url) soup = BeautifulSoup(page,"html.parser") content =str(soup.find("div", class_="entry-content")) if content == "None" or not content: raw_post.delete() return raw_post.html = content results = soup.findAll("a", {"rel" : "category tag"}) tags = [ ( r['href'].split('/')[-2:-1][0], r.contents[0] ) for r in results] # for r in results: # tags.append( (r['href'].split('/')[-2:-1][0], r.contents[0]) ) images = soup.findAll("img") raw_post.first_image = images[1]['src'] raw_post.first_text = str(soup.findAll("p")[0].contents[0]) raw_post.title = self.reg_find(r"<h1 class=\"entry-title\">(.+?)</h1>", page)[0] same_post = Post.objects.filter(title = raw_post.title, is_raw = False).first() if same_post: same_post.delete() post_date = self.reg_find(r">.*(\d\d\.\d\d\.\d\d\d\d)</time>", page)[0] post_date = post_date.replace('.','') #post_date = datetime.datetime.strptime(post_date, "%dd%mm%YYYY").date() format_str = '%d%m%Y' # The format raw_post.post_date = datetime.datetime.strptime(post_date, format_str) author = self.reg_find(r"class=\"url fn n\">(.+?)</a>", page)[0] a = Author.objects.filter(name = author) if a: a = a[0] else: a = Author(name = author) a.save() raw_post.author = a existed_tags = Tag.objects.filter(name__in = [t[0] for t in tags]) for t in tags: if not t[0] in [et.name for et in existed_tags]: new_tag = Tag(name =t[0], value = t[1]) new_tag.save() post_tags = Tag.objects.filter(name__in = [t[0] for t in tags]) # post = Post( # title = title, # html = content, # author =a, # original_url = url, # post_date = post_date, # first_image = first_image, # first_text=first_p) # post.save() # post.tags.set(post_tags) raw_post.is_raw = False raw_post.save() raw_post.tags.set(post_tags) print("post %s saved" % str(raw_post))
def do_migration(): for t in Tag.find(): logger.info('Migrating tag {!r}'.format(t)) t.lower = t._id.lower() t.save(force=True)