def migrate_posts(self, source, fname): from biostar.server.models import disconnect_all from biostar.apps.posts.models import Post, Subscription from biostar.apps.messages.models import Message from biostar.apps.util import html log = self.stdout.write # Disconnect signals they will generate way too many messages disconnect_all() posts = [ p[0] for p in Post.objects.all().values_list("id") ] posts = set(posts) users = dict((u.id, u) for u in User.objects.all()) log("migrating posts from %s" % fname) stream = csv.DictReader(file(fname), delimiter=b'\t') for i, row in enumerate(stream): title = to_unicode(row['title']) uid = int(row['id']) url = row['url'].strip() # Skip existing posts if uid in posts: continue posts.add(uid) log("migrating post %s: %s" % (uid, title)) post = get_post(row, users, klass=Post) if not post: log("skipped %s: %s" % (uid, title)) continue # Read and add the post body. post_file = path_join(source, 'posts', str(post.id)) post.content = file(post_file, 'rt').read() if url and post.type == Post.BLOG: # Will break out an not deal with Blogs in Biostar. continue # Link to external blog bosts. url_link = '<p><b><i class="fa fa-external-link-square"></i> Read full blogpost at <a href="%s">%s</a></b><p>' % (url, url[:45]) url_link = to_unicode(url_link) content = to_unicode(post.content) post.content = url_link + content try: post.save() except Exception, exc: log('*** error inserting post %s' % post.id) log("*** %s" % exc) continue # TODO migrate only tags with high count post.add_tags(post.tag_val)
def migrate_posts(self, source, fname): from biostar.server.models import disconnect_all from biostar.apps.posts.models import Post, Subscription from biostar.apps.messages.models import Message log = self.stdout.write # Disconnect signals they will generate way too many messages disconnect_all() Post.objects.all().delete() users = dict((u.id, u) for u in User.objects.all()) log("migrating posts from %s" % fname) stream = csv.DictReader(file(fname), delimiter=b'\t') for i, row in enumerate(stream): title = to_unicode(row['title']) uid = row['id'] log("migrating %s: %s" % (uid, title)) post = get_post(row, users, klass=Post) if not post: continue # Read and add the post body. post_file = path_join(source, 'posts', str(post.id)) post.content = file(post_file, 'rt').read() try: post.save() except Exception, exc: log('*** error inserting post %s' % post.id) log("*** %s" % exc) continue # TODO migrate only tags with high count post.add_tags(post.tag_val)
def parse_mboxx(filename, limit=None, tag_val=''): from biostar.server.models import disconnect_all from biostar.apps.users.models import User from biostar.apps.posts.models import Post global SKIPPED_REPLY #users = User.objects.all().delete() users = User.objects.all() users = dict([(u.email, u) for u in users]) #Post.objects.all().delete() logger.info("*** found %s users" % len(users)) if limit is not None: limit = int(limit) # Disconnect signals disconnect_all() logger.info("*** parsing mbox %s" % filename) new_name = fix_file(filename) # Parse the modified mbox. mbox = mailbox.mbox(new_name) rows = imap(unpack_message, mbox) # Remove empty elements rows = ifilter(None, rows) # Keep only email with sender and subject. rows = ifilter(lambda b: b.email, rows) rows = ifilter(lambda b: b.subj, rows) # Apply limits if necessary. rows = islice(rows, limit) tree, posts, fallback = {}, {}, {} for b in rows: datefmt = b.date.strftime('%Y-%m-%d') logger.info("*** %s parsing %s " % (datefmt, b.subj)) if b.email not in users: logger.info("--- creating user name:%s, email:%s" % (b.name, b.email)) u = User(email=b.email, name=b.name) if not DRY_RUN: u.save() u.profile.date_joined = b.date u.profile.last_login = b.date u.profile.save() users[u.email] = u author = users[b.email] parent = posts.get(b.reply_to) or fallback.get(b.subj) # Looks like a reply but still no parent # Fuzzy matching to commence if not parent and b.subj.startswith("Re:"): curr_key = b.subj logger.info("searching for best match %s" % curr_key) cands = difflib.get_close_matches(curr_key, fallback.keys()) if cands: logger.info("found %s" % cands) parent = fallback[cands[0]] if parent: root = parent.root post = create_post(b=b, author=author, parent=parent) else: post = create_post(b=b, author=author, tag_val=tag_val) posts[b.id] = post # Fall back to guessing post inheritance from the title fall_key = "Re: %s" % post.title fallback[fall_key] = post logger.info("*** users %s" % len(users)) logger.info("*** posts %s" % len(posts)) logger.info("*** post limit: %s" % limit) logger.info("*** skipped posts due to size: %s" % SKIPPED_SIZE) logger.info("*** skipped posts due to missing parent: %s" % SKIPPED_REPLY) if DRY_RUN: logger.info("*** dry run, no data saved") sys.exit() logger.info("*** updating user scores") for user in User.objects.all(): score = Post.objects.filter(author=user).count() user.score = user.full_score = score user.save() latest = Post.objects.filter( author=user).order_by("-creation_date")[:1] if latest: user.profile.last_login = latest[0].creation_date user.profile.save()
def parse_mboxx(filename, limit=None, tag_val=''): from biostar.server.models import disconnect_all from biostar.apps.users.models import User from biostar.apps.posts.models import Post global SKIPPED_REPLY #users = User.objects.all().delete() users = User.objects.all() users = dict([(u.email, u) for u in users]) #Post.objects.all().delete() logger.info("*** found %s users" % len(users)) if limit is not None: limit = int(limit) # Disconnect signals disconnect_all() logger.info("*** parsing mbox %s" % filename) new_name = fix_file(filename) # Parse the modified mbox. mbox = mailbox.mbox(new_name) rows = imap(unpack_message, mbox) # Remove empty elements rows = ifilter(None, rows) # Keep only email with sender and subject. rows = ifilter(lambda b: b.email, rows) rows = ifilter(lambda b: b.subj, rows) # Apply limits if necessary. rows = islice(rows, limit) tree, posts, fallback = {}, {}, {} # titles that have been seen in the past roots = {} for b in rows: datefmt = b.date.strftime('%Y-%m-%d') logger.info("*** %s parsing %s " % (datefmt, b.subj)) if b.email not in users: logger.info("--- creating user name:%s, email:%s" % (b.name, b.email)) u = User(email=b.email, name=b.name) if not DRY_RUN: u.save() u.profile.date_joined = b.date u.profile.last_login = b.date u.profile.save() users[u.email] = u author = users[b.email] parent = posts.get(b.reply_to) or fallback.get(b.subj) # Looks like a reply but still no parent # Fuzzy matching to commence if not parent and b.subj.lower().startswith("Re:"): curr_key = b.subj logger.info("searching for best match %s" % curr_key) cands = difflib.get_close_matches(curr_key, fallback.keys()) if cands: logger.info("found %s" % cands) parent = fallback[cands[0]] # some emailers do not append Re: to replies, this is a heuristics if not parent and b.subj in roots: # try a candidate cand = roots[b.subj] delta = b.date - cand.creation_date if delta < timedelta(weeks=5): parent = cand if parent: root = parent.root post = create_post(b=b, author=author, parent=parent) else: post = create_post(b=b, author=author, tag_val=tag_val) posts[b.id] = post # keep track of posts that could be parents if not parent: roots[b.subj] = post # Fall back to guessing post inheritance from the title fall_key = "Re: %s" % post.title fallback[fall_key] = post logger.info("*** users %s" % len(users)) logger.info("*** posts %s" % len(posts)) logger.info("*** post limit: %s" % limit) logger.info("*** skipped posts due to size: %s" % SKIPPED_SIZE) logger.info("*** skipped posts due to missing parent: %s" % SKIPPED_REPLY) if DRY_RUN: logger.info("*** dry run, no data saved") sys.exit() logger.info("*** updating user scores") for user in User.objects.all(): score = Post.objects.filter(author=user).count() user.score = user.full_score = score user.save() latest = Post.objects.filter(author=user).order_by("-creation_date")[:1] if latest: user.profile.last_login = latest[0].creation_date user.profile.save()
def migrate_posts(self, source, fname): from biostar.server.models import disconnect_all from biostar.apps.posts.models import Post, Subscription from biostar.apps.messages.models import Message from biostar.apps.util import html log = self.stdout.write # Disconnect signals they will generate way too many messages disconnect_all() posts = [p[0] for p in Post.objects.all().values_list("id")] posts = set(posts) users = dict((u.id, u) for u in User.objects.all()) log("migrating posts from %s" % fname) stream = csv.DictReader(file(fname), delimiter=b'\t') for i, row in enumerate(stream): title = to_unicode(row['title']) uid = int(row['id']) url = row['url'].strip() # Skip existing posts if uid in posts: continue posts.add(uid) log("migrating post %s: %s" % (uid, title)) post = get_post(row, users, klass=Post) if not post: log("skipped %s: %s" % (uid, title)) continue # Read and add the post body. post_file = path_join(source, 'posts', str(post.id)) post.content = file(post_file, 'rt').read() if url and post.type == Post.BLOG: # Will break out an not deal with Blogs in Biostar. continue # Link to external blog bosts. url_link = '<p><b><i class="fa fa-external-link-square"></i> Read full blogpost at <a href="%s">%s</a></b><p>' % ( url, url[:45]) url_link = to_unicode(url_link) content = to_unicode(post.content) post.content = url_link + content try: post.save() except Exception as exc: log('*** error inserting post %s' % post.id) log("*** %s" % exc) continue # TODO migrate only tags with high count post.add_tags(post.tag_val) log("migrated %s posts" % Post.objects.all().count()) log("created %s subscriptions" % Subscription.objects.all().count()) log("created %s messages" % Message.objects.all().count())