def migrate_posts(self, source, fname):
        from biostar.server.models import disconnect_all
        from biostar.apps.posts.models import Post, Subscription
        from biostar.apps.messages.models import Message
        from biostar.apps.util import html

        log = self.stdout.write

        # Disconnect signals they will generate way too many messages
        disconnect_all()

        posts = [ p[0] for p in Post.objects.all().values_list("id") ]

        posts = set(posts)

        users = dict((u.id, u) for u in User.objects.all())

        log("migrating posts from %s" % fname)
        stream = csv.DictReader(file(fname), delimiter=b'\t')

        for i, row in enumerate(stream):
            title = to_unicode(row['title'])
            uid = int(row['id'])
            url = row['url'].strip()

            # Skip existing posts
            if uid in posts:
                continue

            posts.add(uid)

            log("migrating post %s: %s" % (uid, title))
            post = get_post(row, users, klass=Post)

            if not post:
                log("skipped %s: %s" % (uid, title))
                continue

            # Read and add the post body.
            post_file = path_join(source, 'posts', str(post.id))
            post.content = file(post_file, 'rt').read()

            if url and post.type == Post.BLOG:
                # Will break out an not deal with Blogs in Biostar.
                continue
                # Link to external blog bosts.
                url_link = '<p><b><i class="fa fa-external-link-square"></i> Read full blogpost at <a href="%s">%s</a></b><p>' % (url, url[:45])
                url_link = to_unicode(url_link)
                content = to_unicode(post.content)
                post.content = url_link + content

            try:
                post.save()
            except Exception, exc:
                log('*** error inserting post %s' % post.id)
                log("*** %s" % exc)
                continue

            # TODO migrate only tags with high count
            post.add_tags(post.tag_val)
예제 #2
0
    def migrate_posts(self, source, fname):
        from biostar.server.models import disconnect_all
        from biostar.apps.posts.models import Post, Subscription
        from biostar.apps.messages.models import Message

        log = self.stdout.write

        # Disconnect signals they will generate way too many messages
        disconnect_all()

        Post.objects.all().delete()

        users = dict((u.id, u) for u in User.objects.all())

        log("migrating posts from %s" % fname)
        stream = csv.DictReader(file(fname), delimiter=b'\t')

        for i, row in enumerate(stream):
            title = to_unicode(row['title'])
            uid = row['id']
            log("migrating %s: %s" % (uid, title))
            post = get_post(row, users, klass=Post)

            if not post:
                continue

            # Read and add the post body.
            post_file = path_join(source, 'posts', str(post.id))
            post.content = file(post_file, 'rt').read()

            try:
                post.save()
            except Exception, exc:
                log('*** error inserting post %s' % post.id)
                log("*** %s" % exc)
                continue

            # TODO migrate only tags with high count
            post.add_tags(post.tag_val)
예제 #3
0
def parse_mboxx(filename, limit=None, tag_val=''):
    from biostar.server.models import disconnect_all
    from biostar.apps.users.models import User
    from biostar.apps.posts.models import Post

    global SKIPPED_REPLY

    #users = User.objects.all().delete()
    users = User.objects.all()
    users = dict([(u.email, u) for u in users])

    #Post.objects.all().delete()

    logger.info("*** found %s users" % len(users))

    if limit is not None:
        limit = int(limit)

    # Disconnect signals
    disconnect_all()

    logger.info("*** parsing mbox %s" % filename)

    new_name = fix_file(filename)

    # Parse the modified mbox.
    mbox = mailbox.mbox(new_name)
    rows = imap(unpack_message, mbox)

    # Remove empty elements
    rows = ifilter(None, rows)
    # Keep only email with sender and subject.
    rows = ifilter(lambda b: b.email, rows)
    rows = ifilter(lambda b: b.subj, rows)

    # Apply limits if necessary.
    rows = islice(rows, limit)

    tree, posts, fallback = {}, {}, {}

    for b in rows:
        datefmt = b.date.strftime('%Y-%m-%d')
        logger.info("*** %s parsing %s " % (datefmt, b.subj))

        if b.email not in users:

            logger.info("--- creating user name:%s, email:%s" %
                        (b.name, b.email))
            u = User(email=b.email, name=b.name)
            if not DRY_RUN:
                u.save()
                u.profile.date_joined = b.date
                u.profile.last_login = b.date
                u.profile.save()

            users[u.email] = u

        author = users[b.email]

        parent = posts.get(b.reply_to) or fallback.get(b.subj)

        # Looks like a reply but still no parent
        # Fuzzy matching to commence
        if not parent and b.subj.startswith("Re:"):
            curr_key = b.subj
            logger.info("searching for best match %s" % curr_key)
            cands = difflib.get_close_matches(curr_key, fallback.keys())
            if cands:
                logger.info("found %s" % cands)
                parent = fallback[cands[0]]

        if parent:
            root = parent.root
            post = create_post(b=b, author=author, parent=parent)
        else:
            post = create_post(b=b, author=author, tag_val=tag_val)

        posts[b.id] = post

        # Fall back to guessing post inheritance from the title
        fall_key = "Re: %s" % post.title
        fallback[fall_key] = post

    logger.info("*** users %s" % len(users))
    logger.info("*** posts %s" % len(posts))
    logger.info("*** post limit: %s" % limit)
    logger.info("*** skipped posts due to size: %s" % SKIPPED_SIZE)
    logger.info("*** skipped posts due to missing parent: %s" % SKIPPED_REPLY)

    if DRY_RUN:
        logger.info("*** dry run, no data saved")
        sys.exit()

    logger.info("*** updating user scores")
    for user in User.objects.all():
        score = Post.objects.filter(author=user).count()
        user.score = user.full_score = score
        user.save()
        latest = Post.objects.filter(
            author=user).order_by("-creation_date")[:1]
        if latest:
            user.profile.last_login = latest[0].creation_date
            user.profile.save()
예제 #4
0
def parse_mboxx(filename, limit=None, tag_val=''):
    from biostar.server.models import disconnect_all
    from biostar.apps.users.models import User
    from biostar.apps.posts.models import Post

    global SKIPPED_REPLY

    #users = User.objects.all().delete()
    users = User.objects.all()
    users = dict([(u.email, u) for u in users])

    #Post.objects.all().delete()

    logger.info("*** found %s users" % len(users))

    if limit is not None:
        limit = int(limit)

    # Disconnect signals
    disconnect_all()

    logger.info("*** parsing mbox %s" % filename)

    new_name = fix_file(filename)

    # Parse the modified mbox.
    mbox = mailbox.mbox(new_name)
    rows = imap(unpack_message, mbox)

    # Remove empty elements
    rows = ifilter(None, rows)
    # Keep only email with sender and subject.
    rows = ifilter(lambda b: b.email, rows)
    rows = ifilter(lambda b: b.subj, rows)

    # Apply limits if necessary.
    rows = islice(rows, limit)

    tree, posts, fallback = {}, {}, {}

    # titles that have been seen in the past
    roots = {}

    for b in rows:
        datefmt = b.date.strftime('%Y-%m-%d')
        logger.info("*** %s parsing %s " % (datefmt, b.subj))

        if b.email not in users:

            logger.info("--- creating user name:%s, email:%s" % (b.name, b.email))
            u = User(email=b.email, name=b.name)
            if not DRY_RUN:
                u.save()
                u.profile.date_joined = b.date
                u.profile.last_login = b.date
                u.profile.save()

            users[u.email] = u

        author = users[b.email]

        parent = posts.get(b.reply_to) or fallback.get(b.subj)

        # Looks like a reply but still no parent
        # Fuzzy matching to commence
        if not parent and b.subj.lower().startswith("Re:"):
            curr_key = b.subj
            logger.info("searching for best match %s" % curr_key)
            cands = difflib.get_close_matches(curr_key, fallback.keys())
            if cands:
                logger.info("found %s" % cands)
                parent = fallback[cands[0]]

        # some emailers do not append Re: to replies, this is a heuristics
        if not parent and b.subj in roots:
            # try a candidate
            cand = roots[b.subj]
            delta = b.date - cand.creation_date
            if delta < timedelta(weeks=5):
                parent = cand

        if parent:
            root = parent.root
            post = create_post(b=b, author=author, parent=parent)
        else:
            post = create_post(b=b, author=author, tag_val=tag_val)

        posts[b.id] = post

        # keep track of posts that could be parents
        if not parent:
            roots[b.subj] = post

        # Fall back to guessing post inheritance from the title
        fall_key = "Re: %s" % post.title
        fallback[fall_key] = post

    logger.info("*** users %s" % len(users))
    logger.info("*** posts %s" % len(posts))
    logger.info("*** post limit: %s" % limit)
    logger.info("*** skipped posts due to size: %s" % SKIPPED_SIZE)
    logger.info("*** skipped posts due to missing parent: %s" % SKIPPED_REPLY)

    if DRY_RUN:
        logger.info("*** dry run, no data saved")
        sys.exit()

    logger.info("*** updating user scores")
    for user in User.objects.all():
        score = Post.objects.filter(author=user).count()
        user.score = user.full_score = score
        user.save()
        latest = Post.objects.filter(author=user).order_by("-creation_date")[:1]
        if latest:
            user.profile.last_login = latest[0].creation_date
            user.profile.save()
예제 #5
0
    def migrate_posts(self, source, fname):
        from biostar.server.models import disconnect_all
        from biostar.apps.posts.models import Post, Subscription
        from biostar.apps.messages.models import Message
        from biostar.apps.util import html

        log = self.stdout.write

        # Disconnect signals they will generate way too many messages
        disconnect_all()

        posts = [p[0] for p in Post.objects.all().values_list("id")]

        posts = set(posts)

        users = dict((u.id, u) for u in User.objects.all())

        log("migrating posts from %s" % fname)
        stream = csv.DictReader(file(fname), delimiter=b'\t')

        for i, row in enumerate(stream):
            title = to_unicode(row['title'])
            uid = int(row['id'])
            url = row['url'].strip()

            # Skip existing posts
            if uid in posts:
                continue

            posts.add(uid)

            log("migrating post %s: %s" % (uid, title))
            post = get_post(row, users, klass=Post)

            if not post:
                log("skipped %s: %s" % (uid, title))
                continue

            # Read and add the post body.
            post_file = path_join(source, 'posts', str(post.id))
            post.content = file(post_file, 'rt').read()

            if url and post.type == Post.BLOG:
                # Will break out an not deal with Blogs in Biostar.
                continue
                # Link to external blog bosts.
                url_link = '<p><b><i class="fa fa-external-link-square"></i> Read full blogpost at <a href="%s">%s</a></b><p>' % (
                    url, url[:45])
                url_link = to_unicode(url_link)
                content = to_unicode(post.content)
                post.content = url_link + content

            try:
                post.save()
            except Exception as exc:
                log('*** error inserting post %s' % post.id)
                log("*** %s" % exc)
                continue

            # TODO migrate only tags with high count
            post.add_tags(post.tag_val)

        log("migrated %s posts" % Post.objects.all().count())
        log("created %s subscriptions" % Subscription.objects.all().count())
        log("created %s messages" % Message.objects.all().count())