Esempio n. 1
0
def update(limit):
    blogs = models.Blog.objects.all()
    for blog in blogs:
        posts = models.Post.objects.filter(author=blog.author)
        seen = set(p.title for p in posts)
        fname = get_datapath(blog)
        try:
            print '*** reading %s, %s' % (blog.id, blog.url)
            now = datetime.datetime.now()
            models.UserProfile.objects.filter(user=blog.author).update(
                last_visited=now)
            doc = feedparser.parse(fname)

            ent = [e for e in doc.entries if title(e) not in seen]
            ent = ent[:limit]
            for r in ent:
                if not r.title:
                    continue
                date = r.date_parsed
                date = datetime.datetime(date[0], date[1], date[2])
                content = html.strip_tags(r.description)
                post = models.Post(title=title(r),
                                   url=r.link,
                                   author=blog.author,
                                   type=POST_BLOG,
                                   content=content,
                                   creation_date=date)
                post.save()
                print '*** added post %s' % post.title.encode("ascii",
                                                              errors='replace')
        except KeyError, exc:
            print '(!) error %s' % exc
Esempio n. 2
0
def add_files(fnames, uid, ptype):
    user = models.User.objects.get(id=uid)
    
    for fname in fnames:
        title, tag_val, body = parse(fname)
        print '*** adding %s' % title
        post = models.Post(title=title, author=user,  type=ptype, tag_val=tag_val, content=body)
        post.save()
        post.set_tags()
Esempio n. 3
0
def create_post(b, author, post_type, root=None, parent=None):
    title = b.subj
    body = b.body
    if post_type == const.POST_QUESTION:
        post = models.Post(title=title,
                           type=post_type,
                           content=body,
                           tag_val="galaxy",
                           author=author,
                           root=root,
                           parent=parent)
    else:
        post = models.Post(type=post_type,
                           content=body,
                           tag_val="galaxy",
                           author=author,
                           root=root,
                           parent=parent)
    post.save()
    post.creation_date = post.lastedit_date = b.datetime
    post.set_tags()
    print "creating %s: %s" % (post.get_type_display(), title)
    return post
Esempio n. 4
0
def add_files(fnames, uid, ptype, delete=False, sticky=False):

    user = models.User.objects.get(pk=uid)

    for fname in fnames:
        title, tag_val, body = parse(fname)
        print '*** adding %s' % title
        post = models.Post(title=title,
                           author=user,
                           type=ptype,
                           tag_val=tag_val,
                           content=body,
                           sticky=sticky)
        post.save()
        post.set_tags()
        if delete:
            os.delete(fname)
Esempio n. 5
0
def insert_comments(fname, posts, users, limit):

    gc.collect()

    rows = xml_reader(fname)

    # keep the valid rows only
    rows = filter(checkfunc('UserId', users), rows)
    rows = filter(checkfunc('PostId', posts), rows)

    comms, clist = {}, []
    for row in rows:
        cid = row['Id']
        text = row['Text']
        postid = row['PostId']
        author = users[row['UserId']]
        creation_date = parse_time(row['CreationDate'])
        post_type = const.POST_COMMENT
        row = postid, cid, dict(author=author,
                                creation_date=creation_date,
                                content=text,
                                type=post_type)
        clist.append(row)

    print "*** inserting %s comments" % len(clist)
    with transaction.commit_on_success():
        for i, (postid, cid, param) in enumerate(clist):
            parent = posts[postid]
            param['parent'] = parent
            param['root'] = parent.root or parent
            post = models.Post(**param)
            comms[cid] = post
            if USE_DB:
                if (i % 1000 == 0):
                    print "*** commit at %s" % i
                    transaction.commit()
                    gc.collect()
                post.save()

    return comms
Esempio n. 6
0
def insert_posts(fname, limit, users):
    "Inserts the posts"
    gc.collect()

    # read all the posts
    rows = xml_reader(fname, limit=limit)

    # keep only posts with a valid user
    rows = filter(checkfunc('OwnerUserId', users), rows)

    plist = []  # collect post attributes
    acount = defaultdict(int)  # maintains answer counts
    parents = dict()  # maps questions to answers
    # first insert all posts

    # connects the post type in the SE dump to the models in BioStar
    PMAP = {'1': const.POST_QUESTION, '2': const.POST_ANSWER}

    for row in rows:
        postid = row['Id']
        views = row['ViewCount']
        creation_date = parse_time(row['CreationDate'])
        author = users[row['OwnerUserId']]
        parentid = row.get('ParentId')
        title = row.get('Title', '')
        tag_string = row.get('Tags', '')
        if tag_string:
            tag_string = parse_tag_string(tag_string)

        ptypeid = row['PostTypeId']
        post_type = PMAP.get(ptypeid, const.POST_OTHER)

        # collect answercounts
        if post_type == POST_ANSWER and parentid:
            acount[parentid] += 1
            parents[postid] = parentid

        ppair = (postid,
                 dict(author=author,
                      views=views,
                      creation_date=creation_date,
                      type=post_type,
                      title=title,
                      tag_val=tag_string))
        plist.append(ppair)

    posts = {}
    print "*** inserting %s posts" % len(plist)

    with transaction.commit_on_success():
        for i, (postid, p) in enumerate(plist):
            post = models.Post(**p)
            parentid = parents.get(postid)
            if USE_DB:
                if (i % 1000 == 0):
                    print "*** commit at %s" % i
                    transaction.commit()
                # gets triggered as a signal
                #post.answer_count = acount.get(postid, 0)
                post.answer_count = 0
                if parentid:
                    parent = posts.get(parents[postid])
                    if not parent:
                        continue
                    post.parent = post.root = parent
                post.save()
                post.set_tags()
            posts[postid] = post

    fp = file('post-remap.txt', 'wt')
    for postid, post in sorted(posts.items()):
        fp.write('%s\t%s\n' % (postid, post.id))
    fp.close()

    return posts