def update(limit): blogs = models.Blog.objects.all() for blog in blogs: posts = models.Post.objects.filter(author=blog.author) seen = set(p.title for p in posts) fname = get_datapath(blog) try: print '*** reading %s, %s' % (blog.id, blog.url) now = datetime.datetime.now() models.UserProfile.objects.filter(user=blog.author).update( last_visited=now) doc = feedparser.parse(fname) ent = [e for e in doc.entries if title(e) not in seen] ent = ent[:limit] for r in ent: if not r.title: continue date = r.date_parsed date = datetime.datetime(date[0], date[1], date[2]) content = html.strip_tags(r.description) post = models.Post(title=title(r), url=r.link, author=blog.author, type=POST_BLOG, content=content, creation_date=date) post.save() print '*** added post %s' % post.title.encode("ascii", errors='replace') except KeyError, exc: print '(!) error %s' % exc
def add_files(fnames, uid, ptype): user = models.User.objects.get(id=uid) for fname in fnames: title, tag_val, body = parse(fname) print '*** adding %s' % title post = models.Post(title=title, author=user, type=ptype, tag_val=tag_val, content=body) post.save() post.set_tags()
def create_post(b, author, post_type, root=None, parent=None): title = b.subj body = b.body if post_type == const.POST_QUESTION: post = models.Post(title=title, type=post_type, content=body, tag_val="galaxy", author=author, root=root, parent=parent) else: post = models.Post(type=post_type, content=body, tag_val="galaxy", author=author, root=root, parent=parent) post.save() post.creation_date = post.lastedit_date = b.datetime post.set_tags() print "creating %s: %s" % (post.get_type_display(), title) return post
def add_files(fnames, uid, ptype, delete=False, sticky=False): user = models.User.objects.get(pk=uid) for fname in fnames: title, tag_val, body = parse(fname) print '*** adding %s' % title post = models.Post(title=title, author=user, type=ptype, tag_val=tag_val, content=body, sticky=sticky) post.save() post.set_tags() if delete: os.delete(fname)
def insert_comments(fname, posts, users, limit): gc.collect() rows = xml_reader(fname) # keep the valid rows only rows = filter(checkfunc('UserId', users), rows) rows = filter(checkfunc('PostId', posts), rows) comms, clist = {}, [] for row in rows: cid = row['Id'] text = row['Text'] postid = row['PostId'] author = users[row['UserId']] creation_date = parse_time(row['CreationDate']) post_type = const.POST_COMMENT row = postid, cid, dict(author=author, creation_date=creation_date, content=text, type=post_type) clist.append(row) print "*** inserting %s comments" % len(clist) with transaction.commit_on_success(): for i, (postid, cid, param) in enumerate(clist): parent = posts[postid] param['parent'] = parent param['root'] = parent.root or parent post = models.Post(**param) comms[cid] = post if USE_DB: if (i % 1000 == 0): print "*** commit at %s" % i transaction.commit() gc.collect() post.save() return comms
def insert_posts(fname, limit, users): "Inserts the posts" gc.collect() # read all the posts rows = xml_reader(fname, limit=limit) # keep only posts with a valid user rows = filter(checkfunc('OwnerUserId', users), rows) plist = [] # collect post attributes acount = defaultdict(int) # maintains answer counts parents = dict() # maps questions to answers # first insert all posts # connects the post type in the SE dump to the models in BioStar PMAP = {'1': const.POST_QUESTION, '2': const.POST_ANSWER} for row in rows: postid = row['Id'] views = row['ViewCount'] creation_date = parse_time(row['CreationDate']) author = users[row['OwnerUserId']] parentid = row.get('ParentId') title = row.get('Title', '') tag_string = row.get('Tags', '') if tag_string: tag_string = parse_tag_string(tag_string) ptypeid = row['PostTypeId'] post_type = PMAP.get(ptypeid, const.POST_OTHER) # collect answercounts if post_type == POST_ANSWER and parentid: acount[parentid] += 1 parents[postid] = parentid ppair = (postid, dict(author=author, views=views, creation_date=creation_date, type=post_type, title=title, tag_val=tag_string)) plist.append(ppair) posts = {} print "*** inserting %s posts" % len(plist) with transaction.commit_on_success(): for i, (postid, p) in enumerate(plist): post = models.Post(**p) parentid = parents.get(postid) if USE_DB: if (i % 1000 == 0): print "*** commit at %s" % i transaction.commit() # gets triggered as a signal #post.answer_count = acount.get(postid, 0) post.answer_count = 0 if parentid: parent = posts.get(parents[postid]) if not parent: continue post.parent = post.root = parent post.save() post.set_tags() posts[postid] = post fp = file('post-remap.txt', 'wt') for postid, post in sorted(posts.items()): fp.write('%s\t%s\n' % (postid, post.id)) fp.close() return posts