def main(config): config = get_config(config) teardown(drop_db=False, config=config) document = json.load(open(config)) engine = pg_engine( database=document[0].get('database', document[0]['index'])) Session = sessionmaker(bind=engine, autoflush=True) session = Session() # Bootstrap users = { 'Carla Ferreira Cardoso': User(name='Carla Ferreira Cardoso', age=19, gender='female'), 'Uwe Fuerst': User(name='Uwe Fuerst', age=58, gender='male'), 'Otitodilinna Chigolum': User(name='Otitodilinna Chigolum', age=36, gender='male'), } with subtransactions(session): session.add_all(users.values()) posts = { 'Post1': Post(slug='post_1', title='This is the first post'), 'Post2': Post(slug='post_2', title='This is the second post'), 'Post3': Post(slug='post_3', title='This is the third post'), } with subtransactions(session): session.add_all(posts.values()) comments = { 'Comment1': Comment( title='Comment 1', content='This is a sample comment for comment 1', ), 'Comment2': Comment( title='Comment 2', content='This is a sample comment for comment 2', ), 'Comment3': Comment( title='Comment 3', content='This is a sample comment for comment 3', ), 'Comment4': Comment( title='Comment 4', content='This is a sample comment for comment 4', ), 'Comment5': Comment( title='Comment 5', content='This is a sample comment for comment 5', ), 'Comment6': Comment( title='Comment 6', content='This is a sample comment for comment 6', ), } with subtransactions(session): session.add_all(comments.values()) tags = { 'Economics': Tag(name='Economics'), 'Career': Tag(name='Career'), 'Political': Tag(name='Political'), 'Fitness': Tag(name='Fitness'), 'Entertainment': Tag(name='Entertainment'), 'Education': Tag(name='Education'), 'Technology': Tag(name='Technology'), 'Health': Tag(name='Health'), 'Fashion': Tag(name='Fashion'), 'Design': Tag(name='Design'), 'Photography': Tag(name='Photography'), 'Lifestyle': Tag(name='Lifestyle'), } with subtransactions(session): session.add_all(tags.values()) user_posts = [ UserPost( user=users['Carla Ferreira Cardoso'], post=posts['Post1'], ), UserPost( user=users['Uwe Fuerst'], post=posts['Post2'], ), UserPost( user=users['Otitodilinna Chigolum'], post=posts['Post3'], ), ] with subtransactions(session): session.add_all(user_posts) user_tags = [ UserTag( user=users['Carla Ferreira Cardoso'], tag=tags['Economics'], ), UserTag( user=users['Carla Ferreira Cardoso'], tag=tags['Career'], ), UserTag( user=users['Carla Ferreira Cardoso'], tag=tags['Political'], ), UserTag( user=users['Carla Ferreira Cardoso'], tag=tags['Lifestyle'], ), UserTag( user=users['Carla Ferreira Cardoso'], tag=tags['Health'], ), UserTag( user=users['Uwe Fuerst'], tag=tags['Education'], ), UserTag( user=users['Uwe Fuerst'], tag=tags['Lifestyle'], ), UserTag( user=users['Otitodilinna Chigolum'], tag=tags['Fashion'], ), ] with subtransactions(session): session.add_all(user_tags) post_comments = [ PostComment( post=posts['Post1'], comment=comments['Comment1'], ), PostComment( post=posts['Post1'], comment=comments['Comment2'], ), PostComment( post=posts['Post2'], comment=comments['Comment3'], ), PostComment( post=posts['Post2'], comment=comments['Comment4'], ), PostComment( post=posts['Post3'], comment=comments['Comment5'], ), PostComment( post=posts['Post3'], comment=comments['Comment6'], ), ] with subtransactions(session): session.add_all(post_comments)
def parse_one(id): Session = session() logger.debug('Query feed %s', id) feed = Session.query(Feed).get(id) logger.debug('Parse feed %s: %s', id, feed.url) try: resp = requests.get(feed.url, timeout=10, verify=False) except requests.exceptions.Timeout: return parser = feedparser.parse(resp.content) logger.debug('Got %s posts', len(parser.entries)) updated = (parser.feed.get('updated_parsed', None) or parser.feed.get('date_parsed', None)) if updated: updated = dt.fromtimestamp(mktime(updated)) if updated and feed.updated == updated: logger.debug('Feed %s already up-to-date. Exit', feed.title) return # already up to date logger.debug('Query already fetched posts') fetched_ids = tuple((p.id for p in parser.entries if hasattr(p, 'id'))) query = Session.query(Post).filter(Post.feed_id == feed.id) posts = query.filter(Post.entry_id.in_(fetched_ids)).all() logger.debug('%s posts are already in the database', len(posts)) posts = {post.entry_id: post for post in posts} for post in parser.entries: if not hasattr(post, 'id'): logger.debug('Post has no attribute id %s', post) continue pubdate = (post.get('published_parsed', None) or post.get('date_parsed', None)) pubdate = pubdate and dt.fromtimestamp(mktime(pubdate)) or dt.utcnow() updated = post.get('updated_parsed', None) updated = updated and dt.fromtimestamp(mktime(updated)) try: p = posts[post.id] if not updated or p.updated == updated: continue logger.debug('Updating post %s', post.title) except (KeyError, AttributeError): logger.debug('Creating new post %s', post.title) p = Post() feed.posts.append(p) p.read = False p.entry_id = post.id p.title = post.title[:400] p.link = post.link p.author = parser.feed.get('author', 'None')[:80] p.published = pubdate p.updated = updated p.summary = post.get('summary', '') feed.updated = updated Session.commit() Session.close()
def parse_post(content): post = Post() users = [] comments = [] # Note: this code needs patched lxml with support for huge_tree in HTMLParser parser = lxml.etree.HTMLParser(recover=True, huge_tree=True) root = lxml.etree.HTML(content, parser=parser) # post post_node = root.xpath('//li[@class="hentry"]')[0] comments_node = post_node.xpath('.//div[@class="entry-comments"]')[0] author_node = post_node.xpath('.//p[@class="author"]')[0] post_url = post_node.xpath('.//a[@class="entry-title"]')[0].get('href') post.post_id = int(re.search(r'/(\d+)$', post_url).group(1)) comment_list_raw = comments_node.xpath('ul')[0].get("id") post.comment_list_id = int(re.match(r'comments_(\d+)$', comment_list_raw).group(1)) post.language = post_node.xpath('.//a[@rel="chapter"]')[0].text post.code = post_node.xpath('div[@class="entry-content"]/pre/code')[0].text post.text = inner_html(post_node.xpath('p[@class="description"]')[0]) post.posted = parse_date(author_node.xpath('abbr')[0].get('title')) post.vote_plus, post.vote_minus, post.rating = parse_rating(post_node.xpath('p[@class="vote"]/strong')[0]) # author info user = User() user_url = author_node.xpath('a[1]')[0].get('href') user.user_id = int(re.search(r'/user/(\d+)$', user_url).group(1)) user.name = author_node.xpath('a[2]')[0].text user.avatar_hash = parse_avatar(author_node.xpath('a[1]/img')[0].get('src')) post.user_id = user.user_id users.append(user) # comments for comment_node in comments_node.xpath('.//div[@class="entry-comment-wrapper"]'): comment = Comment() comment.comment_id = int(re.match(r'comment-(\d+)$', comment_node.get('id')).group(1)) comment.post_id = post.post_id parent_node = comment_node.getparent().getparent().getparent() if parent_node.tag == 'li': parent_node = parent_node.xpath('div[@class="entry-comment-wrapper"]')[0] comment.parent_id = int(re.match(r'comment-(\d+)$', parent_node.get('id')).group(1)) else: comment.parent_id = None comment.text = inner_html(comment_node.xpath('.//span[@class="comment-text"]')[0]) info_node = comment_node.xpath('p[@class="entry-info"]')[0] comment.posted = parse_date(info_node.xpath('abbr[@class="published"]')[0].get('title')) comment.vote_plus, comment.vote_minus, comment.rating = parse_rating(info_node.xpath('span[@class="comment-vote"]/strong')[0]) user_node = info_node.xpath('strong[@class="entry-author"]/a')[0] user = User() user.user_id = int(re.search(r'/user/(\d+)$', user_node.get('href')).group(1)) user.name = user_node.text user.avatar_hash = parse_avatar(info_node.xpath('img[@class="avatar"]')[0].get('src')) comment.user_id = user.user_id users.append(user) comments.append(comment) return (post, users, comments)
def main(config): config = get_config(config) teardown(drop_db=False, config=config) documents = json.load(open(config)) engine = pg_engine( database=documents[0].get("database", documents[0]["index"])) Session = sessionmaker(bind=engine, autoflush=True) session = Session() # Bootstrap users = { "Carla Ferreira Cardoso": User(name="Carla Ferreira Cardoso", age=19, gender="female"), "Uwe Fuerst": User(name="Uwe Fuerst", age=58, gender="male"), "Otitodilinna Chigolum": User(name="Otitodilinna Chigolum", age=36, gender="male"), } with subtransactions(session): session.add_all(users.values()) posts = { "Post 1": Post(slug="post_1", title="This is the first post"), "Post 2": Post(slug="post_2", title="This is the second post"), "Post 3": Post(slug="post_3", title="This is the third post"), } with subtransactions(session): session.add_all(posts.values()) comments = { "Comment 1": Comment( title="Comment 1", content="This is a sample comment for comment 1", ), "Comment 2": Comment( title="Comment 2", content="This is a sample comment for comment 2", ), "Comment 3": Comment( title="Comment 3", content="This is a sample comment for comment 3", ), "Comment 4": Comment( title="Comment 4", content="This is a sample comment for comment 4", ), "Comment 5": Comment( title="Comment 5", content="This is a sample comment for comment 5", ), "Comment 6": Comment( title="Comment 6", content="This is a sample comment for comment 6", ), } with subtransactions(session): session.add_all(comments.values()) tags = { "Economics": Tag(name="Economics"), "Career": Tag(name="Career"), "Political": Tag(name="Political"), "Fitness": Tag(name="Fitness"), "Entertainment": Tag(name="Entertainment"), "Education": Tag(name="Education"), "Technology": Tag(name="Technology"), "Health": Tag(name="Health"), "Fashion": Tag(name="Fashion"), "Design": Tag(name="Design"), "Photography": Tag(name="Photography"), "Lifestyle": Tag(name="Lifestyle"), } with subtransactions(session): session.add_all(tags.values()) user_posts = [ UserPost( user=users["Carla Ferreira Cardoso"], post=posts["Post 1"], ), UserPost( user=users["Uwe Fuerst"], post=posts["Post 2"], ), UserPost( user=users["Otitodilinna Chigolum"], post=posts["Post 3"], ), ] with subtransactions(session): session.add_all(user_posts) user_tags = [ UserTag( user=users["Carla Ferreira Cardoso"], tag=tags["Economics"], ), UserTag( user=users["Carla Ferreira Cardoso"], tag=tags["Career"], ), UserTag( user=users["Carla Ferreira Cardoso"], tag=tags["Political"], ), UserTag( user=users["Carla Ferreira Cardoso"], tag=tags["Lifestyle"], ), UserTag( user=users["Carla Ferreira Cardoso"], tag=tags["Health"], ), UserTag( user=users["Uwe Fuerst"], tag=tags["Education"], ), UserTag( user=users["Uwe Fuerst"], tag=tags["Lifestyle"], ), UserTag( user=users["Otitodilinna Chigolum"], tag=tags["Fashion"], ), ] with subtransactions(session): session.add_all(user_tags) post_comments = [ PostComment( post=posts["Post 1"], comment=comments["Comment 1"], ), PostComment( post=posts["Post 1"], comment=comments["Comment 2"], ), PostComment( post=posts["Post 2"], comment=comments["Comment 3"], ), PostComment( post=posts["Post 2"], comment=comments["Comment 4"], ), PostComment( post=posts["Post 3"], comment=comments["Comment 5"], ), PostComment( post=posts["Post 3"], comment=comments["Comment 6"], ), ] with subtransactions(session): session.add_all(post_comments)
def post(post_payload: Post): if not post_payload.id: post_payload.id = len(posts) + 1 posts.append(post_payload) return {"data": post_payload}