def set_up_database(): """ Set up access to the database for a new run. """ # Connect to snac database. conn = snac.connect_snac_db() # Add new run to the database. run_id = init_run(conn) # Create dbinfo dictionary to hold database access details. return {'main_conn': conn, 'run_id': run_id}
def set_up_database(): """ Set up access to the database for a new run. """ # Connect to snac database. conn = snac.connect_snac_db() # Add new run to the database. run_id = init_run(conn) # Create dbinfo dictionary to hold database access details. return {'conn': conn, 'run_id': run_id}
def get_ranked_blogs(dbinfo, limit=0): """ Get list of blogs from database in ranked order, optionally up to a specified limit. """ conn = snac.connect_snac_db() sql_prefix = ('select blog_id, link, rank from blog_rank_latest ' 'order by rank') blogs = (db.execute_fetchone(conn, sql_prefix + ' limit %s;', (limit, )) if limit > 0 else db.execute_fetchone(conn, sql_prefix + ';')) return blogs
def set_up_database(): """ Set up access to the database for a new run. """ conn = snac.connect_snac_db() run_id = None try: run_id = init_run(conn) finally: conn.close() return run_id
def get_ranked_blogs(dbinfo, limit = 0): """ Get list of blogs from database in ranked order, optionally up to a specified limit. """ conn = snac.connect_snac_db() sql_prefix = ('select blog_id, link, rank from blog_rank_latest ' 'order by rank') blogs = ( db.execute_fetchone( conn, sql_prefix + ' limit %s;', (limit, ) ) if limit > 0 else db.execute_fetchone( conn, sql_prefix + ';' ) ) return blogs
def end_run(run_id): """ Store run's ending timestamp in the database. """ sql = ('update blog_post_run ' + 'set end_time = CURRENT_TIMESTAMP ' + 'where blog_post_run_id = %s;') conn = snac.connect_snac_db() try: cur = conn.cursor() cur.execute(sql, (run_id, )) cur.close() conn.commit() finally: conn.close()
def end_run(dbinfo): """ Store run's ending timestamp in the database. """ sql = ('update blog_post_run ' + 'set end_time = CURRENT_TIMESTAMP ' + 'where blog_post_run_id = %s;') conn = snac.connect_snac_db() try: cur = conn.cursor() cur.execute(sql, (dbinfo['run_id'],) ) cur.close() conn.commit() finally: conn.close()
def run(self): conn = snac.connect_snac_db() print('CONNECTED ' + str(self.blog)) try: print(timestamp() + 'Blog: ' + str(self.blog)) (blog_id, blog_link, blog_rank) = self.blog # if blog_link != '...a_problem_url...': # return print('00') html = retrieve_page(self.dbinfo, blog_link) print('01') if html is None: log(self.dbinfo, 'DEBUG', 'Page not retrieved') return print('02') soup = BeautifulSoup(html, "lxml") # print('**************************************************************') # print('* ' + blog_link) # print('**************************************************************') # Get blog roll print('03') blog_roll = retrieve_blog_roll(self.dbinfo, soup) print('04') store_blog_roll(self.dbinfo, conn, blog_id, blog_roll) # Get RSS feeds print('05') rss_feeds = retrieve_rss_feeds(self.dbinfo, soup) print('06') if len(rss_feeds) == 0: log(self.dbinfo, 'DEBUG', 'No RSS feeds found') return print('07') main_rss_url = rss_feeds[0][1] print('08') composed_rss_url = compose_rss_url(blog_link, main_rss_url) print('09') log(self.dbinfo, 'DEBUG', 'Retrieving posts: ' + composed_rss_url) print('10') rss = retrieve_posts(composed_rss_url) print('11') rss_entries = rss['entries'] print('12') store_blog_posts(self.dbinfo, conn, blog_id, rss_entries) print('13') finally: print('14') conn.close()
def get_blog_post_run(): sql = """ select blog_post_run_id, start_time, end_time from blog_post_run order by blog_post_run_id desc limit 1; """ conn = snac.connect_snac_db() try: cur = conn.cursor() cur.execute(sql) row = cur.fetchone() cur.close() conn.commit() return row finally: conn.close()
def retrieve_site(run_id, blog_id, blog_link, blog_rank): print('CONNECTING ' + str((blog_id, blog_link, blog_rank))) conn = snac.connect_snac_db() print('CONNECTED ' + str((blog_id, blog_link, blog_rank))) try: print(timestamp() + 'Blog: ' + str((blog_id, blog_link, blog_rank))) print('00') html = retrieve_page(blog_link) print('01') if html is None: log('DEBUG', 'Page not retrieved') return print('02') soup = BeautifulSoup(html, "lxml") # Get blog roll print('03') blog_roll = retrieve_blog_roll(soup) print('04') store_blog_roll(run_id, conn, blog_id, blog_roll) # Get RSS feeds print('05') rss_feeds = retrieve_rss_feeds(soup) print('06') if len(rss_feeds) == 0: log('DEBUG', 'No RSS feeds found') return print('07') main_rss_url = rss_feeds[0][1] print('08') composed_rss_url = compose_rss_url(blog_link, main_rss_url) print('09') log('DEBUG', 'Retrieving posts: ' + composed_rss_url) print('10') rss = retrieve_posts(composed_rss_url) print('11') rss_entries = rss['entries'] print(blog_link + ' - ' + str(len(rss_entries)) + ' rss entries') print('12') store_blog_posts(run_id, conn, blog_id, rss_entries) except: print(traceback.format_exc()) finally: conn.close()
def retrieve_site(run_id, blog_id, blog_link, blog_rank): print('CONNECTING ' + str((blog_id, blog_link, blog_rank)) ) conn = snac.connect_snac_db() print('CONNECTED ' + str((blog_id, blog_link, blog_rank)) ) try: print(timestamp() + 'Blog: ' + str((blog_id, blog_link, blog_rank)) ) print('00') html = retrieve_page(blog_link) print('01') if html is None: log('DEBUG', 'Page not retrieved') return print('02') soup = BeautifulSoup(html, "lxml") # Get blog roll print('03') blog_roll = retrieve_blog_roll(soup) print('04') store_blog_roll(run_id, conn, blog_id, blog_roll) # Get RSS feeds print('05') rss_feeds = retrieve_rss_feeds(soup) print('06') if len(rss_feeds) == 0: log('DEBUG', 'No RSS feeds found') return print('07') main_rss_url = rss_feeds[0][1] print('08') composed_rss_url = compose_rss_url(blog_link, main_rss_url) print('09') log('DEBUG', 'Retrieving posts: ' + composed_rss_url) print('10') rss = retrieve_posts(composed_rss_url) print('11') rss_entries = rss['entries'] print(blog_link + ' - ' + str(len(rss_entries)) + ' rss entries') print('12') store_blog_posts(run_id, conn, blog_id, rss_entries) except: print(traceback.format_exc()) finally: conn.close()
def get_ranked_blogs(limit=0): """ Get list of blogs from database in ranked order, optionally up to a specified limit. """ conn = snac.connect_snac_db() try: cur = conn.cursor() sql_prefix = ('select blog_id, link, rank from blog_rank_latest ' 'order by rank') if limit > 0: cur.execute(sql_prefix + ' limit %s;', (limit, )) else: cur.execute(sql_prefix + ';') blogs = cur.fetchall() cur.close() finally: conn.close() return blogs
def get_ranked_blogs(dbinfo, limit = 0): """ Get list of blogs from database in ranked order, optionally up to a specified limit. """ conn = snac.connect_snac_db() try: cur = conn.cursor() sql_prefix = ('select blog_id, link, rank from blog_rank_latest ' 'order by rank') if limit > 0: cur.execute(sql_prefix + ' limit %s;', (limit, )) else: cur.execute(sql_prefix + ';') blogs = cur.fetchall() cur.close() finally: conn.close() return blogs
def connect(): return snac.connect_snac_db()