def set_up_database():
    """
    Set up access to the database for a new run.
    """
    # Connect to snac database.
    conn = snac.connect_snac_db()
    # Add new run to the database.
    run_id = init_run(conn)
    # Create dbinfo dictionary to hold database access details.
    return {'main_conn': conn, 'run_id': run_id}
def set_up_database():
    """
    Set up access to the database for a new run.
    """
    # Connect to snac database.
    conn = snac.connect_snac_db()
    # Add new run to the database.
    run_id = init_run(conn)
    # Create dbinfo dictionary to hold database access details.
    return {'conn': conn, 'run_id': run_id}
def get_ranked_blogs(dbinfo, limit=0):
    """
    Get list of blogs from database in ranked order, optionally up to
    a specified limit.
    """
    conn = snac.connect_snac_db()
    sql_prefix = ('select blog_id, link, rank from blog_rank_latest '
                  'order by rank')
    blogs = (db.execute_fetchone(conn, sql_prefix + ' limit %s;', (limit, ))
             if limit > 0 else db.execute_fetchone(conn, sql_prefix + ';'))
    return blogs
Beispiel #4
0
def set_up_database():
    """
    Set up access to the database for a new run.
    """
    conn = snac.connect_snac_db()
    run_id = None
    try:
        run_id = init_run(conn)
    finally:
        conn.close()
    return run_id
Beispiel #5
0
def set_up_database():
    """
    Set up access to the database for a new run.
    """
    conn = snac.connect_snac_db()
    run_id = None
    try:
        run_id = init_run(conn)
    finally:
        conn.close()
    return run_id
def get_ranked_blogs(dbinfo, limit = 0):
    """
    Get list of blogs from database in ranked order, optionally up to
    a specified limit.
    """
    conn = snac.connect_snac_db()
    sql_prefix = ('select blog_id, link, rank from blog_rank_latest '
                  'order by rank')
    blogs = ( db.execute_fetchone( conn, sql_prefix + ' limit %s;', (limit, ) )
              if limit > 0 else
              db.execute_fetchone( conn, sql_prefix + ';' ) )
    return blogs
Beispiel #7
0
def end_run(run_id):
    """
    Store run's ending timestamp in the database.
    """
    sql = ('update blog_post_run ' + 'set end_time = CURRENT_TIMESTAMP ' +
           'where blog_post_run_id = %s;')
    conn = snac.connect_snac_db()
    try:
        cur = conn.cursor()
        cur.execute(sql, (run_id, ))
        cur.close()
        conn.commit()
    finally:
        conn.close()
def end_run(dbinfo):
    """
    Store run's ending timestamp in the database.
    """
    sql = ('update blog_post_run ' +
           'set end_time = CURRENT_TIMESTAMP ' +
           'where blog_post_run_id = %s;')
    conn = snac.connect_snac_db()
    try:
        cur = conn.cursor()
        cur.execute(sql, (dbinfo['run_id'],) )
        cur.close()
        conn.commit()
    finally:
        conn.close()
    def run(self):
        conn = snac.connect_snac_db()
        print('CONNECTED ' + str(self.blog))
        try:
            print(timestamp() + 'Blog: ' + str(self.blog))
            (blog_id, blog_link, blog_rank) = self.blog
#            if blog_link != '...a_problem_url...':
#                return
            print('00')
            html = retrieve_page(self.dbinfo, blog_link)
            print('01')
            if html is None:
                log(self.dbinfo, 'DEBUG', 'Page not retrieved')
                return
            print('02')
            soup = BeautifulSoup(html, "lxml")
#            print('**************************************************************')
#            print('* ' + blog_link)
#            print('**************************************************************')
            # Get blog roll
            print('03')
            blog_roll = retrieve_blog_roll(self.dbinfo, soup)
            print('04')
            store_blog_roll(self.dbinfo, conn, blog_id, blog_roll)
            # Get RSS feeds
            print('05')
            rss_feeds = retrieve_rss_feeds(self.dbinfo, soup)
            print('06')
            if len(rss_feeds) == 0:
                log(self.dbinfo, 'DEBUG', 'No RSS feeds found')
                return
            print('07')
            main_rss_url = rss_feeds[0][1]
            print('08')
            composed_rss_url = compose_rss_url(blog_link, main_rss_url)
            print('09')
            log(self.dbinfo, 'DEBUG', 'Retrieving posts: ' + composed_rss_url)
            print('10')
            rss = retrieve_posts(composed_rss_url)
            print('11')
            rss_entries = rss['entries']
            print('12')
            store_blog_posts(self.dbinfo, conn, blog_id, rss_entries)
            print('13')
        finally:
            print('14')
            conn.close()
Beispiel #10
0
 def run(self):
     conn = snac.connect_snac_db()
     print('CONNECTED ' + str(self.blog))
     try:
         print(timestamp() + 'Blog: ' + str(self.blog))
         (blog_id, blog_link, blog_rank) = self.blog
         #            if blog_link != '...a_problem_url...':
         #                return
         print('00')
         html = retrieve_page(self.dbinfo, blog_link)
         print('01')
         if html is None:
             log(self.dbinfo, 'DEBUG', 'Page not retrieved')
             return
         print('02')
         soup = BeautifulSoup(html, "lxml")
         #            print('**************************************************************')
         #            print('* ' + blog_link)
         #            print('**************************************************************')
         # Get blog roll
         print('03')
         blog_roll = retrieve_blog_roll(self.dbinfo, soup)
         print('04')
         store_blog_roll(self.dbinfo, conn, blog_id, blog_roll)
         # Get RSS feeds
         print('05')
         rss_feeds = retrieve_rss_feeds(self.dbinfo, soup)
         print('06')
         if len(rss_feeds) == 0:
             log(self.dbinfo, 'DEBUG', 'No RSS feeds found')
             return
         print('07')
         main_rss_url = rss_feeds[0][1]
         print('08')
         composed_rss_url = compose_rss_url(blog_link, main_rss_url)
         print('09')
         log(self.dbinfo, 'DEBUG', 'Retrieving posts: ' + composed_rss_url)
         print('10')
         rss = retrieve_posts(composed_rss_url)
         print('11')
         rss_entries = rss['entries']
         print('12')
         store_blog_posts(self.dbinfo, conn, blog_id, rss_entries)
         print('13')
     finally:
         print('14')
         conn.close()
Beispiel #11
0
def get_blog_post_run():
    sql = """
        select blog_post_run_id, start_time, end_time
        from blog_post_run
        order by blog_post_run_id desc
        limit 1;
        """
    conn = snac.connect_snac_db()
    try:
        cur = conn.cursor()
        cur.execute(sql)
        row = cur.fetchone()
        cur.close()
        conn.commit()
        return row
    finally:
        conn.close()
def retrieve_site(run_id, blog_id, blog_link, blog_rank):
    print('CONNECTING ' + str((blog_id, blog_link, blog_rank)))
    conn = snac.connect_snac_db()
    print('CONNECTED ' + str((blog_id, blog_link, blog_rank)))
    try:
        print(timestamp() + 'Blog: ' + str((blog_id, blog_link, blog_rank)))
        print('00')
        html = retrieve_page(blog_link)
        print('01')
        if html is None:
            log('DEBUG', 'Page not retrieved')
            return
        print('02')
        soup = BeautifulSoup(html, "lxml")
        # Get blog roll
        print('03')
        blog_roll = retrieve_blog_roll(soup)
        print('04')
        store_blog_roll(run_id, conn, blog_id, blog_roll)
        # Get RSS feeds
        print('05')
        rss_feeds = retrieve_rss_feeds(soup)
        print('06')
        if len(rss_feeds) == 0:
            log('DEBUG', 'No RSS feeds found')
            return
        print('07')
        main_rss_url = rss_feeds[0][1]
        print('08')
        composed_rss_url = compose_rss_url(blog_link, main_rss_url)
        print('09')
        log('DEBUG', 'Retrieving posts: ' + composed_rss_url)
        print('10')
        rss = retrieve_posts(composed_rss_url)
        print('11')
        rss_entries = rss['entries']
        print(blog_link + ' - ' + str(len(rss_entries)) + ' rss entries')
        print('12')
        store_blog_posts(run_id, conn, blog_id, rss_entries)
    except:
        print(traceback.format_exc())
    finally:
        conn.close()
def retrieve_site(run_id, blog_id, blog_link, blog_rank):
    print('CONNECTING ' + str((blog_id, blog_link, blog_rank)) )
    conn = snac.connect_snac_db()
    print('CONNECTED ' + str((blog_id, blog_link, blog_rank)) )
    try:
        print(timestamp() + 'Blog: ' + str((blog_id, blog_link, blog_rank)) )
        print('00')
        html = retrieve_page(blog_link)
        print('01')
        if html is None:
            log('DEBUG', 'Page not retrieved')
            return
        print('02')
        soup = BeautifulSoup(html, "lxml")
        # Get blog roll
        print('03')
        blog_roll = retrieve_blog_roll(soup)
        print('04')
        store_blog_roll(run_id, conn, blog_id, blog_roll)
        # Get RSS feeds
        print('05')
        rss_feeds = retrieve_rss_feeds(soup)
        print('06')
        if len(rss_feeds) == 0:
            log('DEBUG', 'No RSS feeds found')
            return
        print('07')
        main_rss_url = rss_feeds[0][1]
        print('08')
        composed_rss_url = compose_rss_url(blog_link, main_rss_url)
        print('09')
        log('DEBUG', 'Retrieving posts: ' + composed_rss_url)
        print('10')
        rss = retrieve_posts(composed_rss_url)
        print('11')
        rss_entries = rss['entries']
        print(blog_link + ' - ' + str(len(rss_entries)) + ' rss entries')
        print('12')
        store_blog_posts(run_id, conn, blog_id, rss_entries)
    except:
        print(traceback.format_exc())
    finally:
        conn.close()
Beispiel #14
0
def get_ranked_blogs(limit=0):
    """
    Get list of blogs from database in ranked order, optionally up to
    a specified limit.
    """
    conn = snac.connect_snac_db()
    try:
        cur = conn.cursor()
        sql_prefix = ('select blog_id, link, rank from blog_rank_latest '
                      'order by rank')
        if limit > 0:
            cur.execute(sql_prefix + ' limit %s;', (limit, ))
        else:
            cur.execute(sql_prefix + ';')
        blogs = cur.fetchall()
        cur.close()
    finally:
        conn.close()
    return blogs
def get_ranked_blogs(dbinfo, limit = 0):
    """
    Get list of blogs from database in ranked order, optionally up to
    a specified limit.
    """
    conn = snac.connect_snac_db()
    try:
        cur = conn.cursor()
        sql_prefix = ('select blog_id, link, rank from blog_rank_latest '
                      'order by rank')
        if limit > 0:
            cur.execute(sql_prefix + ' limit %s;', (limit, ))
        else:
            cur.execute(sql_prefix + ';')
        blogs = cur.fetchall()
        cur.close()
    finally:
        conn.close()
    return blogs
Beispiel #16
0
def connect():
    return snac.connect_snac_db()
Beispiel #17
0
def connect():
    return snac.connect_snac_db()