Ejemplo n.º 1
0
def get_most_frequent_title_words(feed_ids):
    # Get number of requested words from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get highest-frequency words used in post titles, excluding stop words,
    # filtering by feed_ids if specified
    if feed_ids:
        query = session.execute(
            """
              SELECT *
                FROM ts_stat(
                     $$SELECT to_tsvector('simple_english', LOWER(title))
                         FROM (
                                 SELECT DISTINCT ON (post.id)
                                        post.id, post.title, feed_post.feed_id
                                   FROM post
                                        JOIN feed_post
                                          ON feed_post.post_id = post.id
                                  WHERE feed_id = ANY(:feed_id)
                               ORDER BY post.id, feed_post.feed_id DESC
                         ) post_table$$
                )
               WHERE word NOT IN ('ask', 'hn', 'show')
                 AND LENGTH (word) > 1
            ORDER BY nentry DESC
               LIMIT :count;
            """, {
                'feed_id': feed_ids,
                'count': count
            }).fetchall()

    else:
        query = session.execute(
            """
              SELECT *
                FROM ts_stat(
                     $$SELECT to_tsvector('simple_english', LOWER(title))
                         FROM post$$
                )
               WHERE word NOT IN ('ask', 'hn', 'show')
                 AND LENGTH (word) > 1
            ORDER BY nentry DESC
               LIMIT :count;
            """, {
                'feed_id': feed_ids,
                'count': count
            }).fetchall()

    session.close()

    words = []

    for row in query:
        words.append(dict(row))

    return jsonify(words)
Ejemplo n.º 2
0
def scrape_loop():
    # Connect to database
    session = models.Session()

    # Add feed to database
    new_feed = models.Feed()

    session.add(new_feed)

    session.commit()

    feed_id = new_feed.id

    # Create asynchronous tasks to scrape first three pages of Hacker News
    loop = asyncio.get_event_loop()

    tasks = [
        loop.create_task(scrape_page(1, feed_id, loop)),
        loop.create_task(scrape_page(2, feed_id, loop)),
        loop.create_task(scrape_page(3, feed_id, loop))
    ]

    wait_tasks = asyncio.wait(tasks)

    loop.run_until_complete(wait_tasks)

    loop.close()

    session.close()

    print('Scrape completed for first three pages of Hacker News.')

    return
Ejemplo n.º 3
0
def get_users_with_most_comments(feed_ids):
    # Return sample data if there is no database connection
    if not os.environ['DB_CONNECTION']:
        with open(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
                '/sample_data/' + feed_ids + '_users_most_comments.json',
                'r') as sample_data:
            return jsonify(json.load(sample_data))

    # Get number of requested users from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get users who posted the most comments, filtering by feed_ids if
    # specified
    if feed_ids:
        subquery = session.query(models.Comment).with_entities(
            models.Comment.id, models.Comment.total_word_count,
            models.Comment.username).join(models.FeedComment).filter(
                models.FeedComment.feed_id.in_(feed_ids)).filter(
                    models.Comment.username != '').order_by(
                        models.Comment.id,
                        models.FeedComment.feed_id.desc()).distinct(
                            models.Comment.id).subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('username'),
            func.count('*').label("comment_count"),
            func.sum(subquery.columns.get(
                'total_word_count')).label("word_count")).group_by(
                    subquery.columns.get('username')).order_by(
                        desc('comment_count')).limit(count)

    else:
        subquery = session.query(models.Comment).with_entities(
            models.Comment.id, models.Comment.total_word_count,
            models.Comment.username).filter(
                models.Comment.username != '').subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('username'),
            func.count('*').label("comment_count"),
            func.sum(subquery.columns.get(
                'total_word_count')).label("word_count")).group_by(
                    subquery.columns.get('username')).order_by(
                        desc('comment_count')).limit(count)

    session.close()

    users = []

    for row in query:
        users.append(row._asdict())

    return jsonify(users)
Ejemplo n.º 4
0
def get_posts_with_highest_comment_counts(feed_ids):
    # Return sample data if there is no database connection
    if not os.environ['DB_CONNECTION']:
        with open(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
                '/sample_data/' + feed_ids +
                '_posts_highest_comment_count.json', 'r') as sample_data:
            return jsonify(json.load(sample_data))

    # Get number of requested posts from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get posts with highest comment counts, filtering by feed_ids if specified
    if feed_ids:
        subquery = session.query(models.Post).with_entities(
            models.Post.created, models.Post.id, models.Post.link,
            models.Post.title, models.Post.type, models.Post.username,
            models.Post.website, models.FeedPost.comment_count,
            models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(models.FeedPost).filter(
                models.FeedPost.feed_id.in_(feed_ids)).order_by(
                    models.Post.id,
                    models.FeedPost.comment_count.desc()).distinct(
                        models.Post.id).subquery()

        query = session.query(subquery).order_by(
            subquery.columns.get('comment_count').desc()).limit(count)

    else:
        subquery = session.query(models.Post).with_entities(
            models.Post.created, models.Post.id, models.Post.link,
            models.Post.title, models.Post.type, models.Post.username,
            models.Post.website, models.FeedPost.comment_count,
            models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(models.FeedPost).order_by(
                models.Post.id, models.FeedPost.comment_count.desc()).distinct(
                    models.Post.id).subquery()

        query = session.query(subquery).order_by(
            subquery.columns.get('comment_count').desc()).limit(count)

    session.close()

    posts = []

    for row in query:
        posts.append(row._asdict())

    return jsonify(posts)
Ejemplo n.º 5
0
def get_users_with_most_words_in_comments(feed_ids):
    # Get number of requested users from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get users who posted the most words in comments, filtering by feed_ids if
    # specified
    if feed_ids:
        subquery = session.query(models.Comment).with_entities(
            models.Comment.id, models.Comment.total_word_count,
            models.Comment.username).join(models.FeedComment).filter(
                models.FeedComment.feed_id.in_(feed_ids)).filter(
                    models.Comment.username != '').order_by(
                        models.Comment.id,
                        models.FeedComment.feed_id.desc()).distinct(
                            models.Comment.id).subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('username'),
            func.count('*').label("comment_count"),
            func.sum(subquery.columns.get(
                'total_word_count')).label("word_count")).group_by(
                    subquery.columns.get('username')).order_by(
                        desc('word_count')).limit(count)

    else:
        subquery = session.query(models.Comment).with_entities(
            models.Comment.id, models.Comment.total_word_count,
            models.Comment.username).filter(
                models.Comment.username != '').subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('username'),
            func.count('*').label("comment_count"),
            func.sum(subquery.columns.get(
                'total_word_count')).label("word_count")).group_by(
                    subquery.columns.get('username')).order_by(
                        desc('word_count')).limit(count)

    session.close()

    users = []

    for row in query:
        users.append(row._asdict())

    return jsonify(users)
Ejemplo n.º 6
0
def initialize_database():
    # Connect to database
    session = models.Session()

    metadata = MetaData()

    # Create custom text dictionary for detecting stop words without word
    # stemming
    session.execute("""
        DO $$
        BEGIN
            IF NOT EXISTS
                          (SELECT 1
                             FROM pg_ts_dict
                            WHERE dictname = 'simple_english')
                     THEN CREATE TEXT SEARCH DICTIONARY simple_english (
                        TEMPLATE = pg_catalog.simple,
                        STOPWORDS = english
                     );
            END IF;
        END
        $$;

        DO $$
        BEGIN
            IF NOT EXISTS
                          (SELECT 1
                             FROM pg_ts_config
                            WHERE cfgname = 'simple_english')
                     THEN CREATE TEXT SEARCH CONFIGURATION simple_english (
                        COPY = english
                     );
            END IF;
        END
        $$;

        ALTER TEXT SEARCH CONFIGURATION simple_english
            ALTER MAPPING FOR asciihword, asciiword, hword, hword_asciipart,
                              hword_part, word
                         WITH simple_english;
        """)

    session.commit()

    session.close()

    print('Database ' + os.environ['DB_NAME'] + ' initialized successfully.')

    return
Ejemplo n.º 7
0
def get_top_posts(feed_ids):
    # Get number of requested posts from query parameter, using default if
    # null
    count = int(request.args.get('count', 3))

    # Connect to database
    session = models.Session()

    # Get posts in order of rank, filtering by feed_ids if specified
    if feed_ids:
        subquery = session.query(models.Post).with_entities(
            models.Post.created, models.Post.id, models.Post.link,
            models.Post.title, models.Post.type, models.Post.username,
            models.Post.website, models.FeedPost.comment_count,
            models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(models.FeedPost).filter(
                models.FeedPost.feed_id.in_(feed_ids)).order_by(
                    models.Post.id, models.FeedPost.feed_rank,
                    models.FeedPost.point_count.desc()).distinct(
                        models.Post.id).subquery()

        query = session.query(subquery).order_by(
            subquery.columns.get('feed_rank'),
            subquery.columns.get('point_count').desc()).limit(count)

    else:
        subquery = session.query(models.Post).with_entities(
            models.Post.created, models.Post.id, models.Post.link,
            models.Post.title, models.Post.type, models.Post.username,
            models.Post.website, models.FeedPost.comment_count,
            models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(models.FeedPost).order_by(
                models.Post.id, models.FeedPost.feed_rank,
                models.FeedPost.point_count.desc()).distinct(
                    models.Post.id).subquery()

        query = session.query(subquery).order_by(
            subquery.columns.get('feed_rank'),
            subquery.columns.get('point_count').desc()).limit(count)

    session.close()

    posts = []

    for row in query:
        posts.append(row._asdict())

    return jsonify(posts)
Ejemplo n.º 8
0
def get_average_point_count(feed_ids):
    # Connect to database
    session = models.Session()

    # Get average post point count, filtering by feed_ids if specified
    if feed_ids:
        average = round(
            session.query(func.avg(models.FeedPost.point_count)).filter(
                models.FeedPost.feed_id.in_(feed_ids)).one()[0])

    else:
        average = round(
            session.query(func.avg(models.FeedPost.point_count)).one()[0])

    session.close()

    return jsonify(average)
Ejemplo n.º 9
0
def get_average_comment_tree_depth(feed_ids):
    # Connect to database
    session = models.Session()

    # Get average comment level, filtering by feed_ids if specified
    if feed_ids:
        average = round(
            session.query(func.avg(models.Comment.level)).join(
                models.FeedComment).filter(
                    models.FeedComment.feed_id.in_(feed_ids)).one()[0])

    else:
        average = round(session.query(func.avg(models.Comment.level)).one()[0])

    session.close()

    return jsonify(average)
Ejemplo n.º 10
0
def get_top_websites(feed_ids):
    # Get number of requested websites from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get websites that highest number of posts are from, filtering by feed_ids
    # if specified
    if feed_ids:
        subquery = session.query(models.Post).with_entities(
            models.Post.id, models.Post.website).join(models.FeedPost).filter(
                models.FeedPost.feed_id.in_(feed_ids)).filter(
                    models.Post.website != '').order_by(
                        models.Post.id,
                        models.FeedPost.feed_id.desc()).distinct(
                            models.Post.id).subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('website'),
            func.count('*').label("link_count")).group_by(
                subquery.columns.get('website')).order_by(
                    desc('link_count')).limit(count)

    else:
        subquery = session.query(models.Post).with_entities(
            models.Post.id,
            models.Post.website).filter(models.Post.website != '').subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('website'),
            func.count('*').label("link_count")).group_by(
                subquery.columns.get('website')).order_by(
                    desc('link_count')).limit(count)

    session.close()

    websites = []

    for row in query:
        websites.append(row._asdict())

    return jsonify(websites)
Ejemplo n.º 11
0
def get_post_types(feed_ids):
    # Return sample data if there is no database connection
    if not os.environ['DB_CONNECTION']:
        with open(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
                '/sample_data/' + feed_ids + '_post_types.json',
                'r') as sample_data:
            return jsonify(json.load(sample_data))

    # Otherwise, connect to database
    session = models.Session()

    # Get count of types of posts ('article' vs. 'ask' vs. 'job' vs. 'show'),
    # filtering by feed_ids if specified
    if feed_ids:
        subquery = session.query(models.Post).with_entities(
            models.Post.id, models.Post.type).join(models.FeedPost).filter(
                models.FeedPost.feed_id.in_(feed_ids)).order_by(
                    models.Post.id, models.FeedPost.feed_id.desc()).distinct(
                        models.Post.id).subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('type'),
            func.count('*').label("type_count")).group_by(
                subquery.columns.get('type')).order_by(desc('type_count'))

    else:
        subquery = session.query(models.Post).with_entities(
            models.Post.id, models.Post.type).subquery()

        query = session.query(subquery).with_entities(
            subquery.columns.get('type'),
            func.count('*').label("type_count")).group_by(
                subquery.columns.get('type')).order_by(desc('type_count'))

    session.close()

    types = []

    for row in query:
        types.append(row._asdict())

    return jsonify(types)
Ejemplo n.º 12
0
def get_comments_with_highest_word_counts(feed_ids):
    # Get number of requested comments from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get comments with highest word counts, filtering by feed_ids if specified
    if feed_ids:
        subquery = session.query(models.Comment).with_entities(
            models.Comment.content, models.Comment.created, models.Comment.id,
            models.Comment.level, models.Comment.parent_comment,
            models.Comment.post_id, models.Comment.username,
            models.Comment.total_word_count).join(models.FeedComment).filter(
                models.FeedComment.feed_id.in_(feed_ids)).order_by(
                    models.Comment.id,
                    models.Comment.total_word_count.desc()).distinct(
                        models.Comment.id).subquery()

        query = session.query(subquery).order_by(
            subquery.columns.get('total_word_count').desc()).limit(count)

    else:
        query = session.query(models.Comment).with_entities(
            models.Comment.content, models.Comment.created, models.Comment.id,
            models.Comment.level, models.Comment.parent_comment,
            models.Comment.post_id, models.Comment.username,
            models.Comment.total_word_count).order_by(
                models.Comment.total_word_count.desc()).limit(count)

    session.close()

    comments = []

    for row in query:
        comments.append(row._asdict())

    return jsonify(comments)
Ejemplo n.º 13
0
def get_comment(comment_id):
    # Connect to database
    session = models.Session()

    # Get comment from database
    try:
        comment = session.query(models.Comment).with_entities(
            models.Comment.content, models.Comment.created,
            models.Comment.level, models.Comment.parent_comment,
            models.Comment.post_id, models.Comment.username,
            models.FeedComment.feed_rank).join(models.FeedComment).filter(
                models.Comment.id == comment_id).order_by(
                    models.FeedComment.feed_id.desc()).limit(1).one()

        session.close()

        return jsonify(comment._asdict())

    # Return error if comment not returned from query
    except NoResultFound:
        session.close()

        return make_response('Comment not found', 404)
Ejemplo n.º 14
0
def get_post(post_id):
    # Connect to database
    session = models.Session()

    # Get post from database
    try:
        post = session.query(models.Post).with_entities(
            models.Post.created, models.Post.link, models.Post.title,
            models.Post.type, models.Post.username, models.Post.website,
            models.FeedPost.comment_count, models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(
                models.FeedPost).filter(models.Post.id == post_id).order_by(
                    models.FeedPost.feed_id.desc()).limit(1).one()

        session.close()

        return jsonify(post._asdict())

    # Return error if post not returned from query
    except NoResultFound:
        session.close()

        return make_response('Post not found', 404)
Ejemplo n.º 15
0
def get_feeds(time_period):
    # Return time period if there is no database connection
    if not os.environ['DB_CONNECTION']:
        return time_period

    # Connect to database
    session = models.Session()

    # Get requested feed(s) from database based on passed time value
    if time_period == 'day':
        feed_ids = [
            row.id for row in session.query(models.Feed).filter(
                models.Feed.created > date.today()).all()
        ]

    # Get one feed per day in past week if 'week' is specified
    elif time_period == 'week':
        feed_ids = []

        for i in range(7):
            feed_ids.append(
                session.query(models.Feed.id).filter(
                    models.Feed.created > date.today() -
                    timedelta(days=i)).limit(1).one()[0])

    # Return no feed_ids if 'all' is specified so all data can be queried
    elif time_period == 'all':
        feed_ids = None

    # Return most recent feed_id if time_period is 'hour' or unspecified
    else:
        feed_ids = [
            row.id for row in session.query(models.Feed).order_by(
                models.Feed.created.desc()).limit(1)
        ]

    return feed_ids
Ejemplo n.º 16
0
async def scrape_post(post_id, feed_id, loop, page_number):
    # Connect to database
    session = models.Session()

    # Get current UTC time in seconds
    now = int(datetime.utcnow().strftime('%s'))

    # Get HTML tree from post's webpage, specifying page number if given
    if page_number:
        post_html = requests.get('https://news.ycombinator.com/item?id=' +
                                 str(post_id) + '&p=' + str(page_number))

    else:
        post_html = requests.get('https://news.ycombinator.com/item?id=' +
                                 str(post_id))

    post_content = post_html.content

    post_soup = BeautifulSoup(post_content, 'html.parser')

    # If post page contains a "More" link to more comments, create asynchronous
    # task to scrape that page for its comments
    if (post_soup.find('a', 'morelink')):
        page_number = post_soup.find('a',
                                     'morelink').get('href').split('&p=')[1]

        loop.create_task(scrape_post(post_id, feed_id, loop, page_number))

    # Get all comment rows from HTML tree
    comment_rows = post_soup.select('tr.athing.comtr')

    # Set starting comment feed rank to 0
    comment_feed_rank = 0

    for comment_row in comment_rows:
        # Get comment id
        comment_id = comment_row.get('id')

        # Check if comment exists in database
        comment_exists = session.query(
            models.Comment.id).filter_by(id=comment_id).scalar()

        # Get core comment data if it is not in database already
        if not comment_exists:
            # If comment has content span, get text from span
            if comment_row.find('div', 'comment').find_all('span'):
                comment_content = comment_row.find(
                    'div', 'comment').find_all('span')[0].get_text()

                # Remove the last word ('reply') from the comment content
                # and strip trailing whitespace
                comment_content = comment_content.rsplit(' ', 1)[0].strip()

                total_word_count = len(comment_content.split())

            # Otherwise, comment is flagged, so get flagged message as text
            # and strip trailing whitespace
            else:
                comment_content = comment_row.find(
                    'div', 'comment').get_text().strip()

                total_word_count = 0

            # Get UTC timestamp for comment's posting time by subtracting
            # the number of days/hours/minutes ago given on the webpage from
            # the current UTC timestamp
            comment_time_unit = comment_row.find('span',
                                                 'age').a.get_text().split()[1]

            if 'day' in comment_time_unit:
                comment_created = now - 86400 * int(
                    comment_row.find('span', 'age').a.get_text().split()[0])

            elif 'hour' in comment_time_unit:
                comment_created = now - 3600 * int(
                    comment_row.find('span', 'age').a.get_text().split()[0])

            else:
                comment_created = now - 60 * int(
                    comment_row.find('span', 'age').a.get_text().split()[0])

            comment_created = time.strftime('%Y-%m-%d %H:%M',
                                            time.localtime(comment_created))

            # Get comment's level in tree by getting indentation width
            # value divided by value of one indent (40px)
            level = int(
                comment_row.find('td', 'ind').contents[0].get('width')) / 40

            # Set parent comment as blank if comment is the top-level
            # comment
            if level == 0:
                parent_comment = None

            # Otherwise, get preceding comment in comment tree
            else:
                parent_comment = session.query(models.Comment).with_entities(
                    models.Comment.id).join(models.FeedComment).filter(
                        models.Comment.level == (level - 1)
                    ).filter(models.FeedComment.feed_id == feed_id).filter(
                        models.Comment.post_id == post_id).order_by(
                            models.FeedComment.feed_rank).limit(1).one()[0]

            # Get username of user who posted comment
            try:
                comment_username = comment_row.find('a', 'hnuser').get_text()

            except AttributeError:
                comment_username = ''

            # Add scraped comment data to database
            comment = models.Comment(content=comment_content,
                                     created=comment_created,
                                     id=comment_id,
                                     level=level,
                                     parent_comment=parent_comment,
                                     post_id=post_id,
                                     total_word_count=total_word_count,
                                     username=comment_username,
                                     word_counts=func.to_tsvector(
                                         'simple_english',
                                         comment_content.lower()))

            session.add(comment)

        # Increment comment feed rank to get current comment's rank
        comment_feed_rank += 1

        # Add feed-based comment data to database
        feed_comment = models.FeedComment(comment_id=comment_id,
                                          feed_id=feed_id,
                                          feed_rank=comment_feed_rank)

        session.add(feed_comment)

    session.commit()

    # Print message if there are no more pages of comments to scrape
    if not post_soup.find('a', 'morelink'):
        print('Post ' + str(post_id) + ' and its comments scraped')

    return
Ejemplo n.º 17
0
def get_deepest_comment_tree(feed_ids):
    # Connect to database
    session = models.Session()

    # Get highest level comment (deepest in comment tree), filtering by
    # feed_ids if specified
    if feed_ids:
        subquery = session.query(models.Comment).with_entities(
            models.Comment.content, models.Comment.created, models.Comment.id,
            models.Comment.level, models.Comment.parent_comment,
            models.Comment.post_id,
            models.Comment.username).join(models.FeedComment).filter(
                models.FeedComment.feed_id.in_(feed_ids)).order_by(
                    models.Comment.id, models.Comment.level.desc()).distinct(
                        models.Comment.id).subquery()

        comment = session.query(subquery).order_by(
            subquery.columns.get('level').desc()).limit(1).one()._asdict()

        # Get post information
        post = session.query(models.Post).with_entities(
            models.Post.created, models.Post.id, models.Post.link,
            models.Post.title, models.Post.type, models.Post.username,
            models.FeedPost.comment_count, models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(models.FeedPost).filter(
                models.Post.id == comment['post_id']).filter(
                    models.FeedPost.feed_id.in_(feed_ids)).order_by(
                        models.FeedPost.post_id.desc()).limit(
                            1).one()._asdict()

    else:
        comment = session.query(models.Comment).with_entities(
            models.Comment.content, models.Comment.created, models.Comment.id,
            models.Comment.level, models.Comment.parent_comment,
            models.Comment.post_id, models.Comment.username).order_by(
                models.Comment.level.desc()).limit(1).one()._asdict()

        # Get post information
        post = session.query(models.Post).with_entities(
            models.Post.created, models.Post.id, models.Post.link,
            models.Post.title, models.Post.type, models.Post.username,
            models.FeedPost.comment_count, models.FeedPost.feed_rank,
            models.FeedPost.point_count).join(models.FeedPost).filter(
                models.Post.id == comment['post_id']).order_by(
                    models.FeedPost.feed_id.desc()).limit(1).one()._asdict()

    comment.pop('post_id')
    comment.pop('level')

    # Get parent comments of comment to get full comment tree
    while comment['parent_comment']:
        parent_comment = session.query(models.Comment).with_entities(
            models.Comment.content, models.Comment.created, models.Comment.id,
            models.Comment.parent_comment, models.Comment.username).filter(
                models.Comment.id ==
                comment['parent_comment']).one()._asdict()

        comment.pop('parent_comment')

        # Set comment as child of parent
        parent_comment['child_comment'] = comment

        # Set next comment as current parent comment
        comment = parent_comment

    comment.pop('parent_comment')

    post['comment_tree'] = comment

    session.close()

    return jsonify(post)
Ejemplo n.º 18
0
def get_most_frequent_comment_words(feed_ids):
    # Return sample data if there is no database connection
    if not os.environ['DB_CONNECTION']:
        with open(
                os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
                '/sample_data/' + feed_ids + '_comment_words.json',
                'r') as sample_data:
            return jsonify(json.load(sample_data))

    # Get number of requested words from query parameter, using default if
    # null
    count = int(request.args.get('count', 1))

    # Connect to database
    session = models.Session()

    # Get highest frequency words used in comments, excluding stop words,
    # filtering by feed_ids if specified
    if feed_ids:
        query = session.execute(
            """
              SELECT *
                FROM ts_stat(
                     $$SELECT word_counts
                         FROM (
                                 SELECT DISTINCT ON (id)
                                        id, feed_id, word_counts
                                   FROM comment
                                        JOIN feed_comment
                                          ON feed_comment.comment_id =
                                             comment.id
                                  WHERE feed_id = ANY(:feed_id)
                               ORDER BY id, word_counts DESC
                         ) comment_table$$
                )
               WHERE LENGTH (word) > 1
            ORDER BY nentry DESC
               LIMIT :count;
            """, {
                'feed_id': feed_ids,
                'count': count
            }).fetchall()

    else:
        query = session.execute(
            """
              SELECT *
                FROM ts_stat(
                     $$SELECT word_counts
                         FROM comment$$
                )
               WHERE LENGTH (word) > 1
            ORDER BY nentry DESC
               LIMIT :count;
            """, {
                'feed_id': feed_ids,
                'count': count
            }).fetchall()

    session.close()

    words = []

    for row in query:
        words.append(dict(row))

    return jsonify(words)
Ejemplo n.º 19
0
async def scrape_page(page, feed_id, loop):
    # Connect to database
    session = models.Session()

    print('Scrape initiated for page ' + str(page) + ' of Hacker News.')

    # Get current UTC time in seconds
    now = int(datetime.utcnow().strftime('%s'))

    # Get HTML tree from feed page
    feed_html = requests.get('https://news.ycombinator.com/news?p=' +
                             str(page))

    feed_content = feed_html.content

    feed_soup = BeautifulSoup(feed_content, 'html.parser')

    # Get all post rows from HTML tree
    post_rows = feed_soup.find_all('tr', 'athing')

    for post_row in post_rows:
        # Get subtext row with additional post data
        subtext_row = post_row.next_sibling

        # Get post id
        post_id = post_row.get('id')

        # Check if post exists in database
        post_exists = session.query(
            models.Post.id).filter_by(id=post_id).scalar()

        # Get core post data if it is not in database already
        if not post_exists:
            # Get UTC timestamp for post's posting time by subtracting the
            # number of days/hours/minutes ago given on the webpage from the
            # current UTC timestamp
            time_unit = subtext_row.find('span', 'age').a.get_text().split()[1]

            if 'day' in time_unit:
                created = now - 86400 * int(
                    subtext_row.find('span', 'age').a.get_text().split()[0])

            elif 'hour' in time_unit:
                created = now - 3600 * int(
                    subtext_row.find('span', 'age').a.get_text().split()[0])

            else:
                created = now - 60 * int(
                    subtext_row.find('span', 'age').a.get_text().split()[0])

            created = time.strftime('%Y-%m-%d %H:%M', time.localtime(created))

            # Get post's link
            link = post_row.find('a', 'storylink').get('href')

            # Get post's title
            title = post_row.find('a', 'storylink').get_text()

            # Set post's type based on title
            if 'Show HN:' in title:
                type = 'show'
            elif 'Ask HN:' in title:
                type = 'ask'
            else:
                type = 'article'

            # Get username of user who posted post or set as blank for job
            # posting
            if subtext_row.find('a', 'hnuser'):
                username = subtext_row.find('a', 'hnuser').get_text()
            else:
                username = ''

            # Get website that post is from or set as blank for ask posting
            if post_row.find('span', 'sitestr'):
                website = post_row.find('span', 'sitestr').get_text()
            else:
                website = ''

            # Add post data to database
            post = models.Post(created=created,
                               id=post_id,
                               link=link,
                               title=title,
                               type=type,
                               username=username,
                               website=website)

            session.add(post)

        # Get post's comment count if it is listed (otherwise, set to 0)
        if 'comment' in subtext_row.find_all(href='item?id=' +
                                             post_id)[-1].get_text():
            unicode_count = UnicodeDammit(
                subtext_row.find_all(href='item?id=' + post_id)[-1].get_text())
            comment_count = unicode_count.unicode_markup.split()[0]
        else:
            comment_count = 0

        # Get post's rank on feed page
        feed_rank = post_row.find('span', 'rank').get_text()[:-1]

        # Get post's score if it is listed (otherwise, post is job posting)
        if subtext_row.find('span', 'score'):
            point_count = subtext_row.find('span',
                                           'score').get_text().split()[0]
        else:
            point_count = 0
            type = 'job'

        # Add feed-based post data to database
        feed_post = models.FeedPost(comment_count=comment_count,
                                    feed_id=feed_id,
                                    feed_rank=feed_rank,
                                    point_count=point_count,
                                    post_id=post_id)

        session.add(feed_post)

        session.commit()

        # Create asynchronous task to scrape post page for its comments
        loop.create_task(scrape_post(post_id, feed_id, loop, None))

    return
Ejemplo n.º 20
0
def initialize_test_database(postgresql, mock_get):
    db_port = postgresql.dsn()['port']
    db_host = postgresql.dsn()['host']
    db_user = postgresql.dsn()['user']
    database = postgresql.dsn()['database']
    os.environ['DB_NAME'] = database

    os.environ['DB_CONNECTION'] = ('postgresql://' + db_user + '@' + db_host +
                                   ':' + str(db_port) + '/' + database)

    models.engine = create_engine(os.environ['DB_CONNECTION'])

    models.Session = sessionmaker(bind=models.engine)

    # Create custom text dictionary and database tables
    management.initialize_database()

    alembicArgs = [
        '--raiseerr',
        'upgrade',
        'head',
    ]

    alembic.config.main(argv=alembicArgs)

    # Run fake Hacker News scrape to get sample feed, post, and comment from
    # past hour
    hacker_news.scrape_loop()

    # Connect to database
    session = models.Session()

    # Add sample feed to database for remaining time periods ('day', 'week',
    # 'all')
    past_day_feed = models.Feed(id=2,
                                created=(datetime.combine(
                                    date.today(),
                                    datetime.min.time())).isoformat())
    session.add(past_day_feed)

    past_week_feed = models.Feed(id=3,
                                 created=(datetime.combine(
                                     date.today() - timedelta(days=5),
                                     datetime.min.time())).isoformat())
    session.add(past_week_feed)

    all_feed = models.Feed(id=4,
                           created=(datetime.combine(
                               date.today() - timedelta(weeks=2),
                               datetime.min.time())).isoformat())
    session.add(all_feed)

    # Add sample post to database for remaining time periods ('day', 'week',
    # 'all')
    posts_file = 'fixtures/test-post.json'
    with open(posts_file, 'r') as post_data:
        post = json.load(post_data)

    past_day_post = models.Post(created=(datetime.combine(
        date.today() - timedelta(hours=20), datetime.min.time())).isoformat(),
                                id=4,
                                link=post['link'],
                                title=post['title'],
                                type='job',
                                username=post['username'],
                                website=post['website'])

    session.add(past_day_post)

    past_week_post = models.Post(created=(datetime.combine(
        date.today() - timedelta(days=5), datetime.min.time())).isoformat(),
                                 id=5,
                                 link=post['link'],
                                 title=post['title'],
                                 type='ask',
                                 username=post['username'],
                                 website=post['website'])

    session.add(past_week_post)

    all_post = models.Post(created=(datetime.combine(
        date.today() - timedelta(weeks=2), datetime.min.time())).isoformat(),
                           id=6,
                           link=post['link'],
                           title=post['title'],
                           type='show',
                           username=post['username'],
                           website=post['website'])

    session.add(all_post)

    # Add sample feed_post data to database for each feed and post
    past_day_feed_post = models.FeedPost(feed_id=past_day_feed.id,
                                         feed_rank=3,
                                         point_count=3,
                                         post_id=past_day_post.id,
                                         comment_count=3)

    session.add(past_day_feed_post)

    past_week_feed_post = models.FeedPost(feed_id=past_week_feed.id,
                                          feed_rank=5,
                                          point_count=5,
                                          post_id=past_week_post.id,
                                          comment_count=5)

    session.add(past_week_feed_post)

    all_feed_post = models.FeedPost(feed_id=all_feed.id,
                                    feed_rank=7,
                                    point_count=7,
                                    post_id=all_post.id,
                                    comment_count=7)

    session.add(all_feed_post)

    # Add sample comment to database for remaining time periods ('day', 'week',
    # 'all')
    comment_file = 'fixtures/test-comment.json'
    with open(comment_file, 'r') as comment_data:
        comment = json.load(comment_data)

    past_day_comment = models.Comment(
        content=comment['content'],
        created=(datetime.combine(date.today() - timedelta(hours=20),
                                  datetime.min.time())).isoformat(),
        id=6,
        level=0,
        parent_comment=None,
        post_id=past_day_post.id,
        total_word_count=len(comment['content'].split()),
        username=comment['username'],
        word_counts=func.to_tsvector('simple_english',
                                     comment['content'].lower()))

    session.add(past_day_comment)

    past_week_comment = models.Comment(
        content=comment['content'],
        created=(datetime.combine(date.today() - timedelta(days=5),
                                  datetime.min.time())).isoformat(),
        id=7,
        level=0,
        parent_comment=None,
        post_id=past_week_post.id,
        total_word_count=len(comment['content'].split()),
        username=comment['username'],
        word_counts=func.to_tsvector('simple_english',
                                     comment['content'].lower()))

    session.add(past_week_comment)

    all_comment = models.Comment(
        content=comment['content'],
        created=(datetime.combine(date.today() - timedelta(weeks=2),
                                  datetime.min.time())).isoformat(),
        id=8,
        level=0,
        parent_comment=None,
        post_id=all_post.id,
        total_word_count=len(comment['content'].split()),
        username=comment['username'],
        word_counts=func.to_tsvector('simple_english',
                                     comment['content'].lower()))

    session.add(all_comment)

    # Add sample feed_comment data to database for each feed and comment
    past_day_feed_comment = models.FeedComment(comment_id=past_day_comment.id,
                                               feed_id=past_day_feed.id,
                                               feed_rank=3)

    session.add(past_day_feed_comment)

    past_week_feed_comment = models.FeedComment(
        comment_id=past_week_comment.id,
        feed_id=past_week_feed.id,
        feed_rank=5)

    session.add(past_week_feed_comment)

    all_feed_comment = models.FeedComment(comment_id=all_comment.id,
                                          feed_id=all_feed.id,
                                          feed_rank=7)

    session.add(all_feed_comment)

    session.commit()

    session.close()