コード例 #1
0
ファイル: classify.py プロジェクト: jonbaer/hackernews
def get_unclassified_posts(posts_chunk, unclassified_hn_posts, chunk_number):
    for i, postId in enumerate(posts_chunk):
        ranking = chunk_number * 20 + (i + 1)
        # Check if post was already classified
        old_post = posts.find_one({'id': postId})
        new_post = get_hn_post(postId)
        if not old_post:
            if new_post and 'url' in new_post:
                text = get_link_content(new_post['url'])
                if text:
                    print(new_post['url'])
                    time = datetime.datetime\
                        .fromtimestamp(int(new_post['time']))
                    post_data = {
                        'id': postId,
                        'url': new_post['url'],
                        'title': new_post['title'],
                        'text': text,
                        'time': time,
                        'score': new_post['score'],
                        'username': new_post['by'],
                        'ranking': ranking
                    }
                    if 'descendants'in new_post:
                        post_data['comments'] = new_post['descendants']

                    unclassified_hn_posts.append(post_data)
        else:
            update_post(old_post, new_post, ranking)
コード例 #2
0
ファイル: get_samples.py プロジェクト: monkeylearn/hackernews
def get_subreddit_samples(category, subreddit, posts_chunk, samples):
    # Get sample from each post's url
    for post in posts_chunk:
        print(subreddit + ': ' + post.url)
        # If it's a self related post then take selftext as content
        if post.is_self:
            content = post.selftext
        else:
            content = get_link_content(post.url)
        if content:
            samples.append({'text': content, 'label': category})
コード例 #3
0
ファイル: get_samples.py プロジェクト: leinadlime/hackernews
def get_subreddit_samples(category, subreddit, posts_chunk, samples):
    # Get sample from each post's url
    for post in posts_chunk:
        print(subreddit + ': ' + post.url)
        # If it's a self related post then take selftext as content
        if post.is_self:
            content = post.selftext
        else:
            content = get_link_content(post.url)
        if content:
            samples.append({
                'text': content,
                'label': category
            })
コード例 #4
0
ファイル: classify.py プロジェクト: monkeylearn/hackernews
def classify_top_posts(max_posts=None, logger=None):
    top_posts_ids = firebase.get('/v0/topstories', None)

    new_posts = []

    for i, post_id in enumerate(top_posts_ids):
        ranking = i + 1
        post = get_hn_post(post_id)
        cached_post = db.posts.find_one({'id': post_id})

        if logger:
            logger.debug(u'#{} Procesing {} ("{}")'.format(ranking, post['id'], post['title']))

        if cached_post:
            if logger:
                logger.debug('----> Already classified, updating...')
            update_post(post, cached_post, ranking)
        else:
            if post and 'url' in post:
                text = get_link_content(post['url'])
                post_data = {
                    'id': post_id,
                    'url': post['url'],
                    'title': post['title'],
                    'text': text,
                    'time': datetime.datetime .fromtimestamp(int(post['time'])),
                    'score': post['score'],
                    'username': post['by'],
                    'ranking': ranking
                }
                if 'descendants'in post:
                    post_data['comments'] = post['descendants']

                if text and text.strip():
                    # Has good text, queue for classification...
                    if logger:
                        logger.debug('----> Queuing for classification...')
                    new_posts.append(post_data)
                else:
                    if logger:
                        logger.debug('----> Unclassifiable, inserting as random...')
                    post_data['result'] = {
                'label': 'random',
                'probability': '--'
                    }
                    post_data['original_result'] = None
                    db.posts.insert(post_data)
                    if max_posts and ranking >= max_posts:
                        break

    # Classify posts
    if new_posts:
        if logger:
            logger.debug("Classifying {} queued posts with MonkeyLearn".format(len(new_posts)))
        ml = MonkeyLearn(MONKEYLEARN_TOKEN)
        result = ml.classifiers.classify(
            MONKEYLEARN_MODULE_ID,
            (p['text'] for p in new_posts)
        ).result

        # Add classification data to new posts and save to db
        for i, post in enumerate(new_posts):
            if result[i][0]['probability'] > 0.5:
                post['result'] = result[i][0]
            else:
                post['result'] = {
                    'label': 'random',
                    'probability': '--'
                }
            post['original_result'] = result[i][0]

            if logger:
                logger.debug('{} {}',format(post['ranking'], post['title']))
            db.posts.insert(post)

    # Deleted old posts with same ranking
    db.posts.delete_many({'id': {'$nin': top_posts_ids}})
コード例 #5
0
ファイル: classify.py プロジェクト: monkeylearn/hackernews
def classify_top_posts(max_posts=None, logger=None):
    top_posts_ids = firebase.get('/v0/topstories', None)

    new_posts = []

    for i, post_id in enumerate(top_posts_ids):
        ranking = i + 1
        post = get_hn_post(post_id)
        cached_post = db.posts.find_one({'id': post_id})

        if logger:
            logger.debug(u'#{} Procesing {} ("{}")'.format(
                ranking, post['id'], post['title']))

        if cached_post:
            if logger:
                logger.debug('----> Already classified, updating...')
            update_post(post, cached_post, ranking)
        else:
            if post and 'url' in post:
                text = get_link_content(post['url'])
                post_data = {
                    'id': post_id,
                    'url': post['url'],
                    'title': post['title'],
                    'text': text,
                    'time': datetime.datetime.fromtimestamp(int(post['time'])),
                    'score': post['score'],
                    'username': post['by'],
                    'ranking': ranking
                }
                if 'descendants' in post:
                    post_data['comments'] = post['descendants']

                if text and text.strip():
                    # Has good text, queue for classification...
                    if logger:
                        logger.debug('----> Queuing for classification...')
                    new_posts.append(post_data)
                else:
                    if logger:
                        logger.debug(
                            '----> Unclassifiable, inserting as random...')
                    post_data['result'] = {
                        'label': 'random',
                        'probability': '--'
                    }
                    post_data['original_result'] = None
                    db.posts.insert(post_data)
                    if max_posts and ranking >= max_posts:
                        break

    # Classify posts
    if new_posts:
        if logger:
            logger.debug("Classifying {} queued posts with MonkeyLearn".format(
                len(new_posts)))
        ml = MonkeyLearn(MONKEYLEARN_TOKEN)
        result = ml.classifiers.classify(MONKEYLEARN_MODULE_ID,
                                         (p['text'] for p in new_posts)).result

        # Add classification data to new posts and save to db
        for i, post in enumerate(new_posts):
            if result[i][0]['probability'] > 0.5:
                post['result'] = result[i][0]
            else:
                post['result'] = {'label': 'random', 'probability': '--'}
            post['original_result'] = result[i][0]

            if logger:
                logger.debug('{} {}', format(post['ranking'], post['title']))
            db.posts.insert(post)

    # Deleted old posts with same ranking
    db.posts.delete_many({'id': {'$nin': top_posts_ids}})