Ejemplo n.º 1
0
def mean_headline(x):
    if 'headline' not in x or 'main' not in x['headline'] or not x['headline']['main']:
        return None
    if 'lead_paragraph' not in x or not x['lead_paragraph']:
        return None
    words = text_nltk.lemma_tokenize(x['headline']['main'])
    if len(words) < 5:
        return None
    words += text_nltk.lemma_tokenize(x['lead_paragraph'])
    return np.nanmean([text_nltk.vectors(w) for w in words], axis=0)
Ejemplo n.º 2
0
def top_articles(source, emotion):
    return article_vector[source]   \
        .map(_fv(lambda x: np.dot(text_nltk.vectors(emotion), x)))   \
        .map(fk_(lambda k: k[1]))      \
        .map(swap).sortByKey(False)    \
        .map(result_format).take(36)
Ejemplo n.º 3
0
    print happiness
    assert happiness[0] >= happiness[
        1], 'pleased < delighted: %f %f' % happiness
    '''
    print test \
        .flatMap(k_skip(lambda x: x['query'] if 'query' in x else None))   \
        .flatMap(_fv_skip(mean_headline))  \
        .map(_fv(empathy))                 \
        .map(_fv(lambda x: x['happy']))    \
        .collect() #map(k_(empathy)).map(k_(max_emotion)).collect()
    '''
    articles = test.flatMap(
        add_skip(lambda x: (x['query'], x['pub_date']) if 'query' in x else None))
    scores   = articles   \
        .flatMap(_fv_skip(mean_headline))         \
        .map(_fv(lambda x: np.dot(x, text_nltk.vectors('happy'))))
    join     = articles   \
        .join(scores)     \
        .map(fv(lambda x: dict_kv(x[0], 'score', x[1]))) \
        .map(add_(lambda x: x['score'])).sortByKey(True).map(v).collect()
    assert join[0][
        'query'] == 'John Biggs', 'John Biggs is not the least happy after join.'

    print >>sys.stderr, 'TEST OK'

'''
    .join(test2)                           \
    .map(v(lambda x: dict_kv(x[1], 'score', x[0])))                 \
    .map(lambda x: dict_kv(x, 'image', '/static/images/%s.png' % x['query'])) \
'''