Esempio n. 1
0
def populate_data_structures():
    logger.info('started populating data structures...')

    #key = blog
    #value = list of (post_id, author, tags, categories, date_struct) tuples
    test_blog_post_tuples_map = defaultdict(list)

    with open(testPosts_loc, 'r') as f:
        for line_number, line_text in enumerate(f):
            if lines_to_read != -1 and line_number >= lines_to_read:
                break

            blog_json = json.loads(line_text)

            blog        = blog_json['blog']
            post_id     = blog_json['post_id']
            author      = blog_json['author']
            tags        = blog_json['tags']
            categories  = blog_json['categories']
            date_struct = datetime.strptime(blog_json['date_gmt'], '%Y-%m-%d %H:%M:%S')

            test_blog_post_tuples_map[blog] += [(post_id, author, tags, categories, date_struct)]

    logger.info('finished populating data structures')

    pickle(test_blog_post_tuples_map, 'test_blog_post_tuples_map')
Esempio n. 2
0
from collections import defaultdict
import json
import logging
from gensim.models.ldamodel import LdaModel
from v1.config_and_pickle import testPosts_loc, MyFilesIterator, trainPosts_loc, MyCorpus, build_word_id_map, normalize_content_stats, topic_count, pickle, trainPostsThin_loc

################################################################################################################################################
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
logger = logging.getLogger('LDA_model_builder')
################################################################################################################################################
logger.info('building word_id_map...')
word_id_map = build_word_id_map([trainPosts_loc, testPosts_loc])
pickle(word_id_map, 'word_id_map')
normalize_content_stats()

train_and_test_corpus = MyCorpus([trainPosts_loc, testPosts_loc], word_id_map)
logger.info('training LDA model...')
#id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
lda = LdaModel(train_and_test_corpus,
               id2word=word_id_map,
               num_topics=topic_count,
               update_every=1,
               chunksize=10000,
               passes=1)
pickle(lda, 'lda')

#Print the 'topn' most probable words for (randomly selected) 'topics' number of topics. Set topics=-1 to print all topics.
lda.show_topics(topics=topic_count, topn=10)
################################################################################################################################################
#key = blog + '_' + post_id
Esempio n. 3
0
from collections import defaultdict
import json
import logging
from gensim.models.ldamodel import LdaModel
from v1.config_and_pickle import testPosts_loc, MyFilesIterator, trainPosts_loc, MyCorpus, build_word_id_map, normalize_content_stats, topic_count, pickle, trainPostsThin_loc

################################################################################################################################################
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger('LDA_model_builder')
################################################################################################################################################
logger.info('building word_id_map...')
word_id_map = build_word_id_map([trainPosts_loc, testPosts_loc])
pickle(word_id_map, 'word_id_map')
normalize_content_stats()

train_and_test_corpus = MyCorpus([trainPosts_loc, testPosts_loc], word_id_map)
logger.info('training LDA model...')
#id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
lda = LdaModel(train_and_test_corpus, id2word=word_id_map, num_topics=topic_count, update_every=1, chunksize=10000, passes=1)
pickle(lda, 'lda')

#Print the 'topn' most probable words for (randomly selected) 'topics' number of topics. Set topics=-1 to print all topics.
lda.show_topics(topics=topic_count, topn=10)
################################################################################################################################################
#key = blog + '_' + post_id
#value = a list of (topic_id, topic_probability) 2-tuples
blog_topic_distribution_map = {}

#key = uid (user id)
#value = list of (blog, post_id) tuples
train_user_likes_map = defaultdict(list)
Esempio n. 4
0
def populate_data_structures(populate_for_first_four_weeks):
    #populate data structures required for building training CSV
    logger.info('started populating data structures...')

    #key = blog
    #value = number of posts
    blog_post_count_map = defaultdict(int)

    #key = blog_author
    #value = number of posts
    blog_author__post_count_map = defaultdict(int)

    #key   = uid (user id)
    #value = list of (blog, post_id, author) tuples
    user_likes_map = defaultdict(list)

    #key   = uid (user id)
    #value = set of blogs liked by this user
    user_liked_blogs_map = defaultdict(set)

    #key   = blog
    #value = set of users (uid) who have liked at least one post from the blog 
    blog_liked_users_map = defaultdict(set)
    
    #key = tag
    #value = dict:
    #             key = blog
    #             value = count [# of posts in this blog for this tag]
    #tag_blog_count_map = defaultdict(lambda : defaultdict(int))
    tag_blog_count_map = {}

    #key = category
    #value = dict:
    #             key = blog
    #             value = count [# of posts in this blog for this category]
    #category_blog_count_map = defaultdict(lambda : defaultdict(int))
    category_blog_count_map = {}

    #key = tag
    #value = dict:
    #             key = user [uid]
    #             value = count [# of posts for this tag that this user has liked]
    #tag_user_count_map = defaultdict(lambda : defaultdict(int))
    tag_user_count_map = {}

    #key = category
    #value = dict:
    #             key = user [uid]
    #             value = count [# of posts for this category that this user has liked]
    #category_user_count_map = defaultdict(lambda : defaultdict(int))
    category_user_count_map = {}

    with open(trainPosts_loc, 'r') as f:
        for line_number, line_text in enumerate(f):
            if lines_to_read != -1 and line_number >= lines_to_read:
                break

            blog_json = json.loads(line_text)

            blog        = blog_json['blog']
            post_id     = blog_json['post_id']
            author      = blog_json['author']
            tags        = blog_json['tags']
            categories  = blog_json['categories']
            date_struct = datetime.strptime(blog_json['date_gmt'], '%Y-%m-%d %H:%M:%S')
            date_string = date_struct.strftime('%Y-%m-%d')

            if populate_for_first_four_weeks and date_string >= wk_5_start_date:
                continue

            blog_post_count_map[blog] += 1
            blog_author__post_count_map[blog + '_' + author] += 1

            for tag in tags:
                #tag_blog_count_map[tag][blog] += 1
                increment_by_one(tag_blog_count_map, tag, blog)

            for category in categories:
                #category_blog_count_map[category][blog] += 1
                increment_by_one(category_blog_count_map, category, blog)

            for like in blog_json['likes']:
                uid = like['uid']
                user_likes_map[uid] += [(blog, post_id, author)]
                user_liked_blogs_map[uid].add(blog)
                blog_liked_users_map[blog].add(uid)
                for tag in tags:
                    #tag_user_count_map[tag][uid] += 1
                    increment_by_one(tag_user_count_map, tag, uid)
                for category in categories:
                    #category_user_count_map[category][uid] += 1
                    increment_by_one(category_user_count_map, category, uid)

    logger.info('finished populating data structures')

    pickle(blog_post_count_map, 'blog_post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(blog_author__post_count_map, 'blog_author__post_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(user_likes_map, 'user_likes_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(user_liked_blogs_map, 'user_liked_blogs_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(blog_liked_users_map, 'blog_liked_users_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(tag_blog_count_map, 'tag_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(category_blog_count_map, 'category_blog_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(tag_user_count_map, 'tag_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))
    pickle(category_user_count_map, 'category_user_count_map' + get_pickle_file_suffix(populate_for_first_four_weeks))