Ejemplo n.º 1
0
def main(options):
    # Initialize database
    session = init_DB(options.dbfile)

    # Query submissions
    sub_query = session.query(commentDB.Submission)

    # Limit submissions
    if options.N_subs > 0:
        sub_query = sub_query.limit(options.N_subs)

    # Retrieve comments, lazily
    comment_gen = lazy_chain( (s.comments for s in sub_query) )


    ########################
    # Generate Featuresets #
    # (load from database) #
    ########################
    t0 = time.time()
    print ("== Loading comments for %d submissions ==" % sub_query.count())
    featureSets = []

    t1 = time.time()
    counter = 0
    printevery = max(500, min(4000,int(0.8*sub_query.count())))
    for c in comment_gen:
        fs = features.FeatureSet(c, user=c.user, parent=c.submission)
        featureSets.append(fs)

        # Progress indicator
        counter += 1
        if counter % printevery == 0:
            temp = t1
            t1 = time.time()
            print "  last %d: %.02f s (%d loaded)" % (printevery, (t1 - temp), counter)

    # featureSets = [features.FeatureSet(c, user=c.user, parent=c.submission) for c in comment_gen]
    print "  [loaded %d comments in %.02g seconds]" % (len(featureSets), time.time() - t0)

    ##############
    # Build VSMs #
    ##############
    vsmTag_global = "_global"
    t0 = time.time()
    print "== Generating global VSM =="
    vsm_global = construct_VSM(featureSets, vsmTag=vsmTag_global)
    print "  [completed in %.02g seconds]" % (time.time() - t0)

    # Build per-thread VSMs
    t0 = time.time()
    print "== Generating per-thread VSMs =="
    build_thread_VSMs(vsm_global)
    print "  [completed in %.02g seconds]" % (time.time() - t0)


    ############################
    # Process General Features #
    ############################
    t0 = time.time()
    t1 = time.time()
    counter = 0
    printevery = len(featureSets) / 10
    print "== Extracting general features: %d total comments ==" % len(featureSets)
    for f in featureSets:
        calcGeneralFeatures(f, options, vsmTag_global=vsmTag_global)
        # calcLocalFeatures(f, options)

        # Progress indicator
        counter += 1
        if counter % printevery == 0:
            temp = t1
            t1 = time.time()
            print "  -> last %d: %.02f s (%.01f%% done)" % (printevery, (t1 - temp), counter*100.0/len(featureSets))

    dt = time.time() - t0
    print "  [completed %d in %.02f s]" % (counter, dt)
    print "  (%d ms per comment)" % ((dt * 1000)/counter)

    ##########################
    # Process Local Features #
    ##########################
    t0 = time.time()
    t1 = time.time()
    counter = 0
    printevery = len(featureSets) / 10
    if options.f_pos: printevery /= 10 # much slower!
    printevery = max(100, printevery) # avoid dumping too much text :)
    print "== Extracting local features: %d total comments ==" % len(featureSets)
    for f in featureSets:
        # calcGeneralFeatures(f, options, vsmTag_global=vsmTag_global)
        calcLocalFeatures(f, options)

        # Progress indicator
        counter += 1
        if counter % printevery == 0:
            temp = t1
            t1 = time.time()
            print "  -> last %d: %.02f s (%.01f%% done)" % (printevery, (t1 - temp), counter*100.0/len(featureSets))

    dt = time.time() - t0
    print "  [completed %d in %.02f s]" % (counter, dt)
    print "  (%d ms per comment)" % ((dt * 1000)/counter)


    #########################
    # Convert and Save Data #
    #########################

    # Convert to DataFrame
    print "== Converting to DataFrame =="
    df = features.fs_to_DataFrame(featureSets)
    df = features.derive_features(df)

    # Convert all unicode to ASCII strings before saving to HDF5
    cols_unicode = ['self_id', 'parent_id', 'distinguished']
    for name in cols_unicode:
        df[name] = map(str, df[name])
    # df['self_id'] = map(str, df['self_id'])
    # df['parent_id'] = map(str, df['parent_id'])

    # Convert all boolean to float (0.0, 1.0), NaN if missing
    cols_boolean = ['is_mod', 'is_gold', 'has_verified_email']
    for name in cols_boolean:
        df[name] = map(float, df[name])

    # For now, everything in featureSets is a comment,
    # and all parents are submissions
    # rename columns to keep Sammy happy
    df['cid'] = df['self_id']
    df['sid'] = df['parent_id']

    # Save data to HDF5
    print "== Exporting to HDF5 =="
    df.to_hdf(options.savename, "data")
    print "  [saved as %s]" % options.savename
Ejemplo n.º 2
0
        return pandas.DataFrame({
            'user_id': groupdf.user_id,
            'post_id': groupdf.post_id,
            'distance': distance
        })

    distance = cand_groups.apply(process_group)

    distance = pandas.Series(distance.distance,
                             index=pandas.MultiIndex.from_arrays(
                                 [distance.user_id, distance.post_id]))

    return Struct(post_tz=post_tz.apply_name("time_zone"),
                  user_tz=user_tz.apply_name("mean_stim_like_time_zone"),
                  distance_tz=distance)


def build_func(feature, data):

    if feature.key == "post.time_zone":
        return data.post_tz
    elif feature.key == "user.mean_stim_like_time_zone":
        return data.user_tz
    elif feature.key == "user_post.tz_proximity_to_stim_likes":
        return data.distance_tz


fset = ft.FeatureSet("time_zone", feature_list, setup_func, build_func)
fset.save(ft.dev_store, overwrite=True)
fset.save(ft.prod_store, overwrite=True)
Ejemplo n.º 3
0
            return group_df

        r = groups.apply(f)
        r.index = r.index.droplevel(0)
        return r

    # function to calculate blog rank from group of users
    def process_user_chunk(user_chunk):
        prob_df = get_user_blog_prob(user_chunk)
        final_data = filter_and_add_data(prob_df, 1000)
        return final_data

    result = stim_users.chunk_apply(process_user_chunk, 50, True)
    return result


# build function to calculate/retrieve fture from setup data
def build_func(feature, data):

    if feature.name == "pagerank_by_like_share_postrank":
        return data['post_rank']
    elif feature.name == "pagerank_by_like_share_blogprob":
        return data['prob']


# define fset: fture set template
fset = ft.FeatureSet("period_data", feature_list, setup_func, build_func)
fset.save(ft.prod_store, overwrite=True)
fset.save(ft.dev_store, overwrite=True)
Ejemplo n.º 4
0
        postdf['week'] = postdf.date.apply(to_week)
        retval["post.week"] = postdf

        postdf['weekday'] = postdf.date.apply(lambda x: x.weekday())
        retval["post.weekday"] = postdf

    if "user_post.like_week" in features_left:

        try:
            likedf = retval["user_post.like_date"]
        except:
            likedf = store.user_post.load_df(['like_date']).dropna()

        likedf['like_week'] = likedf.like_date.apply(to_week)
        retval["user_post.like_week"] = likedf

    return retval


# build function to calculate/retrieve fture from setup data
def build_func(feature, data):

    source_df = data[feature.key]
    return source_df[feature.name]


# define fset: fture set template
fset = ft.FeatureSet("source_data", feature_list, setup_func, build_func)
fset.save(ft.prod_store, overwrite=True)
fset.save(ft.dev_store, overwrite=True)
Ejemplo n.º 5
0
				return_df = pandas.DataFrame(map(f, cand_posts.iteritems())
											, columns = ["mean_rho", "max_rho", "avgvec_rho"], index = cand_posts)
				
				return_df = return_df.sort("mean_rho", ascending = False)
				return_df['mean_rho_rank'] = range(1, len(return_df) + 1)
			
			prog.n = prog.n + 1
			print nowstring(),  str(prog.n)+"/"+str(prog.nmax), user_id, len(return_df), (prog.start + prog.nmax * (datetime.now() - prog.start) / prog.n).strftime("%Y-%m-%d %H:%M:%S")
			return return_df
		return_value = resp_users.apply(evaluate_user_group)
		
		return_value.index.names = ['user_id', 'post_id']					
		pandas.HDFStore(store.path + "tmptopicdata.h5")["result"] = return_value
	return return_value
	
# build function to calculate/retrieve fture from setup data	
def build_func(feature, data):

	if feature.name == "topic_proximity_rank":
		return data['mean_rho_rank']
	elif feature.name == "topic_proximity_mean":
		return data['mean_rho']
	elif feature.name == "topic_proximity_max":
		return data['max_rho']
	
fset = ft.FeatureSet("all_data", feature_list, setup_func, build_func)
fset.save(ft.dev_store, overwrite = True)
fset.save(ft.prod_store, overwrite = True)
	

	
Ejemplo n.º 6
0
 def __init__(self, reactor):
     super(ServerConnection, self).__init__(reactor)
     self.features = features.FeatureSet()
Ejemplo n.º 7
0
    # Save blog data
    blog_df = store.blog.load_df(store.blog.keys())

    stim_posts = post_df.groupby('blog_id')['is_stim'].agg(sum)
    blog_df['is_stim'] = stim_posts > 0
    blog_df.is_stim = blog_df.is_stim.fillna(False)

    resp_posts = post_df.groupby('blog_id')['is_resp'].agg(sum)
    blog_df['is_resp'] = resp_posts > 0
    blog_df.is_resp = blog_df.is_resp.fillna(False)

    return {
        'user': user_df,
        'blog': blog_df,
        'post': post_df,
        'user_post': like_df
    }


# build function to calculate/retrieve fture from setup data
def build_func(feature, data):

    source_df = data[feature.scope]
    return source_df[feature.name]


# define fset: fture set template
fset = ft.FeatureSet("dev_stim_resp", feature_list, setup_func, build_func)
fset.save(ft.dev_store, overwrite=True)
Ejemplo n.º 8
0
	
	if feature.key == "user_blog.lang_proximity":
		s = pandas.Series(data.scores)
		s.index = pandas.MultiIndex.from_tuples(s.index, names = ["user_id", "blog_id"])
		return s
	elif feature.key == "user.is_english":
		s = pandas.Series(data.isusereng)
		s.index.names = ["user_id"]
		return s
	elif feature.key == "blog.is_english":
		print feature.key
		s = pandas.Series(data.isblogeng)
		s.index.names = ["blog_id"]
		return s
	
fset = ft.FeatureSet("language", feature_list, setup_func, build_func)
fset.save(ft.dev_store, overwrite = True)
fset.save(ft.prod_store, overwrite = True)



# # code used to get top languages used - some judgement applied		
# langdict = LangIDDict()
# df = pandas.DataFrame([(key, score, score if n == 0 else 0) 
						# for vec in user_vecs.itervalues() if len(vec) > 0 
						# for n, (key, score) in enumerate(vec[:5])], 
						# columns = ["langid", "score", "top_score"])
# df = df.groupby(["langid"]).aggregate(sum)
# df = df.sort("top_score", ascending = False)
# df['lang_name'] = map(lambda(k): langdict[k], df.index)
# df