def main(options): # Initialize database session = init_DB(options.dbfile) # Query submissions sub_query = session.query(commentDB.Submission) # Limit submissions if options.N_subs > 0: sub_query = sub_query.limit(options.N_subs) # Retrieve comments, lazily comment_gen = lazy_chain( (s.comments for s in sub_query) ) ######################## # Generate Featuresets # # (load from database) # ######################## t0 = time.time() print ("== Loading comments for %d submissions ==" % sub_query.count()) featureSets = [] t1 = time.time() counter = 0 printevery = max(500, min(4000,int(0.8*sub_query.count()))) for c in comment_gen: fs = features.FeatureSet(c, user=c.user, parent=c.submission) featureSets.append(fs) # Progress indicator counter += 1 if counter % printevery == 0: temp = t1 t1 = time.time() print " last %d: %.02f s (%d loaded)" % (printevery, (t1 - temp), counter) # featureSets = [features.FeatureSet(c, user=c.user, parent=c.submission) for c in comment_gen] print " [loaded %d comments in %.02g seconds]" % (len(featureSets), time.time() - t0) ############## # Build VSMs # ############## vsmTag_global = "_global" t0 = time.time() print "== Generating global VSM ==" vsm_global = construct_VSM(featureSets, vsmTag=vsmTag_global) print " [completed in %.02g seconds]" % (time.time() - t0) # Build per-thread VSMs t0 = time.time() print "== Generating per-thread VSMs ==" build_thread_VSMs(vsm_global) print " [completed in %.02g seconds]" % (time.time() - t0) ############################ # Process General Features # ############################ t0 = time.time() t1 = time.time() counter = 0 printevery = len(featureSets) / 10 print "== Extracting general features: %d total comments ==" % len(featureSets) for f in featureSets: calcGeneralFeatures(f, options, vsmTag_global=vsmTag_global) # calcLocalFeatures(f, options) # Progress indicator counter += 1 if counter % printevery == 0: temp = t1 t1 = time.time() print " -> last %d: %.02f s (%.01f%% done)" % (printevery, (t1 - temp), counter*100.0/len(featureSets)) dt = time.time() - t0 print " [completed %d in %.02f s]" % (counter, dt) print " (%d ms per comment)" % ((dt * 1000)/counter) ########################## # Process Local Features # ########################## t0 = time.time() t1 = time.time() counter = 0 printevery = len(featureSets) / 10 if options.f_pos: printevery /= 10 # much slower! printevery = max(100, printevery) # avoid dumping too much text :) print "== Extracting local features: %d total comments ==" % len(featureSets) for f in featureSets: # calcGeneralFeatures(f, options, vsmTag_global=vsmTag_global) calcLocalFeatures(f, options) # Progress indicator counter += 1 if counter % printevery == 0: temp = t1 t1 = time.time() print " -> last %d: %.02f s (%.01f%% done)" % (printevery, (t1 - temp), counter*100.0/len(featureSets)) dt = time.time() - t0 print " [completed %d in %.02f s]" % (counter, dt) print " (%d ms per comment)" % ((dt * 1000)/counter) ######################### # Convert and Save Data # ######################### # Convert to DataFrame print "== Converting to DataFrame ==" df = features.fs_to_DataFrame(featureSets) df = features.derive_features(df) # Convert all unicode to ASCII strings before saving to HDF5 cols_unicode = ['self_id', 'parent_id', 'distinguished'] for name in cols_unicode: df[name] = map(str, df[name]) # df['self_id'] = map(str, df['self_id']) # df['parent_id'] = map(str, df['parent_id']) # Convert all boolean to float (0.0, 1.0), NaN if missing cols_boolean = ['is_mod', 'is_gold', 'has_verified_email'] for name in cols_boolean: df[name] = map(float, df[name]) # For now, everything in featureSets is a comment, # and all parents are submissions # rename columns to keep Sammy happy df['cid'] = df['self_id'] df['sid'] = df['parent_id'] # Save data to HDF5 print "== Exporting to HDF5 ==" df.to_hdf(options.savename, "data") print " [saved as %s]" % options.savename
return pandas.DataFrame({ 'user_id': groupdf.user_id, 'post_id': groupdf.post_id, 'distance': distance }) distance = cand_groups.apply(process_group) distance = pandas.Series(distance.distance, index=pandas.MultiIndex.from_arrays( [distance.user_id, distance.post_id])) return Struct(post_tz=post_tz.apply_name("time_zone"), user_tz=user_tz.apply_name("mean_stim_like_time_zone"), distance_tz=distance) def build_func(feature, data): if feature.key == "post.time_zone": return data.post_tz elif feature.key == "user.mean_stim_like_time_zone": return data.user_tz elif feature.key == "user_post.tz_proximity_to_stim_likes": return data.distance_tz fset = ft.FeatureSet("time_zone", feature_list, setup_func, build_func) fset.save(ft.dev_store, overwrite=True) fset.save(ft.prod_store, overwrite=True)
return group_df r = groups.apply(f) r.index = r.index.droplevel(0) return r # function to calculate blog rank from group of users def process_user_chunk(user_chunk): prob_df = get_user_blog_prob(user_chunk) final_data = filter_and_add_data(prob_df, 1000) return final_data result = stim_users.chunk_apply(process_user_chunk, 50, True) return result # build function to calculate/retrieve fture from setup data def build_func(feature, data): if feature.name == "pagerank_by_like_share_postrank": return data['post_rank'] elif feature.name == "pagerank_by_like_share_blogprob": return data['prob'] # define fset: fture set template fset = ft.FeatureSet("period_data", feature_list, setup_func, build_func) fset.save(ft.prod_store, overwrite=True) fset.save(ft.dev_store, overwrite=True)
postdf['week'] = postdf.date.apply(to_week) retval["post.week"] = postdf postdf['weekday'] = postdf.date.apply(lambda x: x.weekday()) retval["post.weekday"] = postdf if "user_post.like_week" in features_left: try: likedf = retval["user_post.like_date"] except: likedf = store.user_post.load_df(['like_date']).dropna() likedf['like_week'] = likedf.like_date.apply(to_week) retval["user_post.like_week"] = likedf return retval # build function to calculate/retrieve fture from setup data def build_func(feature, data): source_df = data[feature.key] return source_df[feature.name] # define fset: fture set template fset = ft.FeatureSet("source_data", feature_list, setup_func, build_func) fset.save(ft.prod_store, overwrite=True) fset.save(ft.dev_store, overwrite=True)
return_df = pandas.DataFrame(map(f, cand_posts.iteritems()) , columns = ["mean_rho", "max_rho", "avgvec_rho"], index = cand_posts) return_df = return_df.sort("mean_rho", ascending = False) return_df['mean_rho_rank'] = range(1, len(return_df) + 1) prog.n = prog.n + 1 print nowstring(), str(prog.n)+"/"+str(prog.nmax), user_id, len(return_df), (prog.start + prog.nmax * (datetime.now() - prog.start) / prog.n).strftime("%Y-%m-%d %H:%M:%S") return return_df return_value = resp_users.apply(evaluate_user_group) return_value.index.names = ['user_id', 'post_id'] pandas.HDFStore(store.path + "tmptopicdata.h5")["result"] = return_value return return_value # build function to calculate/retrieve fture from setup data def build_func(feature, data): if feature.name == "topic_proximity_rank": return data['mean_rho_rank'] elif feature.name == "topic_proximity_mean": return data['mean_rho'] elif feature.name == "topic_proximity_max": return data['max_rho'] fset = ft.FeatureSet("all_data", feature_list, setup_func, build_func) fset.save(ft.dev_store, overwrite = True) fset.save(ft.prod_store, overwrite = True)
def __init__(self, reactor): super(ServerConnection, self).__init__(reactor) self.features = features.FeatureSet()
# Save blog data blog_df = store.blog.load_df(store.blog.keys()) stim_posts = post_df.groupby('blog_id')['is_stim'].agg(sum) blog_df['is_stim'] = stim_posts > 0 blog_df.is_stim = blog_df.is_stim.fillna(False) resp_posts = post_df.groupby('blog_id')['is_resp'].agg(sum) blog_df['is_resp'] = resp_posts > 0 blog_df.is_resp = blog_df.is_resp.fillna(False) return { 'user': user_df, 'blog': blog_df, 'post': post_df, 'user_post': like_df } # build function to calculate/retrieve fture from setup data def build_func(feature, data): source_df = data[feature.scope] return source_df[feature.name] # define fset: fture set template fset = ft.FeatureSet("dev_stim_resp", feature_list, setup_func, build_func) fset.save(ft.dev_store, overwrite=True)
if feature.key == "user_blog.lang_proximity": s = pandas.Series(data.scores) s.index = pandas.MultiIndex.from_tuples(s.index, names = ["user_id", "blog_id"]) return s elif feature.key == "user.is_english": s = pandas.Series(data.isusereng) s.index.names = ["user_id"] return s elif feature.key == "blog.is_english": print feature.key s = pandas.Series(data.isblogeng) s.index.names = ["blog_id"] return s fset = ft.FeatureSet("language", feature_list, setup_func, build_func) fset.save(ft.dev_store, overwrite = True) fset.save(ft.prod_store, overwrite = True) # # code used to get top languages used - some judgement applied # langdict = LangIDDict() # df = pandas.DataFrame([(key, score, score if n == 0 else 0) # for vec in user_vecs.itervalues() if len(vec) > 0 # for n, (key, score) in enumerate(vec[:5])], # columns = ["langid", "score", "top_score"]) # df = df.groupby(["langid"]).aggregate(sum) # df = df.sort("top_score", ascending = False) # df['lang_name'] = map(lambda(k): langdict[k], df.index) # df