Beispiel #1
0
import pandas, pandas_ext, param, features as ft
import useful_stuff
from useful_stuff import *

# list of ftures to be defined by this module
feature_list = [
    ft.Feature("user_blog", "pagerank_by_like_share_postrank"),
    ft.Feature("user_blog", "pagerank_by_like_share_blogprob")
]


# setup function to load data (from store) needed to calculate each Feature
def setup_func(feature_list, store, recreate_setup):

    # Get blog share of user likes and user share of blog likes over stim period "all"
    # Use these as transition probabilities for user moving to other blogs, users
    user_blog_df = store.user_blog.load_df(
        ["all_blog_like_user_like_share", "all_user_like_blog_post_share"],
        reset_index=True)
    user_blog_df = user_blog_df.rename(
        columns={
            "all_blog_like_user_like_share": "bu_prob",
            "all_user_like_blog_post_share": "ub_prob"
        })

    # get list of stim users - they're only ones we ultimately care about
    tmp = store.user.load_df(['is_stim'], reset_index=True)
    stim_users = tmp[tmp.is_stim.fillna(False)].user_id

    # function to get all current user blog indexes for which data is needed
    def get_user_blog_index(store):
Beispiel #2
0
if __name__ == "__main__":
    # Set random seed
    np.random.seed(SEED)
    random.seed(SEED)

    # Load dataset
    x_train, y_train, x_test, y_test = dataset.load_hasc()
    print(x_train.shape)
    print(y_test.shape)

    # Feature extractor
    # 特徴量数 75 (time domain: 16*3 = 48, frequency domain: 8*3 = 24)

    extractors = [
        ('min', features.Feature(np.amin)), ('max', features.Feature(np.amax)),
        ('mean', features.Feature(np.mean)), ('std', features.Feature(np.std)),
        ('first_quartiles', features.Feature(features.first_quartiles)),
        ('median', features.Feature(np.median)),
        ('third_quartiles', features.Feature(features.third_quartiles)),
        ('iqr', features.Feature(sp.iqr)),
        ('corrcoef', features.Feature(features.corrcoef)),
        ('abs_corrcoef', features.Feature(features.abs_corrcoef)),
        ('frame_init', features.Feature(features.frame_init)),
        ('frame_end', features.Feature(features.frame_end)),
        ('intensity', features.Feature(features.intensity)),
        ('skewness', features.Feature(features.skewness)),
        ('kurtosis', features.Feature(features.kurtosis)),
        ('zcr', features.Feature(features.zcr)),
        ('power_spectrum_features_8', features.Feature(features.fft_features))
    ]
import pandas, pandas_ext, param, features as ft
import useful_stuff
from useful_stuff import *


# list of ftures to be defined by this module
feature_list = []
for (name, period) in [('hist', -1), ('all', 0), ('weekM1', 1), ('weekM2', 2), ('weekM3', 3)]:
	
	if period != -1:
		feature_list.append(ft.Feature('blog', name + '_post_ct'
			, period_name = name, period_index = period, calc_type = 'blog_post_ct'))
		feature_list.append(ft.Feature('blog', name + '_like_ct'
			, period_name = name, period_index = period, calc_type = 'blog_like_ct'))
		feature_list.append(ft.Feature('user', name + '_like_ct'
			, period_name = name, period_index = period, calc_type = 'user_like_ct'))
			
	feature_list.append(ft.Feature('user_blog', name + '_blog_post_user_like_share'
		, period_name = name, period_index = period, calc_type = 'blog_post_user_like_share'))
	feature_list.append(ft.Feature('user_blog', name + '_blog_like_user_like_share'
		, period_name = name, period_index = period, calc_type = 'blog_like_user_like_share'))
	feature_list.append(ft.Feature('user_blog', name + '_user_like_blog_post_share'
		, period_name = name, period_index = period, calc_type = "user_like_blog_post_share"))


# setup function to load data (from store) needed to calculate each Feature 
def setup_func(feature_list, store, recreate_setup):

	retval = dict()
	needed_periods = list(set(f.period_index for f in feature_list))
	
Beispiel #4
0
        df = df[(df.user_blog_hist_like_ct > 0)
                | (df.user_blog_all_user_like_blog_post_share > 0)
                | ((df.user_blog_pagerank_by_like_share_postrank > 0) &
                   (df.user_blog_pagerank_by_like_share_postrank < 1000))
                | ((df.user_post_topic_proximity_rank > 0) &
                   (df.user_post_topic_proximity_rank < 1000))]

        # return data
        print "End candidates", nowstring()
        return df[["user_id", "post_id"]]


# feature list
feature_list = [
    ft.Feature("post", "time_zone"),
    ft.Feature("user", "mean_stim_like_time_zone"),
    ft.Feature("user_post", "tz_proximity_to_stim_likes")
]


def setup_func(feature_list, store, recreate_setup):
    # store, recreate_setup = ft.dev_store, True

    # get all tz; convert to vectors (location of tz looking at world from N pole, with GMT at left
    post_tz = jf.PostFile.get_df(["time_zone"]).time_zone
    post_vec = post_tz.apply(lambda x: (math.sin(x * (math.pi / 12.0)),
                                        math.cos(x * (math.pi / 12.0))))

    # calculate average time zone of user likes using TZ vector
    df = store.user_post.load_df(["like_is_stim"], reset_index=True)
Beispiel #5
0
import pandas, pandas_ext, param, features as ft
import jsonfiles as jf
import useful_stuff
from useful_stuff import *

# list of ftures to be defined by this module
feature_list = [
    ft.Feature('post', 'date', fileclass=jf.PostFile),
    ft.Feature('post', 'blog_id', fileclass=jf.PostFile),
    ft.Feature('post', 'is_test', fileclass=jf.PostFile),
    ft.Feature('post', 'week', fileclass=None),
    ft.Feature('post', 'weekday', fileclass=None),
    ft.Feature('user_post', 'is_like', fileclass=jf.LikeFile),
    ft.Feature('user_post', 'like_date', fileclass=jf.LikeFile),
    ft.Feature('user_post', 'like_week', fileclass=None),
    ft.Feature('user', 'is_test', fileclass=jf.UserFile),
    ft.Feature('user', 'hist_like_ct', fileclass=jf.UserHistFile),
    ft.Feature('user_blog', 'hist_like_ct', fileclass=jf.UserBlogHistFile),
    ft.Feature('blog', 'hist_like_ct', fileclass=jf.BlogHistFile),
    ft.Feature('blog', 'hist_post_ct', fileclass=jf.BlogHistFile)
]


# setup function to load data (from store) needed to calculate each Feature
def setup_func(feature_list, store, recreate_setup):

    # will store feature key and data frame with data
    retval = dict()
    features_left = dict((f.key, f) for f in feature_list)

    # if reloading is allowed, it is ok to get to get series from prod store if they exist
Beispiel #6
0
				return len(self.files)

	def veccos(vec1, vec2):
		if (vec1 is None) or (vec2 is None):
			return 0
		else:
			denom = sqrt(sum(v*v for (i, v) in vec1)) * sqrt(sum(v*v for (i, v) in vec2))
			if denom == 0:
				return 0
			else:
				return sum(v1*v2 for (i1, v1) in vec1 for (i2, v2) in vec2 if i1 == i2) / denom
		
		
			
feature_list = [
	ft.Feature("user_post", "topic_proximity_rank")
	, ft.Feature("user_post", "topic_proximity_mean")
	, ft.Feature("user_post", "topic_proximity_max")
]

def setup_func(feature_list, store, recreate_setup):
# store, recreate_setup = ft.prod_store, True

	if not recreate_setup:
		try:
			retval = pandas.HDFStore(store.path + "tmptopicdata.h5")["result"]
			print "loaded stored result"
			return retval
		except:
			print "didn't load stored result"
	
Beispiel #7
0
import pandas, pandas_ext, param, features as ft
import useful_stuff
from useful_stuff import *

# list of ftures to be defined by this module
feature_list = [
    ft.Feature('post', 'is_stim'),
    ft.Feature('post', 'is_resp'),
    ft.Feature('user_post', 'like_is_stim'),
    ft.Feature('user_post', 'like_is_resp'),
    ft.Feature('user', 'is_stim'),
    ft.Feature('user', 'is_resp'),
    ft.Feature('blog', 'is_stim'),
    ft.Feature('blog', 'is_resp')
]


# setup function to load data (from store) needed to calculate each Feature
def setup_func(feature_list, store, recreate_setup):

    # Load universe of posts, set stim and resp
    post_df = store.post.load_df(store.post.keys())
    post_df['is_stim'] = (post_df.is_test
                          == False) & (post_df.week < datetime(2012, 8, 6))
    post_df['is_resp'] = (post_df.is_test == False) & (post_df.week
                                                       == datetime(2012, 8, 6))

    # Load univers of likes, set stim and resp
    like_df = store.user_post.load_df(store.user_post.keys())
    like_df = like_df[like_df.is_like]
Beispiel #8
0
		user_post = store.user_post.load_df(["topic_proximity_rank"], reset_index = True)
		
		post = store.post.load_df(["is_resp", "blog_id"], reset_index = True)
		post = post[post.pop("is_resp").fillna(False)]
		user_post = user_post.merge(post, on = "post_id", how = "inner")
		
		user_post = user_post[
			(user_post.user_id.isin(store.user['is_resp'].get_matching_indexes(True)))
			& (user_post.blog_id.isin(store.blog['is_resp'].get_matching_indexes(True)))
		]
			
		return set(zip(user_post.user_id, user_post.blog_id)) | set(zip(user_blog.user_id, user_blog.blog_id))
		
# feature info
feature_list = [
	ft.Feature("user_blog", "lang_proximity")
	, ft.Feature("user", "is_english")
	, ft.Feature("blog", "is_english")
]

def setup_func(feature_list, store, recreate_setup):
	# store, recreate_setup = ft.dev_store, False
		
	# if not (recreate_setup):
	#	try:
	#		retval = cPickle.load(open(
	#			param.folders.root + "languagedata/" + store.name + "/all_setup_data.pickle", "r"))
	#		return retval
	#	except:
	#		pass
			
Beispiel #9
0
        df = df[(df.user_blog_hist_like_ct > 0)
                | (df.user_blog_all_user_like_blog_post_share > 0)
                | ((df.user_blog_pagerank_by_like_share_postrank > 0) &
                   (df.user_blog_pagerank_by_like_share_postrank < 1000))
                | ((df.user_post_topic_proximity_rank > 0) &
                   (df.user_post_topic_proximity_rank < 1000))]

        # return data
        print "End candidates", nowstring()
        return df[["user_id", "post_id"]]


# list of ftures to be defined by this module
feature_list = [
    ft.Feature('post', 'author'),
    ft.Feature('blog', 'author_ct'),
    ft.Feature('user', 'as_author_post_ct'),
    ft.Feature('user', 'as_author_post_user_like_share'),
    ft.Feature('user_post', 'author_post_ct'),
    ft.Feature('user_post', 'author_like_ct'),
    ft.Feature('user_post', 'blog_post_author_post_share'),
    ft.Feature('user_post', 'blog_like_author_like_share'),
    ft.Feature('user_post', 'author_post_user_like_share'),
    ft.Feature('user_post', 'author_like_user_like_share'),
    ft.Feature('user_post', 'user_like_author_post_share'),
    ft.Feature('user_post', 'user_is_blog_author'),
    ft.Feature('user_post', 'user_is_post_author')
]