def cal_x_y(col_lst):
        col1 = col_lst[0]
        col2 = col_lst[1]

        name1 = '_'.join([''.join(x.split('_')) for x in col1])
	print('name1:')
	print(name1)
        mean_train = train.groupby(col1)['log_demand'].mean().reset_index(name=name1)
	print('mean_train_1:')
	print(mean_train.head())
        merge = pd.merge(test, mean_train, how='inner', on=col1)
	print('merge1:')
	print(merge.head())
        name2 = '_'.join([''.join(x.split('_')) for x in col2])
	print('name2')
	print(name2)
        mean_train = train.groupby(col2)['log_demand'].mean().reset_index(name=name2)
	print('mean_train_2:')
	print(mean_train.head())
        merge = pd.merge(merge, mean_train, how='inner', on=col2)
	print('merge2:')
	print(merge.head())

        x1 = merge[name1].apply(np.expm1)
	print('x1:')
	print(x1)
        x2 = merge[name2].apply(np.expm1)
	print('x2:')
	print(x2)
        y = merge['log_demand']
	print('y:')
	print(y)
        return x1, x2, y
Exemple #2
0
def get_data(criteria_info):
    '''
    Gets the city name and relevant column(s) for the criteria passed.

    Input: criteria name
    Output: pandas dataframe
    '''
    # Specific query information for cities because of db inconsistencies
    if criteria_info == RELATION_DICT['cities']:
        data = []
        for col in criteria_info[1:]:
            pull = criteria_info[0].objects.values('id', col)
            data.append(pull)
        rv = pd.DataFrame.from_records(data[0])
        for df in data[1:]:
            df = pd.DataFrame.from_records(df)
            rv = pd.merge(rv, df, on='id')
        return rv

    # All other queries
    else:
        data = []
        for col in criteria_info[1:]:
            data.append(criteria_info[0].objects.values('city_id', col))
        rv = pd.DataFrame.from_records(data[0])
        for df in data[1:]:
            df = pd.DataFrame.from_records(df)
            rv = pd.merge(rv, df, on='city_id')
        return rv
def main():

    df = pd.read_csv("../OUTPUT/segmentation_results_k-means.csv", delimiter=",", skipinitialspace=True)

    df_api = pd.read_csv("../OUTPUT/usersInfoAPI.csv", delimiter=",", skipinitialspace=True)

    #aggrego male, female e null
    df_api["sesso"] = df_api["sesso"].replace("F", "f")
    df_api["sesso"] = df_api["sesso"].replace("M", "m")
    df_api["sesso"] = df_api["sesso"].replace("N", "n")
    df_api["sesso"] = df_api["sesso"].fillna('n')

    df_friends = pd.read_csv("../OUTPUT/network_degree_node.csv", delimiter=",", skipinitialspace=True)

    df_merged = pd.merge(df_api, df, left_on="user_id", right_on="user_id", how='right')

    df_merged = pd.merge(df_friends, df_merged, left_on="user_id", right_on="user_id", how='right')
    df_merged["sesso"] = df_merged["sesso"].fillna('n')
    # df_merged["data_reg"] = pd.to_datetime(df_merged['data_reg'])

    # print df_merged["degree_initial_network"].mean()
    # generi = df_merged["sesso"].values.tolist()
    # counter_sex = Counter(generi)
    # sex_dict = dict(counter_sex)
    # print sex_dict
    # # date_time = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
    #
    # # print datetime.datetime.fromtimestamp(int(df_merged["data_reg"].mean()))
    # sys.exit()

    # plt.style.use("dark_background")

    k_means_analysis(df_merged)
Exemple #4
0
def determine_from_listed_position():
    general_stats_ep = GeneralPlayerStats()

    guards = general_stats_ep.get_data({'Season': '2017-18', 'PlayerPosition': 'G'})
    forwards = general_stats_ep.get_data({'Season': '2017-18', 'PlayerPosition': 'F'})
    centers = general_stats_ep.get_data({'Season': '2017-18', 'PlayerPosition': 'C'})

    guards['G'] = 1
    forwards['F'] = 1
    centers['C'] = 1

    merge_df = pd.merge(guards, forwards, on=['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'TEAM_ID'], how='outer')
    merge_df = pd.merge(merge_df, centers, on=['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'TEAM_ID'], how='outer')

    merge_df = merge_df[['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'TEAM_ID', 'G', 'F', 'C']]
    merge_df = merge_df.fillna(0)

    conditions = [
        ((merge_df['G'] == 1) & (merge_df['F'] == 0) & (merge_df['C'] == 0)),
        ((merge_df['F'] == 1) & (merge_df['C'] == 0)),
        (merge_df['C'] == 1)
    ]
    choices = ['Guard', 'Wing', 'Big']

    merge_df['POSITION'] = np.select(conditions, choices, default='None')
    return merge_df
Exemple #5
0
def test_merge_tables3():
    df_a = pd.DataFrame(
        {'a': [0, 1]},
        index=['a0', 'a1'])
    df_b = pd.DataFrame(
        {'b': [2, 3, 4, 5, 6],
         'a_id': ['a0', 'a1', 'a1', 'a0', 'a1']},
        index=['b0', 'b1', 'b2', 'b3', 'b4'])
    df_c = pd.DataFrame(
        {'c': [7, 8, 9]},
        index=['c0', 'c1', 'c2'])
    df_d = pd.DataFrame(
        {'d': [10, 11, 12, 13, 15, 16, 16, 17, 18, 19],
         'b_id': ['b2', 'b0', 'b3', 'b3', 'b1', 'b4', 'b1', 'b4', 'b3', 'b3'],
         'c_id': ['c0', 'c1', 'c1', 'c0', 'c0', 'c2', 'c1', 'c2', 'c1', 'c2']},
        index=['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9'])

    orca.add_table('a', df_a)
    orca.add_table('b', df_b)
    orca.add_table('c', df_c)
    orca.add_table('d', df_d)

    orca.broadcast(cast='a', onto='b', cast_index=True, onto_on='a_id')
    orca.broadcast(cast='b', onto='d', cast_index=True, onto_on='b_id')
    orca.broadcast(cast='c', onto='d', cast_index=True, onto_on='c_id')

    df = orca.merge_tables(target='d', tables=['a', 'b', 'c', 'd'])

    expected = pd.merge(df_a, df_b, left_index=True, right_on='a_id')
    expected = pd.merge(expected, df_d, left_index=True, right_on='b_id')
    expected = pd.merge(df_c, expected, left_index=True, right_on='c_id')

    assert_frames_equal(df, expected)
Exemple #6
0
def pivot_work():
	coster = CcalcCostTime()
	
	# 建立透视表,提取女性用户
	g_DF_USER_all = pd.pivot_table(g_DF_USER, index=g_DF_USER.index, values='user_id', columns='gender',fill_value=0)
	g_DF_USER_wm = getUserDataFramebyGender(g_DF_USER_all , 'F')	
	merger = pd.merge(g_DF_UDATA, g_DF_USER_wm, left_on='user_id', right_on='F', how='left')	
	va = np.vstack(merger[ merger['F'] > 0 ] ['rating'])
	#print '\n, merge len', len(va), '\navg',va.mean(), '\nvar', va.var(), '\nstd', va.std()

		
	# 提取男性用户
	g_DF_USER_m = getUserDataFramebyGender(g_DF_USER_all , 'M')
	merger = pd.merge(g_DF_UDATA, g_DF_USER_m, left_on='user_id', right_on='M', how='left')	
	va2 = np.vstack(merger[ merger['M'] > 0 ] ['rating'])
	#print '\n, merge len', len(va2), '\navg',va2.mean(), '\nvar', va2.var(), '\nstd', va2.std()

	
	# 读取u.item件
	#df_item = readSrcData('u.item','|')
	#print 'item\n', df_item.head(5)
	
	''' 计算男女对电影评分的标准差'''
	F_StandardDiff =  va.std() #1.0
	M_StandardDiff =  va2.std() #2.0
	ser_result = pd.Series({'F':F_StandardDiff, 'M':M_StandardDiff})
	ser_result.name='rating'
	print '\n\ngender\n',ser_result
def chuli_consumer(train_ccx_A=train_ccx_A,train_target_A=train_target_A,over=datetime(2017, 6, 1)):
    #月份只有2017.1-2017.5的数据,只保留月份作为分类变量
    
    train_ccx_A[['var_06']]=train_ccx_A[['var_06']].apply(pd.to_datetime)
    train_ccx_A['datediff']=(over-train_ccx_A['var_06']).apply(lambda x: x.days)
    query = train_ccx_A.groupby(train_ccx_A['ccx_id']).size()#查询次数
    query = query.reset_index() #index 改为 column
     #最后一次消费距分析时间距离
    datediff=train_ccx_A['datediff'].groupby(train_ccx_A['ccx_id']).min()
    datediff=datediff.reset_index()
    query=pd.merge(query,datediff,on='ccx_id',how='left')
    query.columns = ['ccx_id','query','datediff']
	#衍生变量 消费频率
    tmp1= train_ccx_A['datediff'].groupby( train_ccx_A['ccx_id']).min()
    tmp2= train_ccx_A['datediff'].groupby( train_ccx_A['ccx_id']).max()
    query['query']= list( query['query'].tolist()/(tmp2-tmp1))
    query['query'][query['query']==float('inf')]=0     
    df = pd.get_dummies(train_ccx_A) #变成哑变量形式one-hot编码 特征增加了
    df2 = df.groupby(['ccx_id'],as_index=False).sum() #根据id汇总 加和
    

    df3 = pd.merge(df2,query,on='ccx_id',how='left')#query 和 df2合并
    df3 = pd.merge(train_target_A,df3,on='ccx_id',how='left')#target与ccx合并
    df4 = df3.drop(['target'], axis = 1) #只有数据没有target


    df4=df4.fillna(0)

    df4 = df4.set_index("ccx_id")
    
    return df4
Exemple #8
0
def course_feature():
    user = read_user('data\\object\\object_num2.csv')[['course_id_num','module_id','category_num','start','children']]
    #total1 = user[['course_id_num','module_id']].groupby(['course_id_num']).count()
    total2 = user[['course_id_num','module_id']].drop_duplicates().groupby(['course_id_num']).count()
    total2.columns = ['category_all']
    part = user[['course_id_num','module_id','category_num']].drop_duplicates()
    part1 = part[['course_id_num','category_num']]
    part_en = user[['course_id_num']].drop_duplicates().set_index('course_id_num')    
    
    for i in range(15):
        category_cnt = part1[part1['category_num']==i].groupby(['course_id_num']).count()
        
        category_cnt.columns = ['category'+str(i)]
        part_en = pd.merge(part_en,category_cnt,how='outer',left_index=True,right_index=True)
    part_all = pd.merge(part_en,total2,how='outer',left_index=True,right_index=True)
    part_all.fillna(0,inplace=True)
    part_all = part_all.astype('int')
    
    
    enroll = read_user('data\\train\\enrollment_train_num.csv')[['enrollment_id','course_id_num']].set_index('course_id_num')
    part_enroll = pd.merge(enroll,part_all,how='outer',left_index=True,right_index=True)
    part_enroll_1 = part_enroll.set_index('enrollment_id')#ort_index(inplace=True)
   
    
    feature_all = read_user('data\\train\\feature_all_11.csv').set_index('enrollment_id')
    feature_all2 = pd.merge(feature_all,part_enroll_1,how='outer',left_index=True,right_index=True)
    feature_all2.to_csv('data\\train\\feature_all_12.csv')   
    '''
Exemple #9
0
def recommendByUserFC(userid, k=3, wantedNum=5):
	coster = CcalcCostTime()

	#1. 建立user id到看过的电影映射; 电影id 到user的映射;本次使用df
	merger = pd.merge(g_DF_UDATA, g_DF_USER, on='user_id')
				
	#2. 计算最近的K个邻居
	nears = calcNears(merger, userid, k)
	
	#testPrint(nears, merger,userid)
	
	#3. 对邻居的所有看过的电影,基于邻近情况,计算推荐度
	movieitems_dist={}
	for item in nears:
		nearmovies = (merger[merger.user_id == item[1] ]) ['item_id'].values
		for movie in nearmovies:		
			if movieitems_dist.has_key(movie):
				movieitems_dist[movie] += item[0]
			else:
				movieitems_dist[movie] = item[0]

  #4. 基于推荐度,进行排序
	SeriesMovies = pd.Series(movieitems_dist).sort_values()
	print '\n', SeriesMovies.tail(wantedNum)
	
	#5. 输出
	recommned_moiveID_df = pd.DataFrame(SeriesMovies.tail(wantedNum).keys(), columns=['item_id'] )
	recomm_merger = pd.merge(g_DF_MOVIE_ITEM, recommned_moiveID_df, on='item_id')
	print '\n\n', userid,'\'s recom list:\n', recomm_merger.loc[:, ['item_id','title','release'] ]
Exemple #10
0
def add_judgments_and_frequencies_to_qa_pairs(qa_pairs, judgments, question_frequencies, remove_newlines):
    """
    Collate system answer confidences and annotator judgments by question/answer pair.
    Add to each pair the question frequency. Collated system files are used as input to subsequent cross-system
    analyses.

    Though you expect the set of question/answer pairs in the system answers and judgments to not be disjoint, it may
    be the case that neither is a subset of the other. If annotation is incomplete, there may be Q/A pairs in the
    system answers that haven't been annotated yet. If multiple systems are being judged, there may be Q/A pairs in the
    judgements that don't appear in the system answers.

    Some versions of Annotation Assist strip newlines from the answers they return in the judgement files, so
    optionally take this into account when joining on question/answer pairs.

    :param qa_pairs: question, answer, and confidence provided by a Q&A system
    :type qa_pairs: pandas.DataFrame
    :param judgments: question, answer, in purview, and judgement provided by annotators
    :type judgments: pandas.DataFrame
    :param question_frequencies: question and question frequency in the test set
    :type question_frequencies: pandas.DataFrame
    :param remove_newlines: join judgments on answers with newlines removed
    :type remove_newlines: bool
    :return: question and answer pairs with confidence, in purview, judgement and question frequency
    :rtype: pandas.DataFrame
    """
    qa_pairs = pandas.merge(qa_pairs, question_frequencies, on=QUESTION, how="left")
    if remove_newlines:
        qa_pairs["Temp"] = qa_pairs[ANSWER].str.replace("\n", "")
        qa_pairs = qa_pairs.rename(columns={"Temp": ANSWER, ANSWER: "Temp"})
    qa_pairs = pandas.merge(qa_pairs, judgments, on=(QUESTION, ANSWER), how="left")
    if remove_newlines:
        del qa_pairs[ANSWER]
        qa_pairs = qa_pairs.rename(columns={"Temp": ANSWER})
    return qa_pairs
Exemple #11
0
def users_per_course_all():
     #训练集+测试集中,每个课程的用户数。生成文件feature_all_11.csv
    u1 = read_user('data\\enrollment\\enrollment_all_num.csv')
    u2 = u1[['username','course_id_num']].groupby(['course_id_num']).count()
    u2.columns = ['users_per_course']
    u3 = u1.copy().set_index('course_id_num')[['enrollment_id']]
    e = pd.merge(u3,u2,how="outer",left_index=True,right_index=True)
    f = e.set_index('enrollment_id')
    f.sort_index(inplace=True)
    
    train = read_user('data\\train\\enrollment_train_num.csv')
    test = read_user('data\\test\\enrollment_test_num.csv')
    tr1 = train[['enrollment_id']].set_index('enrollment_id')
    tr1m = pd.merge(tr1,f,how="inner",left_index=True,right_index=True)
    tr1m.columns = ['users_per_course_all']
    te1 = test[['enrollment_id']].set_index('enrollment_id')
    te1m = pd.merge(te1,f,how="inner",left_index=True,right_index=True)
    te1m.columns = ['users_per_course_all']
    
    feature_all = read_user('data\\test\\feature_all_10.csv').set_index('enrollment_id')
    feature_all1 = pd.merge(feature_all,te1m,how='outer',left_index=True,right_index=True)
    feature_all1.to_csv('data\\test\\feature_all_11.csv')  
    
    feature_all2 = read_user('data\\train\\feature_all_10.csv').set_index('enrollment_id')
    feature_all3 = pd.merge(feature_all2,tr1m,how='outer',left_index=True,right_index=True)
    feature_all3.to_csv('data\\train\\feature_all_11.csv')  
Exemple #12
0
def wishlist_scores(data_path, scoring_function, preserve_zeros=True):
    '''returns iso/ft ratio score & iso count ratio score as dataframes
    see weighted_percentile for methodology
    higher ratio or iso count means higher score
    '''
    ftiso = pd.read_csv(data_path)
    # calculate iso counts
    iso = ftiso[ftiso['type'] == 'iso']
    iso = iso.groupby('beer_id', as_index=False).count()[['id', 'beer_id']]
    iso.columns = ['iso_count', 'beer_id']
    # calculate ft counts
    ft = ftiso[ftiso['type'] == 'ft']
    ft = ft.groupby('beer_id', as_index=False).count()[['id', 'beer_id']]
    ft.columns = ['ft_count', 'beer_id']
    if preserve_zeros:
        # outer merge to get combined iso & ft counts for each beer
        iso_ft = pd.merge(iso, ft, on='beer_id', how='outer')
        iso_ft = iso_ft.fillna(0)
        # laplace smoothing for iso & ft counts (avoids dividing by zero)
        iso_ft['ft_count'] = iso_ft['ft_count'] + 1
        iso_ft['iso_count'] = iso_ft['iso_count'] + 1
    else:
        # inner merge to eliminate zeros in iso & ft counts for each beer
        iso_ft = pd.merge(iso, ft, on='beer_id', how='inner')
    # finally get demand to supply ratio score
    iso_ft['ratio'] = iso_ft['iso_count'] / iso_ft['ft_count']
    iso_ft['ratio_score'] = scoring_function(iso_ft['ratio'])
    # get iso score
    iso_ft['iso_score'] = scoring_function(iso_ft['iso_count'])
    return iso_ft
Exemple #13
0
def execute(bins=10, ylim=False):
    data = pandas.merge(load_terms(), load_search_results().rename(columns={'identifier': 'term_id'}), on=['term_id'], how='inner')
    data = data[data['term_name'].apply(lambda x: len(x.split(';')[0]) > 5)]
    data = data[data['term_id'].apply(lambda x: x.startswith('A'))]
    data = pandas.merge(data, load_radiopaedia_terms(), on=['term_id', 'term_name'], how='inner')
    # load_radiopaedia_terms()
    # g = sns.pairplot(data, vars=['search_results_log', 'pagerank', 'difficulty_prob'])
    # for ax in g.axes.flat:
        # if ax.get_xlabel() in ['difficulty_prob', 'pagerank']:
            # ax.set_xlim(0, 1)
        # if ax.get_ylabel() in ['difficulty_prob', 'pagerank']:
            # ax.set_ylim(0, 1)
        # if min(ax.get_xticks()) < 0:
            # ax.set_xlim(0, max(ax.get_xticks()))
        # if min(ax.get_yticks()) < 0:
            # ax.set_ylim(0, max(ax.get_yticks()))
    # output.savefig('importance_pair', tight_layout=False)
    rcParams['figure.figsize'] = 30, 20
    for term_name, difficulty_prob, pagerank in data[['term_name', 'difficulty_prob', 'pagerank']].values:
        plt.plot(1 - difficulty_prob, pagerank, color='red', marker='s', markersize=10)
        plt.text(1 - difficulty_prob, pagerank, term_name)
        if ylim:
            plt.ylim(0, 0.5)
        plt.xlabel('Predicted error rate')
        plt.ylabel('Pagerank')
    output.savefig('importance_pagerank')
Exemple #14
0
def createMaster(raw_data_path,clean_data_path):
    
    #Get the cleaned data
    projects = cleanProjects(raw_data_path,clean_data_path)
    outcomes = cleanOutcomes(raw_data_path,clean_data_path)
    
    
    #try the merge. Nothing wrong with the data, have 664098 unique projects
    print "Merging outcomes and project data set..."
    project_with_outcome = pd.merge(projects, outcomes, how='outer', on='projectid')
    project_with_outcome['projectid'].nunique()
    
    del projects
    del outcomes
    
    #Merge in essay data, have 664098 unique projects
    essays = cleanEssays(raw_data_path,clean_data_path)
    print "Merging in essay data..."
    master = pd.merge(project_with_outcome,essays, how='outer', on='projectid')
    del essays
    master['projectid'].nunique()
    
    print "Saving master data..."
    master.to_csv(clean_data_path+'master.csv')
    
    print "Saving subsample of master"
    firstThousandMaster = master[0:999]
    firstThousandMaster.to_csv(clean_data_path+'firstThousandMaster.csv')
    
    return master
Exemple #15
0
def _calculate_normalised_dispersion(model, input_files, beta, header, unit, cut, output, accelerator):
    #TODO there are no errors from orbit
    df_orbit = pd.DataFrame(model).loc[:, ['S', 'MUX', 'DPX', 'DX', 'X', 'BETX']]
    df_orbit['NDXMDL'] = df_orbit.loc[:, 'DX'] / np.sqrt(df_orbit.loc[:, 'BETX'])
    df_orbit.rename(columns={'MUX': 'MUXMDL', 'DPX': 'DPXMDL', 'DX': 'DXMDL', 'X': 'XMDL'}, inplace=True)
    df_orbit['COUNT'] = len(input_files.get_columns(df_orbit, 'CO'))
    dpps = input_files.dpps("X")
    df_orbit = pd.merge(df_orbit, input_files.joined_frame("X", ['CO', 'CORMS', 'AMPX']),
                        how='inner', left_index=True, right_index=True)
    df_orbit = pd.merge(df_orbit, beta.loc[:, ['BETX', 'ERRBETX']], how='inner', left_index=True,
                        right_index=True, suffixes=('', '_phase'))
    if np.max(dpps) - np.min(dpps) == 0.0:
        return  # temporary solution
        # raise ValueError('Cannot calculate dispersion, only a single dpoverp')
    fit = np.polyfit(dpps, SCALES[unit] * input_files.get_data(df_orbit, 'CO').T, 1, cov=True)
    df_orbit['NDX_unscaled'] = fit[0][-2, :].T / stats.weighted_mean(input_files.get_data(df_orbit, 'AMPX'), axis=1) # TODO there is no error from AMPX
    df_orbit['STDNDX_unscaled'] = np.sqrt(fit[1][-2, -2, :].T) / stats.weighted_mean(input_files.get_data(df_orbit, 'AMPX'), axis=1)
    df_orbit = df_orbit.loc[np.abs(fit[0][-1, :].T) < cut * SCALES[unit], :]
    mask = accelerator.get_element_types_mask(df_orbit.index, ["arc_bpm"])
    global_factor = np.sum(df_orbit.loc[mask, 'NDXMDL'].values) / np.sum(df_orbit.loc[mask, 'NDX_unscaled'].values)
    df_orbit['NDX'] = global_factor * df_orbit.loc[:, 'NDX_unscaled']
    df_orbit['STDNDX'] = global_factor * df_orbit.loc[:, 'STDNDX_unscaled']
    df_orbit['DX'] = df_orbit.loc[:, 'NDX'] * np.sqrt(df_orbit.loc[:, 'BETX_phase'])
    df_orbit['STDDX'] = df_orbit.loc[:, 'STDNDX'] * np.sqrt(df_orbit.loc[:, 'BETX_phase'])
    df_orbit['DPX'] = _calculate_dp(model, df_orbit.loc[:, ['DX', 'STDDX']], "X")
    df_orbit['DELTANDX'] = df_orbit.loc[:, 'NDX'] - df_orbit.loc[:, 'NDXMDL']
    output_df = df_orbit.loc[:, ['S', 'COUNT', 'NDX', 'STDNDX', 'DX', 'DPX',
                                 'NDXMDL', 'DXMDL', 'DPXMDL', 'MUXMDL', 'DELTANDX']]
    tfs_pandas.write_tfs(join(output, header['FILENAME']), output_df, header, save_index='NAME')
    return output_df
Exemple #16
0
def corrects_incorrects_counter_win(ds,  window=None):
    ''' Receives the dataset and creates a cumulative windowed sum
    for the columns corrects and incorrects '''

    #If window not specified not use window
    student_cfa = ds[['student_id', 'step_id', 'corrects', 'incorrects']]
    grouped = student_cfa.groupby(['student_id', 'step_id'])

    new_df = pd.DataFrame(np.zeros((ds.shape[0], 2)), 
                            columns=['cum_corr', 'cum_incorr'])

    if window:        
        cum = grouped.cumsum()

        cum_df = pd.merge(student_cfa[['student_id', 'step_id']], cum, 
                            right_index=True, left_index=True)
        grouped_cum = cum_df.groupby(['student_id', 'step_id'])

        cum_delay = grouped_cum.shift(window).fillna(0)

        diff = cum - cum_delay

        diff_df = pd.merge(student_cfa[['student_id', 'step_id']], diff, 
                            right_index=True, left_index=True)
        diff_df = diff_df.groupby(['student_id', 'step_id'])
        
        previous_columns = diff_df.shift(1)
        previous_columns = previous_columns.fillna(0)
        previous_columns.columns = ['prev_corr',  'prev_incorr']        

        return (previous_columns.prev_corr, previous_columns.prev_incorr)
def createMatchAndTournamentTables(matchesFilepath, playersDF, conn) :

	# Récupération des matches en DF
	matchsDF = pd.read_csv(matchesFilepath)

	# Travail sur la date pour la mettre au même format que les dob des players
	matchsDF.event_time = matchsDF.event_time.apply(lambda x : x.split(" ")[0])
	matchsDF.event_time = pd.to_datetime(matchsDF.event_time).apply(lambda x : x.strftime("%d/%m/%Y"))

	# Récupération des tournois en sql
	tournamentsDF = matchsDF[["event_time", "event_name", "surface"]].drop_duplicates()
	tournamentsDF.index = range(len(tournamentsDF))
	tournamentsDF.to_sql("tournaments", conn, if_exists="replace")

	# Jointure sur les tournois (pour les joueurs le nom fait office de jointure)
	tournamentsDF["idTournoi"] = tournamentsDF.index
	matchsDF = pd.merge(matchsDF, tournamentsDF)

	# Jointure sur les players
	playersDF["idPlayerA"] = playersDF.index
	playersDF = playersDF[["idPlayerA", "playername"]].rename(columns={"playername": "playerA"})
	matchsDF = pd.merge(matchsDF, playersDF, how="left")
	playersDF = playersDF.rename(columns={"idPlayerA":"idPlayerB", "playerA":"playerB"})
	matchsDF = pd.merge(matchsDF, playersDF, how="left")

	# Suppression des champs tournois et player devenus redondants et écriture de la table
	matchsDF.drop(["playerA", "playerB", "Unnamed: 0", "event_time", "event_name", "surface"], axis=1, inplace=True)
	matchsDF.to_sql("matchs", conn, if_exists="replace")

	return matchsDF, tournamentsDF
 def score_item(self, train_file, test_file, score_type):
     train_df = pd.read_csv(train_file, header=False,
                            names=['userId', 'movieId', 'rating', 'timestamp'])
     test_df = pd.read_csv(test_file, header=False,
                           names=['userId', 'movieId', 'rating', 'timestamp'])
     test_users = pd.unique(test_df.userId)
     rating_from_test_users = train_df[train_df.userId.isin(test_users)]
     rating_from_test_users_cluster = pd.merge(self.cluster_df, rating_from_test_users,
                                               on='movieId')
     rating_from_test_users_cluster = rating_from_test_users_cluster.groupby(['userId', 'cluster'])['rating'] \
                                 .agg(np.mean).reset_index()
     if score_type == 'optimal':
         rating_join = pd.merge(test_df, self.model,
                                left_on='movieId', right_on='Item', how='left') \
             .drop(['timestamp', 'Partition', 'Item', 'Rank'], axis=1)
         rating_join['error'] = rating_join['rating'] - rating_join['Score']
         rmse_summary = rating_join.groupby(['userId', 'cluster'])['error'] \
                 .agg(lambda x: np.linalg.norm(x)/np.sqrt(len(x))).reset_index()
         test_user_cluster_map = rmse_summary.groupby('userId') \
             .apply(lambda x: x.loc[x['error'].argmin()]).reset_index(drop=1)
     else:
         test_user_cluster_map = rating_from_test_users_cluster.groupby('userId') \
             .apply(lambda x: x.loc[x['rating'].argmax()]).reset_index(drop=1)
     output = pd.merge(test_user_cluster_map, self.model, on='cluster', how='left')
     output.userId = output.userId.astype(np.int64)
     output.Item = output.Item.astype(np.int64)
     output[['userId', 'Item', 'Score']].to_csv(sys.stdout, header=False, index=False)
Exemple #19
0
def genCoauthors(aDF,names):    
    coauthors = pd.merge(aDF.drop(['name','stnname','paperCount'],1),aDF.drop(['name','stnname','paperCount'],1),how='outer',on='paperID',suffixes=['1','2'])
    coauthors = coauthors[coauthors['authorID1']!=coauthors['authorID2']]
    
    coauthors = pd.merge(coauthors,names.rename(columns={'authorID':'authorID1','stnname':'stnname1'}), how='left',on='authorID1')
    coauthors = pd.merge(coauthors,names.rename(columns={'authorID':'authorID2','stnname':'stnname2'}), how='left',on='authorID2')
    return coauthors
Exemple #20
0
    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='right')
Exemple #21
0
    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='inner')
Exemple #22
0
def get_full_features():
    reviews = get_reviews()
    tips = get_tips()

    reviews_tips = reviews.append(tips)
    reviews_tips.columns = ['restaurant_id', 'review_date', 'review_id', 'review_stars', 'review_text', 'review_type', 'user_id', 'review_votes_cool', 'review_votes_funny', 'review_votes_useful']
    reviews_tips.review_votes_useful.fillna(0, inplace=True)
    reviews_tips.review_votes_cool.fillna(0, inplace=True)
    reviews_tips.review_votes_funny.fillna(0, inplace=True)
    reviews_tips = map_ids(reviews_tips)

    # # saving this for tfidf vectorizer training later
    # with open('pickle_jar/reviews_tips_original_text.pkl', 'w') as f:
    #     pickle.dump(reviews_tips.review_text.tolist(), f)

    users = get_users()
    users_reviews_tips = pd.merge(reviews_tips, users, how='left', on='user_id')

    restaurants = get_restaurants()
    restaurants_users_reviews_tips = pd.merge(users_reviews_tips, restaurants, how='outer', on='restaurant_id')

    # if checkins dont exist for a restaurant dont want to drop the restaurant values
    checkins = get_checkins()
    full_features = pd.merge(restaurants_users_reviews_tips, checkins, how='left', on='restaurant_id')

    # drop restaurants not found in boston data
    full_features = full_features[pd.notnull(full_features.restaurant_id)]

    return full_features
Exemple #23
0
def plot_clf_polar(clf, cmap=None, key='nickname', n_topics=60, n_top=3, labels=None, topics = None, mask=None, selection='top', metric='correlation', max_val=None):
    import pandas as pd
    import seaborn as sns

    ## Set up topic nicknames
    word_keys = pd.read_csv("../data/unprocessed/abstract_topics_filtered/topic_sets/topic_keys" + str(n_topics) + "-july_cognitive.csv")
    word_keys['topic_name'] = "topic" + word_keys['topic'].astype('str')

    o_fi = pd.DataFrame(clf.odds_ratio)

    # Melt feature importances, and add top_words for each feeature
    o_fi['region'] = range(1, o_fi.shape[0] + 1)
    o_fis_melt = pd.melt(o_fi, var_name='topic_order', value_name='importance', id_vars=['region'])

    word_keys = pd.merge(pd.DataFrame(np.array([range(0, clf.feature_importances.shape[1]), clf.feature_names]).T, columns=['topic_order', 'topic_name']), word_keys)
    word_keys.topic_order = word_keys.topic_order.astype('int')

    o_fis_melt= pd.merge(o_fis_melt, word_keys)
    o_fis_melt['abs_imp'] = np.abs(o_fis_melt['importance'])
    
    if mask is not None:
        o_fis_melt = o_fis_melt[o_fis_melt.region.isin(mask)]
        
    if topics is not None:
        o_fis_melt = o_fis_melt[o_fis_melt[key].isin(topics)]
    
    pplot = pd.pivot_table(o_fis_melt, values='importance', index=[key], columns=['region'])
    
    if cmap is None:
        cmap = sns.color_palette('Set1', clf.feature_importances.shape[0])
    if mask is not None:
        cmap = [n[0] for n in sorted(zip(np.array(cmap)[np.array(mask)-1], mask), key=lambda tup: tup[1])]
    return plot_polar(pplot, overplot=True, palette=cmap, n_top=n_top, metric=metric, selection=selection, 
        label_size=30, labels=labels, max_val=max_val)
Exemple #24
0
def runSharesPSRCToBKRZones():
    #list of two lists
    files_shares = [files_manu_shares, file_wtcu_shares]
    header_rows = 3 #number of rows at the begining of a file with header information

    headers = {} #dictionary to save header information
    for files_group in files_shares:
        for file in files_group:
            print("working on file: " + file)
            file_path = os.path.join(wd, file)

            #read header - use "#" as seperator as it is less likely to present in the file
            headers[file] = pd.read_table(file_path, delimiter = "#", header = None, nrows = header_rows) 
        
            # skip first few rows, as they contain general information - also ignore rows starting with 'c' (comment lines)
            shares_psrc = pd.read_table(file_path, delimiter = " ", names = ["o","d",file], comment = "c", skiprows = header_rows)

            if file == files_group[0]:
                #if first file in the group, set to the file shares
                truck_shares_psrc = shares_psrc
            else:
                #add a new column for a new file
                truck_shares_psrc = pd.merge(truck_shares_psrc, shares_psrc, on = ["o","d"])

        # merge psrc to bkr correspondence with percent
        tazGroups = pd.merge(truck_shares_psrc, tazShares, left_on = "o", right_on = "psrc_zone_id")
        tazGroups[file] = tazGroups[file] * tazGroups["percent"]

        # group by unique pair of bkr zone and group
        tazGroups_grouped = tazGroups.groupby(["bkr_zone_id"])

        # calculate sum of percent by unique pair
        tazGroups_sum = tazGroups_grouped[files_group].sum()
        tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1)

        for file in files_group:
            tazGroups_sum[file] *= 1/tazGroups_sum['sum'] 

        tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1)
        tazGroups_sum =  tazGroups_sum.round(4) #round values to 4 decimal

        #temp = tazGroups_sum.ix[tazGroups_sum["sum"]>1.0] #debug: to find out rows that have sum value more than 1

        tazGroups_sum = tazGroups_sum[files_group].reset_index() # makes object a data frame by setting the current index to a column
        tazGroups_sum["c"] = "all:"

        for file in files_group:
            tazGroups_bkr = tazGroups_sum[["bkr_zone_id", "c", file]]
            tazGroups_bkr = tazGroups_bkr.sort_values(by = ['bkr_zone_id'], ascending=[True])

            # write - first header and then append the updated data
            outfile = file.split(".")[0]
            outfile = os.path.join(wd, outfile + "_bkr.in")

            #first write header
            headers[file].to_csv(outfile, sep = " ", header = False, index = False, quoting=csv.QUOTE_NONE, escapechar = " ") #had to add space as escapechar otherwise throws an error - not sure if that would cause any issue in the mdoel

            #write data
            with open(outfile, 'a') as wfile:
                tazGroups_bkr.to_csv(wfile, sep = " " , header = False, index = False)
Exemple #25
0
def test_hash_join(how):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]})
    a = dd.repartition(A, [0, 4, 5])

    B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]})
    b = dd.repartition(B, [0, 2, 5])

    c = hash_join(a, 'y', b, 'y', how)

    result = c.compute()
    expected = pd.merge(A, B, how, 'y')
    list_eq(result, expected)

    # Different columns and npartitions
    c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3)
    assert c.npartitions == 3

    result = c.compute()
    expected = pd.merge(A, B, 'outer', None, 'x', 'z')

    list_eq(result, expected)

    assert hash_join(a, 'y', b, 'y', 'inner')._name == \
           hash_join(a, 'y', b, 'y', 'inner')._name
    assert hash_join(a, 'y', b, 'y', 'inner')._name != \
           hash_join(a, 'y', b, 'y', 'outer')._name
Exemple #26
0
 def conjoint(self):
     '''
     Calcule l'identifiant du conjoint et vérifie que les conjoint sont bien reciproques 
     '''     
     print ("travail sur les conjoints")
     ind = self.ind
     conj = ind.ix[ind['couple']==1,['men','lienpref','id']]
     conj['lienpref'].value_counts()
     conj.ix[conj['lienpref']==1,'lienpref'] = 0
     conj.ix[conj['lienpref']==31,'lienpref'] = 2
     conj.ix[conj['lienpref']==32,'lienpref'] = 3
     conj.ix[conj['lienpref']==50,'lienpref'] = 10
     conj2 = merge(conj, conj, on=['men','lienpref'])
     conj2 = conj2[conj2['id_x'] != conj2['id_y']]
     assert len(conj2) == len(conj)
     conj = conj2
     test = pd.groupby(conj, ['men','lienpref']).size()
     assert max(test)==2 and min(test)==2
     couple = pd.groupby(conj, 'id_x')
     for id, potential in couple:
         if len(potential) == 1:
             conj.loc[ conj['id_x']==id, 'id_y'] = potential['id_y']
         else:
             pdb.set_trace()
     # TODO: pas de probleme, bizarre
     conj = conj.rename(columns={'id_x': 'id', 'id_y':'conj'})
     ind = merge(ind,conj[['id','conj']], on='id', how='left')
     
     self.ind = ind
     ## verif sur les conj réciproque
     test_conj = merge(ind[['conj','id']],ind[['conj','id']],
                          left_on='id',right_on='conj')
     print "le nombre de couple non réciproque est:", sum(test_conj['id_x'] != test_conj['conj_y'])
     print ("fin du travail sur les conjoints")
        def flattenTable(fulltable,levelcol,idcol,parentidcol,countchildren,removeempty):
            fulltable[[levelcol]] = fulltable[[levelcol]].astype(int)

            levels = dict(list(fulltable.groupby(levelcol)))
            minlevel = fulltable.level.min()
            for level, data in sorted(levels.iteritems()):
                #First level is the starting point for the following merges
                if level == minlevel:
                    #data = data[[idcol,'object_id','object_type']]
                    data = data.add_prefix('level_{}-'.format(level))
                    flattable = data
                else:
                    #Aggregate object types and join them
                    for col_countchildren in countchildren:
                        children = data[parentidcol].groupby([data[parentidcol],data[col_countchildren]]).count()
                        children = children.unstack(col_countchildren)
                        children['total'] = children.sum(axis=1)
                        children = children.add_prefix('level_{}-children-{}-'.format(level-1,col_countchildren))

                        leftkey = 'level_{}-id'.format(level-1)
                        flattable = merge(flattable,children,how='left',left_on=leftkey,right_index=True)
                        flattable[children.columns.values.tolist()] = flattable[children.columns.values.tolist()].fillna(0).astype(int)

                    #Join data
                    data['childnumber'] = data.groupby(parentidcol).cumcount()
                    leftkey = 'level_{}-{}'.format(level-1,idcol)
                    rightkey = 'level_{}-{}'.format(level,parentidcol)
                    data = data.drop([levelcol],axis=1)
                    data = data.add_prefix('level_{}-'.format(level))
                    flattable = merge(flattable,data,how="outer",left_on=leftkey,right_on=rightkey)

            if removeempty:
                flattable = flattable.dropna(axis=1,how='all')
            return flattable
def hierarchicallyIndexMerge():
    lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                       'key2': [2000, 2001, 2002, 2001, 2002],
                       'data': np.arange(5.)})
    righth = DataFrame(np.arange(12).reshape((6, 2)),
                       index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                              [2001, 2000, 2000, 2000, 2001, 2002]],
                       columns=['event1', 'event2'])
    print ('DataFrame 1: \n{}'.format(lefth))
    print ('DataFrame 2: \n{}'.format(righth))
    merge_inner = pd.merge(lefth,righth,left_on=['key1','key2'], right_index=True)
    print ('Inner Merged DataFrame: \n{}'.format(merge_inner))
    merge_outer = pd.merge(lefth,righth,left_on=['key1','key2'], right_index=True, how='outer')
    print ('Outer Merged DataFrame: \n{}'.format(merge_outer))

    left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                      index=['a', 'c', 'e'],
                      columns=['Ohio', 'Nevada'])
    right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                       index=['b', 'c', 'd', 'e'],
                       columns=['Missouri','Alabama'])
    print ('Data Frame 1: ', left2)
    print ('Data Frame 2: ', right2)
    merge = pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
    print ('Merge: \n{}'.format(merge))
Exemple #29
0
def station_summary(pair_counts: DataFrame or None=None) -> DataFrame:
    if pair_counts is None:
        pair_counts = station_pair_counts()
    info = station_info()
    lat_lng = info[['lat', 'lng']]

    def rename_lat_lng_columns(prefix):
        def do_rename(column):
            if column == 'lat' or column == 'lng':
                return "{}_{}".format(prefix, column)
            else:
                return column

        return do_rename

    with_min_st_coords = merge(pair_counts, lat_lng, how='left', left_on='min_st', right_index=True)
    with_min_st_coords.dropna(inplace=True)
    # http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
    new_column_names = map(rename_lat_lng_columns("min"), with_min_st_coords.columns)
    with_min_st_coords.columns = new_column_names

    with_both_coords = merge(with_min_st_coords, lat_lng, how='left', left_on='max_st', right_index=True)
    with_both_coords.dropna(inplace=True)
    new_column_names = map(rename_lat_lng_columns("max"), with_both_coords.columns)
    with_both_coords.columns = new_column_names

    return with_both_coords
Exemple #30
0
def load_distractors_usage(length=None, by_attempt=True):
    cf = load_confusing_factor()

    def _apply(g):
        g['ratio'] = g['value'] / g['value'].sum()
        return g
    cf = cf.groupby(['experiment_setup_name', 'item']).apply(_apply).reset_index().sort_values(by=['experiment_setup_name', 'item', 'ratio'], ascending=False)
    cf['ratio_rank'] = cf.groupby([
        'experiment_setup_name',
        'item'
    ]).cumcount()
    answers = load_non_reference_answers()
    answers['attempt'] = answers.groupby([
        'experiment_setup_name',
        'user_id',
        'context_name',
        'term_type',
    ]).cumcount()
    if length is not None:
        answers = answers[answers['attempt'] < length]
    answers = pandas.merge(answers, load_options().rename(columns={'answer_id': 'id'}), on=['id', 'item_asked_id', 'experiment_setup_name', 'experiment_setup_id'], how='inner')[['item_asked_id', 'experiment_setup_name', 'attempt', 'item_option_id']]
    answers = pandas.merge(
        answers,
        cf[['experiment_setup_name', 'item', 'other', 'ratio_rank']].rename(columns={'item': 'item_asked_id', 'other': 'item_option_id', 'ratio_rank': 'confusing_rank'}),
        on=['experiment_setup_name', 'item_asked_id', 'item_option_id'], how='inner')

    def _apply(group):
        total = len(group)
        return group.groupby('confusing_rank').apply(lambda g: len(g) / total).reset_index().rename(columns={0: 'value'})
    groupby_add = ['attempt'] if by_attempt else []
    return answers.groupby(['experiment_setup_name'] + groupby_add).apply(_apply).reset_index()
Exemple #31
0
# Ignorando la fila de totales y seleccionando solo las filas de estados
pad = pad[1:36]
# Seleccionando solo las columnas de interés
pad = pad[[pad.columns[0], '2015', '2018']]
# Renombrando las columnas a su clave definida en el diccionario del dataset
pad = pad.rename(columns={
    pad.columns[0]: 'EDO',
    '2015': 'DET.TOT.15',
    '2018': 'DET.TOT.18'
})

# Resultado parcial de DS_HIPERTENSION
print('DS_HIPERTENSION', pad)

# ## Uniendo los datasets anteriores
todos = pd.merge(db, hp, on='EDO')
todos = pd.merge(todos, pad, on='EDO')

# Resultado final de esta parte
print('resultado de merges', todos)

# Removiendo espacios en blanco en la columna EDO
todos['EDO'] = todos['EDO'].str.strip()

# Haciendo que EDO sea el indice
todos = todos.set_index('EDO')

# Sumando filas de estado de mexico
todos.loc['México Oriente'] += todos.loc['México Poniente']

# Borrando la columna ya sumada
from collections import Counter
from scipy.sparse.construct import hstack
from sklearn.model_selection import train_test_split
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.ranking import roc_auc_score
from sklearn.preprocessing.data import OneHotEncoder
import numpy as np
from sklearn import metrics
data=pd.read_csv('train_agg.csv')
data.apply(lambda x:x.replace('"',''))
data.to_csv('data.csv',index=False)
del data
train_agg=pd.read_csv('data.csv',delimiter='\t')
train_flg=pd.read_csv('train_flg.csv',delimiter='\t')
train_agg=pd.merge(train_agg,train_flg,on='USRID')
train_log=pd.read_csv('train_log.csv',delimiter='\t')
test_agg=pd.read_csv('test_agg.csv',delimiter='\t')
test_log=pd.read_csv('test_log.csv',delimiter='\t')


len_train=len(train_agg)
merge_log=train_log.append(test_log)

def data_process(data):
    data.TCH_TYP.replace(2,1,inplace=True)
data_process(merge_log)   
def cut1(group):
    return group.split('-')[0]
def cut2(group):
    return group.split('-')[1]
WAVE_TOP = 8410  # Angstrom
WAVE_BOTTOM = 8790
WAVERANGE = np.linspace(WAVE_TOP, WAVE_BOTTOM, 1000)
star_data = pd.DataFrame(fits.getdata(DATA_DIR + 'data_stars.fits', 1))
# spectra = pd.read_csv(DATA_DIR + 'spectra_fits_raw.csv').iloc[:, :-1]
spectral_data_raw = pd.DataFrame(np.load(
    'spectra_fits_raw.npy', allow_pickle=True),
    columns=['flux', 'flux_error', 'rave_obs_id'])

# Filter those rows that have more flux values than 1000 and separate
# for an easier handling
spectra = pd.concat([pd.DataFrame(spectral_data_raw['flux'].to_list()).iloc[:, 0:950],
                     spectral_data_raw['rave_obs_id']], axis=1).dropna()
spectra_error = pd.concat([pd.DataFrame(spectral_data_raw['flux_error'].to_list()).iloc[0:950],
                           spectral_data_raw['rave_obs_id']], axis=1).dropna()
spectra_all_data = pd.merge(spectra, star_data, on='rave_obs_id', how='inner')
analysis_line1 = pd.read_csv(DATA_DIR + 'analysis_results_line_1.csv')
analysis_line2 = pd.read_csv(DATA_DIR + 'analysis_results_line_2.csv')
analysis_line3 = pd.read_csv(DATA_DIR + 'analysis_results_line_3.csv')

# ______________________ Filters for data ______________________________________

common_ids = np.intersect1d(analysis_line3.rave_obs_id, np.intersect1d(
    analysis_line1.rave_obs_id, analysis_line2.rave_obs_id))

# We keep the same stars for a fair comparison of the algorithm's performance
# Drop the first two columns as they are bypdroducts of the filtering and we don't need them
analysis_line1_cm = analysis_line1[analysis_line1['rave_obs_id'].isin(
    common_ids)].reset_index().drop(['index', 'Unnamed: 0'], axis=1)
analysis_line2_cm = analysis_line2[analysis_line2['rave_obs_id'].isin(
    common_ids)].reset_index().drop(['index', 'Unnamed: 0'], axis=1)
Exemple #34
0
def make_figure(df, pa):
    df_ls = df.copy()

    durations = df_ls[pa["xvals"]]
    event_observed = df_ls[pa["yvals"]]

    km = KaplanMeierFitter()  ## instantiate the class to create an object

    pl = None
    fig = plt.figure(frameon=False,
                     figsize=(float(pa["fig_width"]), float(pa["fig_height"])))

    ## Fit the data into the model

    if str(pa["groups_value"]) == "None":
        km.fit(durations, event_observed, label='Kaplan Meier Estimate')

        df_survival = km.survival_function_
        df_conf = km.confidence_interval_
        df_event = km.event_table

        df = pd.merge(df_survival,
                      df_conf,
                      how='left',
                      left_index=True,
                      right_index=True)
        df = pd.merge(df,
                      df_event,
                      how='left',
                      left_index=True,
                      right_index=True)

        df['time'] = df.index.tolist()
        df = df.reset_index(drop=True)
        df = df[[
            "time", "at_risk", "removed", "observed", "censored", "entrance",
            "Kaplan Meier Estimate", "Kaplan Meier Estimate_lower_0.95",
            "Kaplan Meier Estimate_upper_0.95"
        ]]

        pa_ = {}
        for arg in [
                "Conf_Interval", "show_censors", "ci_legend", "ci_force_lines",
                "left_axis", "right_axis", "upper_axis", "lower_axis",
                "tick_left_axis", "tick_right_axis", "tick_upper_axis",
                "tick_lower_axis"
        ]:
            if pa[arg] in ["off", ".off"]:
                pa_[arg] = False
            else:
                pa_[arg] = True

        if str(pa["markerc_write"]) != "":
            pa_["marker_fc"] = pa["markerc_write"]
        else:
            pa_["marker_fc"] = pa["markerc"]

        if str(pa["edgecolor_write"]) != "":
            pa_["marker_ec"] = pa["edgecolor_write"]
        else:
            pa_["marker_ec"] = pa["edgecolor"]

        if str(pa["grid_color_text"]) != "":
            pa_["grid_color_write"] = pa["grid_color_text"]
        else:
            pa_["grid_color_write"] = pa["grid_color_value"]

        pl=km.plot(show_censors=pa_["show_censors"], \
                censor_styles={"marker":marker_dict[pa["censor_marker_value"]], "markersize":float(pa["censor_marker_size_val"]), "markeredgecolor":pa_["marker_ec"], "markerfacecolor":pa_["marker_fc"], "alpha":float(pa["marker_alpha"])}, \
               ci_alpha=float(pa["ci_alpha"]), \
               ci_force_lines=pa_["ci_force_lines"], \
               ci_show=pa_["Conf_Interval"], \
               ci_legend=pa_["ci_legend"], \
               linestyle=pa["linestyle_value"], \
               linewidth=float(pa["linewidth_write"]), \
               color=pa["line_color_value"])

        pl.spines['right'].set_visible(pa_["right_axis"])
        pl.spines['top'].set_visible(pa_["upper_axis"])
        pl.spines['left'].set_visible(pa_["left_axis"])
        pl.spines['bottom'].set_visible(pa_["lower_axis"])

        pl.spines['right'].set_linewidth(pa["axis_line_width"])
        pl.spines['left'].set_linewidth(pa["axis_line_width"])
        pl.spines['top'].set_linewidth(pa["axis_line_width"])
        pl.spines['bottom'].set_linewidth(pa["axis_line_width"])

        pl.tick_params(axis="both",
                       direction=pa["ticks_direction_value"],
                       length=float(pa["ticks_length"]))

        pl.tick_params(axis='x',
                       which='both',
                       bottom=pa_["tick_lower_axis"],
                       top=pa_["tick_upper_axis"],
                       labelbottom=pa_["lower_axis"],
                       labelrotation=float(pa["xticks_rotation"]),
                       labelsize=float(pa["xticks_fontsize"]))

        pl.tick_params(axis='y',
                       which='both',
                       left=pa_["tick_left_axis"],
                       right=pa_["tick_right_axis"],
                       labelleft=pa_["left_axis"],
                       labelrotation=float(pa["yticks_rotation"]),
                       labelsize=float(pa["yticks_fontsize"]))

        if str(pa["grid_value"]) != "None":
            pl.grid(True,
                    which='both',
                    axis=pa["grid_value"],
                    color=pa_["grid_color_write"],
                    linewidth=float(pa["grid_linewidth"]))

        if str(pa["x_lower_limit"]) != "" and str(pa["x_upper_limit"]) != "":
            pl.set_xlim(float(pa["x_lower_limit"]), float(pa["x_upper_limit"]))
        if str(pa["y_lower_limit"]) != "" and str(pa["y_upper_limit"]) != "":
            pl.set_ylim(float(pa["y_lower_limit"]), float(pa["y_upper_limit"]))

        pl.set_title(pa["title"], fontdict={'fontsize': float(pa['titles'])})
        pl.set_xlabel(pa["xlabel"],
                      fontdict={'fontsize': float(pa['xlabels'])})
        pl.set_ylabel(pa["ylabel"],
                      fontdict={'fontsize': float(pa['ylabels'])})

        return df, pl

    elif str(pa["groups_value"]) != "None":

        df_long = pd.DataFrame(
            columns=['day', 'status', str(pa["groups_value"])])

        for row in range(0, len(df_ls)):

            if int(df_ls.loc[row, pa["yvals"]]) >= 1:
                dead = int(df_ls.loc[row, pa["yvals"]])
                #print(dead)
                for i in range(0, dead):
                    #print(i)
                    df_long = df_long.append(
                        {
                            'day':
                            int(df_ls.loc[row, pa["xvals"]]),
                            'status':
                            1,
                            str(pa["groups_value"]):
                            str(df_ls.loc[row, pa["groups_value"]])
                        },
                        ignore_index=True)
                    i = i + 1

            elif int(df_ls.loc[row, pa["censors_val"]]) >= 1:
                censored = int(df_ls.loc[row, pa["censors_val"]])
                #print(censored)
                for c in range(0, censored):
                    #print(c)
                    df_long = df_long.append(
                        {
                            'day':
                            int(df_ls.loc[row, pa["xvals"]]),
                            'status':
                            0,
                            str(pa["groups_value"]):
                            str(df_ls.loc[row, pa["groups_value"]])
                        },
                        ignore_index=True)
                    c = c + 1

        df_dummy = pd.get_dummies(df_long,
                                  drop_first=True,
                                  columns=[pa["groups_value"]])

        results = logrank_test(df_dummy.loc[df_dummy['status'] == 1,
                                            'day'].tolist(),
                               df_dummy.loc[df_dummy['status'] == 0,
                                            'day'].tolist(),
                               df_dummy.loc[df_dummy['status'] == 1,
                                            'status'].tolist(),
                               df_dummy.loc[df_dummy['status'] == 0,
                                            'status'].tolist(),
                               alpha=.99)

        cph = CoxPHFitter()
        cph.fit(df_dummy, duration_col='day', event_col='status')

        cph_coeff = cph.summary
        cph_coeff = cph_coeff.reset_index()

        df_info = {}
        df_info['model'] = 'lifelines.CoxPHFitter'
        df_info['duration col'] = cph.duration_col
        df_info['event col'] = cph.event_col
        df_info['baseline estimation'] = 'breslow'
        df_info['number of observations'] = cph._n_examples
        df_info['number of events observed'] = len(
            df_dummy.loc[df_dummy['status'] == 1, ])
        df_info['partial log-likelihood'] = cph.log_likelihood_
        df_info['Concordance'] = cph.concordance_index_
        df_info['Partial AIC'] = cph.AIC_partial_
        df_info['log-likelihood ratio test'] = cph.log_likelihood_ratio_test(
        ).test_statistic
        df_info[
            'P.value(log-likelihood ratio test)'] = cph.log_likelihood_ratio_test(
            ).p_value
        df_info['log rank test'] = results.test_statistic
        df_info['P.value(log rank test)'] = results.p_value

        cph_stats = pd.DataFrame(df_info.items())
        cph_stats = cph_stats.rename(columns={0: 'Statistic', 1: 'Value'})
        #cph_stats

        tmp = []

        for cond in pa["list_of_groups"]:
            df_tmp = df_ls.loc[df_ls[pa["groups_value"]] == cond]

            km.fit(df_tmp[pa["xvals"]], df_tmp[pa["yvals"]], label=cond)

            df_survival = km.survival_function_
            df_conf = km.confidence_interval_
            df_event = km.event_table

            df = pd.merge(df_survival,
                          df_conf,
                          how='left',
                          left_index=True,
                          right_index=True)
            df = pd.merge(df,
                          df_event,
                          how='left',
                          left_index=True,
                          right_index=True)

            df['time'] = df.index.tolist()
            df = df.reset_index(drop=True)
            df = df.rename(
                columns={
                    "at_risk": cond + "_at_risk",
                    "removed": cond + "_removed",
                    "observed": cond + "_observed",
                    "censored": cond + "_censored",
                    "entrance": cond + "_entrance",
                    cond: cond + "_KMestimate"
                })

            df = df[[
                "time", cond + "_at_risk", cond + "_removed",
                cond + "_observed", cond + "_censored", cond + "_entrance",
                cond + "_KMestimate", cond + "_lower_0.95",
                cond + "_upper_0.95"
            ]]
            tmp.append(df)

            df = reduce(lambda df1, df2: pd.merge(df1, df2, on='time'), tmp)

            PA_ = [g for g in pa["groups_settings"] if g["name"] == cond][0]

            if str(PA_["linecolor_write"]) != "":
                linecolor = PA_["linecolor_write"]
            else:
                linecolor = PA_["line_color_value"]

            if str(PA_["linestyle_write"]) != "":
                linestyle = PA_["linestyle_write"]
            else:
                linestyle = PA_["linestyle_value"]

            if str(PA_["markerc_write"]) != "":
                markerColor = PA_["markerc_write"]
            else:
                markerColor = PA_["markerc"]

            if str(PA_["edgecolor_write"]) != "":
                edgeColor = PA_["edgecolor_write"]
            else:
                edgeColor = PA_["edgecolor"]

            if PA_["show_censors"] in ["off", ".off"]:
                showCensors = False
            else:
                showCensors = True

            if PA_["Conf_Interval"] in ["off", ".off"]:
                ConfidenceInterval = False
            else:
                ConfidenceInterval = True

            if PA_["ci_legend"] in ["off", ".off"]:
                CI_legend = False
            else:
                CI_legend = True

            if PA_["ci_force_lines"] in ["off", ".off"]:
                CI_lines = False
            else:
                CI_lines = True

            linewidth = PA_["linewidth_write"]
            edgeLineWidth = PA_["edge_linewidth"]
            markerSize = PA_["censor_marker_size_val"]

            markerAlpha = PA_["marker_alpha"]
            CI_alpha = PA_["ci_alpha"]
            markerVal = PA_["censor_marker_value"]

            pa_ = {}
            for arg in [
                    "left_axis", "right_axis", "upper_axis", "lower_axis",
                    "tick_left_axis", "tick_right_axis", "tick_upper_axis",
                    "tick_lower_axis"
            ]:
                if pa[arg] in ["off", ".off"]:
                    pa_[arg] = False
                else:
                    pa_[arg] = True

            if str(pa["grid_color_text"]) != "":
                pa_["grid_color_write"] = pa["grid_color_text"]
            else:
                pa_["grid_color_write"] = pa["grid_color_value"]

            pl=km.plot(show_censors=showCensors, \
                censor_styles={"marker":marker_dict[markerVal], "markersize":float(markerSize), "markeredgecolor":edgeColor, "markerfacecolor":markerColor, "alpha":float(markerAlpha), "mew":float(edgeLineWidth)}, \
                ci_alpha=float(CI_alpha), \
                ci_force_lines=CI_lines, \
                ci_show=ConfidenceInterval, \
                ci_legend=CI_legend, \
                linestyle=linestyle, \
                linewidth=float(linewidth), \
                color=linecolor)

            pl.spines['right'].set_visible(pa_["right_axis"])
            pl.spines['top'].set_visible(pa_["upper_axis"])
            pl.spines['left'].set_visible(pa_["left_axis"])
            pl.spines['bottom'].set_visible(pa_["lower_axis"])

            pl.spines['right'].set_linewidth(pa["axis_line_width"])
            pl.spines['left'].set_linewidth(pa["axis_line_width"])
            pl.spines['top'].set_linewidth(pa["axis_line_width"])
            pl.spines['bottom'].set_linewidth(pa["axis_line_width"])

            pl.tick_params(axis="both",
                           direction=pa["ticks_direction_value"],
                           length=float(pa["ticks_length"]))

            pl.tick_params(axis='x',
                           which='both',
                           bottom=pa_["tick_lower_axis"],
                           top=pa_["tick_upper_axis"],
                           labelbottom=pa_["lower_axis"],
                           labelrotation=float(pa["xticks_rotation"]),
                           labelsize=float(pa["xticks_fontsize"]))

            pl.tick_params(axis='y',
                           which='both',
                           left=pa_["tick_left_axis"],
                           right=pa_["tick_right_axis"],
                           labelleft=pa_["left_axis"],
                           labelrotation=float(pa["yticks_rotation"]),
                           labelsize=float(pa["yticks_fontsize"]))

            if str(pa["grid_value"]) != "None":
                pl.grid(True,
                        which='both',
                        axis=pa["grid_value"],
                        color=pa_["grid_color_write"],
                        linewidth=float(pa["grid_linewidth"]))

            if str(pa["x_lower_limit"]) != "" and str(
                    pa["x_upper_limit"]) != "":
                pl.set_xlim(float(pa["x_lower_limit"]),
                            float(pa["x_upper_limit"]))
            if str(pa["y_lower_limit"]) != "" and str(
                    pa["y_upper_limit"]) != "":
                pl.set_ylim(float(pa["y_lower_limit"]),
                            float(pa["y_upper_limit"]))

            pl.set_title(pa["title"],
                         fontdict={'fontsize': float(pa['titles'])})
            pl.set_xlabel(pa["xlabel"],
                          fontdict={'fontsize': float(pa['xlabels'])})
            pl.set_ylabel(pa["ylabel"],
                          fontdict={'fontsize': float(pa['ylabels'])})

        return df, pl, cph_coeff, cph_stats
Exemple #35
0
def raw2meta_extract(fn):
	"""
	Reasds raw2 files including GPS and enginerring information

	Parameters
	----------
	fn : string
		Path and filenmae of *.raw2 file

	Returns
	-------
	data : pandas DataFrame
		CTD (Salinity, Temperature, Fluorescence, Pressure), Pitch and Roll, Compass information
	gps : pandas DataFrame
		GPS position information
	zoog : pandas DataFrame
		Zoocam grayscale values

	"""
	pgain = 0.04
	poff = -10
	tgain = 0.001
	toff = -5
	sgain = 0.001
	soff = -1
	delta_t = 8
			
	
	#get file index
	print(time.ctime() + ": Processing "+fn)		
	print(time.ctime() + ": Generating file index...")		
	with open(fn) as f:
		list2 = [row.split()[0] for row in f]
	
	##########################################
	#read files
	##########################################
	
	f = open(fn)
	raw2 = f.readlines()
	f.close()
	
	print(time.ctime() + ": Loading CF_DIVE")		
	
	##########################################
	# CF_DIVE 0F
	##########################################
	
		
	'''
	This packet marks the present:
		Nsurf = Dive-Set Number 
		Ncyc = Cycle Number
		Npro = the profile number
		uxti0 = the UNIX time that the Dive-Set
		uxti1 = The Unix time this specific cycle began
		
		For the 0901 code, the Dive-Set Number is only incremented after 
		surface communications (GPS and SBD) are attempted (multiple cycles 
		between surface communications will not increment the Dive-Set 
		Number, but will increment the Cycle Number).  
		This packet should be used to set Nsurf, Ncyc, Npro for all 
		proceeding packets, until the next CF_DIVE packet is encountered.  
	'''
	
	cf_dive_idx = [i for i, j in enumerate(list2) if j == '0f']
	cf_dive_raw = [raw2[i].split() for i in cf_dive_idx]
	cf_dive = pd.DataFrame(cf_dive_raw)
	cf_dive = cf_dive.iloc[:,1:]
	
	cf_dive.columns = ['Nsurf','Ncyc','Npro','uxti0','uxti1','Dow','Month',
					   'day','Time','Year']
	cf_dive = cf_dive.astype(dtype = {'Nsurf':'int64','Ncyc':'int64',
									  'Npro':'int64','uxti0':'int64',
									  'uxti1':'int64'})
	
	##########################################			
	# CF_PDAT  11
	##########################################
	
	print(time.ctime() + ": Loading CF_PDAT")
	edat_idx = [i for i, j in enumerate(list2) if j == '11']
	edat_raw = [raw2[i].split() for i in edat_idx]
	edat = pd.DataFrame(edat_raw)
	edat = edat.iloc[:,1:9]
	edat.columns = ['Nsurf','Ncyc','Npro','time','pressure','temperature',
					'salinity','fluorescence']
	edat = edat.astype(dtype = {'Nsurf':'int64','Ncyc': 'int64','Npro': 'int64',
								'time':'float','pressure':'float',
								'temperature':'float','salinity':'float',
								'fluorescence':'float'} )
	edat['pressure']=edat['pressure'] * pgain + poff #pressure as a double; step 1 of conversion
	#still need to find pmin and do p=p-pmin to convert to dBar	
	sal_cond = edat['salinity'] > 0
	edat.loc[sal_cond, 'salinity'] = edat.loc[sal_cond,'salinity']  * sgain + soff
	sal_cond = edat['temperature'] > 0
	edat.loc[sal_cond, 'temperature'] = edat.loc[sal_cond,'temperature']  * tgain + toff
	
	for var in ['salinity','temperature','fluorescence']:
		cond = edat[var] <= 0
		edat.loc[cond, var] = float('nan')
	
	edat = pd.merge(edat,cf_dive)
	edat['Dive_start_time'] = pd.to_datetime(edat.uxti0, unit='s')
	edat['Dive_start_time'] = edat['Dive_start_time'].dt.tz_localize('UTC')
	#add time_of_measure
	edat['time_of_measure'] = edat['Dive_start_time'] + pd.to_timedelta(edat['time'].astype('str') + 'seconds')
	#edat.time_of_measure = edat.time_of_measure.dt.tz_localize('UTC')
	edat['time_of_measure_PDT'] = edat.time_of_measure - pd.to_timedelta(delta_t, unit='hours') #transform to local time as defined -8 hours not ST
	#correct pressure
	edat['pressure'] = edat.pressure - edat.pressure.min() #Correct pressure
	
	##########################################				
	#CF_EDAT 21
	##########################################
	
	pr_idx = [i for i, j in enumerate(list2) if j == '21']
	pr_raw = [raw2[i].split() for i in pr_idx]
	pr = pd.DataFrame(pr_raw)
	pr = pr.iloc[:,1:7]
	pr.columns = ['Nsurf','Ncyc','Npro','compass','pitch','roll']
	pr = pr.astype(dtype = {'Nsurf':'int64','Ncyc': 'int64',
							'Npro': 'int64','compass':'******',
							'pitch':'float','roll':'float'})
	pr.loc[:,['compass','pitch', 'roll']] /= 10
	
	print(time.ctime() + "Loading CF_GPS1")
	
	##########################################
	#CF_GPS1--start of dive-set 01
	##########################################
	
	gps1_idx = [i for i, j in enumerate(list2) if j == '01']
	gps1_raw = [raw2[i].split() for i in gps1_idx]
	gps1 = pd.DataFrame(gps1_raw)
	gps1 = gps1.iloc[:,[1,3,4,5,6,13]]
	gps1.columns = ['Nsurf_start','year','yr_day_start','lat_start', 'lon_start',
					'UTC_time_fix_start']
	gps1 = gps1.astype(dtype = {'Nsurf_start':'int64', 'year':'int64',
								'yr_day_start':'float','lat_start': 'float', 
								'lon_start': 'float'})
	
	base_date = pd.to_datetime(gps1['year'].astype('str') + '/01/01 00:00:00')
	
	gps1['UTC_time_fix_start'] = base_date + pd.to_timedelta((gps1['yr_day_start']-1).astype('str') + ' days')
	
	print(time.ctime() + ": Loading CF_GPS2")
	
	##########################################
	#CF_GPS2--end of dive-set 02
	##########################################
	
	gps2_idx = [i for i, j in enumerate(list2) if j == '02']
	gps2_raw = [raw2[i].split() for i in gps2_idx]
	gps2 = pd.DataFrame(gps2_raw)
	gps2 = gps2.iloc[:,[1,3,4,5,6,13]]
	gps2.columns = ['Nsurf_end', 'year','yr_day_end',
					  'lat_end', 'lon_end','UTC_time_fix_end']
	gps2 = gps2.astype(dtype = {'Nsurf_end':'int64', 'year':'int64',
								'yr_day_end':'float','lat_end': 'float', 
								'lon_end': 'float'})
	
	base_date = pd.to_datetime(gps2['year'].astype('str') + '/01/01 00:00:00')
	
	gps2['UTC_time_fix_end'] = base_date + pd.to_timedelta((gps2['yr_day_end']-1).astype('str') + ' days')
	print(time.ctime() + "Loading CF_ZOOG")
	
	##########################################
	#CF_ZOOG this is the zooglider grayscale value
	##########################################
	
	zoog_idx = [i for i, j in enumerate(list2) if j == 'b4']
	zoog_raw = [raw2[i].split() for i in zoog_idx]
	zoog = pd.DataFrame(zoog_raw)
	#dt = pd.to_datetime(zoog.iloc[:,7] +' '+ zoog.iloc[:,8] + ' ' + zoog.iloc[:,9] + 
	#					' ' + zoog.iloc[:,10] + ' ' + zoog.iloc[:,11])
	zoog = zoog.iloc[:,[1,2,3,4,5,6]]
	#zoog['date'] = dt
	zoog.columns = ['zstart', 'zstop','n_img','n_err', 'avg','unix_tstamp']
	zoog = zoog.astype(dtype = {'zstart':'int64', 'zstop':'int64','n_img':'int64','n_err': 'int64', 'avg': 'float', 'unix_tstamp':'float'})
	zoog['UTC_time'] = pd.to_datetime(zoog.unix_tstamp[0], unit='s')
	zoog.UTC_time = zoog.UTC_time.dt.tz_localize('UTC')
	zoog['PDT_time'] = zoog.UTC_time - pd.to_timedelta(delta_t, unit='hours')

	##########################################
	#Export
	##########################################
	
	print(time.ctime() + ": Preparing data for export")
	
	##GPS
	gps = pd.merge(gps1, gps2, left_on = 'Nsurf_start', right_on = 'Nsurf_end')
	gps = gps[['Nsurf_start','Nsurf_end', 'UTC_time_fix_start', 'UTC_time_fix_end','lon_start', 'lon_end', 'lat_start', 'lat_end']]
	
	## Data
	data = pd.concat([edat, pr.iloc[:,3:]], sort=False,axis=1)
	#only keep important info
	data = data[['Nsurf','Ncyc','Npro', 'pitch', 'roll', 'compass','pressure','temperature','salinity','fluorescence','uxti0','Dive_start_time','time_of_measure','time_of_measure_PDT']]
	print(time.ctime() + ": Completed")
	return data, gps, zoog
def reconstruct():
    """
    run KFOLD method for regression 
    """
    #import packages
    import os
    import pandas as pd
    import statsmodels.api as sm
    from datetime import datetime
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 108
    y = 109

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        {
            # #apply 10 fold cross validation
            # kf = KFold(n_splits=10, random_state=29)

            # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
            # for train_index, test_index in kf.split(X):
            #     X_train, X_test = X_pca[train_index], X_pca[test_index]
            #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #     #train regression model
            #     lm = LinearRegression()
            #     lm.fit(X_train, y_train)

            #     #predictions
            #     predictions = lm.predict(X_test)
            #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #     #                       pd.DataFrame(np.array(y_test))], \
            #     #                      axis = 1)
            #     # pred_obs.columns = ['pred', 'obs']
            #     # combo = pd.concat([combo, pred_obs], axis = 0)

            #     #evaluation matrix - check p value
            #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
            #         print("insignificant correlation!")
            #         continue
            #     else:
            #         #print(stats.pearsonr(y_test, predictions))
            #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
            #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

            # # #number of years used to train/test model
            # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
            #                       pred_surge['date'][0]).days/365)
            # longitude = surge['lon'][0]
            # latitude = surge['lat'][0]
            # num_pc = X_pca.shape[1] #number of principal components
            # corr = np.mean(metric_corr)
            # rmse = np.mean(metric_rmse)

            # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
            #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
            #       np.mean(metric_rmse), '\n')
        }

        num_pc = X_pca.shape[1]  #number of principal components
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]

        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis=1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1)

        #standardize predictor data
        dat = pred_for_recon.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat(
            [pred_for_recon['date'], dat_standardized], axis=1)

        X_recon = pred_standardized.iloc[:, 1:]

        #apply PCA
        pca = PCA(num_pc)  #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)

        #model preparation
        #first train model using observed surge and corresponding predictors
        X_pca = sm.add_constant(X_pca)
        est = sm.OLS(y['surge'], X_pca).fit()

        #predict with X_recon and get 95% prediction interval
        X_pca_recon = sm.add_constant(X_pca_recon)
        predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05)

        #drop confidence interval and mean_se columns
        predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \
                         axis = 1, inplace = True)

        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], predictions], axis=1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']

        {
            # plot - optional
            # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
            # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
            # sns.set_context('notebook', font_scale = 2)
            # plt.figure()
            # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
            # plt.scatter(surge['date'], surge['surge'], color = 'blue')
            # prediction intervals
            # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
            # confidence intervals
            # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)
import pandas as pd
import os
import numpy as np
# Step 1: Getting held out samples' information.
scale_file = r'D:\WorkStation_2018\WorkStation_CNN_Schizo\Scale\10-24大表.xlsx'
included_subjects = r'D:\WorkStation_2018\WorkStation_dynamicFC_V1\Data\headmotion\included_subjects_from851database_ID.xlsx'
roi_signals_dir = r'D:\WorkStation_2018\WorkStation_dynamicFC_V3\Data\dfc_whole'; 
roi_all_signals_dir = r'D:\WorkStation_2018\WorkStation_dynamicFC_V1\Data\ROISignals_FumImgARWSFC_screened'

scale = pd.read_excel(scale_file)
included_subjects = pd.read_excel(included_subjects, header=None)

subjname = os.listdir(roi_signals_dir)
subjname = pd.Series(subjname)
subjname = subjname.str.findall('[1-9]\d*')
subjname = [np.int(sn[0]) for sn in subjname]
subjname = pd.DataFrame(subjname)

subjname_all = os.listdir(roi_all_signals_dir)
subjname_all = pd.Series(subjname_all)
subjname_all = subjname_all.str.findall('[1-9]\d*')
subjname_all = [np.int(sn[0]) for sn in subjname_all]
subjname_all = pd.DataFrame(subjname_all)

exclueded_subj = pd.DataFrame((set(included_subjects[0]) - set(subjname[0])))
exclueded_subj = scale[scale['folder'].isin(exclueded_subj[0])]['folder']
describe = exclueded_subj.value_counts()

exclueded_subj_available = pd.merge(subjname_all, exclueded_subj, left_on=0, right_on='folder', how='inner')
exclueded_subj_available[0].to_csv(r'D:\WorkStation_2018\WorkStation_dynamicFC_V3\Data\ID_Scale_Headmotion\held_out_samples.txt',index=None, header=None)
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 601
    y = 602

    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"

        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)

        metric_corr = []
        metric_rmse = []
        #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)

            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)

            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(
                    np.sqrt(metrics.mean_squared_error(y_test, predictions)))

        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)

        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')

        #original size and pca size of matrix added
        new_df = pd.DataFrame(
            [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis=0)

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)

        #cd to dir_in
        os.chdir(dir_in)
Exemple #39
0
# Change the release year to numeric.
tmdb_movie_data['title_year'] = pd.to_numeric(tmdb_movie_data['title_year'])

# ---
# Dropping Columns.
# ---
# Drop columns that we don't care about.
imdb_movie_data = imdb_movie_data.drop(columns=IMDB_DROPS)
tmdb_movie_data = tmdb_movie_data.drop(columns=TMDB_DROPS)

# ---
# Merging the Datasets.
# ---
# Inner join.
full_data = pd.merge(imdb_movie_data,
                     tmdb_movie_data,
                     how='inner',
                     on=['movie_title', 'title_year'])
# Drop duplicates.
full_data = full_data.drop_duplicates()
# Remove columns where country is not USA.
full_data = full_data.loc[full_data['country'] == 'USA']
# Release year no longer needed. Was only needed for join.
full_data = full_data.drop(columns=['title_year', 'country'])

# --
# Normalize Names Thus Far.
# --
full_data = full_data.rename(
    columns={
        'director_name': 'Director_Name',
        'duration': 'Runtime',
Exemple #40
0
def createBMIDataset(bmi_buckets=[0, 20, 30, 40, 50, 55, 60, 100]):

    patientList = []
    fullFileList = []

    inputFolder1 = '/home/santhosr/Documents/Birad/ProcessedData/FullRes'
    truthFile1 = '/home/santhosr/Documents/Birad/birad_targetFile.csv'

    inputFolder2 = '/home/santhosr/Documents/Birad/ProcessedData/PennExtra_3500/'
    truthFile2 = '/home/santhosr/Documents/Birad/RaceDL_ExtraCaucasian.csv'

    df1 = pd.read_csv('/home/santhosr/Documents/Birad/birad_targetFile.csv')
    df1.drop(['PresIntentType', 'DBT'], inplace=True, axis=1)

    df2 = pd.read_csv(
        '/home/santhosr/Documents/Birad/RaceDL_ExtraCaucasian.csv')
    df2.Medview_Race = 'White'

    ## Removing IDs from df2 which are already present in df1
    idList = list(df1.DummyID.values)
    df2 = df2[~df2.DummyID.isin(idList)]

    truth = pd.concat([df1, df2], sort=True)

    ## Reading from set 1
    for i in range(1, 5):

        folder = os.path.join(inputFolder1, str(i))
        fileList = os.listdir(folder)
        fileList = [os.path.join('FullRes', str(i), x) for x in fileList]
        fullFileList = fullFileList + fileList
        #         print(len(fileList))

        patientList = patientList + [
            int(x.split("/")[-1].split("_")[0]) for x in fileList
        ]

    patientList1 = patientList.copy()
    ## Reading from set 2
    print(len(patientList))

    fileList = os.listdir(inputFolder2)
    fileList = [os.path.join('PennExtra_3500', x) for x in fileList]
    d = pd.DataFrame(fileList)
    d[1] = d[0].apply(lambda x: int(x.split("/")[1].split("_")[0]))
    d = d[d[1].isin(df2.DummyID.values)]
    fileList = list(d[0].values)
    fullFileList += list(d[0].values)

    patientList += [int(x.split("/")[-1].split("_")[0]) for x in fileList]
    print(len(patientList))

    patientList2 = patientList.copy()

    #Retaining only the patients with 4 views
    k = pd.Series(patientList).value_counts().reset_index()
    patientList = k[k[0] == 4]['index'].values
    print("total number of patients", len(patientList))

    patientList = np.array(list(set(patientList)))
    df = pd.DataFrame({'DummyID': patientList})
    df = pd.merge(df, truth, how='left')
    df1 = df1.copy()
    df = df.drop_duplicates(subset=['DummyID'])

    #Creates equal number of patients from White and AA groups
    white = df[df.Medview_Race == 'White']
    AA = df[df.Medview_Race == 'African American']

    outputDf = pd.DataFrame()

    for i in range(len(bmi_buckets) - 1):
        out = getBMIData(AA, white, bmi_buckets[i], bmi_buckets[i + 1])

        outputDf = pd.concat([outputDf, out])

    temp = pd.DataFrame(fullFileList)
    temp.columns = ['filename']

    temp['DummyID'] = temp.filename.apply(
        lambda x: int(x.split("/")[-1].split("_")[0]))

    trainTemp = temp[temp.DummyID.isin(
        outputDf[outputDf.train == False].DummyID.values)]
    validTemp = temp[temp.DummyID.isin(
        outputDf[outputDf.train == True].DummyID.values)]

    trainTemp['train'] = False
    validTemp['train'] = True

    df = pd.concat([trainTemp, validTemp], sort=True)

    #Shuffling data
    index = list(range(len(df)))
    np.random.shuffle(index)
    df = df.iloc[index]

    return df
Exemple #41
0
def build_agg_data(df,
                   x,
                   y,
                   inputs,
                   agg,
                   z=None,
                   group_col=None,
                   animate_by=None):
    """
    Builds aggregated data when an aggregation (sum, mean, max, min...) is selected from the front-end.

    :param df: dataframe that contains data for chart
    :type df: :class:`pandas:pandas.DataFrame`
    :param x: column to use for the X-Axis
    :type x: str
    :param y: columns to use for the Y-Axes
    :type y: list of str
    :param inputs: additional chart configurations (chart_type, group, rolling_win, rolling_comp...)
    :type inputs: dict
    :param agg: points to a specific function that can be applied to
                :func: pandas.core.groupby.DataFrameGroupBy.  Possible values are: count, first, last mean,
                median, min, max, std, var, mad, prod, sum
    :type agg: str
    :param z: column to use for the Z-Axis
    :type z: str, optional
    :return: dataframe of aggregated data
    :rtype: :class:`pandas:pandas.DataFrame`
    """
    if agg == "raw":
        return df, []
    z_exists = len(make_list(z))
    if agg == "corr":
        if not z_exists:
            raise NotImplementedError(
                "Correlation aggregation is only available for 3-dimensional charts!"
            )
    if agg == "rolling":
        if z_exists:
            raise NotImplementedError(
                "Rolling computations have not been implemented for 3-dimensional charts!"
            )
        window, comp = map(inputs.get, ["rolling_win", "rolling_comp"])
        agg_df = df.set_index(x).rolling(window=window)
        agg_df = pd.DataFrame({c: getattr(agg_df[c], comp)() for c in y})
        agg_df = agg_df.reset_index()
        code = [
            "chart_data = chart_data.set_index('{x}').rolling(window={window})"
            .format(x=x, window=window),
            "chart_data = pd.DataFrame({'" + ", ".join([
                "'{c}': chart_data['{c}'].{comp}()".format(c=c, comp=comp)
                for c in y
            ]) + "})",
            "chart_data = chart_data.reset_index()",
        ]
        return agg_df, code

    idx_cols = make_list(animate_by) + make_list(group_col) + [x]
    agg_cols = make_list(y)
    if z_exists:
        idx_cols += make_list(y)
        agg_cols = make_list(z)

    if agg == "drop_duplicates":
        groups = [df[idx_cols + [col]].drop_duplicates() for col in agg_cols]
        if len(groups) == 1:
            groups = groups[0]
            code = "chart_data = chart_data[['{}']].drop_duplicates()".format(
                "','".join(idx_cols + agg_cols))
        else:
            groups = pd.merge(*groups, on=idx_cols, how="outer")
            code = (
                "idx_cols = ['{}']\n"
                "agg_cols = ['{}']\n"
                "chart_data = pd.merge(\n"
                "\t*[chart_data[idx_cols + [col]].drop_duplicates() for col in agg_cols],\n"
                "\ton=idx_cols,\n"
                "\thow='outer'\n"
                ")").format("','".join(idx_cols), "','".join(agg_cols))
    else:
        groups = df.groupby(idx_cols)
        if agg in ["pctsum", "pctct"]:
            func = "sum" if agg == "pctsum" else "size"
            subidx_cols = [
                c for c in idx_cols if c not in make_list(group_col)
            ]
            groups = getattr(groups[agg_cols], func)()
            groups = groups / getattr(df.groupby(subidx_cols)[agg_cols],
                                      func)() * 100
            if len(agg_cols) > 1:
                groups.columns = agg_cols
            elif len(agg_cols) == 1:
                groups.name = agg_cols[0]
            code = (
                "chart_data = chart_data.groupby(['{cols}'])[['{agg_cols}']].{agg}()\n"
                "chart_data = chart_data / chart_data.groupby(['{subidx_cols}']).{agg}()\n"
                "chart_data = chart_data.reset_index()")
            code = code.format(
                cols="', '".join(idx_cols),
                subidx_cols="', '".join(subidx_cols),
                agg_cols="', '".join(make_list(agg_cols)),
                agg=func,
            )
            code = [code]
        else:
            groups = getattr(groups[agg_cols], agg)()
            code = [
                "chart_data = chart_data.groupby(['{cols}'])[['{agg_cols}']].{agg}().reset_index()"
                .format(cols="', '".join(idx_cols),
                        agg_cols="', '".join(agg_cols),
                        agg=agg)
            ]
    if animate_by is not None:
        full_idx = pd.MultiIndex.from_product(
            [df[c].unique() for c in idx_cols], names=idx_cols)
        groups = groups.reindex(full_idx).fillna(0)
        code += [
            "idx_cols = ['{cols}']".format(cols="', '".join(idx_cols)),
            "full_idx = pd.MultiIndex.from_product([df[c].unique() for c in idx_cols], names=idx_cols)"
            "chart_data = chart_data.reindex(full_idx).fillna(0)",
        ]
    return groups.reset_index(), code
Exemple #42
0
import pandas as pd
import numpy as np
import random
from copy import deepcopy

ml1m_dir = 'ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir,
                          sep='::',
                          header=None,
                          names=['uid', 'mid', 'rating', 'timestamp'],
                          engine='python')

user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))

ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')

ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]

# convert to binary data
ratings = deepcopy(ml1m_rating)
ratings['rating'][ratings['rating'] > 0] = 1.0
Exemple #43
0
# Nos conectamos al API de Google Maps

gmaps_key = googlemaps.Client(key='AIzaSyAdZKjevohQs7fHJn3NpZJ70DDtcAsj4rI')
lat = []
lng = []
for i in direcciones:
    try:
        geocode_result = gmaps_key.geocode(i)
        lat_ = geocode_result[0]['geometry']['location']['lat']
        lng_ = geocode_result[0]['geometry']['location']['lng']
        lat.append(lat_)
        lng.append(lng_)

    except IndexError:
        lat.append(np.nan)
        lng.append(np.nan)

df_coordenadas = pd.DataFrame({'latitud': lat, 'longitud': lng})

df['i1'] = df.index
df_coordenadas['i1'] = df_coordenadas.index

join = pd.merge(df, df_coordenadas, how='right', on='i1')

del join['i1']

join.to_csv(
    '/Users/rogeliomj/Documents/python/tesis/plataforma_cdmx/escuelas_danadas_geocoded.csv',
    index=False)
"""
comp_info = pandas.read_excel(compound_info_file, index_col=0)
for row in comp_info.iterrows():
    compound_table[int(row[0])] = conversions.pubchem(str(row[1]['Drug Name']))
    # compound_table[int(row[0])] = pubchem_table[str(row[1]['Drug Name'])]


e = Emiter("gdsc.scan")


cl_info = pandas.read_excel(conv_file, index_col=0)
sample_table = {}
for row in cl_info.iterrows():
    sample_table[row[0]] = (str(row[1]['CCLE name']), 'ccle') # "ccle:%s" % (row[1]['CCLE name'])

cl_info = pandas.read_excel(cell_info_file, index_col=1)
for row in cl_info.iterrows():
    if row[0] not in sample_table:
        sample_table[row[0]] = (str(row[1]['Sample Name']), 'gdsc') # "gdsc:%s" % (row[1]['Sample Name'])
        gdsc_cell_info(row[1], e.emit)

raw = pandas.read_excel(raw_file)
fitted = pandas.read_excel(fitted_file)

merge = pandas.merge(raw, fitted, on=["COSMIC_ID", "DRUG_ID"])

for r in merge.iterrows():
    cosmic_id = int(r[1]["COSMIC_ID"])
    if cosmic_id in cl_info.index:
        gdsc_ic50_row( r[1], compound_table, sample_table, e.emit )
supply = pd.read_excel(r'E:\PD Wk 35 Input.xlsx', 'PD Wk 34 Output')
supply['Supply Id'] = supply.index + 1
supply['Running Supply To'] = supply.sort_values('Date').groupby(
    ['Product', 'Scent'])['Quantity'].cumsum()
supply[
    'Running Supply From'] = supply['Running Supply To'] - supply['Quantity']

demand = pd.read_excel(r'E:\PD Wk 35 Input.xlsx', 'Store Orders')
demand['Demand Id'] = demand.index + 1
demand['Running Demand To'] = demand.sort_values('Date Required').groupby(
    ['Product', 'Scent'])['Quantity Requested'].cumsum()
demand['Running Demand From'] = demand['Running Demand To'] - demand[
    'Quantity Requested']

allocate = pd.merge(supply, demand, on=['Product', 'Scent'], how='inner')
allocate = allocate.query(
    '(`Running Demand To`>=`Running Supply From` and `Running Demand To`<=`Running Supply To`) \
                          or (`Running Supply To`>=`Running Demand From` and `Running Supply To`<=`Running Demand To`)'
)
allocate['Allocated Quantity'] = allocate.apply(
    lambda x: min(x['Running Supply To'], x['Running Demand To']) - max(
        x['Running Supply From'], x['Running Demand From']),
    axis=1)

surplus = supply[~supply['Supply Id'].isin(allocate['Supply Id'])]
surplus = surplus.groupby(['Supplier', 'Product', 'Scent'],
                          as_index=False).agg({'Quantity': 'sum'})

fulfill = allocate.groupby([
    'Store', 'Product', 'Scent', 'Supplier', 'Quantity Requested',
mycol = mydb['EKKO']
mycol1 = mydb['EKET']

cursor = mycol.find()
df = pd.DataFrame(list(cursor))
df.columns = df.columns.str.replace(' ', '')
df = df[['Purch-Doc-', 'Createdon', 'Vendor']]
print(df.head())
print('_______________')
cursor = mycol1.find()
df1 = pd.DataFrame(list(cursor))
df1.columns = df1.columns.str.replace(' ', '')
#d1f = df1[['Purch-Doc-','Del-Date','Sched-Qty']]

df1 = df1[['Purch-Doc-', 'Del-Date', 'Sched-Qty', 'Delivered']]
data = pd.merge(df1, df, on='Purch-Doc-', how='left')
#data.to_csv('EKET-EKKO.csv')
print(data.columns)
data.columns = data.columns.str.replace(' ', '')
data['Sched-Qty'] = data['Sched-Qty'].astype(str)
data['Delivered'] = data['Delivered'].astype(str)
data['Sched-Qty'] = data['Sched-Qty'].str.replace(',', '')
data['Delivered'] = data['Delivered'].str.replace(',', '')
'''
data['Delivered'] = data['Delivered'].str.replace('.00','')
print(data['Delivered'])
data['Sched-Qty'] = data['Sched-Qty'].str.replace('.00','')
'''
data['Delivered'] = np.where(data['Delivered'].str[:-3] == '.00',
                             data['Delivered'].str[-3:], data['Delivered'])
data['Sched-Qty'] = np.where(data['Sched-Qty'].str[:-3] == '.00',
def call_KM(genre1, genre2, genre3):
    movies = pd.read_csv('mysite/movies.csv')
    ratings = pd.read_csv('mysite/ratings.csv')

    # genre1='Adventure'
    # genre2='Sci-Fi'
    # genre3='Action'
    my_clusters = 0
    helper.set_Variables(genre1, genre2, genre3)

    genre_ratings = helper.get_genre_ratings(ratings, movies, [genre1, genre2],
                                             [Dict[genre1], Dict[genre2]])
    biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5)

    print("Number of records: ", len(biased_dataset))
    biased_dataset.head()
    helper.draw_scatterplot(biased_dataset[Dict[genre2]], Dict[genre2],
                            biased_dataset[Dict[genre1]], Dict[genre1],
                            'mysite/static/mysite/Normal.png')
    # plt.savefig('mysite/static/mysite/Normal.png')
    #
    # plt.close('mysite/static/mysite/Normal.png')

    X = biased_dataset[[Dict[genre2], Dict[genre1]]].values

    # TODO: Create an instance of KMeans to find two clusters
    kmeans_1 = KMeans(n_clusters=2, random_state=0)
    predictions = kmeans_1.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions,
                         'mysite/static/mysite/TwoCluster.png')
    # plt.savefig('mysite/static/mysite/TwoCluster.png')
    # plt.close('TwoCluster.png')

    # TODO: Create an instance of KMeans to find three clusters
    kmeans_2 = KMeans(n_clusters=3, random_state=1)
    predictions_2 = kmeans_2.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions_2,
                         'mysite/static/mysite/ThreeCluster.png')
    # plt.savefig('mysite/static/mysite/ThreeCluster.png')
    # plt.close('ThreeCluster.png')

    # TODO: Create an instance of KMeans to find four clusters
    kmeans_3 = KMeans(n_clusters=4, random_state=3)
    predictions_3 = kmeans_3.fit_predict(X)
    helper.draw_clusters(biased_dataset, predictions_3,
                         'mysite/static/mysite/FourCluster.png')
    # plt.savefig('mysite/static/mysite/FourCluster.png')
    # plt.close('FourCluster.png')

    possible_k_values = range(2, len(X) + 1, 5)
    errors_per_k = [helper.clustering_errors(k, X) for k in possible_k_values]
    list(zip(possible_k_values, errors_per_k))
    fig, ax = plt.subplots(figsize=(16, 6))
    ax.set_xlabel('K - number of clusters')
    ax.set_ylabel('Silhouette Score (higher is better)')
    ax.plot(possible_k_values, errors_per_k)
    fig.savefig('mysite/static/mysite/score.png')
    plt.close(fig)

    # Ticks and grid
    xticks = np.arange(min(possible_k_values), max(possible_k_values) + 1, 5.0)
    ax.set_xticks(xticks, minor=False)
    ax.set_xticks(xticks, minor=True)
    ax.xaxis.grid(True, which='both')
    yticks = np.arange(round(min(errors_per_k), 2), max(errors_per_k), .05)
    ax.set_yticks(yticks, minor=False)
    ax.set_yticks(yticks, minor=True)
    ax.yaxis.grid(True, which='both')

    # TODO: Create an instance of KMeans to find seven clusters
    kmeans_4 = KMeans(n_clusters=7, random_state=6)
    predictions_4 = kmeans_4.fit_predict(X)
    helper.draw_clusters(biased_dataset,
                         predictions_4,
                         'mysite/static/mysite/BestCluster.png',
                         cmap='Accent')
    # plt.savefig('mysite/static/mysite/BestCluster.png')
    # plt.close('BestCluster.png')

    biased_dataset_3_genres = helper.get_genre_ratings(
        ratings, movies, [genre1, genre2, genre3],
        [Dict[genre1], Dict[genre2], Dict[genre3]])
    biased_dataset_3_genres = helper.bias_genre_rating_dataset(
        biased_dataset_3_genres, 3.2, 2.5).dropna()
    print("Number of records: ", len(biased_dataset_3_genres))

    X_with_action = biased_dataset_3_genres[[
        Dict[genre2], Dict[genre1], Dict[genre3]
    ]].values

    # TODO: Create an instance of KMeans to find seven clusters
    kmeans_5 = KMeans(n_clusters=7)
    predictions_5 = kmeans_5.fit_predict(X_with_action)
    helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5,
                            'mysite/static/mysite/3DCluster.png')
    # plt.savefig('mysite/static/mysite/3DCluster.png')
    # plt.close('3DCluster.png')

    #Merge the two tables then pivot so we have Users X Movies dataframe
    ratings_title = pd.merge(ratings,
                             movies[['movieId', 'title']],
                             on='movieId')
    user_movie_ratings = pd.pivot_table(ratings_title,
                                        index='userId',
                                        columns='title',
                                        values='rating')
    user_movie_ratings.iloc[:6, :10]
    n_movies = 30
    n_users = 18
    most_rated_movies_users_selection = helper.sort_by_rating_density(
        user_movie_ratings, n_movies, n_users)
    most_rated_movies_users_selection.head()

    helper.draw_movies_heatmap(most_rated_movies_users_selection,
                               'mysite/static/mysite/HeatMap.png')
Exemple #48
0
    def calc_stat_info(self):
        '''
        通过设置的参数信息,计算统计周期内的统计信息
        :return:
        '''
        gold_future = "C:\\quanttime\\data\\gold\\sh_future\\gold.csv"
        silver_future = "C:\\quanttime\\data\\gold\\sh_future\\silver.csv"

        stander_dtype = {
            'open': float,
            "close": float,
            "high": float,
            "low": float,
            "volume": float,
            "money": float
        }
        gold_future_data = pd.read_csv(gold_future,
                                       parse_dates=["date"],
                                       index_col=["date"],
                                       dtype=stander_dtype)
        gold_future_data = gold_future_data[~gold_future_data.reset_index().
                                            duplicated().values]

        silver_future_data = pd.read_csv(silver_future,
                                         parse_dates=["date"],
                                         index_col=["date"],
                                         dtype=stander_dtype)
        silver_future_data = silver_future_data[
            ~silver_future_data.reset_index().duplicated().values]

        future_data = pd.merge(gold_future_data,
                               silver_future_data,
                               left_index=True,
                               right_index=True,
                               suffixes=('_gold', '_silver'))
        future_data["compare"] = future_data["close_gold"] / future_data[
            "close_silver"] * 1000
        # 去重
        future_data = future_data.dropna()
        future_data_trade_date = future_data.index

        today = datetime.today().date()
        self.ui.lineEdit_14.setText(today.strftime("%Y-%m-%d"))
        self.ui.lineEdit_13.setText(
            future_data_trade_date[-1].strftime("%Y-%m-%d"))
        columns_name = [
            "count", "mean", "std", "min", "25%", "50%", "75%", "max"
        ]
        # df_empty = pd.DataFrame(columns=columns_name)
        self.back_day_stat = self.ui.lineEdit_12.text()  # 设置当前日期往前推几天的统计信息
        self.long_buy_value = self.ui.lineEdit_8.text(
        )  # 做多金银比,统计买入线,如0.10即10%分位线
        self.long_sell_value = self.ui.lineEdit_9.text(
        )  # 做多金银比,统计卖出线,如0.15即15%分位线
        self.short_buy_value = self.ui.lineEdit_10.text(
        )  # 做空金银比,统计的买入线,如0.85即85%分位线
        self.short_sell_value = self.ui.lineEdit_11.text()  # 做空金银比,统计的卖出线
        df_stat_20 = future_data.iloc[
            -int(self.back_day_stat):]  #如future_data.iloc[-20:]
        #print(df_stat_20)
        df_stat = df_stat_20.loc[:, ["compare"]].describe()
        #print(df_stat)

        long_buyValue = round(float(self.long_buy_value), 2)
        long_sellValue = round(float(self.long_sell_value), 2)
        short_sellValue = round(float(self.short_buy_value), 2)
        short_buyValue = round(float(self.short_sell_value), 2)
        # print(self.long_buy_value)
        # long_buyValue = 0.05
        # long_sellValue = 0.10
        # short_sellValue = 0.85
        # short_buyValue = 0.90
        v_5 = df_stat_20.quantile(long_buyValue).compare  # 5%分位
        v_10 = df_stat_20.quantile(long_sellValue).compare  # 10%分位
        v_90 = df_stat_20.quantile(short_sellValue).compare  # 90%分位
        v_95 = df_stat_20.quantile(short_buyValue).compare  # 95%分位

        value = round(v_5, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(0, 0, newItem)
        value = round(v_10, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(1, 0, newItem)
        value = round(v_90, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(2, 0, newItem)
        value = round(v_95, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(3, 0, newItem)

        value = round(df_stat.loc["max", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(4, 0, newItem)

        value = round(df_stat.loc["min", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(5, 0, newItem)

        value = round(df_stat.loc["mean", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(6, 0, newItem)

        value = round(df_stat.loc["25%", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(7, 0, newItem)

        value = round(df_stat.loc["50%", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(8, 0, newItem)

        value = round(df_stat.loc["75%", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(9, 0, newItem)

        value = round(df_stat.loc["std", ["compare"]].compare, 2)
        newItem = QtWidgets.QTableWidgetItem(str(value))
        self.ui.tableWidget_2.setItem(10, 0, newItem)
crsp = crsp.sort_values(by=['permno', 'date'])

# change variable format to int
crsp['permno'] = crsp['permno'].astype(int)

# Line up date to be end of month
crsp['date'] = pd.to_datetime(crsp['date'])

# find the closest trading day to the end of the month
crsp['monthend'] = crsp['date'] + MonthEnd(0)
crsp['date_diff'] = crsp['monthend'] - crsp['date']
date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min()
date_temp = pd.DataFrame(date_temp)  # convert Series to DataFrame
date_temp.reset_index(inplace=True)
date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True)
crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend'])
crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan)

# label every date of month end
crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount()

# label numbers of months for a firm
month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1)
month_num = month_num.astype(int)
month_num = month_num.reset_index(drop=True)

# mark the number of each month to each day of this month
crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill')

# crate a firm list
df_firm = crsp.drop_duplicates(['permno'])
color = sns.color_palette("hls", 8)
sns.set(style="darkgrid")
plt.figure(figsize=(15, 5))
sns.countplot(x=matrix['shop_id'], data=matrix, palette=color)

"""Aggregate Sale"""

train_data['revenue'] = train_data['item_price'] * train_data['item_cnt_day']
train_data.head()

group_data = train_data.groupby(by=['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum'})
group_data.columns = ['item_cnt_month']
group_data.reset_index(inplace = True)
group_data.head()

matrix = pd.merge(matrix, group_data, on=cols, how='left')
matrix.head()

matrix['item_cnt_month'] = (matrix['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float16))
matrix.head()

matrix.shape

test_data.head()

test_data['date_block_num'] = 34
test_data['date_block_num'] = test_data['date_block_num'].astype(np.int8)
test_data['shop_id'] = test_data['shop_id'].astype(np.int8)
test_data['item_id'] = test_data['item_id'].astype(np.int16)

matrix = pd.concat([matrix, test_data], ignore_index=True, sort=False, keys=cols)
Exemple #51
0
#  @Function:merge合并
#  @Time:2020/6/2 下午3:55
#  @Author:Flank
import numpy as np
import pandas as pd
# 数据库表的左,右连接,内连接,外连接,全连接
left = pd.DataFrame({
    'key': ['k0', 'k1', 'k2', 'k3'],
    'A':['a0','a1','a2','a3']   ,
    'B':['b0','b1','b2','b3']
})
right = pd.DataFrame({
    'key': ['k0', 'k1', 'k2', 'k3'],
    'C':['c0','c1','c2','c3'],
    'D':['d0','d1','d2','d3']
})
print(left)
print(right)
res=pd.merge(left,right,on='key')#on表示合并哪一列
print(res)
Exemple #52
0
def weather_phenomena_france():
    
    Nice = pd.read_csv('weather_nice.csv', sep =';', parse_dates = [0], usecols = [1,7])
    Mars = pd.read_csv('weather_mars.csv', sep = ';', parse_dates = [0], usecols = [1,7])
    Paris = pd.read_csv('weather_paris.csv', sep = ';', parse_dates = [0], usecols = [1,7])
    Lille = pd.read_csv('weather_lille.csv', sep = ';', parse_dates = [0], usecols = [1,7])
    Toulouse = pd.read_csv('weather_toulouse.csv', sep = ';', parse_dates = [0], usecols = [1,7])
    Lyon = pd.read_csv('weather_lyon.csv', sep = ';', parse_dates = [0], usecols = [1,7])
    
    
    Nice['Date'] = Nice['Date'].dt.strftime('%m%d')
    Nice = Nice.groupby('Date').mean()
    Nice['Température'] = Nice['Température'] - 273.15
    
    Mars['Date'] = Mars['Date'].dt.strftime('%m%d')
    Mars = Mars.groupby('Date').mean()
    Mars['Température'] = Mars['Température'] - 273.15
    
    Paris['Date'] = Paris['Date'].dt.strftime('%m%d')
    Paris = Paris.groupby('Date').mean()
    Paris['Température'] = Paris['Température'] - 273.15
    
    Lille['Date'] = Lille['Date'].dt.strftime('%m%d')
    Lille = Lille.groupby('Date').mean()
    Lille['Température'] = Lille['Température'] - 273.15
    
    Toulouse['Date'] = Toulouse['Date'].dt.strftime('%m%d')
    Toulouse = Toulouse.groupby('Date').mean()
    Toulouse['Température'] = Toulouse['Température'] - 273.15
    
    Lyon['Date'] = Lyon['Date'].dt.strftime('%m%d')
    Lyon = Lyon.groupby('Date').mean()
    Lyon['Température'] = Lyon['Température'] - 273.15
    
    
    temp = pd.merge(Nice, Mars, how='inner', left_index = True, right_index = True)
    temp = pd.merge(temp, Paris, how = 'inner', left_index = True, right_index = True)
    temp = temp.rename(columns ={'Température_x':'Nice', 'Température_y':'Marseille', 'Température' : 'Paris'})
    temp = pd.merge(temp, Lille, how = 'inner', left_index = True, right_index = True)
    temp = temp.rename(columns ={'Température' : 'Lille'})
    temp = pd.merge(temp, Toulouse, how = 'inner', left_index = True, right_index = True)
    temp = temp.rename(columns ={'Température' : 'Toulouse'})
    temp = pd.merge(temp, Lyon, how = 'inner', left_index = True, right_index = True)
    temp = temp.rename(columns ={'Température' : 'Lyon'})
    temp = temp[temp.index != '0229']
    # independency test between nice and marseille
    Nice_Marseille = ttest_ind(temp['Nice'],temp['Marseille'])
    # mean for france and south of france
    temp['France'] = (temp['Paris'] + temp['Lille'] + temp['Lyon'] + temp['Toulouse'])/4
    temp['South East France'] = (temp['Marseille']+temp['Nice'])/2
    # independency test between South east of france and rest of France
    Southfrance_vs_France = ttest_ind(temp['France'],temp['South East France'])
    
    columns_to_keep = ['South East France','France']
    temp = temp[columns_to_keep]
    temp = temp.reset_index()
    temp['Date'] = pd.to_datetime(temp['Date'], format = '%m%d')
    
    fig = plt.figure(figsize = (12,8))
    plt.xlabel('Month')
    plt.ylabel('Temperature (Celsius)')
    plt.title('Comparing Average temperatures for the 2010-2018 period between France and South East of France')
    plt.style.use('seaborn-colorblind')
    plt.plot(temp['Date'], temp['France'], color = 'green', lw = 1, label = 'Paris, Lille, Toulouse, Lyon')
    plt.plot(temp['Date'], temp['South East France'], color = 'blue', lw = 1, label = 'Nice, Marseille')
    plt.tick_params(top = 'off', bottom = 'off', left = 'off', right = 'off', labelleft = 'on', labelbottom = 'on')
    
    ax = plt.gca()
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.xaxis.set_major_formatter(dates.DateFormatter('%b'))
    ax.xaxis.set_major_locator(dates.MonthLocator())
    
    plt.legend(loc = 'best')
    
    plt.show()
    fig.savefig('assignment4.Png', format = 'png')
    
    return Nice_Marseille[1], Southfrance_vs_France[1]
Exemple #53
0
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  3 08:56:27 2017

@author: Daniel
"""
import pandas as pd
import numpy as np

url_users = 'https://raw.githubusercontent.com/wesm/pydata-book/master/ch02/movielens/users.dat'
url_ratings = 'https://raw.githubusercontent.com/wesm/pydata-book/master/ch02/movielens/ratings.dat'
url_movies = 'https://raw.githubusercontent.com/wesm/pydata-book/master/ch02/movielens/movies.dat'

users = pd.read_csv(url_users,header=None, delimiter="::").rename(columns={0:'userid', 1:'gender', 2: 'age', 3: 'occupation', 4:'zipcode'})
ratings = pd.read_csv(url_ratings,header=None, delimiter="::").rename(columns={0: 'userid', 1: 'movieid', 2: 'rating', 3:'timestamp'})
movie = pd.read_csv(url_movies,header=None, delimiter="::").rename(columns={0: 'movieid', 1: 'title', 2: 'genre'})

merged = pd.merge(ratings,movie, on=['movieid'])
merged = pd.merge(merged,users,on=['userid'])
data = merged.copy()

del users, ratings, movie, url_users, url_ratings, url_movies, merged

print(np.unique(data.loc['genre']))
                    print(e)
                    break

column_names = "SCode,SName,RDate,LXDM,LX,Count,CGChange,ShareHDNum,VPosition,TabRate,LTZB,ShareHDNumChange,RateChange,fund_type".split(
    ',')
df = pd.DataFrame(li, columns=column_names)
df.to_csv('processed_fin_data/重要基金持股.csv')
'''
# %%
df = pd.read_csv('processed_fin_data/重要基金持股.csv', converters={
                 'SCode': str, 'RDate': pd.to_datetime, 'Count': pd.to_numeric})
# %% 获取各类基金持股结果较好的股票日期组合
df_qs = df[(df.LXDM == '券商') & (df.Count > 15) & (df.CGChange == '增持')]
df_bx = df[(df.LXDM == '保险') & (df.Count > 2) & (df.CGChange == '增持')]
df_jj = df[(df.LXDM == '基金') & (df.Count > 200) & (df.CGChange == '增持')]
df_sb = df[(df.LXDM == '社保') & (df.Count > 2) & (df.CGChange == '增持')]
df_QFII = df[(df.LXDM == 'QFII') & (df.Count > 1) & (df.CGChange == '增持')]

# %%输出以上组合的重合部分,即多种基金持有的股票
li_solid = [i for _ in [df_qs, df_bx, df_jj, df_sb, df_QFII]
            for i in _[['SCode', 'RDate']].values.tolist()]
li_solid_copy = copy.deepcopy(li_solid)
li_solid_unique = [list(a) for a in set(tuple(i) for i in li_solid)]
for i in li_solid_unique:
    li_solid_copy.remove(i)
fraud_free_set = set(tuple(i) for i in li_solid_copy)
df_fraud_free = pd.DataFrame(fraud_free_set, columns=['SCode', 'RDate'])
df_fraud_free.to_csv('label_data/df_fraud_free.csv')
pd.merge(df_fraud_free, df,on=['SCode','RDate'])
# %%
resumen['date'] = pd.to_datetime(resumen.date_1.str.slice(4), 
       format ='%Y-%m-%d_%H:%M:%S', errors='coerce') - np.timedelta64(5, 'h') # Es necesario restar 5 para que se ajuste a las horas Colombia -5 utm

resumen['dom_1'] = resumen.date_1.str.slice(0,3)







#tabla de las alturas
balideam = pd.read_csv('/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/estaciones_altura_20180905.csv')
#alturas_1 = pd.read_csv('/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/union_b_20180905.csv')

union_b = pd.merge(resumen, balideam, how='outer', on='cod')


#Corrección de la temperatura
union_b['temp'] = (((union_b.al_alos - union_b.alt_1) * 0.0065) + union_b.T2)

resumen_back = union_b

resumen_back.T2 = resumen_back.temp


### Creación de la tabla de recepción
recep_t = pd.DataFrame({'tipo_1':np.tile(resumen_back.fecha.unique(), 93),
    'dom_1':np.tile(np.repeat(['d01','d02','d03'], len(resumen_back.fecha.unique())), 31),
    'cod_1':np.repeat((resumen_back.cod.unique()).astype(np.str), (len(resumen_back.fecha.unique()) * 3))})
'/Preprocessor/data'
'''
Walk data path for text files and add metadata columns 
overwriting original data
'''

count = 0
for root, dirs, files in os.walk(PATH_INPUTS):
    if 'Indicators' not in root.split('/'):
        for file in files:
            if file[-3:] == 'txt' and file[:7] != 'merged_':
                print 'Processing file {}, number {}'.format(files, count)
                count = count + 1
                df = pd.read_csv(root + '/' + file)
                for root1, dirs1, files1 in os.walk(PATH_FEATURES):
                    for file1 in files1:
                        if file1[-3:] == 'txt':
                            print 'merging file {} with input file {}'.format(
                                file, file1)
                            feat_df = pd.read_csv(root1 + '/' + file1)
                            feat_symbol = feat_df.irow(0).SYMBOL
                            new_col = feat_symbol + '_CLOSE'
                            feat_df[new_col] = feat_df['Close']
                            if new_col not in df.columns.tolist():
                                df = pd.merge(
                                    df,
                                    feat_df[['Date', 'Time', new_col]],
                                    how='left',
                                    on=['Date', 'Time'])
                df.to_csv(root + '/' + 'merged_' + file, index=False)
import pandas as pd

data_2018 = pd.read_csv('master_2018.csv')
data_2019 = pd.read_csv('master_2019.csv')

data_2019.rename(columns={'zasp_index': 'zasp_i'}, inplace=True)
merged = pd.merge(data_2018, data_2019, on='county_code')

merged['county_code'] = merged['county_code'].transform(
    lambda x: str(x).replace('.0', '').zfill(5))
merged['zasp_index'] = merged['zasp_i'] + (merged['zasp_i'] -
                                           merged['zasp_index'])
merged.to_csv('master_linear_pred_2020.csv', index=False)
del clean_sabio['Substrate']
# drop duplicates
clean_sabio = clean_sabio.join(s)
clean_sabio = clean_sabio.drop_duplicates(keep='first')

# Reconstruct SabioRK database so genes will map directly to Km and kcat
# To choose a single value, I chose the maximum value reported
km = clean_sabio[clean_sabio["Type.1"] == "Km"].drop(columns=["Type.1", \
                "Species"]).rename(columns={"Start Value":"Km"})
km = km.groupby(["Gene names", "Substrate"],
                sort=False)["Km"].max().reset_index()
kcat = clean_sabio[clean_sabio["Type.1"] == "kcat"].drop(columns=["Type.1", \
                  "Species"]).rename(columns={"Start Value":"kcat"})
kcat = kcat.groupby(["Gene names", "Substrate"],
                    sort=False)["kcat"].max().reset_index()
comb = pd.merge(kcat, km, how="inner", left_on=["Gene names", "Substrate"],\
                right_on=["Gene names", "Substrate"]).drop_duplicates(keep="first").dropna()

# Get substrates of interest
atp = comb[comb["Substrate"] == "ATP"].sort_values(by="kcat", ascending=False)
adp = comb[comb["Substrate"] == "ADP"].sort_values(by="kcat", ascending=False)
nad = comb[comb["Substrate"] == "NAD+"].sort_values(by="kcat", ascending=False)
nadh = comb[comb["Substrate"] == "NADH"].sort_values(by="kcat",
                                                     ascending=False)
acoa = comb[comb["Substrate"] == "Acetyl-CoA"].sort_values(by="kcat",
                                                           ascending=False)
coa = comb[comb["Substrate"] == "Coenzyme A"].sort_values(by="kcat",
                                                          ascending=False)

###############################################################################
# Scatterplot for global analysis #############################################
def prediction_data(stock, days):
    '''
    stock = 'BTC-USD'
    days = 5 days to predict in future
    start/end = historical dataset
    '''
    start = datetime.datetime(2019, 8, 1)
    end = datetime.datetime(2019, 9, 7)

    # Store in an array the results of the three different classifier
    prediction_values = []

    # try to get the data from internet
    try:
        stock_df = web.DataReader(stock, 'yahoo', start, end)
        # print(stock_df.tail())
        csv_name = ('app/exports/BTC-USD_export.csv')
        stock_df.to_csv(csv_name)

    except ():
        print('it eas not possible to get the data.')

    print(path.exists('app/exports/BTC-USD_export.csv'))

    stock_df = pd.read_csv('app/exports/BTC-USD_export.csv')

    # print(df.tail())

    # save the data locally into a csv file
    # if os.path.exists('./{}'.format(data_folder)):
    #     pass
    # else:
    #     os.mkdir(data_folder)
    #
    # csv_name = ('{}/{}_export.csv'.format(data_folder, stock))
    # df.to_csv(csv_name)

    # add a column prediction to the dataset
    stock_df['prediction'] = stock_df['Close'].shift(-1)
    stock_df.dropna(inplace=True)

    # print(stock_df.tail())

    forecast_days = int(days)



    #Predicting the stock price in the future
    # Random shuffle the dataset
    # df = df.sample(frac=1)

    # Set the features columns
    X = np.array(stock_df.drop(['prediction', 'Date'], 1))
    # Set the target column
    Y = np.array(stock_df['prediction'])
    # Standardize a dataset along any axis
    X = preprocessing.scale(X)
    # Split the dataset to 45% testing and then 55% training sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.45)

    # Performing the Regression on the trainig data
    linear_regression_classifier = LinearRegression()
    linear_regression_classifier.fit(X_train, Y_train)
    X_prediction = X[-forecast_days:]
    prediction_linear_regression = (linear_regression_classifier.predict(X_prediction))
    confidence_lr = linear_regression_classifier.score(X_train, Y_train)
    plr = round(float(np.float64(prediction_linear_regression[0])), 2)
    clr = round(float(np.float64(confidence_lr*100)), 2)

    linear_regression_prediction = {}
    linear_regression_prediction['prediction'] = plr
    linear_regression_prediction['confidence'] = clr
    # Add to the array the results
    prediction_values.append(linear_regression_prediction)

    # Print out the Linear Regression prediction
    print('Prediction at {} days using linear regression is about {} $'.format(days, str(plr)))
    print('Confidence at {} days using linear regression is about {}% '.format(days, str(clr)))

    # quadratic, linear, lasso, ridge
    quadratic_regression_classifier = make_pipeline(PolynomialFeatures(2), Ridge())
    quadratic_regression_classifier.fit(X_train, Y_train)
    prediction_quadratic_regression = quadratic_regression_classifier.predict(X_prediction)
    confidence_pq = quadratic_regression_classifier.score(X_train, Y_train)
    pqr = round(float(np.float64(prediction_quadratic_regression[0])), 2)
    cpq = round(float(np.float64(confidence_pq * 100)), 2)

    quadratic_regression_prediction = {}
    quadratic_regression_prediction['prediction'] = pqr
    quadratic_regression_prediction['confidence'] = cpq
    # Add to the array the results
    prediction_values.append(quadratic_regression_prediction)

    # Print out the Quadratic regression prediction
    print('Prediction at {} days using quadratic regression is about {} $'.format(days, str(pqr)))
    print('Confidence at {} days using quadratic regression is about {}%'.format(days, str(cpq)))

    # KNN Regression
    kneighbor_regression_classifier = KNeighborsRegressor(n_neighbors=2)
    kneighbor_regression_classifier.fit(X_train, Y_train)
    prediction_kneighbor_regression = kneighbor_regression_classifier.predict(X_prediction)
    confidence_kr = kneighbor_regression_classifier.score(X_train, Y_train)
    pkr = round(float(np.float64(prediction_kneighbor_regression[0])), 2)
    ckr = round(float(np.float64(confidence_kr * 100)), 2)

    kneighbor_regression_prediction = {}
    kneighbor_regression_prediction['prediction'] = pkr
    kneighbor_regression_prediction['confidence'] = ckr
    # Add to the array the results
    prediction_values.append(kneighbor_regression_prediction)

    # Print out the Quadratic regression prediction
    print('Prediction at {} days using K Nearest Neighbor (KNN) regression is about {} $'.format(days, str(pkr)))
    print('Confidence at {} days using K Nearest Neighbor (KNN) regression is about {}%'.format(days, str(ckr)))

    ## Work on the tweets Dataset
    print(path.exists('app/exports/analysis_all.csv'))
    tweet_df = pd.read_csv('app/exports/analysis_all.csv')

    tweet_df.drop('remove', axis=1, inplace=True)
    tweet_df['number'] = tweet_df['tweet'].shift()
    tweet_df.dropna(inplace=True)

    # group_by_date_sentiment = tweet_df.groupby(['created_at', 'sentiment'])['number'].agg('sum')
    group_by_date_sentiment = tweet_df.groupby(['created_at', 'sentiment'], as_index=False).count().pivot('created_at', 'sentiment').fillna(0)

    # print(group_by_date_sentiment)

    df_tmp = group_by_date_sentiment['number']
    # print(group_by_date_sentiment['number'].head())
    df_values = stock_df.set_index('Date')

    final_df = pd.merge(df_values, df_tmp, left_index=True, right_index=True)

    # print(final_df)

    # Work with graph
    columns_df = final_df[['Close', 'Neutral', 'Positive']]
    x = columns_df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    plot_df = pd.DataFrame(x_scaled, columns=columns_df.columns, index=columns_df.index)

    final_close_price = plot_df['Close']
    neutral = plot_df['Neutral']
    positive = plot_df['Positive']

    # Print the graph
    # Adjust the size of mathplotlib
    mpl.rc('figure', figsize=(8, 7))
    mpl.__version__

    plt.suptitle('Bitcoin Sentiment Analysis on Tweets', fontsize=14, fontweight='bold')
    plt.ylabel('Sentiment')
    plt.xlabel('Time')
    # Adjust the style of matplotlib
    style.use(['ggplot'])
    style.context('Solarize_Light2')

    neutral.plot(
        label='Neutral Tweets',
        color='orange',
        linestyle='dashed',
        linewidth=2,
        alpha=0.5,
        marker='s',
        markersize=5,
        markerfacecolor='blue',
        markeredgecolor='blue',
    )
    positive.plot(
        color='green',
        linestyle='dashed',
        linewidth=2,
        alpha=0.5,
        marker='*',
        markersize=5,
        markerfacecolor='blue',
        markeredgecolor='blue',
        label='Positive Tweets'
    )
    final_close_price.plot(
        color='red',
        linestyle='solid',
        linewidth=4,
        alpha=0.5,
        marker='o',
        markersize=5,
        markerfacecolor='blue',
        markeredgecolor='blue',
        label='BTC-USD'
    )
    plt.legend()

    #save to file
    plt.savefig('app/static/img/sentiment.png')
    # plt.show()
    plt.close()

    # return the price with the best confidence
    maxConfidenceItem = max(prediction_values, key=lambda x: x['confidence'])

    print('maxConfidenceItem: {}'.format(str(maxConfidenceItem)))

    return maxConfidenceItem
Exemple #60
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "all"
    logger = get_logger()
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/train_merged.pickle")
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        },
        "previous_answer_index_question_id": {
            "type": "category"
        },
        "previous_answer_question_id": {
            "type": "category"
        },
        "timediff-elapsedtime_bin500": {
            "type": "category"
        },
        "timedelta_log10": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
            groupby="user_id",
            column="question_id",
            is_debug=is_debug,
            model_id=model_id,
            n=300)
        feature_factory_dict["user_id"][
            "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True)
        feature_factory_dict["user_id"][
            f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(
                column="user_id", agg_column="study_time", remove_now=False)

        feature_factory_dict["user_id"][
            "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
            )
        feature_factory_dict["post"] = {
            "DurationFeaturePostProcess": DurationFeaturePostProcess()
        }

        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id=model_id,
            load_feature=not is_debug,
            save_feature=not is_debug)
        print("all_predict")
        df = feature_factory_manager.all_predict(df)

        def f(x):
            x = x // 1000
            if x < -100:
                return -100
            if x > 400:
                return 400
            return x

        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"]
        ]
        df["timediff-elapsedtime_bin500"] = [
            f(x) for x in df["timediff-elapsedtime"].values
        ]
        df["timedelta_log10"] = np.log10(
            df["duration_previous_content"].values)
        df["timedelta_log10"] = df["timedelta_log10"].replace(
            -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8")
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300", "previous_answer_index_question_id",
            "previous_answer_question_id", "row_id",
            "timediff-elapsedtime_bin500", "timedelta_log10"
        ]]
        print(df.head(10))

        print("data preprocess")

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])

    if not load_pickle or is_debug:
        df_val_row = pd.read_feather(
            "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather"
        )
        if is_debug:
            df_val_row = df_val_row.head(3000)
        df_val_row["is_val"] = 1

        df = pd.merge(df, df_val_row, how="left", on="row_id")
        df["is_val"] = df["is_val"].fillna(0)

        print(df["is_val"].value_counts())

        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model275_all", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model275_all/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model275_all/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model275_all/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model275_all/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout,
                      cont_emb=params["cont_emb"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.2
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.2,
    )
    num_train_optimization_steps = int(len(dataloader_train) * 25)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)
    auc_val = 0
    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, epoch,
                                              output_dir, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))
        torch.save(
            model.state_dict(),
            f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth"
        )

    # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_val)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)