def cal_x_y(col_lst): col1 = col_lst[0] col2 = col_lst[1] name1 = '_'.join([''.join(x.split('_')) for x in col1]) print('name1:') print(name1) mean_train = train.groupby(col1)['log_demand'].mean().reset_index(name=name1) print('mean_train_1:') print(mean_train.head()) merge = pd.merge(test, mean_train, how='inner', on=col1) print('merge1:') print(merge.head()) name2 = '_'.join([''.join(x.split('_')) for x in col2]) print('name2') print(name2) mean_train = train.groupby(col2)['log_demand'].mean().reset_index(name=name2) print('mean_train_2:') print(mean_train.head()) merge = pd.merge(merge, mean_train, how='inner', on=col2) print('merge2:') print(merge.head()) x1 = merge[name1].apply(np.expm1) print('x1:') print(x1) x2 = merge[name2].apply(np.expm1) print('x2:') print(x2) y = merge['log_demand'] print('y:') print(y) return x1, x2, y
def get_data(criteria_info): ''' Gets the city name and relevant column(s) for the criteria passed. Input: criteria name Output: pandas dataframe ''' # Specific query information for cities because of db inconsistencies if criteria_info == RELATION_DICT['cities']: data = [] for col in criteria_info[1:]: pull = criteria_info[0].objects.values('id', col) data.append(pull) rv = pd.DataFrame.from_records(data[0]) for df in data[1:]: df = pd.DataFrame.from_records(df) rv = pd.merge(rv, df, on='id') return rv # All other queries else: data = [] for col in criteria_info[1:]: data.append(criteria_info[0].objects.values('city_id', col)) rv = pd.DataFrame.from_records(data[0]) for df in data[1:]: df = pd.DataFrame.from_records(df) rv = pd.merge(rv, df, on='city_id') return rv
def main(): df = pd.read_csv("../OUTPUT/segmentation_results_k-means.csv", delimiter=",", skipinitialspace=True) df_api = pd.read_csv("../OUTPUT/usersInfoAPI.csv", delimiter=",", skipinitialspace=True) #aggrego male, female e null df_api["sesso"] = df_api["sesso"].replace("F", "f") df_api["sesso"] = df_api["sesso"].replace("M", "m") df_api["sesso"] = df_api["sesso"].replace("N", "n") df_api["sesso"] = df_api["sesso"].fillna('n') df_friends = pd.read_csv("../OUTPUT/network_degree_node.csv", delimiter=",", skipinitialspace=True) df_merged = pd.merge(df_api, df, left_on="user_id", right_on="user_id", how='right') df_merged = pd.merge(df_friends, df_merged, left_on="user_id", right_on="user_id", how='right') df_merged["sesso"] = df_merged["sesso"].fillna('n') # df_merged["data_reg"] = pd.to_datetime(df_merged['data_reg']) # print df_merged["degree_initial_network"].mean() # generi = df_merged["sesso"].values.tolist() # counter_sex = Counter(generi) # sex_dict = dict(counter_sex) # print sex_dict # # date_time = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S") # # # print datetime.datetime.fromtimestamp(int(df_merged["data_reg"].mean())) # sys.exit() # plt.style.use("dark_background") k_means_analysis(df_merged)
def determine_from_listed_position(): general_stats_ep = GeneralPlayerStats() guards = general_stats_ep.get_data({'Season': '2017-18', 'PlayerPosition': 'G'}) forwards = general_stats_ep.get_data({'Season': '2017-18', 'PlayerPosition': 'F'}) centers = general_stats_ep.get_data({'Season': '2017-18', 'PlayerPosition': 'C'}) guards['G'] = 1 forwards['F'] = 1 centers['C'] = 1 merge_df = pd.merge(guards, forwards, on=['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'TEAM_ID'], how='outer') merge_df = pd.merge(merge_df, centers, on=['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'TEAM_ID'], how='outer') merge_df = merge_df[['PLAYER_NAME', 'PLAYER_ID', 'TEAM_ABBREVIATION', 'TEAM_ID', 'G', 'F', 'C']] merge_df = merge_df.fillna(0) conditions = [ ((merge_df['G'] == 1) & (merge_df['F'] == 0) & (merge_df['C'] == 0)), ((merge_df['F'] == 1) & (merge_df['C'] == 0)), (merge_df['C'] == 1) ] choices = ['Guard', 'Wing', 'Big'] merge_df['POSITION'] = np.select(conditions, choices, default='None') return merge_df
def test_merge_tables3(): df_a = pd.DataFrame( {'a': [0, 1]}, index=['a0', 'a1']) df_b = pd.DataFrame( {'b': [2, 3, 4, 5, 6], 'a_id': ['a0', 'a1', 'a1', 'a0', 'a1']}, index=['b0', 'b1', 'b2', 'b3', 'b4']) df_c = pd.DataFrame( {'c': [7, 8, 9]}, index=['c0', 'c1', 'c2']) df_d = pd.DataFrame( {'d': [10, 11, 12, 13, 15, 16, 16, 17, 18, 19], 'b_id': ['b2', 'b0', 'b3', 'b3', 'b1', 'b4', 'b1', 'b4', 'b3', 'b3'], 'c_id': ['c0', 'c1', 'c1', 'c0', 'c0', 'c2', 'c1', 'c2', 'c1', 'c2']}, index=['d0', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9']) orca.add_table('a', df_a) orca.add_table('b', df_b) orca.add_table('c', df_c) orca.add_table('d', df_d) orca.broadcast(cast='a', onto='b', cast_index=True, onto_on='a_id') orca.broadcast(cast='b', onto='d', cast_index=True, onto_on='b_id') orca.broadcast(cast='c', onto='d', cast_index=True, onto_on='c_id') df = orca.merge_tables(target='d', tables=['a', 'b', 'c', 'd']) expected = pd.merge(df_a, df_b, left_index=True, right_on='a_id') expected = pd.merge(expected, df_d, left_index=True, right_on='b_id') expected = pd.merge(df_c, expected, left_index=True, right_on='c_id') assert_frames_equal(df, expected)
def pivot_work(): coster = CcalcCostTime() # 建立透视表,提取女性用户 g_DF_USER_all = pd.pivot_table(g_DF_USER, index=g_DF_USER.index, values='user_id', columns='gender',fill_value=0) g_DF_USER_wm = getUserDataFramebyGender(g_DF_USER_all , 'F') merger = pd.merge(g_DF_UDATA, g_DF_USER_wm, left_on='user_id', right_on='F', how='left') va = np.vstack(merger[ merger['F'] > 0 ] ['rating']) #print '\n, merge len', len(va), '\navg',va.mean(), '\nvar', va.var(), '\nstd', va.std() # 提取男性用户 g_DF_USER_m = getUserDataFramebyGender(g_DF_USER_all , 'M') merger = pd.merge(g_DF_UDATA, g_DF_USER_m, left_on='user_id', right_on='M', how='left') va2 = np.vstack(merger[ merger['M'] > 0 ] ['rating']) #print '\n, merge len', len(va2), '\navg',va2.mean(), '\nvar', va2.var(), '\nstd', va2.std() # 读取u.item件 #df_item = readSrcData('u.item','|') #print 'item\n', df_item.head(5) ''' 计算男女对电影评分的标准差''' F_StandardDiff = va.std() #1.0 M_StandardDiff = va2.std() #2.0 ser_result = pd.Series({'F':F_StandardDiff, 'M':M_StandardDiff}) ser_result.name='rating' print '\n\ngender\n',ser_result
def chuli_consumer(train_ccx_A=train_ccx_A,train_target_A=train_target_A,over=datetime(2017, 6, 1)): #月份只有2017.1-2017.5的数据,只保留月份作为分类变量 train_ccx_A[['var_06']]=train_ccx_A[['var_06']].apply(pd.to_datetime) train_ccx_A['datediff']=(over-train_ccx_A['var_06']).apply(lambda x: x.days) query = train_ccx_A.groupby(train_ccx_A['ccx_id']).size()#查询次数 query = query.reset_index() #index 改为 column #最后一次消费距分析时间距离 datediff=train_ccx_A['datediff'].groupby(train_ccx_A['ccx_id']).min() datediff=datediff.reset_index() query=pd.merge(query,datediff,on='ccx_id',how='left') query.columns = ['ccx_id','query','datediff'] #衍生变量 消费频率 tmp1= train_ccx_A['datediff'].groupby( train_ccx_A['ccx_id']).min() tmp2= train_ccx_A['datediff'].groupby( train_ccx_A['ccx_id']).max() query['query']= list( query['query'].tolist()/(tmp2-tmp1)) query['query'][query['query']==float('inf')]=0 df = pd.get_dummies(train_ccx_A) #变成哑变量形式one-hot编码 特征增加了 df2 = df.groupby(['ccx_id'],as_index=False).sum() #根据id汇总 加和 df3 = pd.merge(df2,query,on='ccx_id',how='left')#query 和 df2合并 df3 = pd.merge(train_target_A,df3,on='ccx_id',how='left')#target与ccx合并 df4 = df3.drop(['target'], axis = 1) #只有数据没有target df4=df4.fillna(0) df4 = df4.set_index("ccx_id") return df4
def course_feature(): user = read_user('data\\object\\object_num2.csv')[['course_id_num','module_id','category_num','start','children']] #total1 = user[['course_id_num','module_id']].groupby(['course_id_num']).count() total2 = user[['course_id_num','module_id']].drop_duplicates().groupby(['course_id_num']).count() total2.columns = ['category_all'] part = user[['course_id_num','module_id','category_num']].drop_duplicates() part1 = part[['course_id_num','category_num']] part_en = user[['course_id_num']].drop_duplicates().set_index('course_id_num') for i in range(15): category_cnt = part1[part1['category_num']==i].groupby(['course_id_num']).count() category_cnt.columns = ['category'+str(i)] part_en = pd.merge(part_en,category_cnt,how='outer',left_index=True,right_index=True) part_all = pd.merge(part_en,total2,how='outer',left_index=True,right_index=True) part_all.fillna(0,inplace=True) part_all = part_all.astype('int') enroll = read_user('data\\train\\enrollment_train_num.csv')[['enrollment_id','course_id_num']].set_index('course_id_num') part_enroll = pd.merge(enroll,part_all,how='outer',left_index=True,right_index=True) part_enroll_1 = part_enroll.set_index('enrollment_id')#ort_index(inplace=True) feature_all = read_user('data\\train\\feature_all_11.csv').set_index('enrollment_id') feature_all2 = pd.merge(feature_all,part_enroll_1,how='outer',left_index=True,right_index=True) feature_all2.to_csv('data\\train\\feature_all_12.csv') '''
def recommendByUserFC(userid, k=3, wantedNum=5): coster = CcalcCostTime() #1. 建立user id到看过的电影映射; 电影id 到user的映射;本次使用df merger = pd.merge(g_DF_UDATA, g_DF_USER, on='user_id') #2. 计算最近的K个邻居 nears = calcNears(merger, userid, k) #testPrint(nears, merger,userid) #3. 对邻居的所有看过的电影,基于邻近情况,计算推荐度 movieitems_dist={} for item in nears: nearmovies = (merger[merger.user_id == item[1] ]) ['item_id'].values for movie in nearmovies: if movieitems_dist.has_key(movie): movieitems_dist[movie] += item[0] else: movieitems_dist[movie] = item[0] #4. 基于推荐度,进行排序 SeriesMovies = pd.Series(movieitems_dist).sort_values() print '\n', SeriesMovies.tail(wantedNum) #5. 输出 recommned_moiveID_df = pd.DataFrame(SeriesMovies.tail(wantedNum).keys(), columns=['item_id'] ) recomm_merger = pd.merge(g_DF_MOVIE_ITEM, recommned_moiveID_df, on='item_id') print '\n\n', userid,'\'s recom list:\n', recomm_merger.loc[:, ['item_id','title','release'] ]
def add_judgments_and_frequencies_to_qa_pairs(qa_pairs, judgments, question_frequencies, remove_newlines): """ Collate system answer confidences and annotator judgments by question/answer pair. Add to each pair the question frequency. Collated system files are used as input to subsequent cross-system analyses. Though you expect the set of question/answer pairs in the system answers and judgments to not be disjoint, it may be the case that neither is a subset of the other. If annotation is incomplete, there may be Q/A pairs in the system answers that haven't been annotated yet. If multiple systems are being judged, there may be Q/A pairs in the judgements that don't appear in the system answers. Some versions of Annotation Assist strip newlines from the answers they return in the judgement files, so optionally take this into account when joining on question/answer pairs. :param qa_pairs: question, answer, and confidence provided by a Q&A system :type qa_pairs: pandas.DataFrame :param judgments: question, answer, in purview, and judgement provided by annotators :type judgments: pandas.DataFrame :param question_frequencies: question and question frequency in the test set :type question_frequencies: pandas.DataFrame :param remove_newlines: join judgments on answers with newlines removed :type remove_newlines: bool :return: question and answer pairs with confidence, in purview, judgement and question frequency :rtype: pandas.DataFrame """ qa_pairs = pandas.merge(qa_pairs, question_frequencies, on=QUESTION, how="left") if remove_newlines: qa_pairs["Temp"] = qa_pairs[ANSWER].str.replace("\n", "") qa_pairs = qa_pairs.rename(columns={"Temp": ANSWER, ANSWER: "Temp"}) qa_pairs = pandas.merge(qa_pairs, judgments, on=(QUESTION, ANSWER), how="left") if remove_newlines: del qa_pairs[ANSWER] qa_pairs = qa_pairs.rename(columns={"Temp": ANSWER}) return qa_pairs
def users_per_course_all(): #训练集+测试集中,每个课程的用户数。生成文件feature_all_11.csv u1 = read_user('data\\enrollment\\enrollment_all_num.csv') u2 = u1[['username','course_id_num']].groupby(['course_id_num']).count() u2.columns = ['users_per_course'] u3 = u1.copy().set_index('course_id_num')[['enrollment_id']] e = pd.merge(u3,u2,how="outer",left_index=True,right_index=True) f = e.set_index('enrollment_id') f.sort_index(inplace=True) train = read_user('data\\train\\enrollment_train_num.csv') test = read_user('data\\test\\enrollment_test_num.csv') tr1 = train[['enrollment_id']].set_index('enrollment_id') tr1m = pd.merge(tr1,f,how="inner",left_index=True,right_index=True) tr1m.columns = ['users_per_course_all'] te1 = test[['enrollment_id']].set_index('enrollment_id') te1m = pd.merge(te1,f,how="inner",left_index=True,right_index=True) te1m.columns = ['users_per_course_all'] feature_all = read_user('data\\test\\feature_all_10.csv').set_index('enrollment_id') feature_all1 = pd.merge(feature_all,te1m,how='outer',left_index=True,right_index=True) feature_all1.to_csv('data\\test\\feature_all_11.csv') feature_all2 = read_user('data\\train\\feature_all_10.csv').set_index('enrollment_id') feature_all3 = pd.merge(feature_all2,tr1m,how='outer',left_index=True,right_index=True) feature_all3.to_csv('data\\train\\feature_all_11.csv')
def wishlist_scores(data_path, scoring_function, preserve_zeros=True): '''returns iso/ft ratio score & iso count ratio score as dataframes see weighted_percentile for methodology higher ratio or iso count means higher score ''' ftiso = pd.read_csv(data_path) # calculate iso counts iso = ftiso[ftiso['type'] == 'iso'] iso = iso.groupby('beer_id', as_index=False).count()[['id', 'beer_id']] iso.columns = ['iso_count', 'beer_id'] # calculate ft counts ft = ftiso[ftiso['type'] == 'ft'] ft = ft.groupby('beer_id', as_index=False).count()[['id', 'beer_id']] ft.columns = ['ft_count', 'beer_id'] if preserve_zeros: # outer merge to get combined iso & ft counts for each beer iso_ft = pd.merge(iso, ft, on='beer_id', how='outer') iso_ft = iso_ft.fillna(0) # laplace smoothing for iso & ft counts (avoids dividing by zero) iso_ft['ft_count'] = iso_ft['ft_count'] + 1 iso_ft['iso_count'] = iso_ft['iso_count'] + 1 else: # inner merge to eliminate zeros in iso & ft counts for each beer iso_ft = pd.merge(iso, ft, on='beer_id', how='inner') # finally get demand to supply ratio score iso_ft['ratio'] = iso_ft['iso_count'] / iso_ft['ft_count'] iso_ft['ratio_score'] = scoring_function(iso_ft['ratio']) # get iso score iso_ft['iso_score'] = scoring_function(iso_ft['iso_count']) return iso_ft
def execute(bins=10, ylim=False): data = pandas.merge(load_terms(), load_search_results().rename(columns={'identifier': 'term_id'}), on=['term_id'], how='inner') data = data[data['term_name'].apply(lambda x: len(x.split(';')[0]) > 5)] data = data[data['term_id'].apply(lambda x: x.startswith('A'))] data = pandas.merge(data, load_radiopaedia_terms(), on=['term_id', 'term_name'], how='inner') # load_radiopaedia_terms() # g = sns.pairplot(data, vars=['search_results_log', 'pagerank', 'difficulty_prob']) # for ax in g.axes.flat: # if ax.get_xlabel() in ['difficulty_prob', 'pagerank']: # ax.set_xlim(0, 1) # if ax.get_ylabel() in ['difficulty_prob', 'pagerank']: # ax.set_ylim(0, 1) # if min(ax.get_xticks()) < 0: # ax.set_xlim(0, max(ax.get_xticks())) # if min(ax.get_yticks()) < 0: # ax.set_ylim(0, max(ax.get_yticks())) # output.savefig('importance_pair', tight_layout=False) rcParams['figure.figsize'] = 30, 20 for term_name, difficulty_prob, pagerank in data[['term_name', 'difficulty_prob', 'pagerank']].values: plt.plot(1 - difficulty_prob, pagerank, color='red', marker='s', markersize=10) plt.text(1 - difficulty_prob, pagerank, term_name) if ylim: plt.ylim(0, 0.5) plt.xlabel('Predicted error rate') plt.ylabel('Pagerank') output.savefig('importance_pagerank')
def createMaster(raw_data_path,clean_data_path): #Get the cleaned data projects = cleanProjects(raw_data_path,clean_data_path) outcomes = cleanOutcomes(raw_data_path,clean_data_path) #try the merge. Nothing wrong with the data, have 664098 unique projects print "Merging outcomes and project data set..." project_with_outcome = pd.merge(projects, outcomes, how='outer', on='projectid') project_with_outcome['projectid'].nunique() del projects del outcomes #Merge in essay data, have 664098 unique projects essays = cleanEssays(raw_data_path,clean_data_path) print "Merging in essay data..." master = pd.merge(project_with_outcome,essays, how='outer', on='projectid') del essays master['projectid'].nunique() print "Saving master data..." master.to_csv(clean_data_path+'master.csv') print "Saving subsample of master" firstThousandMaster = master[0:999] firstThousandMaster.to_csv(clean_data_path+'firstThousandMaster.csv') return master
def _calculate_normalised_dispersion(model, input_files, beta, header, unit, cut, output, accelerator): #TODO there are no errors from orbit df_orbit = pd.DataFrame(model).loc[:, ['S', 'MUX', 'DPX', 'DX', 'X', 'BETX']] df_orbit['NDXMDL'] = df_orbit.loc[:, 'DX'] / np.sqrt(df_orbit.loc[:, 'BETX']) df_orbit.rename(columns={'MUX': 'MUXMDL', 'DPX': 'DPXMDL', 'DX': 'DXMDL', 'X': 'XMDL'}, inplace=True) df_orbit['COUNT'] = len(input_files.get_columns(df_orbit, 'CO')) dpps = input_files.dpps("X") df_orbit = pd.merge(df_orbit, input_files.joined_frame("X", ['CO', 'CORMS', 'AMPX']), how='inner', left_index=True, right_index=True) df_orbit = pd.merge(df_orbit, beta.loc[:, ['BETX', 'ERRBETX']], how='inner', left_index=True, right_index=True, suffixes=('', '_phase')) if np.max(dpps) - np.min(dpps) == 0.0: return # temporary solution # raise ValueError('Cannot calculate dispersion, only a single dpoverp') fit = np.polyfit(dpps, SCALES[unit] * input_files.get_data(df_orbit, 'CO').T, 1, cov=True) df_orbit['NDX_unscaled'] = fit[0][-2, :].T / stats.weighted_mean(input_files.get_data(df_orbit, 'AMPX'), axis=1) # TODO there is no error from AMPX df_orbit['STDNDX_unscaled'] = np.sqrt(fit[1][-2, -2, :].T) / stats.weighted_mean(input_files.get_data(df_orbit, 'AMPX'), axis=1) df_orbit = df_orbit.loc[np.abs(fit[0][-1, :].T) < cut * SCALES[unit], :] mask = accelerator.get_element_types_mask(df_orbit.index, ["arc_bpm"]) global_factor = np.sum(df_orbit.loc[mask, 'NDXMDL'].values) / np.sum(df_orbit.loc[mask, 'NDX_unscaled'].values) df_orbit['NDX'] = global_factor * df_orbit.loc[:, 'NDX_unscaled'] df_orbit['STDNDX'] = global_factor * df_orbit.loc[:, 'STDNDX_unscaled'] df_orbit['DX'] = df_orbit.loc[:, 'NDX'] * np.sqrt(df_orbit.loc[:, 'BETX_phase']) df_orbit['STDDX'] = df_orbit.loc[:, 'STDNDX'] * np.sqrt(df_orbit.loc[:, 'BETX_phase']) df_orbit['DPX'] = _calculate_dp(model, df_orbit.loc[:, ['DX', 'STDDX']], "X") df_orbit['DELTANDX'] = df_orbit.loc[:, 'NDX'] - df_orbit.loc[:, 'NDXMDL'] output_df = df_orbit.loc[:, ['S', 'COUNT', 'NDX', 'STDNDX', 'DX', 'DPX', 'NDXMDL', 'DXMDL', 'DPXMDL', 'MUXMDL', 'DELTANDX']] tfs_pandas.write_tfs(join(output, header['FILENAME']), output_df, header, save_index='NAME') return output_df
def corrects_incorrects_counter_win(ds, window=None): ''' Receives the dataset and creates a cumulative windowed sum for the columns corrects and incorrects ''' #If window not specified not use window student_cfa = ds[['student_id', 'step_id', 'corrects', 'incorrects']] grouped = student_cfa.groupby(['student_id', 'step_id']) new_df = pd.DataFrame(np.zeros((ds.shape[0], 2)), columns=['cum_corr', 'cum_incorr']) if window: cum = grouped.cumsum() cum_df = pd.merge(student_cfa[['student_id', 'step_id']], cum, right_index=True, left_index=True) grouped_cum = cum_df.groupby(['student_id', 'step_id']) cum_delay = grouped_cum.shift(window).fillna(0) diff = cum - cum_delay diff_df = pd.merge(student_cfa[['student_id', 'step_id']], diff, right_index=True, left_index=True) diff_df = diff_df.groupby(['student_id', 'step_id']) previous_columns = diff_df.shift(1) previous_columns = previous_columns.fillna(0) previous_columns.columns = ['prev_corr', 'prev_incorr'] return (previous_columns.prev_corr, previous_columns.prev_incorr)
def createMatchAndTournamentTables(matchesFilepath, playersDF, conn) : # Récupération des matches en DF matchsDF = pd.read_csv(matchesFilepath) # Travail sur la date pour la mettre au même format que les dob des players matchsDF.event_time = matchsDF.event_time.apply(lambda x : x.split(" ")[0]) matchsDF.event_time = pd.to_datetime(matchsDF.event_time).apply(lambda x : x.strftime("%d/%m/%Y")) # Récupération des tournois en sql tournamentsDF = matchsDF[["event_time", "event_name", "surface"]].drop_duplicates() tournamentsDF.index = range(len(tournamentsDF)) tournamentsDF.to_sql("tournaments", conn, if_exists="replace") # Jointure sur les tournois (pour les joueurs le nom fait office de jointure) tournamentsDF["idTournoi"] = tournamentsDF.index matchsDF = pd.merge(matchsDF, tournamentsDF) # Jointure sur les players playersDF["idPlayerA"] = playersDF.index playersDF = playersDF[["idPlayerA", "playername"]].rename(columns={"playername": "playerA"}) matchsDF = pd.merge(matchsDF, playersDF, how="left") playersDF = playersDF.rename(columns={"idPlayerA":"idPlayerB", "playerA":"playerB"}) matchsDF = pd.merge(matchsDF, playersDF, how="left") # Suppression des champs tournois et player devenus redondants et écriture de la table matchsDF.drop(["playerA", "playerB", "Unnamed: 0", "event_time", "event_name", "surface"], axis=1, inplace=True) matchsDF.to_sql("matchs", conn, if_exists="replace") return matchsDF, tournamentsDF
def score_item(self, train_file, test_file, score_type): train_df = pd.read_csv(train_file, header=False, names=['userId', 'movieId', 'rating', 'timestamp']) test_df = pd.read_csv(test_file, header=False, names=['userId', 'movieId', 'rating', 'timestamp']) test_users = pd.unique(test_df.userId) rating_from_test_users = train_df[train_df.userId.isin(test_users)] rating_from_test_users_cluster = pd.merge(self.cluster_df, rating_from_test_users, on='movieId') rating_from_test_users_cluster = rating_from_test_users_cluster.groupby(['userId', 'cluster'])['rating'] \ .agg(np.mean).reset_index() if score_type == 'optimal': rating_join = pd.merge(test_df, self.model, left_on='movieId', right_on='Item', how='left') \ .drop(['timestamp', 'Partition', 'Item', 'Rank'], axis=1) rating_join['error'] = rating_join['rating'] - rating_join['Score'] rmse_summary = rating_join.groupby(['userId', 'cluster'])['error'] \ .agg(lambda x: np.linalg.norm(x)/np.sqrt(len(x))).reset_index() test_user_cluster_map = rmse_summary.groupby('userId') \ .apply(lambda x: x.loc[x['error'].argmin()]).reset_index(drop=1) else: test_user_cluster_map = rating_from_test_users_cluster.groupby('userId') \ .apply(lambda x: x.loc[x['rating'].argmax()]).reset_index(drop=1) output = pd.merge(test_user_cluster_map, self.model, on='cluster', how='left') output.userId = output.userId.astype(np.int64) output.Item = output.Item.astype(np.int64) output[['userId', 'Item', 'Score']].to_csv(sys.stdout, header=False, index=False)
def genCoauthors(aDF,names): coauthors = pd.merge(aDF.drop(['name','stnname','paperCount'],1),aDF.drop(['name','stnname','paperCount'],1),how='outer',on='paperID',suffixes=['1','2']) coauthors = coauthors[coauthors['authorID1']!=coauthors['authorID2']] coauthors = pd.merge(coauthors,names.rename(columns={'authorID':'authorID1','stnname':'stnname1'}), how='left',on='authorID1') coauthors = pd.merge(coauthors,names.rename(columns={'authorID':'authorID2','stnname':'stnname2'}), how='left',on='authorID2') return coauthors
def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right')
def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner')
def get_full_features(): reviews = get_reviews() tips = get_tips() reviews_tips = reviews.append(tips) reviews_tips.columns = ['restaurant_id', 'review_date', 'review_id', 'review_stars', 'review_text', 'review_type', 'user_id', 'review_votes_cool', 'review_votes_funny', 'review_votes_useful'] reviews_tips.review_votes_useful.fillna(0, inplace=True) reviews_tips.review_votes_cool.fillna(0, inplace=True) reviews_tips.review_votes_funny.fillna(0, inplace=True) reviews_tips = map_ids(reviews_tips) # # saving this for tfidf vectorizer training later # with open('pickle_jar/reviews_tips_original_text.pkl', 'w') as f: # pickle.dump(reviews_tips.review_text.tolist(), f) users = get_users() users_reviews_tips = pd.merge(reviews_tips, users, how='left', on='user_id') restaurants = get_restaurants() restaurants_users_reviews_tips = pd.merge(users_reviews_tips, restaurants, how='outer', on='restaurant_id') # if checkins dont exist for a restaurant dont want to drop the restaurant values checkins = get_checkins() full_features = pd.merge(restaurants_users_reviews_tips, checkins, how='left', on='restaurant_id') # drop restaurants not found in boston data full_features = full_features[pd.notnull(full_features.restaurant_id)] return full_features
def plot_clf_polar(clf, cmap=None, key='nickname', n_topics=60, n_top=3, labels=None, topics = None, mask=None, selection='top', metric='correlation', max_val=None): import pandas as pd import seaborn as sns ## Set up topic nicknames word_keys = pd.read_csv("../data/unprocessed/abstract_topics_filtered/topic_sets/topic_keys" + str(n_topics) + "-july_cognitive.csv") word_keys['topic_name'] = "topic" + word_keys['topic'].astype('str') o_fi = pd.DataFrame(clf.odds_ratio) # Melt feature importances, and add top_words for each feeature o_fi['region'] = range(1, o_fi.shape[0] + 1) o_fis_melt = pd.melt(o_fi, var_name='topic_order', value_name='importance', id_vars=['region']) word_keys = pd.merge(pd.DataFrame(np.array([range(0, clf.feature_importances.shape[1]), clf.feature_names]).T, columns=['topic_order', 'topic_name']), word_keys) word_keys.topic_order = word_keys.topic_order.astype('int') o_fis_melt= pd.merge(o_fis_melt, word_keys) o_fis_melt['abs_imp'] = np.abs(o_fis_melt['importance']) if mask is not None: o_fis_melt = o_fis_melt[o_fis_melt.region.isin(mask)] if topics is not None: o_fis_melt = o_fis_melt[o_fis_melt[key].isin(topics)] pplot = pd.pivot_table(o_fis_melt, values='importance', index=[key], columns=['region']) if cmap is None: cmap = sns.color_palette('Set1', clf.feature_importances.shape[0]) if mask is not None: cmap = [n[0] for n in sorted(zip(np.array(cmap)[np.array(mask)-1], mask), key=lambda tup: tup[1])] return plot_polar(pplot, overplot=True, palette=cmap, n_top=n_top, metric=metric, selection=selection, label_size=30, labels=labels, max_val=max_val)
def runSharesPSRCToBKRZones(): #list of two lists files_shares = [files_manu_shares, file_wtcu_shares] header_rows = 3 #number of rows at the begining of a file with header information headers = {} #dictionary to save header information for files_group in files_shares: for file in files_group: print("working on file: " + file) file_path = os.path.join(wd, file) #read header - use "#" as seperator as it is less likely to present in the file headers[file] = pd.read_table(file_path, delimiter = "#", header = None, nrows = header_rows) # skip first few rows, as they contain general information - also ignore rows starting with 'c' (comment lines) shares_psrc = pd.read_table(file_path, delimiter = " ", names = ["o","d",file], comment = "c", skiprows = header_rows) if file == files_group[0]: #if first file in the group, set to the file shares truck_shares_psrc = shares_psrc else: #add a new column for a new file truck_shares_psrc = pd.merge(truck_shares_psrc, shares_psrc, on = ["o","d"]) # merge psrc to bkr correspondence with percent tazGroups = pd.merge(truck_shares_psrc, tazShares, left_on = "o", right_on = "psrc_zone_id") tazGroups[file] = tazGroups[file] * tazGroups["percent"] # group by unique pair of bkr zone and group tazGroups_grouped = tazGroups.groupby(["bkr_zone_id"]) # calculate sum of percent by unique pair tazGroups_sum = tazGroups_grouped[files_group].sum() tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1) for file in files_group: tazGroups_sum[file] *= 1/tazGroups_sum['sum'] tazGroups_sum['sum'] = tazGroups_sum[files_group].sum(axis=1) tazGroups_sum = tazGroups_sum.round(4) #round values to 4 decimal #temp = tazGroups_sum.ix[tazGroups_sum["sum"]>1.0] #debug: to find out rows that have sum value more than 1 tazGroups_sum = tazGroups_sum[files_group].reset_index() # makes object a data frame by setting the current index to a column tazGroups_sum["c"] = "all:" for file in files_group: tazGroups_bkr = tazGroups_sum[["bkr_zone_id", "c", file]] tazGroups_bkr = tazGroups_bkr.sort_values(by = ['bkr_zone_id'], ascending=[True]) # write - first header and then append the updated data outfile = file.split(".")[0] outfile = os.path.join(wd, outfile + "_bkr.in") #first write header headers[file].to_csv(outfile, sep = " ", header = False, index = False, quoting=csv.QUOTE_NONE, escapechar = " ") #had to add space as escapechar otherwise throws an error - not sure if that would cause any issue in the mdoel #write data with open(outfile, 'a') as wfile: tazGroups_bkr.to_csv(wfile, sep = " " , header = False, index = False)
def test_hash_join(how): A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], 'y': [1, 1, 2, 2, 3, 4]}) a = dd.repartition(A, [0, 4, 5]) B = pd.DataFrame({'y': [1, 3, 4, 4, 5, 6], 'z': [6, 5, 4, 3, 2, 1]}) b = dd.repartition(B, [0, 2, 5]) c = hash_join(a, 'y', b, 'y', how) result = c.compute() expected = pd.merge(A, B, how, 'y') list_eq(result, expected) # Different columns and npartitions c = hash_join(a, 'x', b, 'z', 'outer', npartitions=3) assert c.npartitions == 3 result = c.compute() expected = pd.merge(A, B, 'outer', None, 'x', 'z') list_eq(result, expected) assert hash_join(a, 'y', b, 'y', 'inner')._name == \ hash_join(a, 'y', b, 'y', 'inner')._name assert hash_join(a, 'y', b, 'y', 'inner')._name != \ hash_join(a, 'y', b, 'y', 'outer')._name
def conjoint(self): ''' Calcule l'identifiant du conjoint et vérifie que les conjoint sont bien reciproques ''' print ("travail sur les conjoints") ind = self.ind conj = ind.ix[ind['couple']==1,['men','lienpref','id']] conj['lienpref'].value_counts() conj.ix[conj['lienpref']==1,'lienpref'] = 0 conj.ix[conj['lienpref']==31,'lienpref'] = 2 conj.ix[conj['lienpref']==32,'lienpref'] = 3 conj.ix[conj['lienpref']==50,'lienpref'] = 10 conj2 = merge(conj, conj, on=['men','lienpref']) conj2 = conj2[conj2['id_x'] != conj2['id_y']] assert len(conj2) == len(conj) conj = conj2 test = pd.groupby(conj, ['men','lienpref']).size() assert max(test)==2 and min(test)==2 couple = pd.groupby(conj, 'id_x') for id, potential in couple: if len(potential) == 1: conj.loc[ conj['id_x']==id, 'id_y'] = potential['id_y'] else: pdb.set_trace() # TODO: pas de probleme, bizarre conj = conj.rename(columns={'id_x': 'id', 'id_y':'conj'}) ind = merge(ind,conj[['id','conj']], on='id', how='left') self.ind = ind ## verif sur les conj réciproque test_conj = merge(ind[['conj','id']],ind[['conj','id']], left_on='id',right_on='conj') print "le nombre de couple non réciproque est:", sum(test_conj['id_x'] != test_conj['conj_y']) print ("fin du travail sur les conjoints")
def flattenTable(fulltable,levelcol,idcol,parentidcol,countchildren,removeempty): fulltable[[levelcol]] = fulltable[[levelcol]].astype(int) levels = dict(list(fulltable.groupby(levelcol))) minlevel = fulltable.level.min() for level, data in sorted(levels.iteritems()): #First level is the starting point for the following merges if level == minlevel: #data = data[[idcol,'object_id','object_type']] data = data.add_prefix('level_{}-'.format(level)) flattable = data else: #Aggregate object types and join them for col_countchildren in countchildren: children = data[parentidcol].groupby([data[parentidcol],data[col_countchildren]]).count() children = children.unstack(col_countchildren) children['total'] = children.sum(axis=1) children = children.add_prefix('level_{}-children-{}-'.format(level-1,col_countchildren)) leftkey = 'level_{}-id'.format(level-1) flattable = merge(flattable,children,how='left',left_on=leftkey,right_index=True) flattable[children.columns.values.tolist()] = flattable[children.columns.values.tolist()].fillna(0).astype(int) #Join data data['childnumber'] = data.groupby(parentidcol).cumcount() leftkey = 'level_{}-{}'.format(level-1,idcol) rightkey = 'level_{}-{}'.format(level,parentidcol) data = data.drop([levelcol],axis=1) data = data.add_prefix('level_{}-'.format(level)) flattable = merge(flattable,data,how="outer",left_on=leftkey,right_on=rightkey) if removeempty: flattable = flattable.dropna(axis=1,how='all') return flattable
def hierarchicallyIndexMerge(): lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'key2': [2000, 2001, 2002, 2001, 2002], 'data': np.arange(5.)}) righth = DataFrame(np.arange(12).reshape((6, 2)), index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'], [2001, 2000, 2000, 2000, 2001, 2002]], columns=['event1', 'event2']) print ('DataFrame 1: \n{}'.format(lefth)) print ('DataFrame 2: \n{}'.format(righth)) merge_inner = pd.merge(lefth,righth,left_on=['key1','key2'], right_index=True) print ('Inner Merged DataFrame: \n{}'.format(merge_inner)) merge_outer = pd.merge(lefth,righth,left_on=['key1','key2'], right_index=True, how='outer') print ('Outer Merged DataFrame: \n{}'.format(merge_outer)) left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada']) right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri','Alabama']) print ('Data Frame 1: ', left2) print ('Data Frame 2: ', right2) merge = pd.merge(left2, right2, how='outer', left_index=True, right_index=True) print ('Merge: \n{}'.format(merge))
def station_summary(pair_counts: DataFrame or None=None) -> DataFrame: if pair_counts is None: pair_counts = station_pair_counts() info = station_info() lat_lng = info[['lat', 'lng']] def rename_lat_lng_columns(prefix): def do_rename(column): if column == 'lat' or column == 'lng': return "{}_{}".format(prefix, column) else: return column return do_rename with_min_st_coords = merge(pair_counts, lat_lng, how='left', left_on='min_st', right_index=True) with_min_st_coords.dropna(inplace=True) # http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas new_column_names = map(rename_lat_lng_columns("min"), with_min_st_coords.columns) with_min_st_coords.columns = new_column_names with_both_coords = merge(with_min_st_coords, lat_lng, how='left', left_on='max_st', right_index=True) with_both_coords.dropna(inplace=True) new_column_names = map(rename_lat_lng_columns("max"), with_both_coords.columns) with_both_coords.columns = new_column_names return with_both_coords
def load_distractors_usage(length=None, by_attempt=True): cf = load_confusing_factor() def _apply(g): g['ratio'] = g['value'] / g['value'].sum() return g cf = cf.groupby(['experiment_setup_name', 'item']).apply(_apply).reset_index().sort_values(by=['experiment_setup_name', 'item', 'ratio'], ascending=False) cf['ratio_rank'] = cf.groupby([ 'experiment_setup_name', 'item' ]).cumcount() answers = load_non_reference_answers() answers['attempt'] = answers.groupby([ 'experiment_setup_name', 'user_id', 'context_name', 'term_type', ]).cumcount() if length is not None: answers = answers[answers['attempt'] < length] answers = pandas.merge(answers, load_options().rename(columns={'answer_id': 'id'}), on=['id', 'item_asked_id', 'experiment_setup_name', 'experiment_setup_id'], how='inner')[['item_asked_id', 'experiment_setup_name', 'attempt', 'item_option_id']] answers = pandas.merge( answers, cf[['experiment_setup_name', 'item', 'other', 'ratio_rank']].rename(columns={'item': 'item_asked_id', 'other': 'item_option_id', 'ratio_rank': 'confusing_rank'}), on=['experiment_setup_name', 'item_asked_id', 'item_option_id'], how='inner') def _apply(group): total = len(group) return group.groupby('confusing_rank').apply(lambda g: len(g) / total).reset_index().rename(columns={0: 'value'}) groupby_add = ['attempt'] if by_attempt else [] return answers.groupby(['experiment_setup_name'] + groupby_add).apply(_apply).reset_index()
# Ignorando la fila de totales y seleccionando solo las filas de estados pad = pad[1:36] # Seleccionando solo las columnas de interés pad = pad[[pad.columns[0], '2015', '2018']] # Renombrando las columnas a su clave definida en el diccionario del dataset pad = pad.rename(columns={ pad.columns[0]: 'EDO', '2015': 'DET.TOT.15', '2018': 'DET.TOT.18' }) # Resultado parcial de DS_HIPERTENSION print('DS_HIPERTENSION', pad) # ## Uniendo los datasets anteriores todos = pd.merge(db, hp, on='EDO') todos = pd.merge(todos, pad, on='EDO') # Resultado final de esta parte print('resultado de merges', todos) # Removiendo espacios en blanco en la columna EDO todos['EDO'] = todos['EDO'].str.strip() # Haciendo que EDO sea el indice todos = todos.set_index('EDO') # Sumando filas de estado de mexico todos.loc['México Oriente'] += todos.loc['México Poniente'] # Borrando la columna ya sumada
from collections import Counter from scipy.sparse.construct import hstack from sklearn.model_selection import train_test_split from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.linear_model.logistic import LogisticRegression from sklearn.metrics.ranking import roc_auc_score from sklearn.preprocessing.data import OneHotEncoder import numpy as np from sklearn import metrics data=pd.read_csv('train_agg.csv') data.apply(lambda x:x.replace('"','')) data.to_csv('data.csv',index=False) del data train_agg=pd.read_csv('data.csv',delimiter='\t') train_flg=pd.read_csv('train_flg.csv',delimiter='\t') train_agg=pd.merge(train_agg,train_flg,on='USRID') train_log=pd.read_csv('train_log.csv',delimiter='\t') test_agg=pd.read_csv('test_agg.csv',delimiter='\t') test_log=pd.read_csv('test_log.csv',delimiter='\t') len_train=len(train_agg) merge_log=train_log.append(test_log) def data_process(data): data.TCH_TYP.replace(2,1,inplace=True) data_process(merge_log) def cut1(group): return group.split('-')[0] def cut2(group): return group.split('-')[1]
WAVE_TOP = 8410 # Angstrom WAVE_BOTTOM = 8790 WAVERANGE = np.linspace(WAVE_TOP, WAVE_BOTTOM, 1000) star_data = pd.DataFrame(fits.getdata(DATA_DIR + 'data_stars.fits', 1)) # spectra = pd.read_csv(DATA_DIR + 'spectra_fits_raw.csv').iloc[:, :-1] spectral_data_raw = pd.DataFrame(np.load( 'spectra_fits_raw.npy', allow_pickle=True), columns=['flux', 'flux_error', 'rave_obs_id']) # Filter those rows that have more flux values than 1000 and separate # for an easier handling spectra = pd.concat([pd.DataFrame(spectral_data_raw['flux'].to_list()).iloc[:, 0:950], spectral_data_raw['rave_obs_id']], axis=1).dropna() spectra_error = pd.concat([pd.DataFrame(spectral_data_raw['flux_error'].to_list()).iloc[0:950], spectral_data_raw['rave_obs_id']], axis=1).dropna() spectra_all_data = pd.merge(spectra, star_data, on='rave_obs_id', how='inner') analysis_line1 = pd.read_csv(DATA_DIR + 'analysis_results_line_1.csv') analysis_line2 = pd.read_csv(DATA_DIR + 'analysis_results_line_2.csv') analysis_line3 = pd.read_csv(DATA_DIR + 'analysis_results_line_3.csv') # ______________________ Filters for data ______________________________________ common_ids = np.intersect1d(analysis_line3.rave_obs_id, np.intersect1d( analysis_line1.rave_obs_id, analysis_line2.rave_obs_id)) # We keep the same stars for a fair comparison of the algorithm's performance # Drop the first two columns as they are bypdroducts of the filtering and we don't need them analysis_line1_cm = analysis_line1[analysis_line1['rave_obs_id'].isin( common_ids)].reset_index().drop(['index', 'Unnamed: 0'], axis=1) analysis_line2_cm = analysis_line2[analysis_line2['rave_obs_id'].isin( common_ids)].reset_index().drop(['index', 'Unnamed: 0'], axis=1)
def make_figure(df, pa): df_ls = df.copy() durations = df_ls[pa["xvals"]] event_observed = df_ls[pa["yvals"]] km = KaplanMeierFitter() ## instantiate the class to create an object pl = None fig = plt.figure(frameon=False, figsize=(float(pa["fig_width"]), float(pa["fig_height"]))) ## Fit the data into the model if str(pa["groups_value"]) == "None": km.fit(durations, event_observed, label='Kaplan Meier Estimate') df_survival = km.survival_function_ df_conf = km.confidence_interval_ df_event = km.event_table df = pd.merge(df_survival, df_conf, how='left', left_index=True, right_index=True) df = pd.merge(df, df_event, how='left', left_index=True, right_index=True) df['time'] = df.index.tolist() df = df.reset_index(drop=True) df = df[[ "time", "at_risk", "removed", "observed", "censored", "entrance", "Kaplan Meier Estimate", "Kaplan Meier Estimate_lower_0.95", "Kaplan Meier Estimate_upper_0.95" ]] pa_ = {} for arg in [ "Conf_Interval", "show_censors", "ci_legend", "ci_force_lines", "left_axis", "right_axis", "upper_axis", "lower_axis", "tick_left_axis", "tick_right_axis", "tick_upper_axis", "tick_lower_axis" ]: if pa[arg] in ["off", ".off"]: pa_[arg] = False else: pa_[arg] = True if str(pa["markerc_write"]) != "": pa_["marker_fc"] = pa["markerc_write"] else: pa_["marker_fc"] = pa["markerc"] if str(pa["edgecolor_write"]) != "": pa_["marker_ec"] = pa["edgecolor_write"] else: pa_["marker_ec"] = pa["edgecolor"] if str(pa["grid_color_text"]) != "": pa_["grid_color_write"] = pa["grid_color_text"] else: pa_["grid_color_write"] = pa["grid_color_value"] pl=km.plot(show_censors=pa_["show_censors"], \ censor_styles={"marker":marker_dict[pa["censor_marker_value"]], "markersize":float(pa["censor_marker_size_val"]), "markeredgecolor":pa_["marker_ec"], "markerfacecolor":pa_["marker_fc"], "alpha":float(pa["marker_alpha"])}, \ ci_alpha=float(pa["ci_alpha"]), \ ci_force_lines=pa_["ci_force_lines"], \ ci_show=pa_["Conf_Interval"], \ ci_legend=pa_["ci_legend"], \ linestyle=pa["linestyle_value"], \ linewidth=float(pa["linewidth_write"]), \ color=pa["line_color_value"]) pl.spines['right'].set_visible(pa_["right_axis"]) pl.spines['top'].set_visible(pa_["upper_axis"]) pl.spines['left'].set_visible(pa_["left_axis"]) pl.spines['bottom'].set_visible(pa_["lower_axis"]) pl.spines['right'].set_linewidth(pa["axis_line_width"]) pl.spines['left'].set_linewidth(pa["axis_line_width"]) pl.spines['top'].set_linewidth(pa["axis_line_width"]) pl.spines['bottom'].set_linewidth(pa["axis_line_width"]) pl.tick_params(axis="both", direction=pa["ticks_direction_value"], length=float(pa["ticks_length"])) pl.tick_params(axis='x', which='both', bottom=pa_["tick_lower_axis"], top=pa_["tick_upper_axis"], labelbottom=pa_["lower_axis"], labelrotation=float(pa["xticks_rotation"]), labelsize=float(pa["xticks_fontsize"])) pl.tick_params(axis='y', which='both', left=pa_["tick_left_axis"], right=pa_["tick_right_axis"], labelleft=pa_["left_axis"], labelrotation=float(pa["yticks_rotation"]), labelsize=float(pa["yticks_fontsize"])) if str(pa["grid_value"]) != "None": pl.grid(True, which='both', axis=pa["grid_value"], color=pa_["grid_color_write"], linewidth=float(pa["grid_linewidth"])) if str(pa["x_lower_limit"]) != "" and str(pa["x_upper_limit"]) != "": pl.set_xlim(float(pa["x_lower_limit"]), float(pa["x_upper_limit"])) if str(pa["y_lower_limit"]) != "" and str(pa["y_upper_limit"]) != "": pl.set_ylim(float(pa["y_lower_limit"]), float(pa["y_upper_limit"])) pl.set_title(pa["title"], fontdict={'fontsize': float(pa['titles'])}) pl.set_xlabel(pa["xlabel"], fontdict={'fontsize': float(pa['xlabels'])}) pl.set_ylabel(pa["ylabel"], fontdict={'fontsize': float(pa['ylabels'])}) return df, pl elif str(pa["groups_value"]) != "None": df_long = pd.DataFrame( columns=['day', 'status', str(pa["groups_value"])]) for row in range(0, len(df_ls)): if int(df_ls.loc[row, pa["yvals"]]) >= 1: dead = int(df_ls.loc[row, pa["yvals"]]) #print(dead) for i in range(0, dead): #print(i) df_long = df_long.append( { 'day': int(df_ls.loc[row, pa["xvals"]]), 'status': 1, str(pa["groups_value"]): str(df_ls.loc[row, pa["groups_value"]]) }, ignore_index=True) i = i + 1 elif int(df_ls.loc[row, pa["censors_val"]]) >= 1: censored = int(df_ls.loc[row, pa["censors_val"]]) #print(censored) for c in range(0, censored): #print(c) df_long = df_long.append( { 'day': int(df_ls.loc[row, pa["xvals"]]), 'status': 0, str(pa["groups_value"]): str(df_ls.loc[row, pa["groups_value"]]) }, ignore_index=True) c = c + 1 df_dummy = pd.get_dummies(df_long, drop_first=True, columns=[pa["groups_value"]]) results = logrank_test(df_dummy.loc[df_dummy['status'] == 1, 'day'].tolist(), df_dummy.loc[df_dummy['status'] == 0, 'day'].tolist(), df_dummy.loc[df_dummy['status'] == 1, 'status'].tolist(), df_dummy.loc[df_dummy['status'] == 0, 'status'].tolist(), alpha=.99) cph = CoxPHFitter() cph.fit(df_dummy, duration_col='day', event_col='status') cph_coeff = cph.summary cph_coeff = cph_coeff.reset_index() df_info = {} df_info['model'] = 'lifelines.CoxPHFitter' df_info['duration col'] = cph.duration_col df_info['event col'] = cph.event_col df_info['baseline estimation'] = 'breslow' df_info['number of observations'] = cph._n_examples df_info['number of events observed'] = len( df_dummy.loc[df_dummy['status'] == 1, ]) df_info['partial log-likelihood'] = cph.log_likelihood_ df_info['Concordance'] = cph.concordance_index_ df_info['Partial AIC'] = cph.AIC_partial_ df_info['log-likelihood ratio test'] = cph.log_likelihood_ratio_test( ).test_statistic df_info[ 'P.value(log-likelihood ratio test)'] = cph.log_likelihood_ratio_test( ).p_value df_info['log rank test'] = results.test_statistic df_info['P.value(log rank test)'] = results.p_value cph_stats = pd.DataFrame(df_info.items()) cph_stats = cph_stats.rename(columns={0: 'Statistic', 1: 'Value'}) #cph_stats tmp = [] for cond in pa["list_of_groups"]: df_tmp = df_ls.loc[df_ls[pa["groups_value"]] == cond] km.fit(df_tmp[pa["xvals"]], df_tmp[pa["yvals"]], label=cond) df_survival = km.survival_function_ df_conf = km.confidence_interval_ df_event = km.event_table df = pd.merge(df_survival, df_conf, how='left', left_index=True, right_index=True) df = pd.merge(df, df_event, how='left', left_index=True, right_index=True) df['time'] = df.index.tolist() df = df.reset_index(drop=True) df = df.rename( columns={ "at_risk": cond + "_at_risk", "removed": cond + "_removed", "observed": cond + "_observed", "censored": cond + "_censored", "entrance": cond + "_entrance", cond: cond + "_KMestimate" }) df = df[[ "time", cond + "_at_risk", cond + "_removed", cond + "_observed", cond + "_censored", cond + "_entrance", cond + "_KMestimate", cond + "_lower_0.95", cond + "_upper_0.95" ]] tmp.append(df) df = reduce(lambda df1, df2: pd.merge(df1, df2, on='time'), tmp) PA_ = [g for g in pa["groups_settings"] if g["name"] == cond][0] if str(PA_["linecolor_write"]) != "": linecolor = PA_["linecolor_write"] else: linecolor = PA_["line_color_value"] if str(PA_["linestyle_write"]) != "": linestyle = PA_["linestyle_write"] else: linestyle = PA_["linestyle_value"] if str(PA_["markerc_write"]) != "": markerColor = PA_["markerc_write"] else: markerColor = PA_["markerc"] if str(PA_["edgecolor_write"]) != "": edgeColor = PA_["edgecolor_write"] else: edgeColor = PA_["edgecolor"] if PA_["show_censors"] in ["off", ".off"]: showCensors = False else: showCensors = True if PA_["Conf_Interval"] in ["off", ".off"]: ConfidenceInterval = False else: ConfidenceInterval = True if PA_["ci_legend"] in ["off", ".off"]: CI_legend = False else: CI_legend = True if PA_["ci_force_lines"] in ["off", ".off"]: CI_lines = False else: CI_lines = True linewidth = PA_["linewidth_write"] edgeLineWidth = PA_["edge_linewidth"] markerSize = PA_["censor_marker_size_val"] markerAlpha = PA_["marker_alpha"] CI_alpha = PA_["ci_alpha"] markerVal = PA_["censor_marker_value"] pa_ = {} for arg in [ "left_axis", "right_axis", "upper_axis", "lower_axis", "tick_left_axis", "tick_right_axis", "tick_upper_axis", "tick_lower_axis" ]: if pa[arg] in ["off", ".off"]: pa_[arg] = False else: pa_[arg] = True if str(pa["grid_color_text"]) != "": pa_["grid_color_write"] = pa["grid_color_text"] else: pa_["grid_color_write"] = pa["grid_color_value"] pl=km.plot(show_censors=showCensors, \ censor_styles={"marker":marker_dict[markerVal], "markersize":float(markerSize), "markeredgecolor":edgeColor, "markerfacecolor":markerColor, "alpha":float(markerAlpha), "mew":float(edgeLineWidth)}, \ ci_alpha=float(CI_alpha), \ ci_force_lines=CI_lines, \ ci_show=ConfidenceInterval, \ ci_legend=CI_legend, \ linestyle=linestyle, \ linewidth=float(linewidth), \ color=linecolor) pl.spines['right'].set_visible(pa_["right_axis"]) pl.spines['top'].set_visible(pa_["upper_axis"]) pl.spines['left'].set_visible(pa_["left_axis"]) pl.spines['bottom'].set_visible(pa_["lower_axis"]) pl.spines['right'].set_linewidth(pa["axis_line_width"]) pl.spines['left'].set_linewidth(pa["axis_line_width"]) pl.spines['top'].set_linewidth(pa["axis_line_width"]) pl.spines['bottom'].set_linewidth(pa["axis_line_width"]) pl.tick_params(axis="both", direction=pa["ticks_direction_value"], length=float(pa["ticks_length"])) pl.tick_params(axis='x', which='both', bottom=pa_["tick_lower_axis"], top=pa_["tick_upper_axis"], labelbottom=pa_["lower_axis"], labelrotation=float(pa["xticks_rotation"]), labelsize=float(pa["xticks_fontsize"])) pl.tick_params(axis='y', which='both', left=pa_["tick_left_axis"], right=pa_["tick_right_axis"], labelleft=pa_["left_axis"], labelrotation=float(pa["yticks_rotation"]), labelsize=float(pa["yticks_fontsize"])) if str(pa["grid_value"]) != "None": pl.grid(True, which='both', axis=pa["grid_value"], color=pa_["grid_color_write"], linewidth=float(pa["grid_linewidth"])) if str(pa["x_lower_limit"]) != "" and str( pa["x_upper_limit"]) != "": pl.set_xlim(float(pa["x_lower_limit"]), float(pa["x_upper_limit"])) if str(pa["y_lower_limit"]) != "" and str( pa["y_upper_limit"]) != "": pl.set_ylim(float(pa["y_lower_limit"]), float(pa["y_upper_limit"])) pl.set_title(pa["title"], fontdict={'fontsize': float(pa['titles'])}) pl.set_xlabel(pa["xlabel"], fontdict={'fontsize': float(pa['xlabels'])}) pl.set_ylabel(pa["ylabel"], fontdict={'fontsize': float(pa['ylabels'])}) return df, pl, cph_coeff, cph_stats
def raw2meta_extract(fn): """ Reasds raw2 files including GPS and enginerring information Parameters ---------- fn : string Path and filenmae of *.raw2 file Returns ------- data : pandas DataFrame CTD (Salinity, Temperature, Fluorescence, Pressure), Pitch and Roll, Compass information gps : pandas DataFrame GPS position information zoog : pandas DataFrame Zoocam grayscale values """ pgain = 0.04 poff = -10 tgain = 0.001 toff = -5 sgain = 0.001 soff = -1 delta_t = 8 #get file index print(time.ctime() + ": Processing "+fn) print(time.ctime() + ": Generating file index...") with open(fn) as f: list2 = [row.split()[0] for row in f] ########################################## #read files ########################################## f = open(fn) raw2 = f.readlines() f.close() print(time.ctime() + ": Loading CF_DIVE") ########################################## # CF_DIVE 0F ########################################## ''' This packet marks the present: Nsurf = Dive-Set Number Ncyc = Cycle Number Npro = the profile number uxti0 = the UNIX time that the Dive-Set uxti1 = The Unix time this specific cycle began For the 0901 code, the Dive-Set Number is only incremented after surface communications (GPS and SBD) are attempted (multiple cycles between surface communications will not increment the Dive-Set Number, but will increment the Cycle Number). This packet should be used to set Nsurf, Ncyc, Npro for all proceeding packets, until the next CF_DIVE packet is encountered. ''' cf_dive_idx = [i for i, j in enumerate(list2) if j == '0f'] cf_dive_raw = [raw2[i].split() for i in cf_dive_idx] cf_dive = pd.DataFrame(cf_dive_raw) cf_dive = cf_dive.iloc[:,1:] cf_dive.columns = ['Nsurf','Ncyc','Npro','uxti0','uxti1','Dow','Month', 'day','Time','Year'] cf_dive = cf_dive.astype(dtype = {'Nsurf':'int64','Ncyc':'int64', 'Npro':'int64','uxti0':'int64', 'uxti1':'int64'}) ########################################## # CF_PDAT 11 ########################################## print(time.ctime() + ": Loading CF_PDAT") edat_idx = [i for i, j in enumerate(list2) if j == '11'] edat_raw = [raw2[i].split() for i in edat_idx] edat = pd.DataFrame(edat_raw) edat = edat.iloc[:,1:9] edat.columns = ['Nsurf','Ncyc','Npro','time','pressure','temperature', 'salinity','fluorescence'] edat = edat.astype(dtype = {'Nsurf':'int64','Ncyc': 'int64','Npro': 'int64', 'time':'float','pressure':'float', 'temperature':'float','salinity':'float', 'fluorescence':'float'} ) edat['pressure']=edat['pressure'] * pgain + poff #pressure as a double; step 1 of conversion #still need to find pmin and do p=p-pmin to convert to dBar sal_cond = edat['salinity'] > 0 edat.loc[sal_cond, 'salinity'] = edat.loc[sal_cond,'salinity'] * sgain + soff sal_cond = edat['temperature'] > 0 edat.loc[sal_cond, 'temperature'] = edat.loc[sal_cond,'temperature'] * tgain + toff for var in ['salinity','temperature','fluorescence']: cond = edat[var] <= 0 edat.loc[cond, var] = float('nan') edat = pd.merge(edat,cf_dive) edat['Dive_start_time'] = pd.to_datetime(edat.uxti0, unit='s') edat['Dive_start_time'] = edat['Dive_start_time'].dt.tz_localize('UTC') #add time_of_measure edat['time_of_measure'] = edat['Dive_start_time'] + pd.to_timedelta(edat['time'].astype('str') + 'seconds') #edat.time_of_measure = edat.time_of_measure.dt.tz_localize('UTC') edat['time_of_measure_PDT'] = edat.time_of_measure - pd.to_timedelta(delta_t, unit='hours') #transform to local time as defined -8 hours not ST #correct pressure edat['pressure'] = edat.pressure - edat.pressure.min() #Correct pressure ########################################## #CF_EDAT 21 ########################################## pr_idx = [i for i, j in enumerate(list2) if j == '21'] pr_raw = [raw2[i].split() for i in pr_idx] pr = pd.DataFrame(pr_raw) pr = pr.iloc[:,1:7] pr.columns = ['Nsurf','Ncyc','Npro','compass','pitch','roll'] pr = pr.astype(dtype = {'Nsurf':'int64','Ncyc': 'int64', 'Npro': 'int64','compass':'******', 'pitch':'float','roll':'float'}) pr.loc[:,['compass','pitch', 'roll']] /= 10 print(time.ctime() + "Loading CF_GPS1") ########################################## #CF_GPS1--start of dive-set 01 ########################################## gps1_idx = [i for i, j in enumerate(list2) if j == '01'] gps1_raw = [raw2[i].split() for i in gps1_idx] gps1 = pd.DataFrame(gps1_raw) gps1 = gps1.iloc[:,[1,3,4,5,6,13]] gps1.columns = ['Nsurf_start','year','yr_day_start','lat_start', 'lon_start', 'UTC_time_fix_start'] gps1 = gps1.astype(dtype = {'Nsurf_start':'int64', 'year':'int64', 'yr_day_start':'float','lat_start': 'float', 'lon_start': 'float'}) base_date = pd.to_datetime(gps1['year'].astype('str') + '/01/01 00:00:00') gps1['UTC_time_fix_start'] = base_date + pd.to_timedelta((gps1['yr_day_start']-1).astype('str') + ' days') print(time.ctime() + ": Loading CF_GPS2") ########################################## #CF_GPS2--end of dive-set 02 ########################################## gps2_idx = [i for i, j in enumerate(list2) if j == '02'] gps2_raw = [raw2[i].split() for i in gps2_idx] gps2 = pd.DataFrame(gps2_raw) gps2 = gps2.iloc[:,[1,3,4,5,6,13]] gps2.columns = ['Nsurf_end', 'year','yr_day_end', 'lat_end', 'lon_end','UTC_time_fix_end'] gps2 = gps2.astype(dtype = {'Nsurf_end':'int64', 'year':'int64', 'yr_day_end':'float','lat_end': 'float', 'lon_end': 'float'}) base_date = pd.to_datetime(gps2['year'].astype('str') + '/01/01 00:00:00') gps2['UTC_time_fix_end'] = base_date + pd.to_timedelta((gps2['yr_day_end']-1).astype('str') + ' days') print(time.ctime() + "Loading CF_ZOOG") ########################################## #CF_ZOOG this is the zooglider grayscale value ########################################## zoog_idx = [i for i, j in enumerate(list2) if j == 'b4'] zoog_raw = [raw2[i].split() for i in zoog_idx] zoog = pd.DataFrame(zoog_raw) #dt = pd.to_datetime(zoog.iloc[:,7] +' '+ zoog.iloc[:,8] + ' ' + zoog.iloc[:,9] + # ' ' + zoog.iloc[:,10] + ' ' + zoog.iloc[:,11]) zoog = zoog.iloc[:,[1,2,3,4,5,6]] #zoog['date'] = dt zoog.columns = ['zstart', 'zstop','n_img','n_err', 'avg','unix_tstamp'] zoog = zoog.astype(dtype = {'zstart':'int64', 'zstop':'int64','n_img':'int64','n_err': 'int64', 'avg': 'float', 'unix_tstamp':'float'}) zoog['UTC_time'] = pd.to_datetime(zoog.unix_tstamp[0], unit='s') zoog.UTC_time = zoog.UTC_time.dt.tz_localize('UTC') zoog['PDT_time'] = zoog.UTC_time - pd.to_timedelta(delta_t, unit='hours') ########################################## #Export ########################################## print(time.ctime() + ": Preparing data for export") ##GPS gps = pd.merge(gps1, gps2, left_on = 'Nsurf_start', right_on = 'Nsurf_end') gps = gps[['Nsurf_start','Nsurf_end', 'UTC_time_fix_start', 'UTC_time_fix_end','lon_start', 'lon_end', 'lat_start', 'lat_end']] ## Data data = pd.concat([edat, pr.iloc[:,3:]], sort=False,axis=1) #only keep important info data = data[['Nsurf','Ncyc','Npro', 'pitch', 'roll', 'compass','pressure','temperature','salinity','fluorescence','uxti0','Dive_start_time','time_of_measure','time_of_measure_PDT']] print(time.ctime() + ": Completed") return data, gps, zoog
def reconstruct(): """ run KFOLD method for regression """ #import packages import os import pandas as pd import statsmodels.api as sm from datetime import datetime from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 108 y = 109 #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) { # #apply 10 fold cross validation # kf = KFold(n_splits=10, random_state=29) # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) # for train_index, test_index in kf.split(X): # X_train, X_test = X_pca[train_index], X_pca[test_index] # y_train, y_test = y['surge'][train_index], y['surge'][test_index] # #train regression model # lm = LinearRegression() # lm.fit(X_train, y_train) # #predictions # predictions = lm.predict(X_test) # # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # # pd.DataFrame(np.array(y_test))], \ # # axis = 1) # # pred_obs.columns = ['pred', 'obs'] # # combo = pd.concat([combo, pred_obs], axis = 0) # #evaluation matrix - check p value # if stats.pearsonr(y_test, predictions)[1] >= 0.05: # print("insignificant correlation!") # continue # else: # #print(stats.pearsonr(y_test, predictions)) # metric_corr.append(stats.pearsonr(y_test, predictions)[0]) # #print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) # # #number of years used to train/test model # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\ # pred_surge['date'][0]).days/365) # longitude = surge['lon'][0] # latitude = surge['lat'][0] # num_pc = X_pca.shape[1] #number of principal components # corr = np.mean(metric_corr) # rmse = np.mean(metric_rmse) # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\ # np.mean(metric_corr), ' - avg_rmse (m) = ', \ # np.mean(metric_rmse), '\n') } num_pc = X_pca.shape[1] #number of principal components longitude = surge['lon'][0] latitude = surge['lat'][0] #surge reconstruction pred_for_recon = pred[~pred.isna().any(axis=1)] pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1) #standardize predictor data dat = pred_for_recon.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat( [pred_for_recon['date'], dat_standardized], axis=1) X_recon = pred_standardized.iloc[:, 1:] #apply PCA pca = PCA(num_pc) #use the same number of PCs used for training pca.fit(X_recon) X_pca_recon = pca.transform(X_recon) #model preparation #first train model using observed surge and corresponding predictors X_pca = sm.add_constant(X_pca) est = sm.OLS(y['surge'], X_pca).fit() #predict with X_recon and get 95% prediction interval X_pca_recon = sm.add_constant(X_pca_recon) predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05) #drop confidence interval and mean_se columns predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \ axis = 1, inplace = True) #final dataframe final_dat = pd.concat([pred_standardized['date'], predictions], axis=1) final_dat['lon'] = longitude final_dat['lat'] = latitude final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\ 'pred_int_upper', 'lon', 'lat'] { # plot - optional # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date']) # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date']) # sns.set_context('notebook', font_scale = 2) # plt.figure() # plt.plot(final_dat['date'], final_dat['mean'], color = 'green') # plt.scatter(surge['date'], surge['surge'], color = 'blue') # prediction intervals # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red', linestyle = "--", lw = 0.8) # confidence intervals # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black', linestyle = "--", lw = 0.8) # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black', linestyle = "--", lw = 0.8) } #save df as cs - in case of interruption os.chdir(dir_out) final_dat.to_csv(tg_name)
import pandas as pd import os import numpy as np # Step 1: Getting held out samples' information. scale_file = r'D:\WorkStation_2018\WorkStation_CNN_Schizo\Scale\10-24大表.xlsx' included_subjects = r'D:\WorkStation_2018\WorkStation_dynamicFC_V1\Data\headmotion\included_subjects_from851database_ID.xlsx' roi_signals_dir = r'D:\WorkStation_2018\WorkStation_dynamicFC_V3\Data\dfc_whole'; roi_all_signals_dir = r'D:\WorkStation_2018\WorkStation_dynamicFC_V1\Data\ROISignals_FumImgARWSFC_screened' scale = pd.read_excel(scale_file) included_subjects = pd.read_excel(included_subjects, header=None) subjname = os.listdir(roi_signals_dir) subjname = pd.Series(subjname) subjname = subjname.str.findall('[1-9]\d*') subjname = [np.int(sn[0]) for sn in subjname] subjname = pd.DataFrame(subjname) subjname_all = os.listdir(roi_all_signals_dir) subjname_all = pd.Series(subjname_all) subjname_all = subjname_all.str.findall('[1-9]\d*') subjname_all = [np.int(sn[0]) for sn in subjname_all] subjname_all = pd.DataFrame(subjname_all) exclueded_subj = pd.DataFrame((set(included_subjects[0]) - set(subjname[0]))) exclueded_subj = scale[scale['folder'].isin(exclueded_subj[0])]['folder'] describe = exclueded_subj.value_counts() exclueded_subj_available = pd.merge(subjname_all, exclueded_subj, left_on=0, right_on='folder', how='inner') exclueded_subj_available[0].to_csv(r'D:\WorkStation_2018\WorkStation_dynamicFC_V3\Data\ID_Scale_Headmotion\held_out_samples.txt',index=None, header=None)
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged" dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 601 y = 602 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x, y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) ########################################## #check if this tg is already taken care of ########################################## os.chdir(dir_out) if os.path.isfile(tg_name): return "file already analyzed!" os.chdir(dir_in) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis=1, inplace=True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1) #standardize predictor data dat = pred.iloc[:, 1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis=1, inplace=True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis=0, inplace=True) surge.reset_index(inplace=True) surge.drop('index', axis=1, inplace=True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns=['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis=1) #merge predictors and surge to find common time frame pred_surge = pd.merge(pred_standardized, surge_new.iloc[:, :2], on='date', how='right') pred_surge.sort_values(by='date', inplace=True) #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis=1)] pred_surge.drop(row_nan.index, axis=0, inplace=True) pred_surge.reset_index(inplace=True) pred_surge.drop('index', axis=1, inplace=True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-' * 80) print('Predictors and Surge don' 't overlap') print('-' * 80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:, 1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis=1, inplace=True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = [] metric_rmse = [] #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append( np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame( [tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis=0) #save df as cs - in case of interruption os.chdir(dir_out) df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
# Change the release year to numeric. tmdb_movie_data['title_year'] = pd.to_numeric(tmdb_movie_data['title_year']) # --- # Dropping Columns. # --- # Drop columns that we don't care about. imdb_movie_data = imdb_movie_data.drop(columns=IMDB_DROPS) tmdb_movie_data = tmdb_movie_data.drop(columns=TMDB_DROPS) # --- # Merging the Datasets. # --- # Inner join. full_data = pd.merge(imdb_movie_data, tmdb_movie_data, how='inner', on=['movie_title', 'title_year']) # Drop duplicates. full_data = full_data.drop_duplicates() # Remove columns where country is not USA. full_data = full_data.loc[full_data['country'] == 'USA'] # Release year no longer needed. Was only needed for join. full_data = full_data.drop(columns=['title_year', 'country']) # -- # Normalize Names Thus Far. # -- full_data = full_data.rename( columns={ 'director_name': 'Director_Name', 'duration': 'Runtime',
def createBMIDataset(bmi_buckets=[0, 20, 30, 40, 50, 55, 60, 100]): patientList = [] fullFileList = [] inputFolder1 = '/home/santhosr/Documents/Birad/ProcessedData/FullRes' truthFile1 = '/home/santhosr/Documents/Birad/birad_targetFile.csv' inputFolder2 = '/home/santhosr/Documents/Birad/ProcessedData/PennExtra_3500/' truthFile2 = '/home/santhosr/Documents/Birad/RaceDL_ExtraCaucasian.csv' df1 = pd.read_csv('/home/santhosr/Documents/Birad/birad_targetFile.csv') df1.drop(['PresIntentType', 'DBT'], inplace=True, axis=1) df2 = pd.read_csv( '/home/santhosr/Documents/Birad/RaceDL_ExtraCaucasian.csv') df2.Medview_Race = 'White' ## Removing IDs from df2 which are already present in df1 idList = list(df1.DummyID.values) df2 = df2[~df2.DummyID.isin(idList)] truth = pd.concat([df1, df2], sort=True) ## Reading from set 1 for i in range(1, 5): folder = os.path.join(inputFolder1, str(i)) fileList = os.listdir(folder) fileList = [os.path.join('FullRes', str(i), x) for x in fileList] fullFileList = fullFileList + fileList # print(len(fileList)) patientList = patientList + [ int(x.split("/")[-1].split("_")[0]) for x in fileList ] patientList1 = patientList.copy() ## Reading from set 2 print(len(patientList)) fileList = os.listdir(inputFolder2) fileList = [os.path.join('PennExtra_3500', x) for x in fileList] d = pd.DataFrame(fileList) d[1] = d[0].apply(lambda x: int(x.split("/")[1].split("_")[0])) d = d[d[1].isin(df2.DummyID.values)] fileList = list(d[0].values) fullFileList += list(d[0].values) patientList += [int(x.split("/")[-1].split("_")[0]) for x in fileList] print(len(patientList)) patientList2 = patientList.copy() #Retaining only the patients with 4 views k = pd.Series(patientList).value_counts().reset_index() patientList = k[k[0] == 4]['index'].values print("total number of patients", len(patientList)) patientList = np.array(list(set(patientList))) df = pd.DataFrame({'DummyID': patientList}) df = pd.merge(df, truth, how='left') df1 = df1.copy() df = df.drop_duplicates(subset=['DummyID']) #Creates equal number of patients from White and AA groups white = df[df.Medview_Race == 'White'] AA = df[df.Medview_Race == 'African American'] outputDf = pd.DataFrame() for i in range(len(bmi_buckets) - 1): out = getBMIData(AA, white, bmi_buckets[i], bmi_buckets[i + 1]) outputDf = pd.concat([outputDf, out]) temp = pd.DataFrame(fullFileList) temp.columns = ['filename'] temp['DummyID'] = temp.filename.apply( lambda x: int(x.split("/")[-1].split("_")[0])) trainTemp = temp[temp.DummyID.isin( outputDf[outputDf.train == False].DummyID.values)] validTemp = temp[temp.DummyID.isin( outputDf[outputDf.train == True].DummyID.values)] trainTemp['train'] = False validTemp['train'] = True df = pd.concat([trainTemp, validTemp], sort=True) #Shuffling data index = list(range(len(df))) np.random.shuffle(index) df = df.iloc[index] return df
def build_agg_data(df, x, y, inputs, agg, z=None, group_col=None, animate_by=None): """ Builds aggregated data when an aggregation (sum, mean, max, min...) is selected from the front-end. :param df: dataframe that contains data for chart :type df: :class:`pandas:pandas.DataFrame` :param x: column to use for the X-Axis :type x: str :param y: columns to use for the Y-Axes :type y: list of str :param inputs: additional chart configurations (chart_type, group, rolling_win, rolling_comp...) :type inputs: dict :param agg: points to a specific function that can be applied to :func: pandas.core.groupby.DataFrameGroupBy. Possible values are: count, first, last mean, median, min, max, std, var, mad, prod, sum :type agg: str :param z: column to use for the Z-Axis :type z: str, optional :return: dataframe of aggregated data :rtype: :class:`pandas:pandas.DataFrame` """ if agg == "raw": return df, [] z_exists = len(make_list(z)) if agg == "corr": if not z_exists: raise NotImplementedError( "Correlation aggregation is only available for 3-dimensional charts!" ) if agg == "rolling": if z_exists: raise NotImplementedError( "Rolling computations have not been implemented for 3-dimensional charts!" ) window, comp = map(inputs.get, ["rolling_win", "rolling_comp"]) agg_df = df.set_index(x).rolling(window=window) agg_df = pd.DataFrame({c: getattr(agg_df[c], comp)() for c in y}) agg_df = agg_df.reset_index() code = [ "chart_data = chart_data.set_index('{x}').rolling(window={window})" .format(x=x, window=window), "chart_data = pd.DataFrame({'" + ", ".join([ "'{c}': chart_data['{c}'].{comp}()".format(c=c, comp=comp) for c in y ]) + "})", "chart_data = chart_data.reset_index()", ] return agg_df, code idx_cols = make_list(animate_by) + make_list(group_col) + [x] agg_cols = make_list(y) if z_exists: idx_cols += make_list(y) agg_cols = make_list(z) if agg == "drop_duplicates": groups = [df[idx_cols + [col]].drop_duplicates() for col in agg_cols] if len(groups) == 1: groups = groups[0] code = "chart_data = chart_data[['{}']].drop_duplicates()".format( "','".join(idx_cols + agg_cols)) else: groups = pd.merge(*groups, on=idx_cols, how="outer") code = ( "idx_cols = ['{}']\n" "agg_cols = ['{}']\n" "chart_data = pd.merge(\n" "\t*[chart_data[idx_cols + [col]].drop_duplicates() for col in agg_cols],\n" "\ton=idx_cols,\n" "\thow='outer'\n" ")").format("','".join(idx_cols), "','".join(agg_cols)) else: groups = df.groupby(idx_cols) if agg in ["pctsum", "pctct"]: func = "sum" if agg == "pctsum" else "size" subidx_cols = [ c for c in idx_cols if c not in make_list(group_col) ] groups = getattr(groups[agg_cols], func)() groups = groups / getattr(df.groupby(subidx_cols)[agg_cols], func)() * 100 if len(agg_cols) > 1: groups.columns = agg_cols elif len(agg_cols) == 1: groups.name = agg_cols[0] code = ( "chart_data = chart_data.groupby(['{cols}'])[['{agg_cols}']].{agg}()\n" "chart_data = chart_data / chart_data.groupby(['{subidx_cols}']).{agg}()\n" "chart_data = chart_data.reset_index()") code = code.format( cols="', '".join(idx_cols), subidx_cols="', '".join(subidx_cols), agg_cols="', '".join(make_list(agg_cols)), agg=func, ) code = [code] else: groups = getattr(groups[agg_cols], agg)() code = [ "chart_data = chart_data.groupby(['{cols}'])[['{agg_cols}']].{agg}().reset_index()" .format(cols="', '".join(idx_cols), agg_cols="', '".join(agg_cols), agg=agg) ] if animate_by is not None: full_idx = pd.MultiIndex.from_product( [df[c].unique() for c in idx_cols], names=idx_cols) groups = groups.reindex(full_idx).fillna(0) code += [ "idx_cols = ['{cols}']".format(cols="', '".join(idx_cols)), "full_idx = pd.MultiIndex.from_product([df[c].unique() for c in idx_cols], names=idx_cols)" "chart_data = chart_data.reindex(full_idx).fillna(0)", ] return groups.reset_index(), code
import pandas as pd import numpy as np import random from copy import deepcopy ml1m_dir = 'ml-1m/ratings.dat' ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'], engine='python') user_id = ml1m_rating[['uid']].drop_duplicates().reindex() user_id['userId'] = np.arange(len(user_id)) item_id = ml1m_rating[['mid']].drop_duplicates() item_id['itemId'] = np.arange(len(item_id)) ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left') ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left') ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']] # convert to binary data ratings = deepcopy(ml1m_rating) ratings['rating'][ratings['rating'] > 0] = 1.0
# Nos conectamos al API de Google Maps gmaps_key = googlemaps.Client(key='AIzaSyAdZKjevohQs7fHJn3NpZJ70DDtcAsj4rI') lat = [] lng = [] for i in direcciones: try: geocode_result = gmaps_key.geocode(i) lat_ = geocode_result[0]['geometry']['location']['lat'] lng_ = geocode_result[0]['geometry']['location']['lng'] lat.append(lat_) lng.append(lng_) except IndexError: lat.append(np.nan) lng.append(np.nan) df_coordenadas = pd.DataFrame({'latitud': lat, 'longitud': lng}) df['i1'] = df.index df_coordenadas['i1'] = df_coordenadas.index join = pd.merge(df, df_coordenadas, how='right', on='i1') del join['i1'] join.to_csv( '/Users/rogeliomj/Documents/python/tesis/plataforma_cdmx/escuelas_danadas_geocoded.csv', index=False)
""" comp_info = pandas.read_excel(compound_info_file, index_col=0) for row in comp_info.iterrows(): compound_table[int(row[0])] = conversions.pubchem(str(row[1]['Drug Name'])) # compound_table[int(row[0])] = pubchem_table[str(row[1]['Drug Name'])] e = Emiter("gdsc.scan") cl_info = pandas.read_excel(conv_file, index_col=0) sample_table = {} for row in cl_info.iterrows(): sample_table[row[0]] = (str(row[1]['CCLE name']), 'ccle') # "ccle:%s" % (row[1]['CCLE name']) cl_info = pandas.read_excel(cell_info_file, index_col=1) for row in cl_info.iterrows(): if row[0] not in sample_table: sample_table[row[0]] = (str(row[1]['Sample Name']), 'gdsc') # "gdsc:%s" % (row[1]['Sample Name']) gdsc_cell_info(row[1], e.emit) raw = pandas.read_excel(raw_file) fitted = pandas.read_excel(fitted_file) merge = pandas.merge(raw, fitted, on=["COSMIC_ID", "DRUG_ID"]) for r in merge.iterrows(): cosmic_id = int(r[1]["COSMIC_ID"]) if cosmic_id in cl_info.index: gdsc_ic50_row( r[1], compound_table, sample_table, e.emit )
supply = pd.read_excel(r'E:\PD Wk 35 Input.xlsx', 'PD Wk 34 Output') supply['Supply Id'] = supply.index + 1 supply['Running Supply To'] = supply.sort_values('Date').groupby( ['Product', 'Scent'])['Quantity'].cumsum() supply[ 'Running Supply From'] = supply['Running Supply To'] - supply['Quantity'] demand = pd.read_excel(r'E:\PD Wk 35 Input.xlsx', 'Store Orders') demand['Demand Id'] = demand.index + 1 demand['Running Demand To'] = demand.sort_values('Date Required').groupby( ['Product', 'Scent'])['Quantity Requested'].cumsum() demand['Running Demand From'] = demand['Running Demand To'] - demand[ 'Quantity Requested'] allocate = pd.merge(supply, demand, on=['Product', 'Scent'], how='inner') allocate = allocate.query( '(`Running Demand To`>=`Running Supply From` and `Running Demand To`<=`Running Supply To`) \ or (`Running Supply To`>=`Running Demand From` and `Running Supply To`<=`Running Demand To`)' ) allocate['Allocated Quantity'] = allocate.apply( lambda x: min(x['Running Supply To'], x['Running Demand To']) - max( x['Running Supply From'], x['Running Demand From']), axis=1) surplus = supply[~supply['Supply Id'].isin(allocate['Supply Id'])] surplus = surplus.groupby(['Supplier', 'Product', 'Scent'], as_index=False).agg({'Quantity': 'sum'}) fulfill = allocate.groupby([ 'Store', 'Product', 'Scent', 'Supplier', 'Quantity Requested',
mycol = mydb['EKKO'] mycol1 = mydb['EKET'] cursor = mycol.find() df = pd.DataFrame(list(cursor)) df.columns = df.columns.str.replace(' ', '') df = df[['Purch-Doc-', 'Createdon', 'Vendor']] print(df.head()) print('_______________') cursor = mycol1.find() df1 = pd.DataFrame(list(cursor)) df1.columns = df1.columns.str.replace(' ', '') #d1f = df1[['Purch-Doc-','Del-Date','Sched-Qty']] df1 = df1[['Purch-Doc-', 'Del-Date', 'Sched-Qty', 'Delivered']] data = pd.merge(df1, df, on='Purch-Doc-', how='left') #data.to_csv('EKET-EKKO.csv') print(data.columns) data.columns = data.columns.str.replace(' ', '') data['Sched-Qty'] = data['Sched-Qty'].astype(str) data['Delivered'] = data['Delivered'].astype(str) data['Sched-Qty'] = data['Sched-Qty'].str.replace(',', '') data['Delivered'] = data['Delivered'].str.replace(',', '') ''' data['Delivered'] = data['Delivered'].str.replace('.00','') print(data['Delivered']) data['Sched-Qty'] = data['Sched-Qty'].str.replace('.00','') ''' data['Delivered'] = np.where(data['Delivered'].str[:-3] == '.00', data['Delivered'].str[-3:], data['Delivered']) data['Sched-Qty'] = np.where(data['Sched-Qty'].str[:-3] == '.00',
def call_KM(genre1, genre2, genre3): movies = pd.read_csv('mysite/movies.csv') ratings = pd.read_csv('mysite/ratings.csv') # genre1='Adventure' # genre2='Sci-Fi' # genre3='Action' my_clusters = 0 helper.set_Variables(genre1, genre2, genre3) genre_ratings = helper.get_genre_ratings(ratings, movies, [genre1, genre2], [Dict[genre1], Dict[genre2]]) biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5) print("Number of records: ", len(biased_dataset)) biased_dataset.head() helper.draw_scatterplot(biased_dataset[Dict[genre2]], Dict[genre2], biased_dataset[Dict[genre1]], Dict[genre1], 'mysite/static/mysite/Normal.png') # plt.savefig('mysite/static/mysite/Normal.png') # # plt.close('mysite/static/mysite/Normal.png') X = biased_dataset[[Dict[genre2], Dict[genre1]]].values # TODO: Create an instance of KMeans to find two clusters kmeans_1 = KMeans(n_clusters=2, random_state=0) predictions = kmeans_1.fit_predict(X) helper.draw_clusters(biased_dataset, predictions, 'mysite/static/mysite/TwoCluster.png') # plt.savefig('mysite/static/mysite/TwoCluster.png') # plt.close('TwoCluster.png') # TODO: Create an instance of KMeans to find three clusters kmeans_2 = KMeans(n_clusters=3, random_state=1) predictions_2 = kmeans_2.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_2, 'mysite/static/mysite/ThreeCluster.png') # plt.savefig('mysite/static/mysite/ThreeCluster.png') # plt.close('ThreeCluster.png') # TODO: Create an instance of KMeans to find four clusters kmeans_3 = KMeans(n_clusters=4, random_state=3) predictions_3 = kmeans_3.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_3, 'mysite/static/mysite/FourCluster.png') # plt.savefig('mysite/static/mysite/FourCluster.png') # plt.close('FourCluster.png') possible_k_values = range(2, len(X) + 1, 5) errors_per_k = [helper.clustering_errors(k, X) for k in possible_k_values] list(zip(possible_k_values, errors_per_k)) fig, ax = plt.subplots(figsize=(16, 6)) ax.set_xlabel('K - number of clusters') ax.set_ylabel('Silhouette Score (higher is better)') ax.plot(possible_k_values, errors_per_k) fig.savefig('mysite/static/mysite/score.png') plt.close(fig) # Ticks and grid xticks = np.arange(min(possible_k_values), max(possible_k_values) + 1, 5.0) ax.set_xticks(xticks, minor=False) ax.set_xticks(xticks, minor=True) ax.xaxis.grid(True, which='both') yticks = np.arange(round(min(errors_per_k), 2), max(errors_per_k), .05) ax.set_yticks(yticks, minor=False) ax.set_yticks(yticks, minor=True) ax.yaxis.grid(True, which='both') # TODO: Create an instance of KMeans to find seven clusters kmeans_4 = KMeans(n_clusters=7, random_state=6) predictions_4 = kmeans_4.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_4, 'mysite/static/mysite/BestCluster.png', cmap='Accent') # plt.savefig('mysite/static/mysite/BestCluster.png') # plt.close('BestCluster.png') biased_dataset_3_genres = helper.get_genre_ratings( ratings, movies, [genre1, genre2, genre3], [Dict[genre1], Dict[genre2], Dict[genre3]]) biased_dataset_3_genres = helper.bias_genre_rating_dataset( biased_dataset_3_genres, 3.2, 2.5).dropna() print("Number of records: ", len(biased_dataset_3_genres)) X_with_action = biased_dataset_3_genres[[ Dict[genre2], Dict[genre1], Dict[genre3] ]].values # TODO: Create an instance of KMeans to find seven clusters kmeans_5 = KMeans(n_clusters=7) predictions_5 = kmeans_5.fit_predict(X_with_action) helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5, 'mysite/static/mysite/3DCluster.png') # plt.savefig('mysite/static/mysite/3DCluster.png') # plt.close('3DCluster.png') #Merge the two tables then pivot so we have Users X Movies dataframe ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId') user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns='title', values='rating') user_movie_ratings.iloc[:6, :10] n_movies = 30 n_users = 18 most_rated_movies_users_selection = helper.sort_by_rating_density( user_movie_ratings, n_movies, n_users) most_rated_movies_users_selection.head() helper.draw_movies_heatmap(most_rated_movies_users_selection, 'mysite/static/mysite/HeatMap.png')
def calc_stat_info(self): ''' 通过设置的参数信息,计算统计周期内的统计信息 :return: ''' gold_future = "C:\\quanttime\\data\\gold\\sh_future\\gold.csv" silver_future = "C:\\quanttime\\data\\gold\\sh_future\\silver.csv" stander_dtype = { 'open': float, "close": float, "high": float, "low": float, "volume": float, "money": float } gold_future_data = pd.read_csv(gold_future, parse_dates=["date"], index_col=["date"], dtype=stander_dtype) gold_future_data = gold_future_data[~gold_future_data.reset_index(). duplicated().values] silver_future_data = pd.read_csv(silver_future, parse_dates=["date"], index_col=["date"], dtype=stander_dtype) silver_future_data = silver_future_data[ ~silver_future_data.reset_index().duplicated().values] future_data = pd.merge(gold_future_data, silver_future_data, left_index=True, right_index=True, suffixes=('_gold', '_silver')) future_data["compare"] = future_data["close_gold"] / future_data[ "close_silver"] * 1000 # 去重 future_data = future_data.dropna() future_data_trade_date = future_data.index today = datetime.today().date() self.ui.lineEdit_14.setText(today.strftime("%Y-%m-%d")) self.ui.lineEdit_13.setText( future_data_trade_date[-1].strftime("%Y-%m-%d")) columns_name = [ "count", "mean", "std", "min", "25%", "50%", "75%", "max" ] # df_empty = pd.DataFrame(columns=columns_name) self.back_day_stat = self.ui.lineEdit_12.text() # 设置当前日期往前推几天的统计信息 self.long_buy_value = self.ui.lineEdit_8.text( ) # 做多金银比,统计买入线,如0.10即10%分位线 self.long_sell_value = self.ui.lineEdit_9.text( ) # 做多金银比,统计卖出线,如0.15即15%分位线 self.short_buy_value = self.ui.lineEdit_10.text( ) # 做空金银比,统计的买入线,如0.85即85%分位线 self.short_sell_value = self.ui.lineEdit_11.text() # 做空金银比,统计的卖出线 df_stat_20 = future_data.iloc[ -int(self.back_day_stat):] #如future_data.iloc[-20:] #print(df_stat_20) df_stat = df_stat_20.loc[:, ["compare"]].describe() #print(df_stat) long_buyValue = round(float(self.long_buy_value), 2) long_sellValue = round(float(self.long_sell_value), 2) short_sellValue = round(float(self.short_buy_value), 2) short_buyValue = round(float(self.short_sell_value), 2) # print(self.long_buy_value) # long_buyValue = 0.05 # long_sellValue = 0.10 # short_sellValue = 0.85 # short_buyValue = 0.90 v_5 = df_stat_20.quantile(long_buyValue).compare # 5%分位 v_10 = df_stat_20.quantile(long_sellValue).compare # 10%分位 v_90 = df_stat_20.quantile(short_sellValue).compare # 90%分位 v_95 = df_stat_20.quantile(short_buyValue).compare # 95%分位 value = round(v_5, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(0, 0, newItem) value = round(v_10, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(1, 0, newItem) value = round(v_90, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(2, 0, newItem) value = round(v_95, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(3, 0, newItem) value = round(df_stat.loc["max", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(4, 0, newItem) value = round(df_stat.loc["min", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(5, 0, newItem) value = round(df_stat.loc["mean", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(6, 0, newItem) value = round(df_stat.loc["25%", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(7, 0, newItem) value = round(df_stat.loc["50%", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(8, 0, newItem) value = round(df_stat.loc["75%", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(9, 0, newItem) value = round(df_stat.loc["std", ["compare"]].compare, 2) newItem = QtWidgets.QTableWidgetItem(str(value)) self.ui.tableWidget_2.setItem(10, 0, newItem)
crsp = crsp.sort_values(by=['permno', 'date']) # change variable format to int crsp['permno'] = crsp['permno'].astype(int) # Line up date to be end of month crsp['date'] = pd.to_datetime(crsp['date']) # find the closest trading day to the end of the month crsp['monthend'] = crsp['date'] + MonthEnd(0) crsp['date_diff'] = crsp['monthend'] - crsp['date'] date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame date_temp.reset_index(inplace=True) date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) # label every date of month end crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() # label numbers of months for a firm month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) month_num = month_num.astype(int) month_num = month_num.reset_index(drop=True) # mark the number of each month to each day of this month crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') # crate a firm list df_firm = crsp.drop_duplicates(['permno'])
color = sns.color_palette("hls", 8) sns.set(style="darkgrid") plt.figure(figsize=(15, 5)) sns.countplot(x=matrix['shop_id'], data=matrix, palette=color) """Aggregate Sale""" train_data['revenue'] = train_data['item_price'] * train_data['item_cnt_day'] train_data.head() group_data = train_data.groupby(by=['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum'}) group_data.columns = ['item_cnt_month'] group_data.reset_index(inplace = True) group_data.head() matrix = pd.merge(matrix, group_data, on=cols, how='left') matrix.head() matrix['item_cnt_month'] = (matrix['item_cnt_month'].fillna(0).clip(0, 20).astype(np.float16)) matrix.head() matrix.shape test_data.head() test_data['date_block_num'] = 34 test_data['date_block_num'] = test_data['date_block_num'].astype(np.int8) test_data['shop_id'] = test_data['shop_id'].astype(np.int8) test_data['item_id'] = test_data['item_id'].astype(np.int16) matrix = pd.concat([matrix, test_data], ignore_index=True, sort=False, keys=cols)
# @Function:merge合并 # @Time:2020/6/2 下午3:55 # @Author:Flank import numpy as np import pandas as pd # 数据库表的左,右连接,内连接,外连接,全连接 left = pd.DataFrame({ 'key': ['k0', 'k1', 'k2', 'k3'], 'A':['a0','a1','a2','a3'] , 'B':['b0','b1','b2','b3'] }) right = pd.DataFrame({ 'key': ['k0', 'k1', 'k2', 'k3'], 'C':['c0','c1','c2','c3'], 'D':['d0','d1','d2','d3'] }) print(left) print(right) res=pd.merge(left,right,on='key')#on表示合并哪一列 print(res)
def weather_phenomena_france(): Nice = pd.read_csv('weather_nice.csv', sep =';', parse_dates = [0], usecols = [1,7]) Mars = pd.read_csv('weather_mars.csv', sep = ';', parse_dates = [0], usecols = [1,7]) Paris = pd.read_csv('weather_paris.csv', sep = ';', parse_dates = [0], usecols = [1,7]) Lille = pd.read_csv('weather_lille.csv', sep = ';', parse_dates = [0], usecols = [1,7]) Toulouse = pd.read_csv('weather_toulouse.csv', sep = ';', parse_dates = [0], usecols = [1,7]) Lyon = pd.read_csv('weather_lyon.csv', sep = ';', parse_dates = [0], usecols = [1,7]) Nice['Date'] = Nice['Date'].dt.strftime('%m%d') Nice = Nice.groupby('Date').mean() Nice['Température'] = Nice['Température'] - 273.15 Mars['Date'] = Mars['Date'].dt.strftime('%m%d') Mars = Mars.groupby('Date').mean() Mars['Température'] = Mars['Température'] - 273.15 Paris['Date'] = Paris['Date'].dt.strftime('%m%d') Paris = Paris.groupby('Date').mean() Paris['Température'] = Paris['Température'] - 273.15 Lille['Date'] = Lille['Date'].dt.strftime('%m%d') Lille = Lille.groupby('Date').mean() Lille['Température'] = Lille['Température'] - 273.15 Toulouse['Date'] = Toulouse['Date'].dt.strftime('%m%d') Toulouse = Toulouse.groupby('Date').mean() Toulouse['Température'] = Toulouse['Température'] - 273.15 Lyon['Date'] = Lyon['Date'].dt.strftime('%m%d') Lyon = Lyon.groupby('Date').mean() Lyon['Température'] = Lyon['Température'] - 273.15 temp = pd.merge(Nice, Mars, how='inner', left_index = True, right_index = True) temp = pd.merge(temp, Paris, how = 'inner', left_index = True, right_index = True) temp = temp.rename(columns ={'Température_x':'Nice', 'Température_y':'Marseille', 'Température' : 'Paris'}) temp = pd.merge(temp, Lille, how = 'inner', left_index = True, right_index = True) temp = temp.rename(columns ={'Température' : 'Lille'}) temp = pd.merge(temp, Toulouse, how = 'inner', left_index = True, right_index = True) temp = temp.rename(columns ={'Température' : 'Toulouse'}) temp = pd.merge(temp, Lyon, how = 'inner', left_index = True, right_index = True) temp = temp.rename(columns ={'Température' : 'Lyon'}) temp = temp[temp.index != '0229'] # independency test between nice and marseille Nice_Marseille = ttest_ind(temp['Nice'],temp['Marseille']) # mean for france and south of france temp['France'] = (temp['Paris'] + temp['Lille'] + temp['Lyon'] + temp['Toulouse'])/4 temp['South East France'] = (temp['Marseille']+temp['Nice'])/2 # independency test between South east of france and rest of France Southfrance_vs_France = ttest_ind(temp['France'],temp['South East France']) columns_to_keep = ['South East France','France'] temp = temp[columns_to_keep] temp = temp.reset_index() temp['Date'] = pd.to_datetime(temp['Date'], format = '%m%d') fig = plt.figure(figsize = (12,8)) plt.xlabel('Month') plt.ylabel('Temperature (Celsius)') plt.title('Comparing Average temperatures for the 2010-2018 period between France and South East of France') plt.style.use('seaborn-colorblind') plt.plot(temp['Date'], temp['France'], color = 'green', lw = 1, label = 'Paris, Lille, Toulouse, Lyon') plt.plot(temp['Date'], temp['South East France'], color = 'blue', lw = 1, label = 'Nice, Marseille') plt.tick_params(top = 'off', bottom = 'off', left = 'off', right = 'off', labelleft = 'on', labelbottom = 'on') ax = plt.gca() for spine in ax.spines.values(): spine.set_visible(False) ax.xaxis.set_major_formatter(dates.DateFormatter('%b')) ax.xaxis.set_major_locator(dates.MonthLocator()) plt.legend(loc = 'best') plt.show() fig.savefig('assignment4.Png', format = 'png') return Nice_Marseille[1], Southfrance_vs_France[1]
# -*- coding: utf-8 -*- """ Created on Fri Mar 3 08:56:27 2017 @author: Daniel """ import pandas as pd import numpy as np url_users = 'https://raw.githubusercontent.com/wesm/pydata-book/master/ch02/movielens/users.dat' url_ratings = 'https://raw.githubusercontent.com/wesm/pydata-book/master/ch02/movielens/ratings.dat' url_movies = 'https://raw.githubusercontent.com/wesm/pydata-book/master/ch02/movielens/movies.dat' users = pd.read_csv(url_users,header=None, delimiter="::").rename(columns={0:'userid', 1:'gender', 2: 'age', 3: 'occupation', 4:'zipcode'}) ratings = pd.read_csv(url_ratings,header=None, delimiter="::").rename(columns={0: 'userid', 1: 'movieid', 2: 'rating', 3:'timestamp'}) movie = pd.read_csv(url_movies,header=None, delimiter="::").rename(columns={0: 'movieid', 1: 'title', 2: 'genre'}) merged = pd.merge(ratings,movie, on=['movieid']) merged = pd.merge(merged,users,on=['userid']) data = merged.copy() del users, ratings, movie, url_users, url_ratings, url_movies, merged print(np.unique(data.loc['genre']))
print(e) break column_names = "SCode,SName,RDate,LXDM,LX,Count,CGChange,ShareHDNum,VPosition,TabRate,LTZB,ShareHDNumChange,RateChange,fund_type".split( ',') df = pd.DataFrame(li, columns=column_names) df.to_csv('processed_fin_data/重要基金持股.csv') ''' # %% df = pd.read_csv('processed_fin_data/重要基金持股.csv', converters={ 'SCode': str, 'RDate': pd.to_datetime, 'Count': pd.to_numeric}) # %% 获取各类基金持股结果较好的股票日期组合 df_qs = df[(df.LXDM == '券商') & (df.Count > 15) & (df.CGChange == '增持')] df_bx = df[(df.LXDM == '保险') & (df.Count > 2) & (df.CGChange == '增持')] df_jj = df[(df.LXDM == '基金') & (df.Count > 200) & (df.CGChange == '增持')] df_sb = df[(df.LXDM == '社保') & (df.Count > 2) & (df.CGChange == '增持')] df_QFII = df[(df.LXDM == 'QFII') & (df.Count > 1) & (df.CGChange == '增持')] # %%输出以上组合的重合部分,即多种基金持有的股票 li_solid = [i for _ in [df_qs, df_bx, df_jj, df_sb, df_QFII] for i in _[['SCode', 'RDate']].values.tolist()] li_solid_copy = copy.deepcopy(li_solid) li_solid_unique = [list(a) for a in set(tuple(i) for i in li_solid)] for i in li_solid_unique: li_solid_copy.remove(i) fraud_free_set = set(tuple(i) for i in li_solid_copy) df_fraud_free = pd.DataFrame(fraud_free_set, columns=['SCode', 'RDate']) df_fraud_free.to_csv('label_data/df_fraud_free.csv') pd.merge(df_fraud_free, df,on=['SCode','RDate']) # %%
resumen['date'] = pd.to_datetime(resumen.date_1.str.slice(4), format ='%Y-%m-%d_%H:%M:%S', errors='coerce') - np.timedelta64(5, 'h') # Es necesario restar 5 para que se ajuste a las horas Colombia -5 utm resumen['dom_1'] = resumen.date_1.str.slice(0,3) #tabla de las alturas balideam = pd.read_csv('/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/estaciones_altura_20180905.csv') #alturas_1 = pd.read_csv('/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/union_b_20180905.csv') union_b = pd.merge(resumen, balideam, how='outer', on='cod') #Corrección de la temperatura union_b['temp'] = (((union_b.al_alos - union_b.alt_1) * 0.0065) + union_b.T2) resumen_back = union_b resumen_back.T2 = resumen_back.temp ### Creación de la tabla de recepción recep_t = pd.DataFrame({'tipo_1':np.tile(resumen_back.fecha.unique(), 93), 'dom_1':np.tile(np.repeat(['d01','d02','d03'], len(resumen_back.fecha.unique())), 31), 'cod_1':np.repeat((resumen_back.cod.unique()).astype(np.str), (len(resumen_back.fecha.unique()) * 3))})
'/Preprocessor/data' ''' Walk data path for text files and add metadata columns overwriting original data ''' count = 0 for root, dirs, files in os.walk(PATH_INPUTS): if 'Indicators' not in root.split('/'): for file in files: if file[-3:] == 'txt' and file[:7] != 'merged_': print 'Processing file {}, number {}'.format(files, count) count = count + 1 df = pd.read_csv(root + '/' + file) for root1, dirs1, files1 in os.walk(PATH_FEATURES): for file1 in files1: if file1[-3:] == 'txt': print 'merging file {} with input file {}'.format( file, file1) feat_df = pd.read_csv(root1 + '/' + file1) feat_symbol = feat_df.irow(0).SYMBOL new_col = feat_symbol + '_CLOSE' feat_df[new_col] = feat_df['Close'] if new_col not in df.columns.tolist(): df = pd.merge( df, feat_df[['Date', 'Time', new_col]], how='left', on=['Date', 'Time']) df.to_csv(root + '/' + 'merged_' + file, index=False)
import pandas as pd data_2018 = pd.read_csv('master_2018.csv') data_2019 = pd.read_csv('master_2019.csv') data_2019.rename(columns={'zasp_index': 'zasp_i'}, inplace=True) merged = pd.merge(data_2018, data_2019, on='county_code') merged['county_code'] = merged['county_code'].transform( lambda x: str(x).replace('.0', '').zfill(5)) merged['zasp_index'] = merged['zasp_i'] + (merged['zasp_i'] - merged['zasp_index']) merged.to_csv('master_linear_pred_2020.csv', index=False)
del clean_sabio['Substrate'] # drop duplicates clean_sabio = clean_sabio.join(s) clean_sabio = clean_sabio.drop_duplicates(keep='first') # Reconstruct SabioRK database so genes will map directly to Km and kcat # To choose a single value, I chose the maximum value reported km = clean_sabio[clean_sabio["Type.1"] == "Km"].drop(columns=["Type.1", \ "Species"]).rename(columns={"Start Value":"Km"}) km = km.groupby(["Gene names", "Substrate"], sort=False)["Km"].max().reset_index() kcat = clean_sabio[clean_sabio["Type.1"] == "kcat"].drop(columns=["Type.1", \ "Species"]).rename(columns={"Start Value":"kcat"}) kcat = kcat.groupby(["Gene names", "Substrate"], sort=False)["kcat"].max().reset_index() comb = pd.merge(kcat, km, how="inner", left_on=["Gene names", "Substrate"],\ right_on=["Gene names", "Substrate"]).drop_duplicates(keep="first").dropna() # Get substrates of interest atp = comb[comb["Substrate"] == "ATP"].sort_values(by="kcat", ascending=False) adp = comb[comb["Substrate"] == "ADP"].sort_values(by="kcat", ascending=False) nad = comb[comb["Substrate"] == "NAD+"].sort_values(by="kcat", ascending=False) nadh = comb[comb["Substrate"] == "NADH"].sort_values(by="kcat", ascending=False) acoa = comb[comb["Substrate"] == "Acetyl-CoA"].sort_values(by="kcat", ascending=False) coa = comb[comb["Substrate"] == "Coenzyme A"].sort_values(by="kcat", ascending=False) ############################################################################### # Scatterplot for global analysis #############################################
def prediction_data(stock, days): ''' stock = 'BTC-USD' days = 5 days to predict in future start/end = historical dataset ''' start = datetime.datetime(2019, 8, 1) end = datetime.datetime(2019, 9, 7) # Store in an array the results of the three different classifier prediction_values = [] # try to get the data from internet try: stock_df = web.DataReader(stock, 'yahoo', start, end) # print(stock_df.tail()) csv_name = ('app/exports/BTC-USD_export.csv') stock_df.to_csv(csv_name) except (): print('it eas not possible to get the data.') print(path.exists('app/exports/BTC-USD_export.csv')) stock_df = pd.read_csv('app/exports/BTC-USD_export.csv') # print(df.tail()) # save the data locally into a csv file # if os.path.exists('./{}'.format(data_folder)): # pass # else: # os.mkdir(data_folder) # # csv_name = ('{}/{}_export.csv'.format(data_folder, stock)) # df.to_csv(csv_name) # add a column prediction to the dataset stock_df['prediction'] = stock_df['Close'].shift(-1) stock_df.dropna(inplace=True) # print(stock_df.tail()) forecast_days = int(days) #Predicting the stock price in the future # Random shuffle the dataset # df = df.sample(frac=1) # Set the features columns X = np.array(stock_df.drop(['prediction', 'Date'], 1)) # Set the target column Y = np.array(stock_df['prediction']) # Standardize a dataset along any axis X = preprocessing.scale(X) # Split the dataset to 45% testing and then 55% training sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.45) # Performing the Regression on the trainig data linear_regression_classifier = LinearRegression() linear_regression_classifier.fit(X_train, Y_train) X_prediction = X[-forecast_days:] prediction_linear_regression = (linear_regression_classifier.predict(X_prediction)) confidence_lr = linear_regression_classifier.score(X_train, Y_train) plr = round(float(np.float64(prediction_linear_regression[0])), 2) clr = round(float(np.float64(confidence_lr*100)), 2) linear_regression_prediction = {} linear_regression_prediction['prediction'] = plr linear_regression_prediction['confidence'] = clr # Add to the array the results prediction_values.append(linear_regression_prediction) # Print out the Linear Regression prediction print('Prediction at {} days using linear regression is about {} $'.format(days, str(plr))) print('Confidence at {} days using linear regression is about {}% '.format(days, str(clr))) # quadratic, linear, lasso, ridge quadratic_regression_classifier = make_pipeline(PolynomialFeatures(2), Ridge()) quadratic_regression_classifier.fit(X_train, Y_train) prediction_quadratic_regression = quadratic_regression_classifier.predict(X_prediction) confidence_pq = quadratic_regression_classifier.score(X_train, Y_train) pqr = round(float(np.float64(prediction_quadratic_regression[0])), 2) cpq = round(float(np.float64(confidence_pq * 100)), 2) quadratic_regression_prediction = {} quadratic_regression_prediction['prediction'] = pqr quadratic_regression_prediction['confidence'] = cpq # Add to the array the results prediction_values.append(quadratic_regression_prediction) # Print out the Quadratic regression prediction print('Prediction at {} days using quadratic regression is about {} $'.format(days, str(pqr))) print('Confidence at {} days using quadratic regression is about {}%'.format(days, str(cpq))) # KNN Regression kneighbor_regression_classifier = KNeighborsRegressor(n_neighbors=2) kneighbor_regression_classifier.fit(X_train, Y_train) prediction_kneighbor_regression = kneighbor_regression_classifier.predict(X_prediction) confidence_kr = kneighbor_regression_classifier.score(X_train, Y_train) pkr = round(float(np.float64(prediction_kneighbor_regression[0])), 2) ckr = round(float(np.float64(confidence_kr * 100)), 2) kneighbor_regression_prediction = {} kneighbor_regression_prediction['prediction'] = pkr kneighbor_regression_prediction['confidence'] = ckr # Add to the array the results prediction_values.append(kneighbor_regression_prediction) # Print out the Quadratic regression prediction print('Prediction at {} days using K Nearest Neighbor (KNN) regression is about {} $'.format(days, str(pkr))) print('Confidence at {} days using K Nearest Neighbor (KNN) regression is about {}%'.format(days, str(ckr))) ## Work on the tweets Dataset print(path.exists('app/exports/analysis_all.csv')) tweet_df = pd.read_csv('app/exports/analysis_all.csv') tweet_df.drop('remove', axis=1, inplace=True) tweet_df['number'] = tweet_df['tweet'].shift() tweet_df.dropna(inplace=True) # group_by_date_sentiment = tweet_df.groupby(['created_at', 'sentiment'])['number'].agg('sum') group_by_date_sentiment = tweet_df.groupby(['created_at', 'sentiment'], as_index=False).count().pivot('created_at', 'sentiment').fillna(0) # print(group_by_date_sentiment) df_tmp = group_by_date_sentiment['number'] # print(group_by_date_sentiment['number'].head()) df_values = stock_df.set_index('Date') final_df = pd.merge(df_values, df_tmp, left_index=True, right_index=True) # print(final_df) # Work with graph columns_df = final_df[['Close', 'Neutral', 'Positive']] x = columns_df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) plot_df = pd.DataFrame(x_scaled, columns=columns_df.columns, index=columns_df.index) final_close_price = plot_df['Close'] neutral = plot_df['Neutral'] positive = plot_df['Positive'] # Print the graph # Adjust the size of mathplotlib mpl.rc('figure', figsize=(8, 7)) mpl.__version__ plt.suptitle('Bitcoin Sentiment Analysis on Tweets', fontsize=14, fontweight='bold') plt.ylabel('Sentiment') plt.xlabel('Time') # Adjust the style of matplotlib style.use(['ggplot']) style.context('Solarize_Light2') neutral.plot( label='Neutral Tweets', color='orange', linestyle='dashed', linewidth=2, alpha=0.5, marker='s', markersize=5, markerfacecolor='blue', markeredgecolor='blue', ) positive.plot( color='green', linestyle='dashed', linewidth=2, alpha=0.5, marker='*', markersize=5, markerfacecolor='blue', markeredgecolor='blue', label='Positive Tweets' ) final_close_price.plot( color='red', linestyle='solid', linewidth=4, alpha=0.5, marker='o', markersize=5, markerfacecolor='blue', markeredgecolor='blue', label='BTC-USD' ) plt.legend() #save to file plt.savefig('app/static/img/sentiment.png') # plt.show() plt.close() # return the price with the best confidence maxConfidenceItem = max(prediction_values, key=lambda x: x['confidence']) print('maxConfidenceItem: {}'.format(str(maxConfidenceItem))) return maxConfidenceItem
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_question_id": { "type": "category" }, "previous_answer_question_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" }, "timedelta_log10": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="question_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df["timedelta_log10"] = np.log10( df["duration_previous_content"].values) df["timedelta_log10"] = df["timedelta_log10"].replace( -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8") df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_question_id", "previous_answer_question_id", "row_id", "timediff-elapsedtime_bin500", "timedelta_log10" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather" ) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model275_all", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model275_all/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model275_all/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.2 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.2, ) num_train_optimization_steps = int(len(dataloader_train) * 25) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) auc_val = 0 for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, output_dir, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) torch.save( model.state_dict(), f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth" ) # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_val) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)