def generateGameGenreMatrix(self, appids=None, nGames=10, save=None, file_size=''): """Generate game-genre matrix (app * genre)""" if appids is None: steamAppList = '' dictGames = requests.get(steamAppList) jsonGames = dictGames.json() gameList = [i['appid'] for i in jsonGames['applist']['apps']['app']] appids = pd.DataFram(gameList, columns=['appid']) appids = appids['appid'].unique() gm = pd.DataFrame() gm.index.names = ["appid"] for id in tqdm(appids): for genre in self.getApps(id): if genre is not None: gm.set_value(id, genre, int(1)) #print('\rGenerate gm:{0}%'.format(round(i / appids.size * 100)), end="", flush=True) gm = gm.fillna(value=0) print('\n') = gm if save is not None: gm.to_csv('Resources/gamematrix{0}.csv.gz'.format(file_size), compression='gzip', mode='w+') return (gm)
def ret_dr(response_dict): df = pd.DataFram( columns=['created_at', 'updated_at', 'name', 'forks', 'stars', 'size']) for resp_dict in response_dict['items']: df = df.append( { 'created_at': resp_dict['created_at'], 'updaed_at': resp_dict['updated_at'], 'name': resp_dict['name'], 'forks': resp_dict['forks'], 'stars': resp_dict['starsgazers_count'], 'size': resp_dict['size'] }, ignore_index=True) c = df.json() return c
def summary_extract(data, n_min=2, n_max=5): '''Function that creates a table output of top N-grams found in the input text data. Inputs: data: A list of text documents on which to analyze. n_min: Integer representing the mininum N-gram size to count. n_max: Integer representing the maximum N-gram size to count. Output: A pandas DataFrame listing N-gram total counts (Frequency), document frequency (No. Cases), document percentage (% Cases), as well as the N-gram itself and its length. Sorted by decreasing Frequency.''' dict_list = [] for k in range(n_min, n_max + 1): dict_list.append(get_n_grams(k, data)) sum_list = [] tot_grams = len(data) for dic in dict_list: for n_gram in dic: text = ' '.join([x.upper() for x in n_gram]) if (dic[n_gram][1] > 1) or (tot_grams < 10): sum_list.append([ text, dic[n_gram][0], dic[n_gram][1], str(round(100 * dic[n_gram][1] / tot_grams, 2)) + '%', dict_list.index(dic) + n_min ]) if sum_list: res = pd.DataFrame(sum_list) else: res = pd.DataFram() res.columns = ['Phrase', 'Frequency', 'No. Cases', '% of Cases', 'Length'] return res.sort_values('Frequency', ascending=False)
def infusion_increasing(self, path, bz2filepath): """Merge daily ticker data with previous day :path: Daily ticker data file path "bz2filepath: Previous day data path """ if os.path.exists(path): self.parse_daily_ticker(path) tar =, 'r:bz2') tar.extractall() tar.close() try: table ='futures.bcolz', 'r') index = table.attrs['line_map'] old_table = collections.defaultdict(pd.DataFrame) for elt in index: s,e = index[elt] old_table[elt] = pd.DataFram(table[s:e]) #Merge new dict for elt in self.whole_q_df: if elt in old_table: old_table[elt] = pd.concat(old_table[elt], self.whole_q_df[elt], ignore_index=True) outpath = os.path.join(os.path.abspath('.'), 'data') if os.path.exists(outpath): shutil.rmtree(outpath) os.makedirs(outpath) else: os.makedirs(outpath) self._generate_bcolzdata(old_table, outpath) except Exception as e: print('Can not find futures.bcolz fine') finally: os.remove('') os.remove('futures.bcolz') else: print('Data file not exists') return
def draw_heatmap1(data): df = pd.DataFram(data) fig = plt.figure() sns_plot = sns.hearmap(df)
#Python Packages used : SQLAlchemy , Pandas #WorkFlow- # Import packages - Create database engine - connect to the engine - query the database - save query as dataframe - close connection # First method from sqlalchemy import create_engine import pandas as pd engine = create_engine(" ---SQlFileName.sql_extension----") con = engine.connect() rs = con.exceute("---------SQLQuery------") #can be (fetchmany(size= )) also df = pd.DataFram(rs.fetchall()) con.close() #Second method - Context manager from sqlalchemy import create_engine import pandas as pd engine = create_engine(" ---SQlFileName.sql_extension----") with engine.connect as con: rs= con.execute("-------SQLQuery----") df=pd.DataFrame(rs.fetchall()) df.columns=rs.keys() #Third method from sqlalchemy import create_engine import pandas as pd engine = create_engine(" ---SQlFileName.sql_extension----") df = pd.read_sql_query("------SQLQuery----", engine)
y_df.columns = ['real', 'predicted'] y_df.to_csv(out_folder + output_text + '_day' + date + '.csv', header=True, index=False) print('Day finished!\n') print('Creating csv...\n') sts = pd.DataFrame(skill_test_scores) sts_m = sts.mean() dl = pd.DataFrame(date_list) nrmse = pd.DataFrame(nrmse_list) nrmse_m = nrmse.mean() rmse = pd.DataFrame(rmse_list) rmse_m = rmse.mean() rmse_pers = pd.DataFrame(rmse_pers_list) rmse_pers_m = rmse_pers.mean() rmse_raw = pd.DataFram(rmse_raw_list) rmse_raw_m = rmse_raw.mean() skills_report = pd.concat([ dl, rmse_raw, rmse, rmse_pers, nrmse, sts, rmse_raw_m, rmse_m, rmse_pers_m, nrmse_m, sts_m ], axis=1, ignore_index=True) skills_report.columns = [ 'date', 'rmse_raw', 'rmse', 'rmse_pers', 'nrmse', 'skill', 'rmse_raw_m', 'rmse_m', 'rmse_pers_m', 'nrmse_m', 'skill_m' ] skills_report.to_csv(out_folder + '/daily_skills_' + hp + '.csv', header=True, index=False)
ratings_mean_count_df.reset_index() ratings_mean_count_df['mean'].plot(bins=100, kind='hist', color='r') ratings_mean_count_df['count'].plot(bins=100, kind='hist', color='r') ratings_mean_count_df[ ratings_mean_count_df['mean'] ==5 ] # FILTER user_id_movietitle_matrix = movies_rating_df.pivot_table(index='user_id', column='title', values='rating') user_id_movietitle_matrix titanic = userid_movietitle_matrix['Titanic (1997)'] titanic_correlations = pd.DataFram(userid_movietitle_matrix.corrwith(titanic), columns = ['Correlation']) titanic_correlations = titanic_correlations.join(ratings_mean_count_df['count']) titanic_correlations titanic_correlations.dropna(inplace=True) titanic_correlations.sort_values('Correlation', ascending = False) titanic_correlations[titanic_correlations['count'] > 80].sort_values('Correlation', ascending = False).head(5) # All movie_correlations = user_movietitle_matrix.corr(method='pearson', min_periods=80) # Test myRatings = pd.read_csv('My_Ratings.csv')
import pandas as pd #处理数据的 import numpy as np #处理数据的 import matplotlib.pyplot as plt #画图表的,而且特别好看 #pandas 已经使用的命令 data = pd.read_excel('C:\\Users\\M\\Desktop\\python\\计算机学院研究生.xlsx') #读取原始数据 data = pd.to_excel('C:\\Users\\M\\Desktop\\output.xlsx') #输出数据到excel data = pd.DataFram(colums=list('ABCD'),index=list('1234') ) #生成一个DataFrame,行是ABCD,列是1234 a =['3行','2列'] #访问data中某一位置数据 a = data.loc['3行','2列'] #效果同上 a = data.loc['缪奇峰'] #访问缪奇峰(行)的所有数据 a = data.iloc[1,2] #访问1行2列的数据 a = data.sort_values(by='出生日期') #按照‘出生日期’的值进行排序 a = data[data.出生日期 > 19950101] #访问出生日期大于19950101的人 data[data.出生日期 > 19950101] ='都是我小弟' #出生日期大于19950101,值改成‘都是我小弟’ a = data.学号 #访问某一列的数据,数据类型为数组 a = data[10:45] #data的10~45行的数据 a = data.生日.max() #生日这一列的最大值,最小为min() a = data.学号.mean()#求学号这一列的平均值 a = data.shape #表格的形状,行列数 a = data.shape[0] #表格的行数,[1]为列 #numpy 已经使用的命令 a = np.arange(33) #生成0~33的一个数组 a = np.nan #a的值为空 NaN #matplotlib 已经使用的命令 data = pd.read_excel('C:\\Users\\M\\Desktop\\数学建模\\运动学片段\\3低速\\final.xlsx')
st_code_name ={'600511':'国药股份', '002311':'海大集团', '000591':'太阳能', '600809':'山西汾酒', '603939':'益丰药房', '002299':'圣农发展', '600295':'鄂尔多斯', '000048':'京基智农', '600438':'通威股份'} data = pd.DataFrame({'国药股份':fetch_stock_data('000651', '国药股份', '2015-01-01', '2018-10-23')} for k , v in st_code_name.items(): if k == '国药股份' : continue data = pd.concat([data, pd.DataFram({k:fetch_stock_data(v, k, '2015-01-01', '2018-10-23')})],1) data = data.dropna() data.to_excel('stock_data.xlsx') date = data.pop('date') newdata = (data/data.iloc[0, :])*100 init_notebook_mode() st_name=[] for v in st_code_name.values():