def generateGameGenreMatrix(self, appids=None, nGames=10, save=None, file_size=''):
        """Generate game-genre matrix (app * genre)"""

        if appids is None:
            steamAppList = 'http://api.steampowered.com/ISteamApps/GetAppList/v2/'
            dictGames = requests.get(steamAppList)
            jsonGames = dictGames.json()
            gameList = [i['appid'] for i in jsonGames['applist']['apps']['app']]
            appids = pd.DataFram(gameList, columns=['appid'])

        appids = appids['appid'].unique()
        gm = pd.DataFrame()
        gm.index.names = ["appid"]
        for id in tqdm(appids):
            for genre in self.getApps(id):
                if genre is not None:
                    gm.set_value(id, genre, int(1))
            #print('\rGenerate gm:{0}%'.format(round(i / appids.size * 100)), end="", flush=True)
        gm = gm.fillna(value=0)
        print('\n')
        self.gm = gm
        if save is not None:
            gm.to_csv('Resources/gamematrix{0}.csv.gz'.format(file_size), compression='gzip', mode='w+')

        return (gm)
Beispiel #2
0
def ret_dr(response_dict):
    df = pd.DataFram(
        columns=['created_at', 'updated_at', 'name', 'forks', 'stars', 'size'])
    for resp_dict in response_dict['items']:
        df = df.append(
            {
                'created_at': resp_dict['created_at'],
                'updaed_at': resp_dict['updated_at'],
                'name': resp_dict['name'],
                'forks': resp_dict['forks'],
                'stars': resp_dict['starsgazers_count'],
                'size': resp_dict['size']
            },
            ignore_index=True)
    c = df.json()
    return c
Beispiel #3
0
def summary_extract(data, n_min=2, n_max=5):
    '''Function that creates a table output of top N-grams found in the input
       text data.
       
       Inputs:
              data: A list of text documents on which to analyze.
              n_min: Integer representing the mininum N-gram size to count.
              n_max: Integer representing the maximum N-gram size to count.
              
       Output: A pandas DataFrame listing N-gram total counts (Frequency),
               document frequency (No. Cases), document percentage (% Cases),
               as well as the N-gram itself and its length. Sorted by 
               decreasing Frequency.'''

    dict_list = []

    for k in range(n_min, n_max + 1):

        dict_list.append(get_n_grams(k, data))

    sum_list = []
    tot_grams = len(data)

    for dic in dict_list:

        for n_gram in dic:

            text = ' '.join([x.upper() for x in n_gram])

            if (dic[n_gram][1] > 1) or (tot_grams < 10):

                sum_list.append([
                    text, dic[n_gram][0], dic[n_gram][1],
                    str(round(100 * dic[n_gram][1] / tot_grams, 2)) + '%',
                    dict_list.index(dic) + n_min
                ])
    if sum_list:
        res = pd.DataFrame(sum_list)
    else:
        res = pd.DataFram()
    res.columns = ['Phrase', 'Frequency', 'No. Cases', '% of Cases', 'Length']
    return res.sort_values('Frequency', ascending=False)
Beispiel #4
0
 def infusion_increasing(self, path, bz2filepath):
     """Merge daily ticker data with previous day
     :path: Daily ticker data file path 
     "bz2filepath: Previous day data path
     """
     if os.path.exists(path):
         self.parse_daily_ticker(path)
         tar = tarfile.open(bz2filepath, 'r:bz2')
         tar.extractall()
         tar.close()
         try:
             table = bcolz.open('futures.bcolz', 'r')
             index = table.attrs['line_map']
             old_table = collections.defaultdict(pd.DataFrame)
             for elt in index:
                 s,e = index[elt]
                 old_table[elt] = pd.DataFram(table[s:e])
             #Merge new dict
             for elt in self.whole_q_df:
                 if elt in old_table:
                     old_table[elt] = pd.concat(old_table[elt],
                                                self.whole_q_df[elt],
                                                ignore_index=True)
             outpath = os.path.join(os.path.abspath('.'), 'data')
             if os.path.exists(outpath):
                 shutil.rmtree(outpath)
                 os.makedirs(outpath)
             else:
                 os.makedirs(outpath)
             self._generate_bcolzdata(old_table, outpath)
         except Exception as e:
             print('Can not find futures.bcolz fine')
         finally:
             os.remove('instruments.pk')
             os.remove('futures.bcolz')
     else:
         print('Data file not exists')
         return 
def draw_heatmap1(data):
    df = pd.DataFram(data)
    fig = plt.figure()
    sns_plot = sns.hearmap(df)
    plt.show()
Beispiel #6
0
 #Python Packages used : SQLAlchemy , Pandas
 #WorkFlow-
 # Import packages - Create database engine - connect to the engine - query the database - save query as dataframe  - close connection
 
 # First method
 from sqlalchemy import create_engine
 import pandas as pd
 engine = create_engine(" ---SQlFileName.sql_extension----")
 con = engine.connect()
 rs = con.exceute("---------SQLQuery------")
 #can be (fetchmany(size= )) also
 df = pd.DataFram(rs.fetchall())      
 con.close()
 
 
 
 #Second method -  Context manager
 from sqlalchemy import create_engine
 import pandas as pd
 engine = create_engine(" ---SQlFileName.sql_extension----")
 with engine.connect as con:
       rs= con.execute("-------SQLQuery----")
       df=pd.DataFrame(rs.fetchall())
       df.columns=rs.keys()
       
       
 #Third method
 from sqlalchemy import create_engine
 import pandas as pd
 engine = create_engine(" ---SQlFileName.sql_extension----")
 df = pd.read_sql_query("------SQLQuery----", engine)
Beispiel #7
0
                y_df.columns = ['real', 'predicted']
                y_df.to_csv(out_folder + output_text + '_day' + date + '.csv',
                            header=True,
                            index=False)
                print('Day finished!\n')
            print('Creating csv...\n')
            sts = pd.DataFrame(skill_test_scores)
            sts_m = sts.mean()
            dl = pd.DataFrame(date_list)
            nrmse = pd.DataFrame(nrmse_list)
            nrmse_m = nrmse.mean()
            rmse = pd.DataFrame(rmse_list)
            rmse_m = rmse.mean()
            rmse_pers = pd.DataFrame(rmse_pers_list)
            rmse_pers_m = rmse_pers.mean()
            rmse_raw = pd.DataFram(rmse_raw_list)
            rmse_raw_m = rmse_raw.mean()

            skills_report = pd.concat([
                dl, rmse_raw, rmse, rmse_pers, nrmse, sts, rmse_raw_m, rmse_m,
                rmse_pers_m, nrmse_m, sts_m
            ],
                                      axis=1,
                                      ignore_index=True)
            skills_report.columns = [
                'date', 'rmse_raw', 'rmse', 'rmse_pers', 'nrmse', 'skill',
                'rmse_raw_m', 'rmse_m', 'rmse_pers_m', 'nrmse_m', 'skill_m'
            ]
            skills_report.to_csv(out_folder + '/daily_skills_' + hp + '.csv',
                                 header=True,
                                 index=False)
ratings_mean_count_df.reset_index()

ratings_mean_count_df['mean'].plot(bins=100, kind='hist', color='r') 

ratings_mean_count_df['count'].plot(bins=100, kind='hist', color='r') 

ratings_mean_count_df[ ratings_mean_count_df['mean'] ==5 ]


# FILTER
user_id_movietitle_matrix = movies_rating_df.pivot_table(index='user_id', column='title', values='rating')
user_id_movietitle_matrix

titanic = userid_movietitle_matrix['Titanic (1997)']

titanic_correlations = pd.DataFram(userid_movietitle_matrix.corrwith(titanic), columns = ['Correlation'])
titanic_correlations = titanic_correlations.join(ratings_mean_count_df['count'])
titanic_correlations

titanic_correlations.dropna(inplace=True)

titanic_correlations.sort_values('Correlation', ascending = False)

titanic_correlations[titanic_correlations['count'] > 80].sort_values('Correlation', ascending = False).head(5)

# All
movie_correlations = user_movietitle_matrix.corr(method='pearson', min_periods=80)


# Test
myRatings = pd.read_csv('My_Ratings.csv')
import pandas as pd                 #处理数据的
import numpy as np                  #处理数据的
import matplotlib.pyplot as plt     #画图表的,而且特别好看

#pandas 已经使用的命令
data = pd.read_excel('C:\\Users\\M\\Desktop\\python\\计算机学院研究生.xlsx')      #读取原始数据
data = pd.to_excel('C:\\Users\\M\\Desktop\\output.xlsx')       #输出数据到excel
data = pd.DataFram(colums=list('ABCD'),index=list('1234') )    #生成一个DataFrame,行是ABCD,列是1234

a = data.at['3行','2列']      #访问data中某一位置数据
a = data.loc['3行','2列']     #效果同上
a = data.loc['缪奇峰']        #访问缪奇峰(行)的所有数据
a = data.iloc[1,2]            #访问1行2列的数据

a = data.sort_values(by='出生日期')             #按照‘出生日期’的值进行排序
a = data[data.出生日期 > 19950101]              #访问出生日期大于19950101的人
data[data.出生日期 > 19950101] ='都是我小弟'     #出生日期大于19950101,值改成‘都是我小弟’

a = data.学号       #访问某一列的数据,数据类型为数组
a = data[10:45]     #data的10~45行的数据
a = data.生日.max() #生日这一列的最大值,最小为min()
a = data.学号.mean()#求学号这一列的平均值
a = data.shape      #表格的形状,行列数
a = data.shape[0]   #表格的行数,[1]为列

#numpy 已经使用的命令
a = np.arange(33)  #生成0~33的一个数组
a = np.nan          #a的值为空 NaN

#matplotlib 已经使用的命令
data = pd.read_excel('C:\\Users\\M\\Desktop\\数学建模\\运动学片段\\3低速\\final.xlsx')
st_code_name ={'600511':'国药股份',
             '002311':'海大集团',
             '000591':'太阳能',
             '600809':'山西汾酒',
             '603939':'益丰药房',
             '002299':'圣农发展',
             '600295':'鄂尔多斯',
             '000048':'京基智农',
             '600438':'通威股份'}

data = pd.DataFrame({'国药股份':fetch_stock_data('000651', '国药股份', '2015-01-01', '2018-10-23')} 
for k , v in st_code_name.items():
    if k == '国药股份' :
        continue
    data = pd.concat([data, pd.DataFram({k:fetch_stock_data(v, k, '2015-01-01', '2018-10-23')})],1)


data = data.dropna()

data.to_excel('stock_data.xlsx')

date = data.pop('date')

newdata = (data/data.iloc[0, :])*100


init_notebook_mode()

st_name=[]
for v in st_code_name.values():