def _load_chat(self): """Return df of iMessage chats.""" # Create sql engine for pandas connection. im_engine = create_engine('sqlite:////Users/{u}/Library/Messages/chat.db'.format(u=self.username)) # Load tables into DataFrames. handle_cols = ['ROWID', 'id'] df_handle = pd.read_sql_table(table_name='handle', con=im_engine, columns=handle_cols) df_chat_handle_join = pd.read_sql_table(table_name='chat_handle_join', con=im_engine) chat_cols = ['ROWID', 'display_name', 'chat_identifier'] df_chat = pd.read_sql_table(table_name='chat', con=im_engine, columns=chat_cols) df_chat_msg_join = pd.read_sql_table(table_name='chat_message_join', con=im_engine) msg_cols = ['ROWID', 'text', 'date', 'is_emote', 'is_from_me', 'handle_id'] df_msg = pd.read_sql_table(table_name='message', con=im_engine, columns=msg_cols) # Join iMessage tables into single DataFrame. ## Remove unnecessary fields before each join. df = df_handle.merge(df_chat_handle_join, left_on='ROWID', right_on='handle_id', how='left') df.drop(labels=['ROWID'], axis=1, inplace=True) df = df.merge(df_chat, left_on='chat_id', right_on='ROWID', how='left') df.drop(labels=['ROWID'], axis=1, inplace=True) df = df.merge(df_chat_msg_join, on='chat_id', how='left') df.drop(labels=['chat_id'], axis=1, inplace=True) df = df.merge(df_msg, left_on=['message_id', 'handle_id'], right_on=['ROWID', 'handle_id'], how='left') df.drop(labels=['ROWID', 'message_id'], axis=1, inplace=True) # Parse timestamp. ## Use 978307200 to convert Apple NSDate to Unix Epoch time. ns_conversion = 978307200 _date_func = lambda x: dt.fromtimestamp(x + ns_conversion) if not np.isnan(x) else dt.now() df.date = df.date.apply(_date_func) return df
def rank(dataset, force_overwrite = False): # name of the result table res_tb = dataset + '_result' # read even when cached. column names are extracted from X tb = pd.read_sql_table(dataset, db, index_col = 'ID') X = tb.iloc[:, :-1]; y = tb.iloc[:, -1] # check if it is cached if res_tb in db.table_names() and not force_overwrite: # yes, get it res = pd.read_sql_table(res_tb, db, index_col = 'index') else: # no, compute it # remove low var columns low_var_cols = X.columns[X.var() < 1e-5] X.drop(low_var_cols, axis = 1, inplace = True) # rank (rank1, R2) = rfe_with_grid_search(X.values, y, RandomForestRegressor(n_jobs = -1), [{'n_estimators': [5, 10, 30], 'max_features': [1.0]}]) (rank2, scores) = lassocv_n_random_lasso(X, y) res = pd.DataFrame(np.array([X.columns[rank1], R2, X.columns[rank2], scores]).T, columns = ['rfe_random_forest', 'R2', 'randomized_lasso', 'scores']) res.to_sql(res_tb, db, if_exists = 'replace') return (res['rfe_random_forest'], res['R2'], res['randomized_lasso'], res['scores'])
def get_balance(): accounts = pd.read_sql_table('account', db.engine) transactions = pd.read_sql_table('transaction', db.engine, columns=['account', 'amount']) scheduled_transactions = pd.read_sql_table('scheduled_transaction', db.engine) transactions = transactions.rename(columns={'account': 'name'}) transactions = transactions.groupby('name', as_index=False).sum() accounts['amount'] = accounts['reconciled_balance'] for name in transactions.name: accounts.ix[accounts['name'] == name, 'amount'] += \ transactions.ix[transactions.name == name, 'amount'].iloc[-1] # taking scheduled transactions into account accounts['end_of_month_amount'] = accounts['amount'] for idx, operation in scheduled_transactions.iterrows(): i = 0 today = datetime.datetime.now() last_day_of_month = today + relativedelta(day=1, months=+1, days=-1) while operation.next_occurence \ + relativedelta(**{operation.every_type: i * operation.every_nb}) \ <= last_day_of_month: i += 1 accounts.ix[accounts['name'] == operation.account, 'end_of_month_amount'] += \ operation.amount * i return accounts
def __init__(self, path=None): if path is None: path = "sqlite:///" + os.path.join(os.environ["HOME"], "tmp", "keras_logs.db") db_path = path.replace("sqlite:///", "") try: self.logs = pd.read_sql_table("log", path) self.runs = pd.read_sql_table("run", path).rename(columns={"id": "runid"}).sort_values("runid", ascending=False) self.df = self.logs.merge(self.runs) except ValueError: self.runs = pd.DataFrame({"runid":[], "comment":[], "user":[]})
def getAllData(poi_pca,con): order_info = pd.read_sql_table('compressed_districts',con=con) weather_info = pd.read_sql_table('weather_info',con=con) cluster_map = pd.read_sql_table('district_info',con=con) traffic_info = pd.read_sql_table('traffic_info',con=con) poi_info = pd.read_sql_table('poi_info',con=con) fixed_weather = dw.fixWeatherData(weather_info) fixed_traffic = dw.fixTrafficData(traffic_info,cluster_map) fixed_poi,expected_var = cp.compressPoiData(poi_info.fillna(0),poi_pca) return order_info,fixed_poi,fixed_weather,fixed_traffic
def get_table_df(self,table,columns=None): """ :param table: string type, db_name.table_name :param columns: lit type with string value, like: ['acc_name', 'initial'] :return: DataFrame type """ if columns: return pd.read_sql_table(table, self.engine) else: return pd.read_sql_table(table, self.engine, columns)
def ShowGS(com1, com2, com3): df2 = Codes[Codes['code'].isin([com1])] print df2 if com2 == 'syl30': tname = 'b'+com1 if me.IsTableExist(tname, G_DBengine) == False: print 'No table ....%s'%tname return df = pd.read_sql_table(tname,G_DBengine) if df.index.size > 250: df1 = df.drop(range(df.index.size - 250)) me.PinghuaDF(df1, md.BI_syl30+1, 5) plt.title(com1 + ' ' + com2 + ' ' + str(max(df1['date']))) plt.fill_between(df1.index, df1['syl30'], 0, where=df1['syl30']>0,facecolor='red') plt.fill_between(df1.index, df1['syl30'], 0, where=df1['syl30']<=0,facecolor='green') elif com2 == 'syl250': tname = 'b'+com1 if me.IsTableExist(tname, G_DBengine) == False: print 'No table ....%s'%tname return df = pd.read_sql_table(tname,G_DBengine) me.PinghuaDF(df, md.BI_syl250+1, 30) plt.fill_between(df.index, df['syl250'], 0, where=df['syl250']>0,facecolor='red') plt.fill_between(df.index, df['syl250'], 0, where=df['syl250']<=0,facecolor='green') plt.title(com1 + ' ' + com2 + ' ' + str(max(df['date']))) View_10X(plt,df, '123') elif com2 == 'hb': tname = 'f'+com1 if me.IsTableExist(tname, G_DBengine) == False: print 'No table ....%s'%tname return df = pd.read_sql_table('f'+com1,G_DBengine) df[1:df.index.size][['sjsrhb','sjlrhb']].plot(kind='bar',color={'red','green'}) df[1:df.index.size]['nhgdqyl'].plot(color='blue', secondary_y=True, linewidth = LW) plt.title(com1 + ' ' + com2 + ' ' + str(df.loc[df.index.size-1,'year']) + ' ' + str(df.loc[df.index.size-1,'season']) ) elif com2 == 'sr': tname = 'f'+com1 if me.IsTableExist(tname, G_DBengine) == False: print 'No table ....%s'%tname return df = pd.read_sql_table('f'+com1,G_DBengine) df[1:df.index.size]['sjsr'].plot(kind='bar',color='green') df[1:df.index.size]['sjlr'].plot(color='red', secondary_y=True, linewidth=LW) plt.title(com1 + ' ' + com2 + ' ' + str(df.loc[df.index.size-1,'year']) + ' ' + str(df.loc[df.index.size-1,'season']) ) else: print '[Error] input error ...' return plt.show() plt.close()
def read_data(): """read data from mysql""" # open connection and read table my_db = sa.engine.url.URL(drivername = 'mysql', database = 'openpharma_db', query = {'read_default_file' : '~/.my.cnf'} ) engine = sa.create_engine(name_or_url = my_db) df_class = pd.read_sql_table(table_name = 'classifier_tb', con = engine) df_trial = pd.read_sql_table(table_name = 'trials_tb', con = engine) return df_class, df_trial
def fromOpSimDB(cls, dbname, subset='combined'): """ Class Method to instantiate this from an OpSim sqlite database output Parameters ---------- dbname : subset : """ allowed_subsets = cls.get_allowed_subsets() subset = subset.lower() if subset not in allowed_subsets: raise NotImplementedError('subset {} not implemented'.\ format(subset)) if not dbname.startswith('sqlite'): dbname = 'sqlite:///' + dbname print(' reading from database {}'.format(dbname)) engine = create_engine(dbname, echo=False) # Read the proposal table to find out which propID corresponds to proposals = pd.read_sql_table('Proposal', con=engine) propDict = cls.get_propIDDict(proposals) # Do the actual sql queries or table reads if subset in ['_all', 'unique_all']: # In this case read everything (ie. table read) summary = pd.read_sql_table('Summary', con=engine) # _all will be used only to write out other serialized versions # of OpSim. Do not drop duplicates, so that different subsets can # be constructed from the same hdf file if subset == 'unique_all': summary.drop_duplicates(subset='obsHistID', inplace=True) summary.set_index('obsHistID', inplace=True) return cls(propIDDict=propDict, summary=summary, proposalTable=proposals) else: sql_query = 'SELECT * FROM Summary WHERE PROPID' if subset == 'ddf': sql_query += ' == {0}'.format(propDict['ddf']) if subset == 'wfd': sql_query += ' == {0}'.format(propDict['wfd']) if subset == 'combined': sql_query += ' in [{0}, {1}]'.format(propDict['wfd'], propDict['ddf']) # Read the summary table summary = pd.read_sql_query(sql_query, con=engine) summary.drop_duplicates(subset='obsHistID', inplace=True) summary.set_index('obsHistID', inplace=True) return cls(propIDDict=propDict, summary=summary, proposalTable=proposals)
def read_data(): """read data from mysql""" print "Reading data..." # open connection and read table my_db = sa.engine.url.URL(drivername = 'mysql', database = 'openpharma_db', query = {'read_default_file' : '~/.my.cnf'} ) engine = sa.create_engine(name_or_url = my_db) df_compare = pd.read_sql_table(table_name = 'compare_tb', con = engine) df_packages = pd.read_sql_table(table_name = 'packages_tb', con = engine) return df_compare, df_packages
def unlock_form(project_name, arm_name, event_descrip, form_name, engine, subject_id = None): """ Unlock a given form be removing records from table :param project_name: str :param arm_name: str :param event_descrip: str :param form_name: str :param engine: `sqlalchemy.Engine` :param subject_id: str :return: None """ # get ids needed for unlocking project_id = get_project_id(project_name, engine) arm_id = get_arm_id(arm_name, project_id, engine) event_id = get_event_id(event_descrip, arm_id, engine) # get a list of all the locked records and filter for records to remove locked_records = pd.read_sql_table('redcap_locking_data', engine) locked_forms = locked_records[(locked_records.project_id == project_id) & (locked_records.event_id == event_id) & (locked_records.form_name == form_name)] if subject_id : locked_forms = locked_forms[(locked_forms.record == subject_id)] # generate the list of ids to drop and remove from db table global locked_list locked_list = ', '.join([str(i) for i in locked_forms.ld_id.values.tolist()]) if locked_list: sql = 'DELETE FROM redcap_locking_data ' \ 'WHERE redcap_locking_data.ld_id IN ({0});'.format(locked_list) execute(sql, engine) return True else : return False
def plot_time_mem_picard_val(pre_post_fastqc_readcount_match_df,data_engine,logger): time_mem_picard_validatesamfile_df=pd.read_sql_table('time_mem_picard_validatesamfile',data_engine) df=pd.merge(pre_post_fastqc_readcount_match_df,time_mem_picard_validatesamfile_df,how='inner',on='uuid') df_pre=df df_pos=df[df['bam_path'].str.contains('realn')] ndf=df[['pre_count','wall_clock']] ndf=ndf.convert_objects(convert_numeric=True) ndf['wall_clock']=ndf['wall_clock']/3600 ndf['pre_count']=ndf['pre_count']/1000000 ax=ndf.plot(x='pre_count',y='wall_clock',kind='scatter') yrange_max=math.ceil(max(ndf['wall_clock'])) xmod=ndf['pre_count']/500 xmod_ceil=math.ceil(max(xmod)) xrange_max=xmod_ceil*500 ax.set_title('picard ValidateSamFile') ax.set_xlim([0,xrange_max]) ax.set_ylim([0,yrange_max]) ax.set_xlabel('readcount (millions)') ax.set_ylabel('run time (hr)') fig=ax.get_figure() fig.savefig('picard_validate_readcount_preharmonize.png',dpi=600) fig.savefig('picard_validate_readcount_postharmonize.png',dpi=600) fig.savefig('picard_validate_filesize_preharmonize.png',dpi=600) fig.savefig('picard_validate_filesize_postharmonize.png',dpi=600)
def download_all_stock_history_k_line(): print "download all stock k-line start" try: if cm.DB_WAY == "csv": df = pd.DataFrame.from_csv(cm.DownloadDir + cm.TABLE_STOCKS_BASIC + ".csv") # se = df.loc[int(code)] # se = df.ix[code] pool = ThreadPool(processes=20) pool.map(download_stock_kline, df.index) pool.close() pool.join() elif cm.DB_WAY == "redis": codes = r.smembers(cm.INDEX_STOCK_BASIC) # codes = r.lrange(cm.INDEX_STOCK_BASIC, 0, -1) pool = ThreadPool(processes=20) pool.map(download_stock_kline_to_redis, codes) pool.close() pool.join() elif cm.DB_WAY == "sqlite": df = pd.read_sql_table(cm.INDEX_STOCK_BASIC, engine) codes = df[cm.KEY_CODE].get_values() # codes = r.lrange(cm.INDEX_STOCK_BASIC, 0, -1) pool = ThreadPool(processes=2) pool.map(download_stock_kline_to_sqlite, codes) pool.close() pool.join() except Exception as e: print str(e) print "download all stock k-line finish"
def break_low(self, date): ''' 筛选出一年内创新低的股票 :param date: 某一天的日期 ‘'2017-11-11 :return: ''' #cmd = 'select * from `{}`'.format(date) df = pd.read_sql_table(date, daily_engine,index_col='index') # **** 这里的index需要删除一个 low_db= get_mysql_conn('db_selection') low_cursor = low_db.cursor() for i in range(len(df)): code = df.loc[i]['code'] cur_low = df.loc[i]['low'] mins_date,mins = self.get_lowest(code, '2017',date) if not mins_date: continue if mins and float(cur_low)<=float(mins) and float(cur_low) !=0.0: print code, print df.loc[i]['name'] print 'year mins {} at {}'.format(mins,mins_date) print 'curent mins ',cur_low create_cmd = 'create table if not exists break_low' \ '(`index` int primary key auto_increment,datetime datetime,code text,name text,low_price float,last_price float, last_price_date datetime);' low_cursor.execute(create_cmd) insert_cmd = 'insert into break_low (datetime,code,name,low_price,last_price,last_price_date) values (%s,%s,%s,%s,%s,%s);' insert_data = (date,code,df.loc[i]['name'],cur_low,mins,mins_date) low_cursor.execute(insert_cmd,insert_data) low_db.commit()
def InitializeMonthlyTable(): disk_engine = create_engine('mysql://*****:*****@quantico.chgivxnnhpn3.us-west-2.rds.amazonaws.com/Quantico') df_data = pd.read_sql_table('data',disk_engine) allclients = sorted(list(pd.unique(df_data.client.ravel()))) client_job_status_dict = {} for client in allclients: client_job_status_dict[client] = {'success':0,'failure':0,'partial':0} start = dt.datetime(2016,2,1,18,0,0) days = calendar.monthrange(2016,2)[1] #start = dt.datetime(2016,1,18,16,0,0) #days = 4 global table #un_sorted_table = {} for day in range(1,days+1): table[start] = copy.deepcopy(client_job_status_dict) for client in allclients: s=time.mktime(start.timetuple()) t=start + dt.timedelta(days=1) e=time.mktime(t.timetuple()) if len(df_data.query('started > {0} and started < {1} and client == "{2}" and status == {3}'.format(s,e,client,0))) == 0: table[start][client]['success'] = 0 table[start][client]['failure'] = 0 table[start][client]['partial'] = 0 else: table[start][client]['success'] = len(df_data[ (df_data['started']>s) & (df_data['started']<e) & (df_data['client']==client) & (df_data['status']==0) ].index) table[start][client]['failure'] = len(df_data[ (df_data['started']>s) & (df_data['started']<e) & (df_data['client']==client) & (df_data['status']>1) ].index) table[start][client]['partial'] = len(df_data[ (df_data['started']>s) & (df_data['started']<e) & (df_data['client']==client) & (df_data['status']==1) ].index) start += dt.timedelta(days=1)
def load_catalog(file_info, load_log=False): """ Load a catalog file. For now this exclusively uses an alchemy connection to a DB but in the future this will have methods for various file types, or at the very least an export function. The biggest complication is that in addition to the table information there is also metadata (which pandas currently does not support) and a log of changes to a catalog. """ cid = file_info['file_settings']['table'] connect_str = file_info['filepath'] engine = create_engine(connect_str) # connects to the db Base.metadata.bind = engine # binds the metadata to the engine if cid not in Base.metadata.tables.keys(): raise astrotoyz.core.AstroToyzError("Catalog not found in database") dataframe = pandas.read_sql_table(cid, engine) DBSession = sessionmaker(bind=engine) session = DBSession() meta = session.query(CatalogMeta).filter(CatalogMeta.cid==cid).first() if meta is None: raise astrotoyz.core.AstroToyzError("Could not find catalog meta data") settings = json.loads(meta.settings) if load_log: log = pandas.read_sql_query( "SELECT * FROM log WHERE cid='{0}'".format(cid), engine) else: log = None catalog = Catalog(cid, file_info, name=meta.name, log=log, data=dataframe) return catalog
def fin_read_hy(hy): is_first = True for i in range(Codes.index.size): code = Codes.loc[i, 'code'] t_name = 'f'+code if me.IsTableExist(t_name,G_DBengine) == False: continue if (is_first): fin = pd.read_sql_table(t_name,G_DBengine) is_first = False else: df = pd.read_sql_table(t_name,G_DBengine) fin = fin.append(df) print '...fin_read_hy:' + hy + '.......[%d of %d]'%(i,Codes.index.size) return fin
def fin_com_hy(hycode): global Fin hy_fin = pd.read_sql_table('f600036',G_DBengine) del hy_fin['level_0'] hy_fin['code'] = hycode hy_fin['name'] = hycode hy_fin['jzc'] = 0.0 hy_fin['sjsr'] = 0.0 hy_fin['sjlr'] = 0.0 hy_fin['sjsrhb'] = 0.0 hy_fin['sjlrhb'] = 0.0 i = 0 for y in range(2006,2017): for s in range(1,5): df1 = Fin[Fin.year == y] df2 = df1[df1.season == s] if (df2.index.size == 0): i = i + 1 continue d_sum = df2.sum() if i < hy_fin.index.size: hy_fin.iat[i,G_jzc ] = d_sum.jzc hy_fin.iat[i,G_sjsr ] = d_sum.sjsr hy_fin.iat[i,G_sjlr ] = d_sum.sjlr i= i+1 for i in range(3, hy_fin.index.size): hy_fin.iat[i,G_sjsrhb ] = hy_fin.iat[i,G_sjsr ] /(hy_fin.iat[i,G_jzc] + hy_fin.iat[i-1,G_jzc] + hy_fin.iat[i-2,G_jzc] + hy_fin.iat[i-3,G_jzc]) * 400 hy_fin.iat[i,G_sjlrhb ] = hy_fin.iat[i,G_sjlr ] /(hy_fin.iat[i,G_jzc] + hy_fin.iat[i-1,G_jzc] + hy_fin.iat[i-2,G_jzc] + hy_fin.iat[i-3,G_jzc]) * 400 return hy_fin
def uploadFX(rates): table = rates[0] today = rates[1] try: df = pd.read_sql_table("fxrates", engine, parse_dates="FXDate") except: df = pd.read_csv("Common/FX rates.csv", parse_dates=["FXDate"]) max_date = df["FXDate"].max().date() if max_date < last_date: df_newdates = update_dates(max_date) df = df.append(df_newdates) df = df.sort(["FXDate"], ascending=False) df.loc[df.loc[:, "FXDate"] == table[0][0], "Rate"] = table[0][1] for i in range(len(table)): if i == 0 and today == datetime.date.today(): df.loc[df.loc[:, "FXDate"] >= table[i][0], "Rate"] = table[i][1] else: df.loc[df.loc[:, "FXDate"] == table[i][0], "Rate"] = table[i][1] df = df.iloc[:, 1:] df.to_csv("Common/FX rates.csv", index=False)
def get_mysql_table_records(self,table_name,project_name, arm_name, event_descrip, name_of_form=None, subject_id=None): """ Get a dataframe of forms for a specific event :param project_name: str :param arm_name: str :param event_descrip: str :return: pandas.DataFrame` """ project_id = self.get_mysql_project_id(project_name) if not project_id : return pd.DataFrame() arm_id = self.get_mysql_arm_id(arm_name, project_id) event_id = self.get_mysql_event_id(event_descrip, arm_id) table_records = pd.read_sql_table(table_name, self.api['redcap_mysql_db']) table_forms = table_records[(table_records.project_id == project_id) & (table_records.event_id == event_id)] if name_of_form : table_forms = table_forms[table_forms.form_name == name_of_form] if subject_id: table_forms = table_forms[table_forms.record == subject_id] return table_forms
def download_all_stock_history_k_line(): print 'download all stock k-line start' try: if DB_WAY == 'csv': df = pd.DataFrame.from_csv(DownloadDir + INDEX_STOCK_BASIC + '.csv') #se = df.loc[int(code)] #se = df.ix[code] pool = ThreadPool(processes=20) pool.map(download_stock_kline_csv, df.index) pool.close() pool.join() elif DB_WAY == 'redis': codes = r.smembers(INDEX_STOCK_BASIC) #codes = r.lrange(INDEX_STOCK_BASIC, 0, -1) pool = ThreadPool(processes=20) pool.map(download_stock_kline_to_redis, codes) pool.close() pool.join() elif DB_WAY == 'mysql': df = pd.read_sql_table(INDEX_STOCK_BASIC, engine) codes = df[KEY_CODE].get_values() #codes = r.lrange(INDEX_STOCK_BASIC, 0, -1) pool = ThreadPool(processes=2) pool.map(download_stock_kline_to_sql, codes) pool.close() pool.join() except Exception as e: print str(e) print 'download all stock k-line finish'
def _xiayingxian(self, row, ratio): ''' 下影线的逻辑 ratio 下影线的长度比例,数字越大,下影线越长 row: series类型 ''' open_p = float(row['open']) # print(open_p) closed = float(row['close']) # print(closed) low = float(row['low']) # print(low) high = float(row['high']) p = min(closed,open_p) try: diff = (p - low) * 1.00 / (high - low) diff=round(diff,3) except ZeroDivisionError: diff = 0 if diff > ratio: xiayinxian_engine = get_engine('db_selection') date,code,name,ocupy_ration ,standards = row['datetime'],row['code'],row['name'],diff,ratio df = pd.DataFrame( {'datetime': [date], 'code': [code], 'name': [name], 'ocupy_ration': [ocupy_ration], 'standards': [standards]}) try: df1=pd.read_sql_table('xiayingxian',xiayinxian_engine,index_col='index') df = pd.concat([df1, df]) except Exception as e: print(e) #return None df = df.reset_index(drop=True) df.to_sql('xiayingxian',xiayinxian_engine,if_exists='replace') return row
def ShowMoney(com1, com2, com3): if True: tname = 'money' if me.IsTableExist(tname, G_DBengine) == False: print 'No table ....%s'%tname return df = pd.read_sql_table(tname,G_DBengine) df = df.sort_values('num', ascending = False) me.PinghuaDF(df, 18, 5) me.PinghuaDF(df, 19, 5) me.PinghuaDF(df, 20, 5) df[['fm2','fm1']].plot(linewidth=LW) df['m1dm2'].plot(color='red', secondary_y=True, linewidth=LW) plt.title(com1 + ' ' + com2 + ' ' + str(max(df['month']))) View_10X(plt, df, '321') #plt.xticks(range(0, df.index.size, df.index.size/10)) #ax=plt.gca() #size = df.index.size #d_size = size / 9 #ax.set_xticklabels([df.iat[size-1,1], df.iat[size-1-d_size,1], df.iat[size-1-d_size*2,1], df.iat[size-1-d_size*3,1], df.iat[size-1-d_size*4,1], \ # df.iat[size-1-d_size*5,1],df.iat[size-1-d_size*6,1], df.iat[size-1-d_size*7,1], df.iat[size-1-d_size*8,1] ,df.iat[0,1]]) plt.show() plt.close()
def _get_report(only_A, table, columns=None, col='截止日期'): """ 获取财务报告数据 使用利润表的公告日期 """ engine = get_engine('dataBrowse') df = pd.read_sql_table(table, engine, columns=columns) if only_A: df = df[~df.证券代码.str.startswith('2')] df = df[~df.证券代码.str.startswith('9')] # df.drop(to_drop, axis=1, inplace=True, errors='ignore') asof_dates = _financial_report_announcement_date() keys = ['证券代码', '截止日期'] if col != '截止日期': # 处理行业排名 df['报告年度'] = df[col] # 原始数据列名称更改为'截止日期' df.rename(columns={col: '截止日期'}, inplace=True) df = df.join( asof_dates.set_index(keys), on=keys ) df.rename(columns={"证券代码": "sid", "截止日期": "asof_date", "公告日期": "timestamp"}, inplace=True) # 修复截止日期 _fill_ad_and_ts(df) # 规范列名称 df.columns = df.columns.map(_normalized_col_name) df.sort_values(['sid', 'asof_date'], inplace=True) return df
def index(): holding_list = []; #try: tr_by_date_df=pd.read_sql_table('transaction_'+str(current_user.get_id()), db.engine, index_col='date') symbols=pf.get_symbols(tr_by_date_df) holdings_ts_list = pf.get_holdings(tr_by_date_df, symbols) holdings_df = pf.get_current_holdings(holdings_ts_list) cost_basis = pf.get_costbasis(tr_by_date_df) # add cost basis and realized gains holdings_df = holdings_df.join(cost_basis['basis']) holdings_df = holdings_df.join(cost_basis['realized']) #print(holdings_df) # # turn into a list for datatables holdings_list = pf.df_to_obj_list(holdings_df, 'ticker') #print(holdings_list) #except: # holdings_list =[] return render_template('portfolio/portfolio.html', holdings=holdings_list)
def correlation_analysis(dataset): # read tb = pd.read_sql_table(dataset, db, index_col = 'ID') X = tb.iloc[:, :-1]; y = tb.iloc[:, -1] # compute correlation X.drop(X.columns[X.var() < 1e-5], axis = 1, inplace = True) r = np.array([pearsonr(X.ix[:,i], y) for i in range(X.shape[1])]) rank = np.abs(r[:, 0]).argsort()[::-1] # plot top ones N = 9 top = rank[:N] traces = [] names = [] for (i, c) in enumerate(X.columns[top]): names.append('{}<br>(r={:0.2g} p={:0.2g})'.format( c, r[top[i], 0], r[top[i], 1])) traces.append(go.Scatter(x = X[c].values.tolist(), y = y.values.tolist(), mode = 'markers', showlegend = False)) fig = tools.make_subplots(rows = 3, cols = 3, subplot_titles = names, vertical_spacing = 0.1, horizontal_spacing = 0.1) for (i, p) in enumerate(traces): fig.append_trace(p, i // 3 + 1, i % 3 + 1) fig['layout'].update(height = 700, width = 1100) fig['layout'].update(margin = go.Margin(l = 50, r = 50, b = 50, t = 50, pad = 0)) for a in fig.layout.annotations: a['font'].update(size = 14) return (X.columns[rank], utils.plot_to_div(fig))
def get_params(dataset): tb = pd.read_sql_table(dataset, db, index_col = 'ID') X = tb.iloc[:, :-1] # remove low var columns low_var_cols = X.columns[X.var() < 1e-5] cols = [c for c in X.columns if c not in low_var_cols] return (cols, low_var_cols)
def readFromDB(table, dbConnect): engine = create_engine('mysql+mysqldb://' + mysql_user + ':' + mysql_pass + '@' + mysql_host + '/' + mysql_db) df = pd.read_sql_table(table, con=engine) #clean up SUBJ column #df.SUBJ = df.SUBJ.str.strip() return df
def crunch_data(): engine = sq.create_engine("sqlite:///snapshots.sqlite") df = pd.read_sql_table("snapshots", engine) df = df.set_index(['datetime']) today = datetime.date.today() from_date = today - datetime.timedelta(weeks=1) #to_date = today - datetime.timedelta(weeks=1) to_date = today dframes = [] for source, df in df.groupby(['source']): ts = df.loc[:, 'percent_women'] ts = ts[ts > 0.0] rs = ts.resample("W", how={'median' : np.median}) rs['week'] = rs.index.weekofyear rs = rs[from_date:to_date] rs.columns = [source, 'week'] year = rs.index.year[0] week = rs.index.weekofyear[0] rs = rs.set_index('week') dframes.append(rs) df = pd.concat(dframes, axis=1, join='inner') return df.T, week, year
def test_writeSimlib(): pkgDir = os.path.split(oss.__file__)[0] dbname = os.path.join(pkgDir, 'example_data', 'enigma_1189_micro.db') template_simlib = os.path.join(pkgDir, 'example_data', 'Enigma_1189_micro_main.simlib') engineFile = 'sqlite:///' + dbname engine = create_engine(engineFile) # read the database into a `pd.DataFrame` Summary = pd.read_sql_table('Summary', engine) EnigmaMain = Summary.query('propID == [364]') EnigmaMainSummary = so.SummaryOpsim(EnigmaMain, calculateSNANASimlibs=True, user='******', host='time') simlibfilename = './Enigma_1189_micro_main.simlib' EnigmaMainSummary.writeSimlib(simlibfilename) with open(template_simlib) as f: template_data = f.read() with open(simlibfilename) as f: new_data = f.read() assert new_data == template_data if new_data == template_data : os.remove(simlibfilename)
def data_package(pkg_tables, pkg_skeleton, out_dir=os.path.join(pudl.settings.PUDL_DIR, "results", "data_pkgs"), testing=False): """ Create a data package of requested tables and their dependencies. See Frictionless Data for the tabular data package specification: http://frictionlessdata.io/specs/tabular-data-package/ Args: pkg_skeleton (dict): A python dictionary containing several top level elements of the data package JSON descriptor specific to the data package, including: * name: pudl-<datasource> e.g. pudl-eia923, pudl-ferc1 * title: One line human readable description. * description: A paragraph long description. * keywords: For search purposes. pkg_tables (iterable): The names of database tables to include. Each one will be converted into a tabular data resource. Dependent tables will also be added to the data package. out_dir (path-like): The location of the packaging directory. The data package will be created in a subdirectory in this directory, according to the name of the package. Returns: data_pkg (Package): an object representing the data package, as defined by the datapackage library. """ # A few paths we are going to need repeatedly: # out_dir is the packaging directory -- the place where packages end up # pkg_dir is the top level directory of this package: pkg_dir = os.path.abspath(os.path.join(out_dir, pkg_skeleton["name"])) # data_dir is the data directory within the package directory: data_dir = os.path.join(pkg_dir, "data") # pkg_json is the datapackage.json that we ultimately output: pkg_json = os.path.join(pkg_dir, "datapackage.json") # Given the list of target tables, find all dependent tables. all_tables = pudl.helpers.get_dependent_tables_from_list( pkg_tables, testing=testing) # Extract the target tables and save them as CSV files. # We have to do this before creating the data resources # because the files are necessary in order to calculate # the file sizes and hashes. for t in all_tables: csv_out = os.path.join(data_dir, f"{t}.csv") os.makedirs(os.path.dirname(csv_out), exist_ok=True) df = pd.read_sql_table(t, pudl.init.connect_db(testing=testing)) if t in pudl.constants.need_fix_inting: df = pudl.helpers.fix_int_na(df, pudl.constants.need_fix_inting[t]) logger.info(f"Exporting {t} to {csv_out}") df.to_csv(csv_out, index=False) # Create a tabular data resource for each of the tables. resources = [] for t in all_tables: resources.append( pudl.output.export.get_tabular_data_resource(t, pkg_dir=pkg_dir)) data_sources = pudl.helpers.data_sources_from_tables( all_tables, testing=testing) contributors = set() for src in data_sources: for c in pudl.constants.contributors_by_source[src]: contributors.add(c) pkg_descriptor = { "name": pkg_skeleton["name"], "profile": "tabular-data-package", "title": pkg_skeleton["title"], "description": pkg_skeleton["description"], "keywords": pkg_skeleton["keywords"], "homepage": "https://catalyst.coop/pudl/", "created": (datetime.datetime.utcnow(). replace(microsecond=0).isoformat() + 'Z'), "contributors": [pudl.constants.contributors[c] for c in contributors], "sources": [pudl.constants.data_sources[src] for src in data_sources], "licenses": [pudl.constants.licenses["cc-by-4.0"]], "resources": resources, } # Use that descriptor to instantiate a Package object data_pkg = datapackage.Package(pkg_descriptor) # Validate the data package descriptor before we go to if not data_pkg.valid: logger.warning(f""" Invalid tabular data package: {data_pkg.descriptor["name"]} Errors: {data_pkg.errors}""") data_pkg.save(pkg_json) # Validate the data within the package using goodtables: report = goodtables.validate(pkg_json, row_limit=100_000) if not report['valid']: logger.warning("Data package data validation failed.") return data_pkg
def load_data(database_filepath): engine = create_engine('sqlite:///' + database_filepath) df = pd.read_sql_table('EAdescription', engine) return df
def read_sql_table(table, uri, index_col, divisions=None, npartitions=None, limits=None, columns=None, bytes_per_chunk="256 MiB", head_rows=5, schema=None, meta=None, engine_kwargs=None, **kwargs): """ Create dataframe from an SQL table. If neither divisions or npartitions is given, the memory footprint of the first few rows will be determined, and partitions of size ~256MB will be used. Parameters ---------- table : string or sqlalchemy expression Select columns from here. uri : string Full sqlalchemy URI for the database connection index_col : string Column which becomes the index, and defines the partitioning. Should be a indexed column in the SQL server, and any orderable type. If the type is number or time, then partition boundaries can be inferred from npartitions or bytes_per_chunk; otherwide must supply explicit ``divisions=``. ``index_col`` could be a function to return a value, e.g., ``sql.func.abs(sql.column('value')).label('abs(value)')``. ``index_col=sql.func.abs(sql.column("value")).label("abs(value)")``, or ``index_col=cast(sql.column("id"),types.BigInteger).label("id")`` to convert the textfield ``id`` to ``BigInteger``. Note ``sql``, ``cast``, ``types`` methods comes frome ``sqlalchemy`` module. Labeling columns created by functions or arithmetic operations is required. divisions: sequence Values of the index column to split the table by. If given, this will override npartitions and bytes_per_chunk. The divisions are the value boundaries of the index column used to define the partitions. For example, ``divisions=list('acegikmoqsuwz')`` could be used to partition a string column lexographically into 12 partitions, with the implicit assumption that each partition contains similar numbers of records. npartitions : int Number of partitions, if divisions is not given. Will split the values of the index column linearly between limits, if given, or the column max/min. The index column must be numeric or time for this to work limits: 2-tuple or None Manually give upper and lower range of values for use with npartitions; if None, first fetches max/min from the DB. Upper limit, if given, is inclusive. columns : list of strings or None Which columns to select; if None, gets all; can include sqlalchemy functions, e.g., ``sql.func.abs(sql.column('value')).label('abs(value)')``. Labeling columns created by functions or arithmetic operations is recommended. bytes_per_chunk : str, int If both divisions and npartitions is None, this is the target size of each partition, in bytes head_rows : int How many rows to load for inferring the data-types, unless passing meta meta : empty DataFrame or None If provided, do not attempt to infer dtypes, but use these, coercing all chunks on load schema : str or None If using a table name, pass this to sqlalchemy to select which DB schema to use within the URI connection engine_kwargs : dict or None Specific db engine parameters for sqlalchemy kwargs : dict Additional parameters to pass to `pd.read_sql()` Returns ------- dask.dataframe Examples -------- >>> df = dd.read_sql_table('accounts', 'sqlite:///path/to/bank.db', ... npartitions=10, index_col='id') # doctest: +SKIP """ import sqlalchemy as sa from sqlalchemy import sql from sqlalchemy.sql import elements if index_col is None: raise ValueError("Must specify index column to partition on") engine_kwargs = {} if engine_kwargs is None else engine_kwargs engine = sa.create_engine(uri, **engine_kwargs) m = sa.MetaData() if isinstance(table, str): table = sa.Table(table, m, autoload=True, autoload_with=engine, schema=schema) index = table.columns[index_col] if isinstance(index_col, str) else index_col if not isinstance(index_col, (str, elements.Label)): raise ValueError( "Use label when passing an SQLAlchemy instance as the index (%s)" % index) if divisions and npartitions: raise TypeError( "Must supply either divisions or npartitions, not both") columns = ([(table.columns[c] if isinstance(c, str) else c) for c in columns] if columns else list(table.columns)) if index_col not in columns: columns.append(table.columns[index_col] if isinstance(index_col, str ) else index_col) if isinstance(index_col, str): kwargs["index_col"] = index_col else: # function names get pandas auto-named kwargs["index_col"] = index_col.name if meta is None: # derive metadata from first few rows q = sql.select(columns).limit(head_rows).select_from(table) head = pd.read_sql(q, engine, **kwargs) if head.empty: # no results at all name = table.name schema = table.schema head = pd.read_sql_table(name, uri, schema=schema, index_col=index_col) return from_pandas(head, npartitions=1) bytes_per_row = (head.memory_usage(deep=True, index=True)).sum() / head_rows meta = head.iloc[:0] else: if divisions is None and npartitions is None: raise ValueError( "Must provide divisions or npartitions when using explicit meta." ) if divisions is None: if limits is None: # calculate max and min for given index q = sql.select([sql.func.max(index), sql.func.min(index)]).select_from(table) minmax = pd.read_sql(q, engine) maxi, mini = minmax.iloc[0] dtype = minmax.dtypes["max_1"] else: mini, maxi = limits dtype = pd.Series(limits).dtype if npartitions is None: q = sql.select([sql.func.count(index)]).select_from(table) count = pd.read_sql(q, engine)["count_1"][0] npartitions = (int( round(count * bytes_per_row / dask.utils.parse_bytes(bytes_per_chunk))) or 1) if dtype.kind == "M": divisions = pd.date_range( start=mini, end=maxi, freq="%iS" % ((maxi - mini).total_seconds() / npartitions), ).tolist() divisions[0] = mini divisions[-1] = maxi elif dtype.kind in ["i", "u", "f"]: divisions = np.linspace(mini, maxi, npartitions + 1).tolist() else: raise TypeError( 'Provided index column is of type "{}". If divisions is not provided the ' "index column type must be numeric or datetime.".format(dtype)) parts = [] lowers, uppers = divisions[:-1], divisions[1:] for i, (lower, upper) in enumerate(zip(lowers, uppers)): cond = index <= upper if i == len(lowers) - 1 else index < upper q = sql.select(columns).where(sql.and_(index >= lower, cond)).select_from(table) parts.append( delayed(_read_sql_chunk)(q, uri, meta, engine_kwargs=engine_kwargs, **kwargs)) engine.dispose() return from_delayed(parts, meta, divisions=divisions)
app = Flask(__name__) def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('Cleaned_Messages', engine) # load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
import datetime import sqlalchemy import pandas as pd import sqlite3 import numpy as np PATH = 'my_file' # Reads in databases from tasks 1 and 2 engine = sqlalchemy.create_engine('sqlite:///' + PATH) all_data = pd.read_sql_table("TempAndCO2Log", engine) all_data.to_csv("tester.csv")
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('messages_disaster', engine) # load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # Using the Genre's as provided and added in Top 10 genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
# remove stop words stopwords_ = stopwords.words("english") words = [word for word in words if word not in stopwords_] # extract root form of words words = [WordNetLemmatizer().lemmatize(word, pos='v') for word in words] return words # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('DisasterMessages', engine) # load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load dataDisasterResponse engine = create_engine('sqlite:////home/workspace/models/DisasterResponse.db') #engine = create_engine('sqlite:///.workspace/models/DisasterResponse.db') df = pd.read_sql_table('DisasterResponse', engine) # load model model = joblib.load("/home/workspace/models/classifier.pickle") #model = joblib.load("./models/classifier.pickle") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
def index(request): dests = Destination.objects.all() if request.user.is_authenticated: global df engine = create_engine( 'postgresql+psycopg2://postgres:postgres@localhost:5432/telusko') df_d = pd.read_sql_table( "travello_destination", con=engine, schema='public', coerce_float=True, columns=['name', 'img', 'desc', 'state', 'city', 'typeofplace']) df = pd.DataFrame(df_d) geocoder = OpenCageGeocode('ea7fd5e689b149c38ef13cbed352bff5') list_lat = [] list_long = [] for index, row in df.iterrows(): name = get_name_from_index(index, df) state = get_state_from_index(index, df) city = get_city_from_index(index, df) query = str(name) + ',' + str(city) + ',' + str(state) print("hi") results = geocoder.geocode(query) print('$$$$$$', results) if len(results) != 0: lat = results[0]['geometry']['lat'] longi = results[0]['geometry']['lng'] else: print("results is empty") print("hello", index, name, state) list_lat.append(lat) list_long.append(longi) df['lat'] = list_lat df['lon'] = list_long print(df) features = ['desc', 'state', 'typeofplace'] for feature in features: df[feature] = df[feature].fillna('') df['combined_features'] = df.apply(combine_features, axis=1) cv = CountVectorizer() count_matrix = cv.fit_transform(df['combined_features']) cosine_sim = cosine_similarity(count_matrix) custom = CustomPreferences.objects.all() for c in custom: if str(c.user) == str(request.user): user_prefer = c.preferences user_prefer = user_prefer.split(",") rows_data = [] for up in user_prefer: place_index = get_index_from_title(up, df) similar_places = list(enumerate(cosine_sim[place_index])) sorted_similar_places = sorted(similar_places, key=lambda x: x[1], reverse=True) i = 0 for place in sorted_similar_places: row_data = get_title_from_index(place[0], df) rows_data.append(row_data) i = i + 1 if i > 3: break final_data = [] for dest in dests: for lists in rows_data: if dest.name in lists: result = TextBlob(dest.desc) polar = result.sentiment.polarity if polar > 0.0: final_data.append(dest) else: user_prefer = [] final_data = [] return render(request, "index.html", { 'dests': dests, 'recommendations': final_data })
# Initialize lemmatizer lemmatizer = WordNetLemmatizer() # Lowercase, eliminate blank spaces and findin the root form of the words clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok, pos='v').lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data try: engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('messages_categories', con=engine) except: print( 'If load data from database failed, try to run it from the app folder') # load model model = joblib.load("../models/classifier_model.pkl") # Function for first plot def first_plot(df): """Create first plot TOP 10 categories """ # Define counts categories = df.drop(['id', 'message', 'original', 'genre'], axis=1).sum().sort_values(ascending=False)
from wtforms import TextField, Form import pandas as pd from sqlalchemy import create_engine from global_parameters import * # assign values from global_parameters.py to local variables # number_movies_returned - how many most similar movies output when requested for each movie number_movies_returned = global_number_movies_returned # database_filepath - name of the sqlalchemy database file where recommendations are stored database_filepath = global_database_filepath app = Flask(__name__) # extract movies data engine = create_engine('sqlite:///' + database_filepath) movies_data = pd.read_sql_table('Closest_movies', engine) engine.dispose() # get the movie titles movie_titles = list(movies_data['movie_title']) # SearchForm class will allow us to have autocomplete feature class SearchForm(Form): movie_autocomplete = TextField('Movie name', id='movie_autocomplete') @app.route('/autocomplete', methods=['GET', 'POST']) def autocomplete(): '''
# -*- coding: utf-8 -*- ############################################################################### ####################### 正文代码 ####################### ############################################################################### # 代码 4-12 from sqlalchemy import create_engine import pandas as pd ## 创建数据库连接 engine = create_engine('mysql+pymysql://root:[email protected]:\ 3306/testdb?charset=utf8') detail= pd.read_sql_table('meal_order_detail1',con = engine) print('订单详情表的索引为:', detail.index) print('订单详情表的所有值为:','\n', detail.values) print('订单详情表的列名为:','\n', detail.columns) print('订单详情表的数据类型为:','\n', detail.dtypes) # 代码 4-13 ## 查看DataFrame的元素个数 print('订单详情表的元素个数为:', detail.size) print('订单详情表的维度数为:', detail.ndim) ## 查看DataFrame的维度数 print('订单详情表的形状为:', detail.shape) ## 查看DataFrame的形状 # 代码 4-14 print('订单详情表转置前形状为:',detail.shape) print('订单详情表转置后形状为为:',detail.T.shape)
help='obsHistID to generate InstanceCatalog for') parser.add_argument('--sne_truth_cat', type=str, help='path to lensed AGN truth catalog') parser.add_argument('--output_dir', type=str, help='output directory for catalog and sed folder') parser.add_argument('--cat_file_name', type=str, help='filename of instance catalog written') parser.add_argument('--sed_folder', type=str, help='directory to put SNe SEDs. Will appear in output_dir.') args = parser.parse_args() obs_gen = ObservationMetaDataGenerator(database=args.obs_db, driver='sqlite') sne_truth_db = create_engine('sqlite:///%s' % args.sne_truth_cat, echo=False) sne_truth_cat = pd.read_sql_table('lensed_sne', sne_truth_db) lensed_sne_ic = lensedSneCat(sne_truth_cat, args.output_dir, args.cat_file_name, args.sed_folder) obs_md = get_obs_md(obs_gen, args.obs_id, 2, dither=True) print(obs_md.mjd.TAI) for obs_time in np.arange(obs_md.mjd.TAI, obs_md.mjd.TAI + 35.1, 0.25): obs_filter = obs_md.bandpass print('Writing Instance Catalog for Visit: %i at MJD: %f in Bandpass: %s' % (args.obs_id, obs_time, obs_filter)) add_to_cat_idx, sne_magnorms, sne_sed_names = lensed_sne_ic.calc_sne_mags(obs_time, obs_filter) lensed_sne_ic.output_instance_catalog(add_to_cat_idx, sne_magnorms, sne_sed_names, obs_md, str('test_cat_%.4f' % obs_time))
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('df', engine) # load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
def application (environ, start_response): session = get_database_session(engine) json_c_type = "application/json" paths = [p for p in environ['PATH_INFO'].split('/') if p != ''] headers = [] origin = environ.get("HTTP_ORIGIN") method = environ['REQUEST_METHOD'] response_factory = ResponseFactory() # / if len(paths) == 0: payload = {f'message': f"Server works!"} return response_factory \ .create(200) \ .get_response(payload, start_response) else: # /players/ if paths[0] == 'players': # GET: /players/ if method == 'GET': # GET: /players/ if len(paths) == 1: try: # query extract query = parse_qs(environ['QUERY_STRING']) name = query.get('name', [''])[0] club = query.get('club', [''])[0] nationality = query.get('nationality', [''])[0] limit = int(query.get('limit', [10])[0]) skip = int(query.get('skip', [0])[0]) rows_all = session.query(Player) if name != '': rows_all = rows_all.filter(Player.name.ilike(f'%{name}%')) if club != '': rows_all = rows_all.filter(Player.club.ilike(f'%{club}%')) if nationality != '': rows_all = rows_all.filter(Player.nationality.ilike(f'%{nationality}%')) rows_all = rows_all.order_by(Player.overall.desc(), Player.value.desc()) rows = rows_all \ .offset(skip) \ .limit(limit) # result set records = [dict(id=r.id, name=r.name, position=r.position, nationality=r.nationality, flag=r.flag, club=r.club, age=r.age, photo=r.photo, value=r.value, overall=r.overall ) for r in rows] payload = { 'count': rows_all.count(), 'results': records } return response_factory \ .create(200, origin, method) \ .get_response(payload, start_response) except Exception as e: payload = {f'message': f"Oops! Something went wrong! => {e}"} return response_factory \ .create(500) \ .get_response(payload, start_response) # GET: /players/... else: payload = {'message': f"You are not allowed to make request to `{environ['PATH_INFO']}`!"} return response_factory \ .create(403) \ .get_response(payload, start_response) # POST: /players/ elif method == 'POST': # POST: /players/team/ if len(paths) == 2 and paths[1] == 'team': length = int(environ.get('CONTENT_LENGTH', '0')) request_body = environ['wsgi.input'].read(length) body = json.loads(request_body) # verify formation = body.get('formation', None) budget = int(body.get('budget', 0)) include_free_agents = body.get('include_free_agents', True) if formation and all([p in POSITIONS for p in formation]) and budget >= 1 * 10**6: try: formation = [ f.lower() for f in formation] temp = pd.read_sql_table('players', con=engine)[['id', 'position', 'value', 'overall']] \ .dropna(subset=['position']) \ .query(f'position in {formation}') temp = temp if include_free_agents else temp.query('value > 0') prob, ids = compute_best_lineup(temp, formation, budget) rows = session.query(Player).filter(Player.id.in_(ids)) records = { r.position: dict(id=r.id, name=r.name, position=r.position, nationality=r.nationality, flag=r.flag, club=r.club, age=r.age, photo=r.photo, value=r.value, overall=r.overall ) for r in rows} payload = { 'total_overall': sum([r.overall for r in rows]), 'total_value': sum([r.value for r in rows]), 'formation': formation, 'results': records } return response_factory \ .create(200, origin, method) \ .get_response(payload, start_response) except Exception as e: payload = {f'message': f"Oops! Something went wrong! => {e}"} return response_factory \ .create(500) \ .get_response(payload, start_response) else: payload = { 'message': f"Please select 11 unique and valid positions and set budget greater than € 1,000,000!", 'formation': formation, 'budget': budget } return response_factory \ .create(400) \ .get_response(payload, start_response) # POST: /players/... else: payload = {'message': f"You are not allowed to make request to `{environ['PATH_INFO']}`!"} return response_factory \ .create(403) \ .get_response(payload, start_response) # HEAD/OPTIONS: /players/ elif method == 'HEAD' or method == 'OPTIONS': return response_factory \ .create(200, origin, method) \ .get_response(None, start_response) # PUT/DELETE/PATCH/OPTIONS: /players/ else: payload = {f'message': f"You are not allowed to make request with {environ['REQUEST_METHOD']} to `{environ['PATH_INFO']}`!"} return response_factory \ .create(405) \ .get_response(payload, start_response) # /assets/ elif paths[0] == 'assets': if method == 'GET': status = '200 OK' try: headers.append(('Content-Type', 'image/png')) start_response(status, headers) with open(f"./{environ['PATH_INFO']}", "rb") as f: img = f.read() size = stat(f"./{environ['PATH_INFO']}").st_size if size == 0: with open(f"./assets/players/000000.png", "rb") as f: img = f.read() return [img] except Exception as e: payload = {'message': f"Image is not available! => {e}"} return response_factory \ .create(404) \ .get_response(payload, start_response) else: payload = {f'message': f"You are not allowed to make request with {environ['REQUEST_METHOD']} to `{environ['PATH_INFO']}`!"} return response_factory \ .create(405) \ .get_response(payload, start_response) # /* else: payload = {'message': f"You are not allowed to make request to `{environ['PATH_INFO']}`!"} return response_factory \ .create(403) \ .get_response(payload, start_response)
'mssql://LAPTOP-TH3PDN0I/Group_8_DB?driver=ODBC+Driver+17+for+SQL+Server') print("Connected.") csvfile = '../SPARC_10k_part-ce.csv' print("File name to load: " + csvfile) ################################################################################ # # Job Step 10: Load Dimensions # ################################################################################ jobutils.printStepStart("10") print("Loading dimensions to memory...") dim_date_df = pd.read_sql_table('DimDate', con=engine).fillna('') dim_location_df = pd.read_sql_table('DimLocation', con=engine).fillna('') dim_demographics_df = pd.read_sql_table('DimDemographics', con=engine).fillna('') dim_payment_df = pd.read_sql_table('DimPayment', con=engine).fillna('') dim_clinic_class_df = pd.read_sql_table('DimClinicClass', con=engine).fillna('') dim_apr_class_df = pd.read_sql_table('DimAPRClassification', con=engine).fillna('') dim_admission_df = pd.read_sql_table('DimAdmission', con=engine).fillna('') dim_provider_df = pd.read_sql_table('DimProvider', con=engine).fillna('') print("Done. Dimensions loaded.") ################################################################################ #
def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype])
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('DisasterRis', engine) # load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for token in tokens: clean_token = lemmatizer.lemmatize(token).lower().strip() clean_tokens.append(clean_token) return clean_tokens # load data engine = create_engine('sqlite:///./data/DisasterResponse.db') df = pd.read_sql_table('Disaster_Response_ETL', engine) # load model model = joblib.load("./models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index)
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('DataFrame', engine) # load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index) category_counts = df.iloc[:, 4:].sum().sort_values(ascending=False)[1:6]
def start_flow(): job_id = admin_api.start_job() if (not job_id): current_app.logger.info('Failed to get job_id') job_outcome = 'busy' else: log_db.log_exec_status(job_id, 'start_flow', 'executing', '') file_path_list = os.listdir(CURRENT_SOURCE_FILES_PATH) if file_path_list: with engine.connect() as connection: Base.metadata.create_all(connection) # Get previous version of pdp_contacts table, which is used later to classify new records pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection) pdp_contacts_df = pdp_contacts_df[ pdp_contacts_df["archived_date"].isnull()] pdp_contacts_df = pdp_contacts_df.drop(columns=[ 'archived_date', 'created_date', '_id', 'matching_id' ]) current_app.logger.info( 'Loaded {} records from pdp_contacts table'.format( pdp_contacts_df.shape[0])) # Clean the input data and normalize/rename columns # Populate new records in secondary tables (donations, volunteer shifts) # input - existing files in path # output - normalized object of all entries, as well as the input json rows for primary sources log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '') normalized_data, source_json, manual_matches_df = clean_and_load_data.start( connection, pdp_contacts_df, file_path_list) # Standardize column data types via postgres (e.g. reading a csv column as int vs. str) # (If additional inconsistencies are encountered, may need to enforce the schema of # the contacts loader by initializing it from pdp_contacts.) normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace') normalized_data = pd.read_sql_table( '_temp_pdp_contacts_loader', connection) # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB log_db.log_exec_status(job_id, 'classify', 'executing', '') rows_classified = calssify_new_data.start( pdp_contacts_df, normalized_data) # Archives rows the were updated in the current state of the DB (changes their archived_date to now) archive_rows.archive(connection, rows_classified["updated"]) # Match new+updated records against previous version of pdp_contacts database, and # write these rows to the database. match_data.start(connection, rows_classified, manual_matches_df, job_id) # Copy raw input rows to json fields in pdp_contacts, # using a temporary table to simplify the update code. current_app.logger.info( 'Saving json of original rows to pdp_contacts') source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace') # https://www.postgresql.org/docs/8.4/sql-update.html connection.execute(''' UPDATE pdp_contacts pdp SET json = to_json(temp.json) FROM _temp_pdp_contacts_loader temp WHERE pdp.source_type = temp.source_type AND pdp.source_id = temp.source_id AND pdp.archived_date IS NULL ''') current_app.logger.info('Finished flow script run') job_outcome = 'completed' else: # No files in list current_app.logger.info('No files to process') job_outcome = 'nothing to do' log_db.log_exec_status(job_id, 'flow', 'complete', '') return job_outcome
def time_read_sql_table_all(self): read_sql_table(self.table_name, self.con)
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('messages', engine) # load model model = joblib.load("../models/classifier.joblib") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] genre_names = list(genre_counts.index) num_categories = df[[
def extract_array(fname, param_names, result_name="result", non_existing=np.nan, redux_funs=[np.nanmean], return_param_values=True, conditionals={}, db_is_sqlite=False): """ Given a database file (as e.g. product by FireAndForgetJob, extraxts an array where each dimension corresponds to a provided parameter, and each element is a redux (e.g. mean) of all results (of given same) for the parameter combinations. An optional set of additional conditions can be specified. Database file can be csv or sqlite. Empty parameter names lead to just aggregating (sliced by conditionals) the results. A default value can be specified. """ if db_is_sqlite: from sqlalchemy import create_engine import sqlalchemy as sa engine = create_engine('sqlite:///{}'.format(fname)) df = pd.read_sql_table("FireAndForgetJob", engine) else: with open(fname) as f: df = pd.read_csv(f, error_bad_lines=False, warn_bad_lines=False) for k, v in conditionals.items(): df = df.loc[df[k] == v] if k in param_names: param_names.remove(k) # no parameter names means just return the aggregated values for the (sliced) result if len(param_names) == 0: return np.array([redux(df[result_name]) for redux in redux_funs]) param_values = { param_name: np.sort(df[param_name].dropna().unique()) for param_name in param_names } sizes = [len(param_values[param_name]) for param_name in param_names] results = [np.zeros(tuple(sizes)) + non_existing for _ in redux_funs] # compute aggregate for each unique appearance of all parameters redux = df.groupby(param_names, as_index=False)[result_name].agg(redux_funs) # since not all parameter combinations might be computed, iterate and pull out computed ones all_combs = itertools.product( *[param_values[param_name] for param_name in param_names]) for index, comb in enumerate(all_combs): # one element tuples should be the value itself if len(comb) == 1: comb = comb[0] result_ind = np.unravel_index(index, tuple(sizes)) # parameter combination was computed if comb in redux.index: # extract results and put them in the right place for i, redux_fun in enumerate(redux_funs): results[i][result_ind] = redux.loc[comb][redux_fun.__name__] if not return_param_values: return results else: return results, param_values
app = Flask(__name__) def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data/DisasterResponse.db') df = pd.read_sql_table('data/DisasterResponse_table',engine) #load model model = joblib.load("../models/classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message']
def load_from_mysql(table_name: str): """读取远程mysql数据表""" LOG.logger_font.info(msg=f"Reading mysql table {table_name}") table = pd.read_sql_table(con=RemoteMySQLConfig.engine, table_name=f"{table_name}") return table
def tokenize(text): tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: clean_tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(clean_tok) return clean_tokens # load data engine = create_engine('sqlite:///../data.db') df = pd.read_sql_table('data', engine) # load model model = joblib.load("../classifier.pkl") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): # extract data needed for visuals # TODO: Below is an example - modify to extract data for your own visuals genre_counts = df.groupby('genre').count()['message'] percent_request = 100 * df.groupby('genre').sum()['request'] / ( df.groupby('genre').count()['message'])
import pandas as pd from sqlalchemy import create_engine engine = create_engine( 'postgresql://*****:*****@localhost:5432/xueshandai') ip = pd.read_sql_table('ip_to_map_province', engine) def get_ip_value(ip): ip2 = ip.split('.') ipv = int(ip2[0]) * 256**3 + int(ip2[1]) * 256**2 + int( ip2[2]) * 256 + int(ip2[3]) return ipv def findcountryprovince_from_db(ip): # print("ip:",ip) ipv = get_ip_value(ip) sql = 'select * from ip_store2 where ipv1<=' + str( ipv) + ' and ipv2>=' + str(ipv) # print("sql:",sql) data = pd.read_sql_query(sql, engine) if data.size > 0: return data.loc[0, 'province'] else: return 'unknow' ip['province'] = '' for k in ip.index: # print(k,ip.loc[k,'ip'])
POSTGRES_PORT = 5432 POSTGRES_USERNAME = '******' POSTGRES_PASSWORD = db_password POSTGRES_DBNAME = 'us_gun_violence' # In[4]: # creat connection string and database engine db_string = f'postgres://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}@{POSTGRES_ADDRESS}:{POSTGRES_PORT}/{POSTGRES_DBNAME}' engine = create_engine(db_string) # In[5]: # import transformed suspects dataframe suspects_df = pd.read_sql_table('suspects_ml_transformed', engine) suspects_df.head() # In[6]: # import incidents dataframe incidents_df = pd.read_sql_table('incidents', engine) incidents_df.head() # ## Preprocess Data # In[7]: # combine suspect and incident data suspects_incidents_df = suspects_df.merge(incidents_df, how='left',
# creating lists of the original names, new names, and keys for the database migration old_table_names = [ 'armory_item', 'armory_weapon', 'charactercreator_character', 'charactercreator_character_inventory', 'charactercreator_cleric', 'charactercreator_fighter', 'charactercreator_mage', 'charactercreator_necromancer', 'charactercreator_thief' ] new_table_names = [ 'items', 'weapons', 'characters', 'inventories', 'clerics', 'fighters', 'mages', 'necromancers', 'thieves' ] keys = [ 'item_id', 'item_ptr_id', 'character_id', 'id', 'character_ptr_id', 'character_ptr_id', 'character_ptr_id', 'mage_ptr_id', 'character_ptr_id' ] for i in range(len(old_table_names)): table = sq_curs.execute(""" select * from {}""".format(old_table_names[i])) table_df = pd.read_sql_table('{}'.format(old_table_names[i]), sq_engine) table_df.set_index(keys[i], inplace=True) if len(table_df) > 0: table_df.to_sql('{}'.format(new_table_names[i]), engine, if_exists='replace') pg_conn.close() sq_con.close()