def import_submissions(course_id = "C00198", dbname="test1"): con = psycopg2.connect("dbname=%s"%dbname) # subs = generate_submissions(users=10, pbls=100) userslist = pd.read_sql_query("select user_id from coursesusers where course_id like '%s';" % course_id,con=con) userslist = [u[0] for u in userslist.values if u[0]] usersstr = str(userslist).strip('[]') users = pd.read_sql_query("select user_id, creation_date from users \ where demo=0 and instructor=0 and administrator=0\ and user_id in (%s);" % usersstr,con=con) lusers = users.user_id.tolist() usersstr = str(lusers).strip('[]') problists = pd.read_sql_query("select list_id from courseslists where course_id like '%s';" % course_id ,con=con) problists = [l[0] for l in problists.values if l[0]] problistsstr = str(problists).strip('[]') probs = pd.read_sql_query("select problem_nm from listitems where list_id \ in(%s);" % problistsstr,con=con) lprobs = [p[0] for p in probs.values if p[0]] lprobsstr = str(lprobs).strip('[]') submissions = pd.read_sql_query("select submission_uid, user_id, problem_id, submission_id, \ state, time_out, time_in, veredict, score \ from submissions where user_id in (%s);" % (usersstr) , con=con) submissions.problem_id = submissions.problem_id.apply(lambda x: x[:-3]) #submissions.set_index('submission_uid', inplace=True) subs = submissions[submissions.problem_id.isin(lprobs)] return subs
def fetch2DB(): # init step fetch2DB.timestamp = datetime.now() # step1: get DB connection dcm_sql = dcm(echo=False) engine = dcm_sql.getengine() conn = dcm_sql.getconn() # step2.1: get current stock list dfm_stocks = pd.read_sql_query('''select [Stock_ID] from stock_basic_info where (Market_ID = 'SH' or Market_ID = 'SZ') and (Stock_ID like '0%' or Stock_ID like '3%' or Stock_ID like '6%') ''' # + " and Stock_ID = '300274'" #only used for debug and test purpose , engine) # print(dfm_stocks) # step2:loop at stock list and fetch and save to DB for item in dfm_stocks['Stock_ID']: # get column Stock_ID from dataframe # step2.2: get current character list in each loop so that new chars are included. dfm_cur_chars = pd.read_sql_query('''select * from ZCFG_character where Char_Origin = 'Tquant' and (Char_Usage = 'FIN10' or Char_Usage = 'FIN20' or Char_Usage = 'FIN30' ) ''' , engine) fetch2DB_individual(item,dfm_cur_chars,conn) # item is str type
def backTest(trainEndDate, code, testDate, predictDate): conn = db.get_history_data_db('D') df = None # train more date # model = pickle.load(open('%s/%s.pkl' % (config.model_dir, code), 'r')) rng = np.random.RandomState(1) model = AdaBoostRegressor(DecisionTreeRegressor( max_depth=4), n_estimators=1000, random_state=rng, loss='square') df = pd.read_sql_query( "select * from history_data where date([date])<='%s' and code='%s' order by code, date([date]) asc" % ( trainEndDate, code), conn) shift_1 = df['close'].shift(-2) df['target'] = shift_1 data = df[df['target'] > -1000] X_train = data.ix[:, 'code':'turnover'] y_train = data.ix[:, 'target'] if len(X_train) < 500: return print len(X_train) # print data # for i in range(0, 10): # model.fit(X_train, y_train) model.fit(X_train, y_train) # predict tomorrow try: df = pd.read_sql_query(config.sql_history_data_by_code_date % (code, testDate), conn) # print df except Exception, e: print e
def get_dfs(station = None, path='./data/'): """ Get all available databases of bikes and weather, return dict of { date : [bikes_dataframe, weather_dataframe] } """ df_dic = {} for db in os.listdir(path): if db.endswith(".db"): with sqlite3.connect(path+db) as con: if station: bikes = pd.read_sql_query( "SELECT \"index\",\""+station+"\" FROM bikes", con) else: bikes = pd.read_sql_query( "SELECT * FROM bikes", con) weather = pd.read_sql_query("SELECT * FROM weather", con) # Only include full-day records, 2-minute intervals # means 60*24/2 ~ 700 scrapes if len(bikes['index']) > 700: date = datetime.datetime.strptime( db.split("_")[0] , "%Y-%m-%d").date() # Fix wind speed values and cast temperatures to integers weather['Wind_Speed'] = weather['Wind_Speed'].replace( to_replace = 'calm', value = 0) weather[['Temperature', 'Feels_Like','Wind_Speed']] = weather[ ['Temperature', 'Feels_Like' ,'Wind_Speed'] ].astype(int) #convert timestamp string to a datetime time object bikes = bikes.rename(columns = {'index' : 'Time'}) for df in [bikes, weather]: df['Time'] = pd.to_datetime(df['Time'].apply(lambda x: datetime.datetime.strptime(str(date)+' '+str(x), "%Y-%m-%d %H:%M:%S"))) df_dic[str(date)] = [bikes,weather] return df_dic
def do_test_winloss(username, dbname): con = None con = psycopg2.connect(database=dbname, user=username) known_table = 'winloss' sql_query = "SELECT COUNT(*) FROM %s;" % (known_table) try: count_sql = pd.read_sql_query(sql_query, con) if count_sql is not None: exists = True except: exists = False print ' Table, %s, exists: %s' % (known_table, exists) if exists is True: print ' Total number of entries in %s: %i' % (known_table, count_sql.loc[0]) sql_query = "SELECT * FROM %s;" % (known_table) try: all_sql = pd.read_sql_query(sql_query, con) except: a = 1 print ' First 5 entries of %s: ' % (known_table) print all_sql.head(5) print ''
def do_test_gamestats(username, dbname, year): con = None con = psycopg2.connect(database=dbname, user=username) known_table = 'teams' + year sql_query = "SELECT COUNT(*) FROM %s;" % (known_table) try: count_sql = pd.read_sql_query(sql_query, con) if count_sql is not None: exists = True except: exists = False print ' Table, %s, exists: %s' % (known_table, exists) sql_query = "SELECT DISTINCT(game_id) FROM %s;" % (known_table) try: count_to_get = pd.read_sql_query(sql_query, con) print ' There are %s distinct games in the %s table' % (len(count_to_get), known_table) except: print ' games table, %s, does not exist' % known_table sql_query = "SELECT * FROM %s;" % (known_table) try: all_sql = pd.read_sql_query(sql_query, con) print ' First 5 entries of %s: ' % (known_table) print all_sql.head(5) print ' Last 10 entries of %s: ' % (known_table) print all_sql.tail(10) except: a = 1 print ''
def meet_all(self, day): """ in a site meeting, the latest state of 3 variable sets are synchronized :param day: the day hold the meeting :return: """ projects = pd.read_sql_query( "SELECT ID as ProjectID FROM Fact_Project WHERE MeetingCycle<>0 AND " + str(day) + " % MeetingCycle =0", self.engine) if len(projects.ProjectID) == 0 or day == 1: return 0 # information about task progress sync_task = pd.read_sql_query("SELECT * FROM Sync_Task", self.engine) sync_task = sync_task.merge(projects, how='inner', on=['ProjectID']).reset_index(drop=True) sync_task['Day'] = day - 1 self.log_wp(sync_task) # information about production rate sync_production_rate = pd.read_sql_query("SELECT * FROM Sync_ProductionRate", self.engine) sync_production_rate = sync_production_rate.merge(projects, how='inner', on=['ProjectID']).reset_index( drop=True) sync_production_rate['Day'] = day - 1 self.log_production_rate(sync_production_rate) # information about workspace priority sync_workspace_priority = pd.read_sql_query("SELECT * FROM Sync_WorkSpacePriority", self.engine) sync_workspace_priority = sync_workspace_priority.merge(projects, how='inner', on=['ProjectID']).reset_index( drop=True) sync_workspace_priority['Day'] = day - 1 self.log_priority_space(sync_workspace_priority)
def step2(): df = pd.DataFrame # make list of unique asin sql = "SELECT DISTINCT asin FROM subset" asinList = pd.read_sql_query(sql, disk_engine) print(asinList.head()) print(len(asinList.index)) for row in asinList['asin'].tolist(): print("loading: ", row) sql = "SELECT * FROM subset WHERE asin = '" + row + "' LIMIT 20" temp_df = pd.read_sql_query(sql, disk_engine, index_col = 'index') print("Temp df:\n", temp_df.head()) if len(temp_df.index)>1: print("appending temp_df to df") if df.empty: df = temp_df else: df = df.append(temp_df, ignore_index=True) print(df.head(), df.tail()) sql = "CREATE TABLE subset AS SELECT * FROM reviews WHERE asin IN ( SELECT asin FROM reviews GROUP BY asin HAVING COUNT (asin)>999)" disk_engine.execute(sql) print("new table created") sql = "SELECT * FROM subset a WHERE a.'index' IN ( SELECT b.'index' FROM subset b WHERE b.'index' IS NOT NULL AND a.'asin' = b.'asin' ORDER BY b.'unixReviewTime', b.'index' LIMIT 20) ORDER BY a.'asin', a.'unixReviewTime'" # df = pd.read_sql_query(sql, disk_engine, index_col = 'index') # print(df.head()) #df.to_sql('means', disk_engine, if_exists='replace', index_label = 'index') df.to_csv(text_destination) print("Success! Hooray!") return
def getData(teamId): teamId = str(teamId) owinningScript = "SELECT Wteam AS team, Wscore AS score, (CAST (Wfgm AS FLOAT))/(CAST(Wfga AS FLOAT)) as fgp,(CAST (Wfgm3 AS FLOAT))/(CAST(Wfga3 AS FLOAT)) as tpp, (CAST (Wftm AS FLOAT))/(CAST(Wfta AS FLOAT)) as ftp, Wor as ofr FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Wteam = ?)" owinningDf = pd.read_sql_query(owinningScript, conn, params = (teamId, )) olosingScript ="SELECT Lteam AS team, Lscore AS score, (CAST (Lfgm AS FLOAT))/(CAST(Lfga AS FLOAT)) as fgp,(CAST (Lfgm3 AS FLOAT))/(CAST(Lfga3 AS FLOAT)) as tpp, (CAST (Lftm AS FLOAT))/(CAST(Lfta AS FLOAT)) as ftp, Lor as ofr FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Lteam =?)" olosingDf = pd.read_sql_query(olosingScript, conn, params = (teamId, )) oteamDf = owinningDf.append(olosingDf) o = oteamDf.apply(genOffScore,axis=1) omean = o.mean(axis=0) #print(omean) dwinningScript = "SELECT Wteam as team, Lscore as oppscore, Lto as oppto, Wdr as dr, Wstl as stl, Wblk as blk FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Wteam = ?)" dwinningDf = pd.read_sql_query(dwinningScript, conn, params = (teamId, )) dlosingScript = "SELECT Lteam as team, Wscore as oppscore, Wto as oppto, Ldr as dr, Lstl as stl, Lblk as blk FROM RegularSeasonDetailedResults WHERE Season >= 2014 AND (Lteam = ?)" dlosingDf = pd.read_sql_query(dlosingScript, conn, params = (teamId, )) dteamDf = dwinningDf.append(dlosingDf) d = dteamDf.apply(genDefScore,axis=1) dmean = d.mean(axis=0) od = pd.concat([o,d],axis=1) #return(omean,dmean) #print(od) return(od)
def cesareans_output(): #pull 'birth_month' from input field and store it patient = request.args.get('birth_month') ucase = request.args.get('ucase') #pull 'bad_foods' from input field and store it bad_foods = request.args.get('bad_foods') ucase = request.args.get('ucase') #just select the Cesareans from the birth database for the month that the user inputs query = """SELECT * FROM nutrients_table2;""" #print query query_results=pd.read_sql_query(query,con) query2 = """SELECT * FROM dri_table;""" #print query query_results2=pd.read_sql_query(query2,con) #print query_results births = [] for i in range(0,query_results.shape[0]): births.append(dict(pos=query_results.iloc[i]['pos'], energy=query_results.iloc[i]['energy'], price_serv=query_results.iloc[i]['price_serv'])) the_result = [] the_result, result_nutrients, nutrients4, total_cost = ModelIt(query_results, ucase, query_results2, patient, bad_foods) #the_result = len(the_result) return render_template("output.html", births = births, the_result = the_result, result_nutrients=result_nutrients, nutrients4=nutrients4, total_cost=total_cost, ucase=ucase, sites=query_results.to_html())
def nearby_station_features(con, station_id): sql_query = "SELECT * FROM station_info;" station_info = pd.read_sql_query(sql_query, con) station_lat = station_info[(station_info['station_id'] == station_id)]['latitude'].values[0] station_lon = station_info[(station_info['station_id'] == station_id)]['longitude'].values[0] station_info['distance'] = ((station_info['latitude'] - station_lat) * 111.03) ** 2 + \ ((station_info['longitude'] - station_lon) * 85.39) ** 2 counter = 0 nearest_stations = [] for station_id in station_info.sort_values('distance')['station_id']: ## can update this since new count check_station_status = pd.read_sql_query("SELECT * FROM station_statuses WHERE station_id = %d;" %station_id, con) if len(check_station_status['event_date']) > 100: nearest_stations.append(station_id) counter += 1 if counter == 4: break nearby_station_data = [] for index, nearby_id in enumerate(nearest_stations[1:]): sql_query = "SELECT event_date, num_bikes FROM station_statuses WHERE station_id = %d;" %nearby_id tmp_nearby_data = pd.read_sql_query(sql_query, con) nearby_station_data.append(pd.DataFrame(data={'event_date': tmp_nearby_data['event_date'], 'num_bikes_st%d' %(index + 1): tmp_nearby_data['num_bikes']})) # print index, nearby_station_data[-1].info() return nearby_station_data
def validate_mutation_1(uniprot_id, mutation): """Select Provean; assert length > 0 """ logger.debug(helper.underline("Validating that we have provean...")) sql_query = """\ select 1 from {db_schema}.provean where uniprot_id = '{uniprot_id}' and provean_supset_filename is not null; """.format( uniprot_id=uniprot_id, db_schema=conf.CONFIGS["db_schema"] ) logger.debug(sql_query) df1 = pd.read_sql_query(sql_query, conf.CONFIGS["engine"]) logger.debug(df1.head(2)) # logger.debug(helper.underline("And that we have at least one domain with a template...")) sql_query = """\ select 1 from {db_schema}.uniprot_domain join {db_schema}.uniprot_domain_template using (uniprot_domain_id) where uniprot_id = '{uniprot_id}'; """.format( uniprot_id=uniprot_id, db_schema=conf.CONFIGS["db_schema"] ) logger.debug(sql_query) df2 = pd.read_sql_query(sql_query, conf.CONFIGS["engine"]) logger.debug(df2.head(2)) assert len(df1) >= 1 or len(df2) == 0
def getMyStocks(uid,flag,isSingle=False): user_id = uid if flag == '0' or flag == '1': global_bdf = pd.read_sql_query( "select ms.*,sb.zgb,sb.launch_date,sb.grow_type,sb.industry from my_stocks ms,stock_basic sb " \ "where ms.code=sb.code and sb.flag=0 and ms.user_id = %(uid)s", db.engine, \ params={'uid': user_id}, \ index_col='code') bdf = global_bdf[global_bdf['flag'] == int(flag)] bdf = bdf.sort_values(by='created_time', ascending=False) elif isSingle: # 如果是股票代码 bdf = pd.read_sql_query( "select ms.*,sb.zgb,sb.launch_date,sb.grow_type,sb.industry from my_stocks ms,stock_basic sb " \ "where ms.code=sb.code and sb.flag=0 and ms.code = %(code)s and ms.user_id = %(uid)s", db.engine, params={'code': flag, 'uid': user_id}, \ index_col='code') elif flag == '2': #所有股票 bdf = dbs.get_global_basic_data() else: tf1 = pd.read_sql_query("select sb.* from relation_stocks rs,stock_basic sb " \ "where rs.relation_stock=sb.code and sb.flag=0 and rs.main_stock=%(name)s and rs.user_id=%(uid)s", db.engine, params={'name': flag, 'uid': user_id}, \ index_col='code') # 添加股票自身 tf2 = dbs.get_global_basic_data() tf2 = tf2[tf2.index == flag] bdf = pd.concat([tf1, tf2]) return getStockItem(bdf)
def load_football(): """ Loads football data Dataset of football stats. +25,000 matches, +10,000 players from 11 European Countries with their lead championship Seasons 2008 to 2016. It also contains players attributes sourced from EA Sports' FIFA video game series, including the weekly updates, team line up with squad formation (X, Y coordinates), betting odds from up to 10 providers and detailed match events (goal types, possession, corner, cross, fouls, cards etc...) for +10,000 matches. The meaning of the columns can be found here: http://www.football-data.co.uk/notes.txt Number of attributes in each table (size of the dataframe): countries (11, 2) matches (25979, 115) leagues (11, 3) teams (299, 5) players (183978, 42) Link to the source: https://www.kaggle.com/hugomathien/soccer Returns ------- list of pandas DataFrame """ database_path = reduce(os.path.join, _FOOTBALL_PATH, _get_datapath()) with sqlite3.connect(database_path) as con: countries = pd.read_sql_query("SELECT * from Country", con) matches = pd.read_sql_query("SELECT * from Match", con) leagues = pd.read_sql_query("SELECT * from League", con) teams = pd.read_sql_query("SELECT * from Team", con) players = pd.read_sql("SELECT * FROM Player_Attributes;", con) return countries, matches, leagues, teams, players
def import_all_submissions(dbname="test1"): con = psycopg2.connect("dbname=%s" % dbname) # subs = generate_submissions(users=10, pbls=100) users = pd.read_sql_query("select user_id, creation_date from users \ where demo=0 and instructor=0 and administrator=0\ ", con=con) lusers = users.user_id.tolist() usersstr = str(lusers).strip('[]') probs = pd.read_sql_query("select problem_nm from abstractproblems where problem_nm like 'P%%'\ ", con=con) # problem_id like 'P%%' lprobs = [p[0] for p in probs.values if p[0]] lprobsstr = str(lprobs).strip('[]') submissions = pd.read_sql_query("select submission_uid, user_id, problem_id, submission_id, \ state, time_out, time_in, veredict, score \ from submissions where user_id in (%s);" % (usersstr), con=con) # get rid of languages submissions.problem_id = submissions.problem_id.apply(lambda x: x[:-3]) #submissions.set_index('submission_uid', inplace=True) subs = submissions[submissions.problem_id.isin(lprobs)] return subs
def get_record_factor(conn_func, parent_kind): conn = conn_func() qtype = QUESTION_SUBTYPE_MAP[parent_kind] sql = ('select question_type,question_id,user_id,status,date ' 'from question_record_detail where status!=0 and question_type=%s') record = pd.read_sql_query(sql, conn, params=(qtype,)) record.rename(columns = {'question_type':'qtype', 'question_id':'qid', 'user_id':'uid'}, inplace=True) sql = ('select target_kind, target_id, tag_id from knowledge_tag ' 'where target_kind = %s') ktags = pd.read_sql_query(sql, conn, params=(qtype,)) ktags.columns = ['qtype', 'qid', 'tag_id'] conf = CONF_MAP[parent_kind]['question_conf'] table_name = conf['table'] qid_name = conf['qid'] sql = ('select question_type,%s,difficulty from %s ' 'where question_type=%s' % (qid_name, table_name, qtype)) diff = pd.read_sql_query(sql, conn) if len(diff) == 0: return None diff.columns = ['qtype', 'qid', 'difficulty'] def _convert_fac(x): res = zip(map(str, x.tag_id), map(str, x.difficulty)) return "|".join(['%s:%s' % (t, d) for t, d in res]) fac = pd.merge(ktags, diff).groupby(['qtype', 'qid'] ).apply(_convert_fac).reset_index() fac.columns = ['qtype', 'qid', 'score'] return pd.merge(record, fac)[['uid', 'status', 'score']]
def __getCodeInfo__(self): rtn={} if self.config['UseCache']: rtn=self.__loadCache__('CodeInfo.json') if rtn=={}: CommodityInfo=pd.read_sql_query('select Future,Exchange,TradeUnit,Tick,Target,DeliveryMethod,Unit1,Unit2'+\ ',TablePrefix,DominantContracts,AcsyCode from CommodityInfo where AcsyCode like N\'%0000\'',self.dbconn) TradeDate=pd.read_sql_query('select * from TradeDate where StartDate is not null',self.dbconn) TradeTime=pd.read_sql_query('select * from TradeTime',self.dbconn) CommodityInfo.index=CommodityInfo['AcsyCode'].apply(lambda x: x[:-4]) CommodityInfo.drop(labels='AcsyCode',axis=1,inplace=True) CommodityInfo=CommodityInfo.T.to_dict() TradeDate.index=TradeDate['AcsyCode'] TradeDate=TradeDate.T.to_dict() TT={} for name,group in TradeTime.groupby('AcsyCode'): TT[name]=[(datetime.datetime.strptime(v['StartTime'],'%H%M%S%f').time(),datetime.datetime.strptime(v['EndTime'],'%H%M%S%f').time()) for k,v in group.T.to_dict().items()] TradeTime={k:{(datetime.datetime.strptime(v['StartDate'],'%Y-%m-%d').date(),datetime.date(9999,12,31) if v['EndDate']==None else datetime.datetime.strptime(v['EndDate'],'%Y-%m-%d').date()):\ TT[k]} for k,v in TradeDate.items()} TT={} for k,v in TradeTime.items(): key=k[:-4] if key in TT.keys(): TT[key].update(v) else: TT[key]=v rtn={} keys=set(TT.keys()) for k,v in CommodityInfo.items(): v.update({'TradeTime': TT[k] if k in keys else []}) rtn[k]=v self.__saveCache__('CodeInfo.json',rtn) return rtn
def populate_encoders_scale(table,disk_engine,events_tbl=None): df = pd.read_sql_query('select * from {table} limit 5'.format(table=table),disk_engine) col_names = df.columns.values encoders = {} # time_cols = ['AUTHZN_RQST_PROC_TM','PREV_ADR_CHNG_DT','PREV_PMT_DT','PREV_CARD_RQST_DT','frd_ind_swt_dt'] for c,name in enumerate(col_names): tp = df.dtypes[c] # print tp if tp == 'object': # print 'ORIGINAL NAME:',name if name not in time_cols: print name df_cols = pd.read_sql_query('select distinct {col_name} from {table}'.format(col_name=name,table=table),disk_engine,chunksize=100000) arr = [] progress = progressbar.ProgressBar(widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage(), ' ', progressbar.ETA()]).start() for c,df_col in enumerate(df_cols): # arr = np.vstack((arr,np.array(df_col))) arr.extend(np.array(df_col)) progress.update(c+1) if events_tbl != None: df_cols = pd.read_sql_query('select distinct {col_name} from {table}'.format(col_name=name,table=events_tbl),disk_engine,chunksize=100000) for c,df_col in enumerate(df_cols): arr.extend(np.array(df_col)) progress.update(c+1) progress.finish() arr = np.array(arr) encoders[name] = encode_column(np.array(arr).ravel()) return encoders
def load_bars(self, pcontract, dt_start, dt_end, window_size): cursor = self.db.cursor() id_start, u = datautil.encode2id(pcontract.period, dt_start) id_end, u = datautil.encode2id(pcontract.period, dt_end) table = string.replace(str(pcontract.contract), '.', '_') #sql = "SELECT COUNT(*) FROM {tb} \ #WHERE {start}<=id AND id<={end}".format(tb=table, start=id_start, end=id_end) #max_length = cursor.execute(sql).fetchone()[0] # sql = "SELECT datetime, open, close, high, low, volume FROM {tb} \ WHERE {start}<=id AND id<={end}".format(tb=table, start=id_start, end=id_end) data = pd.read_sql_query(sql, self.db, index_col='datetime') if not series.g_rolling: data = pd.read_sql_query(sql, self.db, index_col='datetime') ## @todo return SqliteSourceWrapper(pcontract, data, None, len(data)) else: cursor.execute(sql) data = pd.DataFrame({ 'open': [], 'close': [], 'high': [], 'low': [], 'volume': [] }) data.index = [] return SqliteSourceWrapper(pcontract, data, cursor, window_size)
def get_breakdown(orig_state, dest_state, info): column_code = queries.code_translate[info] q = queries.transactionsBetweenStates.params(orig_code=orig_state, dest_code=dest_state) a = queries.getAux(info) try: data = pd.DataFrame({'counts': pd.read_sql_query(q.limit(QL), ENGINE).groupby(info)[info].count()}) appendix = pd.read_sql_query(a, ENGINE) if data.index.dtype != appendix[column_code].dtype: appendix[column_code] = appendix[column_code].astype(str) m = pd.merge(appendix, data, left_on=column_code, right_index=True) # convert all numeric-code columns to be named code for easier manip m = m.rename(columns={column_code:'code'}) # for some reason the mode table has a diff name for description column if info == 'MODE': m = m.rename(columns={'Mode Description':'Description'}) except Exception as e: return Response(str(e) + " is not a correct column to groupby") return Response(m.to_json(orient="records"), mimetype='application/json', headers={'Cache-Control': 'no-cache'})
def get_data(comparison,candidate,labels,top_features,feature_tooltips,party): #pull mean info for the right candidate topic_string="" feature_count=0 for topic_num in top_features: topic_string=topic_string + ", AVG(topic%s) as topic%s"%(topic_num,feature_count) feature_count=feature_count+1 cand_supp_query="SELECT user_candidate.candidate%s FROM user_topics INNER JOIN user_candidate ON (user_topics.user_id = user_candidate.user_id) WHERE user_candidate.candidate='%s' GROUP BY user_candidate.candidate;" %(topic_string, candidate) candidate_supp_data=pd.read_sql_query(cand_supp_query,con) party_query="SELECT user_candidate.party%s FROM user_topics INNER JOIN user_candidate ON (user_topics.user_id = user_candidate.user_id) WHERE user_candidate.candidate !='%s' AND party = '%s' GROUP BY user_candidate.party;" %(topic_string, candidate, party) party_data=pd.read_sql_query(party_query,con) all_data=candidate_supp_data.append(party_data) all_data['label']='comparison' all_data['label'][all_data['candidate']==candidate]="ChosenCandidate" all_data.drop('candidate', axis=1, inplace=True) all_data.drop('party', axis=1, inplace=True) all_data.set_index(['label'],inplace=True) flipped=all_data.T flipped['index_word']=labels flipped['topic_words']=feature_tooltips return flipped
def get_stock_k_line_if_ma_is_null(code): sql = 'SELECT min(date) as date FROM {table} where code={code} and ma_12 is NULL'.format(table=STOCK_KLINE_TABLE, code=code) df = pd.read_sql_query(sql, engine) d_end=datetime.datetime.today() #date_end =d_end.strftime('%Y-%m-%d') if len(df) > 0: date_start = df.ix[0, 'date'] if date_start is None: return None date_start = str(date_start)[:10] d_start = str_to_datatime(date_start, '%Y-%m-%d') delta = d_end - d_start days = delta.days + AVR_LONG + 1 try: sql = "select * from {table} where code='{code}' order by date desc limit {count}".format( table=STOCK_KLINE_TABLE, code=code, count=days) df = pd.read_sql_query(sql, engine) df = df.sort_index(by='date', ascending=True) return df except Exception as e: print str(e) return None
def read_db(): con = None con = psycopg2.connect(database=db_name, user=db_user, host='localhost', password=db_pswd) # query: weather_query = """ SELECT * FROM weather_data_table; """ record_query = """ SELECT * FROM running_data_table; """ stat_query = """ SELECT date, city, COUNT(city) FROM running_data_table GROUP BY date,city; """ # read database weather_data = pd.read_sql_query(weather_query, con) running_data = pd.read_sql_query(record_query, con) stat_data = pd.read_sql_query(stat_query, con) stat = pd.merge(weather_data, stat_data, on=['date', 'city']) full = pd.merge(weather_data, running_data, on=['date', 'city']) full.to_sql('full_data_table', engine, if_exists='replace') return weather_data, running_data, full, stat
def __load_tables(db_path, pair_name, session): """ :param db_path: path to historical database :param pair_name: name of currency exchange pair :return: data frame of time, prices and volume, sessions list """ table_names_pairs = sorted(FxSingleCurrencyBroker.DB_TABLES.items(), key=operator.itemgetter(1)) logging.info("Loading " + db_path) # connect to sqlite database con = sqlite3.connect(db_path) # fetch table names cursor = con.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") cdata = cursor.fetchall() table_names = [c[0] for c in cdata] # check database for key, _ in table_names_pairs: if key not in table_names and key is not 'TIME': raise LookupError("Loaded database doesn't have required table: " + key) # read tables data to data frames (pandas) df_list = [pd.read_sql_query("SELECT TIME from " + table_names[0], con)] df_columns = [] for key, _ in table_names_pairs: if key is not 'TIME': df_list.append(pd.read_sql_query("SELECT " + pair_name + " from " + key, con)) logging.info(key + " has been read") df_columns.append(key) con.close() df = pd.concat(df_list, axis=1) df.columns = df_columns return df, FxSingleCurrencyBroker.__split_sessions(df, session)
def test_data(): """Issue some simple queries to test whether tables exist""" print "Testing 2 queries!" dbname = 'taxi' username = '******' # engine = create_engine('postgresql://%s@localhost/%s'%(username,dbname)) # print 'ENGINE', engine.url # connect to the server to run a test SQL command con = None con = psycopg2.connect(database = dbname, user = username) # query: sql_query = "SELECT COUNT(trip_distance) "+\ " FROM taxi_trips WHERE trip_distance > 10;" subdat = pd.read_sql_query(sql_query,con) print sql_query print subdat.head() sql_query = "SELECT COUNT(fare_amount) "+\ " FROM taxi_fares WHERE fare_amount > 30;" subdat = pd.read_sql_query(sql_query,con) print sql_query print subdat.head() con.close()
def get_lesson_info(self): ''' lesson_info: lesson_id, week, root_id, lesson_plan_id ''' conn = self.conn_func() sql = ('select id s_id, parent_id, level ' 'from book_hierarchy where level > 0') A = pd.read_sql_query(sql, conn) A.sort(columns='level', inplace=True) root_id_map = {} for _, row in A.iterrows(): s_id, parent_id, level = row s_id = int(s_id) level = int(level) if not pd.isnull(parent_id): parent_id = int(parent_id) if level == 1: root_id_map[s_id] = s_id else: root_id_map[s_id] = root_id_map[parent_id] _f = lambda x: root_id_map[x.s_id] sql = 'select id s_id, week from book_hierarchy where level = 4' lesson_info = pd.read_sql_query(sql, conn) lesson_info['root_id'] = lesson_info.apply(_f, axis=1) lesson_info.rename(columns={'s_id':'lesson_id'}, inplace=True) sql = ('select id lesson_plan_id, lesson_id ' 'from lesson_plan where subject_id = %s ' 'and status = %s' % (self.subject_id, K_APPROVED)) lesson_plan = pd.read_sql_query(sql , conn) lesson_info = pd.merge(lesson_info, lesson_plan) return lesson_info
def get_data(limit=None, target='severity_final'): """Returns train test split of relevant data from database.""" print '{}: connecting to database'.format(datetime.datetime.now()) conn = connect_db() print '{}: loading data from database'.format(datetime.datetime.now()) col_list = """ assigned_to_init, cc_init, product_init, version_init, component_init, op_sys_init, reporter_bug_cnt, desc_init, short_desc_init, priority_final, severity_final """ if limit: df_original = pd.read_sql_query( 'select {} from final limit {}'.format(col_list, limit), con=conn) else: df_original = pd.read_sql_query( 'select {} from final'.format(col_list), con=conn) df = df_original.copy(deep=True) # Feature engineering print '{}: feature engineering {}'.format(datetime.datetime.now(), target) df = create_features(df, target=target) y_all = df.pop(target) X_all = df return train_test_split(X_all, y_all, test_size=0.25, random_state=42)
def get_courses(con, filtered=True): df = pd.read_sql_query("select course_id, title from courses;", con=con) kw = df.title.apply(lambda x: x.split()[0]) kw.name = "kw" df = df.join(kw) cusers = pd.read_sql_query("select user_id, course_id from coursesusers \ where course_id in(%s);" % str(df.course_id.tolist()).strip('[]'), con=con) # usrcnt = cusers[cusers.user_id != None].course_id.value_counts() cusers = cusers.drop_duplicates() cusers = cusers.dropna() cusers = cusers[cusers.user_id.isin(get_good_users(con))] usrcnt = cusers.course_id.value_counts() usrcnt.name = 'usrcnt' print usrcnt.describe() df.set_index(df.course_id, inplace=True) pblcnt = pd.Series(name='pblcnt') for c in df.course_id: pblcnt[c] = len(select_pbls(con, course_id=c)) df = df.join(pblcnt) df.pblcnt = df.pblcnt.fillna(0) df = df.join(usrcnt) df.usrcnt = df.usrcnt.fillna(0) # kw_vc = df.kw.value_counts() return df
def to_dataframe(self, timerange=None): c = self.conn.cursor() select_byrange = True select_syntax = None df = None if timerange is None: print "select all" select_byrange = False elif len(timerange) == 1: timestamp1 = int(timerange[0]) timestamp2 = timestamp1 + 1 elif len(timerange) == 2: timestamp1 = timerange[0] timestamp2 = timerange[1] if select_byrange: select_syntax = "SELECT * FROM %s WHERE Timestamp>=? AND Timestamp<? ORDER BY Timestamp" % self.name print "params=", timestamp1, timestamp2 df = pd.read_sql_query(select_syntax, self.conn, params=[timestamp1, timestamp2]) else: select_syntax = "SELECT * from %s ORDER BY Timestamp" % self.name df = pd.read_sql_query(select_syntax, self.conn, index_col=["Timestamp"]) print select_syntax # c.execute(select_syntax, (timestamp1, timestamp2)) # df = pd.DataFrame(self._c.fetchall()) # df.columns = c.keys() # df.set_index(['Timestamp']) # df = pd.read_sql_query(select_syntax, self.conn, params=[timestamp1, timestamp2], index_col=['Timestamp']) return df
def get_publish_articles(self): t1 = time.time() print 'begin query...' #sql = 'select distinct user_id from %s where user_id not in (select distinct user_id from %s)' % (big_v_table_mysql, archive_table_mysql) #df = pd.read_sql_query(sql, engine) #user_ids = df['user_id'].get_values() sql1 = 'select distinct user_id from %s where fans_count > 1000 and fans_count < 10001 ' % (big_v_table_mysql) sql2 = 'select distinct user_id from %s' % archive_table_mysql df1 = pd.read_sql_query(sql1, engine) df2 = pd.read_sql_query(sql2, engine) user_ids1 = df1['user_id'].get_values() user_ids2 = df2['user_id'].get_values() user_ids = [id for id in set(user_ids1).difference(user_ids2)] t2 = time.time() print 'query mysql by join cose:', t2-t1, 's' for user_id in user_ids: try: self.get_publish_articles_by_id(user_id) except Exception, e: se = Series([user_id, GetNowTime(), str(e)], index=['user_id', 'fail_time', 'fail_reason']) df = DataFrame(se).T df.to_sql(unfinish_arcticle_table_mysql, engine, if_exists='append', index=False) print e
def find_companies_by_name(self, name): query = "SELECT FullNameRu, Founders FROM Minjust2018 WHERE FullNameRu LIKE '%" + name + "%' LIMIT 0, 20" df = pd.read_sql_query(query, self.db_connection) return df
def find_companies_by_founder(self, founder, nosearch): df = pd.read_sql_query( "SELECT FullNameRu FROM Minjust2018 WHERE Founders LIKE '%" + founder + "%' AND FullNameRu <> '" + nosearch + "' LIMIT 0, 10", self.db_connection) return df.values.tolist()
from scipy.stats import skew import pgeocode # In[102]: from sklearn.model_selection import cross_val_score, train_test_split from sklearn.preprocessing import StandardScaler # In[103]: db = sqlite3.connect('home_sales.db') df = pd.read_sql_query('SELECT * FROM sales;',db) # In[104]: # dropping null values df = df.dropna() # In[105]: # feature engineering place_name from zipcode nomi = pgeocode.Nominatim('us') for index, row in df.iterrows():
def build_ensembl_genes(cursor, conn): '''queries the MySQL public ensembl database and outputs a gene lookup object in JSON format. It also injects into our sqlite database just so that we can do the processing directly there. ''' #connect to Ensembl MySQL public server core = create_engine( 'mysql+mysqldb://[email protected]/homo_sapiens_core_92_38' ) q = """ select et.exon_id, et.transcript_id, g.stable_id as gene_id, g.description, r.name as chr, g.seq_region_start as start, g.seq_region_end as end, e.seq_region_start as exon_start, e.seq_region_end as exon_end, t.seq_region_strand as fwdstrand from exon_transcript et, exon e, gene g, transcript t, seq_region r where g.canonical_transcript_id = et.transcript_id and g.seq_region_id = r.seq_region_id and r.coord_system_id = 4 and r.name NOT RLIKE 'CHR' and et.transcript_id = t.transcript_id and e.exon_id =et.exon_id """ start_time = time.time() df = pd.read_sql_query(q, core, index_col='exon_id') df['exons'] = list(zip(df.exon_start, df.exon_end)) df['fwdstrand'] = df['fwdstrand'].map({1: True, -1: False}) df['tss'] = df.apply(lambda row: row['start'] if row['fwdstrand'] else row['end'], axis=1) keepcols = [ 'gene_id', 'description', 'tss', 'chr', 'start', 'end', 'fwdstrand' ] genes = pd.DataFrame( df.groupby(keepcols)['exons'].apply(list)).reset_index() genes.set_index('gene_id', inplace=True) print(genes['chr'].value_counts()) genes.to_json(OUTGENENAME, orient='index') print("--- Genes table completed in %s seconds ---" % (time.time() - start_time)) genes.loc[:, ('chr', 'start', 'end')].to_sql('gene', conn, if_exists='replace') # add indices try: cursor.execute(''' CREATE INDEX ix_gene_gene_id ON gene (gene_id); ''') except sqlite3.OperationalError as operror: print(operror) pass
def get_user_buglist(self): #缺陷类型为user 运营端提交的缺陷 zentaodb = self.zentaodb bugsql = "select id,title from zt_bug where status='active' and type='user'" buglist = pd.read_sql_query(bugsql, zentaodb) return buglist
def get_data2(self, query): cnx = sqlite3.connect(self.db) data = pd.read_sql_query(query, cnx) data['fecha'] = pd.to_datetime(data['fecha'], format="%d/%m/%Y") return data
from sklearn.ensemble import GradientBoostingRegressor import skopt import pickle from opioid_functions import * import os os.chdir('/Users/zach.olivier/Desktop/GTX/CSE_6242/course_project') # define the file sqlite_file = 'DVADB/DVADB.db' # open a connection conn = sqlite3.connect(sqlite_file) # read from the main table df = pd.read_sql_query("SELECT * FROM npi_summary", conn) # take a sample for analysis / modeling df_model = df.sample(frac=.3) # quick summary print(f'dataframe dimensions: {df_model.shape}') print(f'column names: {df_model.columns}') print(f' column types: {df_model.dtypes}') # set index to npi df_model = df_model.set_index('npi') # columns that we cannot use for modeling drop_cols = [ 'nppes_provider_last_org_name',
print("\n", "="*50, "\n", sep="") ''' Create a sql db from adult dataset and name it sqladb ''' sqladb = db.connect("./adult_data.db") cursor = sqladb.cursor() df.to_sql("adult_data", sqladb, if_exists="replace", index=False) ''' 1. Select 10 records from the adult sqladb ''' print("Select 10 records\n") query = "SELECT * FROM adult_data LIMIT 10" print(pd.read_sql_query(query, sqladb)) print("\n", "="*50, "\n", sep="") ''' 2. Show me the average hours per week of all men who are working in private sector ''' print("Show me the average hours per week of all men who are working in private sector\n") query = "SELECT AVG(hours_per_week) AS average_hours_per_week FROM adult_data WHERE sex = 'Male' AND workclass = 'Private'" print(pd.read_sql_query(query, sqladb)) print("\n", "="*50, "\n", sep="") ''' 3. Show me the frequency table for education, occupation and relationship, separately ''' print("Show me the frequency table for education\n") query = "SELECT education, COUNT(education) AS frequency FROM adult_data GROUP BY education ORDER BY frequency DESC"
import pandas as pd import sqlite3 as sql # dir dans lequel est installé la database sqlite3 SQLiteDir = '/Users/griceldacalzada/Documents/Python/TestCarrefoursFeux/SQlite' # se connecte se la base de donnee sqlite3 Hermes conn = sql.connect(SQLiteDir + '/Hermes2018.db') cur = conn.cursor() # liste des b3s dans la base de donnée B3S_DF = pd.read_sql_query( "SELECT name FROM sqlite_master WHERE type='table';", conn) B3S_list = list(B3S_DF['name']) for b3s in B3S_list: sql = "DELETE FROM {b3s} WHERE Jour NOT BETWEEN '2018-01-01' AND '2019-01-01'; ".format( b3s=b3s) cur.execute(sql) print(sql) #cur.execute("SELECT * FROM {b3s} ORDER BY Jour ASC;".format(b3s=b3s)) conn.commit() cur.execute("VACUUM;") conn.close()
def to_sql( df: pd.DataFrame, table_name: str, creds: SqlCreds, sql_type: str = "table", schema: str = "dbo", index: bool = True, if_exists: str = "fail", batch_size: int = None, debug: bool = False, bcp_path: str = None, ): """ Writes the pandas DataFrame to a SQL table or view. Will write all columns to the table or view. If the destination table/view doesn't exist, will create it. Assumes the SQL table/view has the same number, name, and type of columns. To only write parts of the DataFrame, filter it beforehand and pass that to this function. Unlike the pandas counterpart, if the DataFrame has no rows, nothing will happen. Parameters ---------- df : pandas.DataFrame table_name : str Name of SQL table or view, without the schema creds : bcpandas.SqlCreds The credentials used in the SQL database. sql_type : {'table'}, can only be 'table' The type of SQL object of the destination. schema : str, default 'dbo' The SQL schema. index : bool, default True Write DataFrame index as a column. Uses the index name as the column name in the table. if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the table already exists. * fail: Raise a BCPandasValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. Matches the dataframe columns to the database columns by name. If the database table exists then the dataframe cannot have new columns that aren't in the table, but conversely table columns can be missing from the dataframe. batch_size : int, optional Rows will be written in batches of this size at a time. By default, BCP sets this to 1000. debug : bool, default False If True, will not delete the temporary CSV and format files, and will output their location. bcp_path : str, default None The full path to the BCP utility, useful if it is not in the PATH environment variable """ # validation if df.shape[0] == 0 or df.shape[1] == 0: return assert sql_type == TABLE, "only supporting table, not view, for now" assert if_exists in IF_EXISTS_OPTIONS if df.columns.has_duplicates: raise BCPandasValueError( "Columns with duplicate names detected, SQL requires that column names be unique. " f"Duplicates: {df.columns[df.columns.duplicated(keep=False)]}") # TODO diff way to implement? could be big performance hit with big dataframe if index: df = df.copy(deep=True).reset_index() delim = get_delimiter(df) quotechar = get_quotechar(df) if batch_size is not None: if batch_size == 0: raise BCPandasValueError("Param batch_size can't be 0") if batch_size > df.shape[0]: raise BCPandasValueError( "Param batch_size can't be larger than the number of rows in the DataFrame" ) # save to temp path csv_file_path = get_temp_file() # replace bools with 1 or 0, this is what pandas native does when writing to SQL Server df.replace({ True: 1, False: 0 }).to_csv( path_or_buf=csv_file_path, sep=delim, header=False, index=False, # already set as new col earlier if index=True quoting=csv.QUOTE_MINIMAL, # pandas default quotechar=quotechar, line_terminator=NEWLINE, doublequote=True, escapechar=None, # not needed, as using doublequote ) logger.debug(f"Saved dataframe to temp CSV file at {csv_file_path}") # build format file fmt_file_path = get_temp_file() sql_item_exists = _sql_item_exists(sql_type=sql_type, schema=schema, table_name=table_name, creds=creds) cols_dict = None # for mypy if if_exists == "append": # get dict of column names -> order of column cols_dict = dict( pd.read_sql_query( """ SELECT COLUMN_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{_schema}' AND TABLE_NAME = '{_tbl}' """.format(_schema=schema, _tbl=table_name), creds.engine, ).values) # check that column names match in db and dataframe exactly if sql_item_exists: # the db cols are always strings, unlike df cols extra_cols = [ str(x) for x in df.columns if str(x) not in cols_dict.keys() ] if extra_cols: raise BCPandasValueError( f"Column(s) detected in the dataframe that are not in the database, " f"cannot have new columns if `if_exists=='append'`, " f"the extra column(s): {extra_cols}") fmt_file_txt = build_format_file(df=df, delimiter=delim, db_cols_order=cols_dict) with open(fmt_file_path, "w") as ff: ff.write(fmt_file_txt) logger.debug(f"Created BCP format file at {fmt_file_path}") try: if if_exists == "fail": if sql_item_exists: raise BCPandasValueError( f"The {sql_type} called {schema}.{table_name} already exists, " f"`if_exists` param was set to `fail`.") else: _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) elif if_exists == "replace": _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) elif if_exists == "append": if not sql_item_exists: _create_table(schema=schema, table_name=table_name, creds=creds, df=df, if_exists=if_exists) # BCP the data in bcp( sql_item=table_name, direction=IN, flat_file=csv_file_path, format_file_path=fmt_file_path, creds=creds, sql_type=sql_type, schema=schema, batch_size=batch_size, bcp_path=bcp_path, ) finally: if not debug: logger.debug(f"Deleting temp CSV and format files") os.remove(csv_file_path) os.remove(fmt_file_path) else: logger.debug( f"`to_sql` DEBUG mode, not deleting the files. CSV file is at " f"{csv_file_path}, format file is at {fmt_file_path}")
from nltk.corpus import stopwords stop_words = stopwords.words('english') # ### Import Dataset # In[ ]: conn = pymysql.connect(host='kpmg-server.mysql.database.azure.com', port=int(3306), user='******', passwd='5527563Aas@', db='imap', charset='utf8mb4') df = pd.read_sql_query("SELECT * FROM imap.data", conn) # In[4]: # Convert body to list data = df.content.values.tolist() # ### Tokenize words and Clean-up text # In[5]: def sent_to_words(sentences): # 문장을 토큰화 for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations
'POSTGRES_PORT': 5432, # change to your port 'POSTGRES_USERNAME': '******', # change to your username 'POSTGRES_PASSWORD': '******', # change to your password 'POSTGRES_DBNAME': 'test_surfers_bible_db' } # change to your db name # create connection and cursor conn = ps.connect(host=credentials['POSTGRES_ADDRESS'], database=credentials['POSTGRES_DBNAME'], user=credentials['POSTGRES_USERNAME'], password=credentials['POSTGRES_PASSWORD'], port=credentials['POSTGRES_PORT']) cur = conn.cursor() beach_df = pd.read_sql_query("SELECT * FROM BEACH_TABLE;", conn) conn.close() cur.close() # beach_df.head() # # API KEYS: # In[27]: SG_API_KEY_DICT = {} SG_API_KEY_DICT[ '1'] = "8aab844c-8cfd-11ea-9f57-0242ac130002-8aab8500-8cfd-11ea-9f57-0242ac130002" SG_API_KEY_DICT[ '2'] = "ca3fa016-8cfd-11ea-ad84-0242ac130002-ca3fa0c0-8cfd-11ea-ad84-0242ac130002"
# password="******", # host="ec2-54-227-241-179.compute-1.amazonaws.com", # port="5432", # database="d46q2igt2d4vbg", # sslmode="require") if (conn): logger.info("Connection Successful!") else: logger.info("Connection Error!") logger.info("Get all the Community Partners from the Database") # Get all the Community Partners from the database dfCommunity = pd.read_sql_query( "SELECT pc.name as Community_Partner,pc.address_line1, pc.address_line2, pc.city, pc.state,pc.zip, hm.mission_name ,p.mission_type, pc.legislative_district,pc.median_household_income, pc2.community_type,pc.website_url FROM partners_communitypartner PC join partners_communitypartnermission p on PC.id = p.community_partner_id join home_missionarea hm on p.mission_area_id = hm.id join partners_communitytype pc2 on PC.community_type_id = pc2.id", con=conn) if len(dfCommunity) == 0: logger.critical("No Community Partners fetched from the Database on " + str(currentDT)) else: logger.info(repr(len(dfCommunity)) + "Community Partners are in the Database on " + str(currentDT)) # Get all the Projects from the database and get their Campus Partners , Community Partners associated dfProjects = pd.read_sql_query( "SELECT project_name,academic_year , pc2.name as campus_partner ,um.college_name,ppcp.name as community_partner FROM projects_project P join projects_academicyear pa on P.academic_year_id = pa.id join projects_projectcampuspartner pc on P.id = pc.project_name_id join projects_projectcommunitypartner ppc on P.id = ppc.project_name_id join partners_communitypartner ppcp on ppc.community_partner_id = ppcp.id join partners_campuspartner pc2 on pc.campus_partner_id= pc2.id join university_college um on um.id = pc2.college_name_id WHERE p.id IN (SELECT project_name_id FROM projects_projectcommunitypartner)", con=conn) if len(dfProjects) == 0: logger.critical("No Projects are fetched from the Database as of " + str(currentDT)) else: logger.info(repr(len(dfProjects)) + "Projects are in the Database as of " + str(currentDT)) conn.close()
def read_data(): conn = db_connection() try: return pd.read_sql_query("SELECT * FROM reports", conn) except Exception as e: return "No table with that name found, Error: {}".format(e)
def get_data(): cnx = sqlite3.connect('Project2') w2 = pd.read_sql_query("SELECT * FROM hawaii", cnx) res = w2.to_json(orient='table') return res
def sample_range_date_time_table(self, club, start, end, step=0): self.open_connect() query = "SELECT * FROM club_tab WHERE (club = ?) AND (data_time BETWEEN ? AND ?)" return pd.read_sql_query(query, self.connect, params=(club, start, end))
else: print('Not CAT4 or DRT') return result #load data from postgres #postgres_str='postgres://*****:*****@46.101.58.30:5432/academic_tracker_production' #product postgres_str = 'postgres://*****:*****@ec2-18-203-229-185.eu-west-1.compute.amazonaws.com:5432/d9k69uia8l4iu' cnx = create_engine(postgres_str) df_schools = pd.read_sql_query('''SELECT id, name FROM schools''', cnx) df_schools_ = df_schools[(df_schools['name'].str.contains("Test*") == False) & (df_schools['name'].str.contains("TEST*") == False)] #df_schools_=df_schools_[df_schools_['name'].str.contains("TEST*")==False] #df2=pd.read_sql_query('''SELECT id, student_id, academical_year, calendar_year FROM student_years''', cnx) #df1=pd.read_sql_query('''SELECT id, subject, student_year_id, score, title, examined_at, # expectation FROM results''', cnx) ### ###df1=part('0', '50000') ###df1=df1.rename(columns = {'id': 'idu'}) #df=pd.read_sql_query('''SELECT results.id, results.subject, results.score, results.expectation, student_years.student_id, student_years.academical_year, student_years.calendar_year FROM results INNER JOIN student_years ON results.student_year_id = student_years.id ORDER BY results.id''', cnx) #df3=pd.read_sql_query('''SELECT id, `, gender, year_of_entry, name FROM students''', cnx)
#displaying map map # # Fetching Data for Count of Applicants basis City # In[3]: import connectors import pandas as pd conn = connectors.db_conn() cur = conn.cursor() sql_query = "select count(apid) as applicants , case when current_state = 'DELHI' then 'Delhi' else current_city end as region from applicants group by 2 order by 1 desc" db_data = pd.read_sql_query(sql_query, conn) print(db_data) ## Always close the connection conn = None # # Plotting fetched data on India Map and applying visualisation effects # In[4]: folium.Choropleth(geo_data= 'India.geojson', #loading geojson file uploaded data=db_data, # my dataset columns=['region', 'applicants'], # region is here for matching the geojson regions, applicants is the column that changes the color of regions key_on= 'feature.properties.NAME_2', # this path contains region in str type, this region should match with our region column fill_color='BuPu',
geo_box = (18.005611, 48.987386, -124.626080, -62.361014) # connect to server engine = sqlalchemy.create_engine('mysql://%(user)s:%(pass)s@%(host)s' % config.database) engine.execute('use %s' % config.database['name']) # select db recent_data = (datetime.now() - timedelta(weeks=12)).strftime("%Y-%m-%d") sql_query = '''SELECT post_date, latitude, longitude, image_url, likes, caption, post_url FROM instagram WHERE post_date > '%s' AND latitude between %s AND %s AND longitude between %s AND %s ORDER BY post_date DESC, likes DESC ''' % (recent_data, geo_box[0], geo_box[1], geo_box[2], geo_box[3]) posts = pd.read_sql_query(sql_query, engine, parse_dates=['date']) n_points = posts.shape[0] posts = posts[posts['caption'].notnull()] posts.reset_index(drop=True) sentences = [] # Initialize an empty list of sentences print "Parsing sentences from training set" for caption in posts['caption']: sentences += caption_to_sentences(caption, tokenizer) # Set values for various parameters num_features = 400 # Word vector dimensionality min_word_count = 30 # Minimum word count num_workers = 4 # Number of threads to run in parallel
print statement indicating successful write to db ''' engine = create_engine( 'postgres://*****:*****@localhost:5432/bloodmoneydb') df.to_sql('model_input_tbl_raw', engine, if_exists='fail') # if need to recreate change this to 'replace' return 'Data successfully written to database' if __name__ == "__main__": conn = psycopg2.connect(host="localhost", database="bloodmoneydb", user="******", password="******") sql_query = """ SELECT * FROM joined_fight_event_fighters_data """ df = pd.read_sql_query(sql_query, con=conn) df = calculate_age_of_fighter(df=df) df = fighter_home_court(df) df = calculate_pct_of_possible_rounds_fought(df=df) df = calculate_win_streak(df=df) df = transform_to_wide_by_fight(df) write_data_to_tbl(df) print('Success')
def power_ice(conn, start, end): #query data from database start_str = str(Time(start).mjd) end_str = str(Time(end).mjd) sql_c = "SELECT * FROM SE_ZIMIRICEA_IDLE WHERE start_time BETWEEN "+start_str+" AND "+end_str+" ORDER BY start_time" _idle = pd.read_sql_query(sql_c, conn) sql_c = "SELECT * FROM SE_ZIMIRICEA_HV_ON WHERE start_time BETWEEN "+start_str+" AND "+end_str+" ORDER BY start_time" _hv = pd.read_sql_query(sql_c, conn) voltage = 30 _idle['average'] *= voltage _hv['average'] *= voltage _idle['start_time'] = pd.to_datetime( Time(_idle['start_time'], format = "mjd").datetime ) _hv['start_time'] = pd.to_datetime( Time(_hv['start_time'], format = "mjd").datetime ) #set column data source idle = ColumnDataSource(_idle) hv = ColumnDataSource(_hv) # create a new plot with a title and axis labels p = figure( tools = "pan,wheel_zoom,box_zoom,reset,save", \ toolbar_location = "above", \ plot_width = 1120, \ plot_height = 500, \ y_range = [5,14], \ x_axis_type = 'datetime', \ output_backend = "webgl", \ x_axis_label = 'Date', y_axis_label='Power (W)') p.grid.visible = True p.title.text = "POWER ICE" pf.add_basic_layout(p) pf.add_limit_box(p, 6, 8, alpha = 0.1, color = "green") # add a line renderer with legend and line thickness scat1=p.scatter(x = "start_time", y = "average", color = 'orange', legend = "Power idle", source = idle) scat2=p.scatter(x = "start_time", y = "average", color = 'red', legend = "Power hv on", source = hv) p.line(x = "start_time", y = "average", color = 'orange', legend = "Power idle", source = idle) p.line(x = "start_time", y = "average", color = 'red', legend = "Power hv on", source = hv) #generate error bars err_xs_hv = [] err_ys_hv = [] err_xs_idle = [] err_ys_idle = [] for index, item in _hv.iterrows(): err_xs_hv.append((item['start_time'],item['start_time'])) err_ys_hv.append((item['average'] - item['deviation'], item['average'] + item['deviation'])) for index, item in _idle.iterrows(): err_xs_idle.append((item['start_time'],item['start_time'])) err_ys_idle.append((item['average'] - item['deviation'], item['average'] + item['deviation'])) # plot them p.multi_line(err_xs_hv, err_ys_hv, color='red', legend='Power hv on') p.multi_line(err_xs_idle, err_ys_idle, color='orange', legend='Power idle') #activate HoverTool for scatter plot hover_tool = HoverTool( tooltips = [ ('count', '@data_points'), ('mean', '@average'), ('deviation', '@deviation'), ], mode='mouse', renderers=[scat1,scat2]) p.tools.append(hover_tool) p.legend.location = "bottom_right" p.legend.click_policy = "hide" return p
def exec_query_file(path: str): conn = open_db() query = import_query(path) return pd.read_sql_query(query, conn)
#!/usr/bin/env python import sqlite3 import pandas as pd move_from = sqlite3.connect("/u/home/c/cloeffle/scratch/sql/bacteria_data.db") move_to = sqlite3.connect("/u/home/c/cloeffle/scratch/sql/new_bacteria_stats.db") cur = move_to.cursor() frame = pd.read_sql_query("SELECT FILENAME, FILEPATH, chromosome_count, avg_length_chromosomes, max_length_chromosomes, min_length_chromosomes, contig_count, avg_length_contig, max_length_contig, min_length_contig, plasmid_count, avg_length_plasmids, max_length_plasmids, min_length_plasmids FROM SPECIESDB WHERE DBNAME='ENSEMBL'", move_from) for index, ROW in frame.iterrows(): cur.execute ("INSERT INTO SPECIESDB (FILENAME, FILEPATH, chromosome_count, avg_length_chromosomes, max_length_chromosomes, min_length_chromosomes, contig_count, avg_length_contig, max_length_contig, min_length_contig, plasmid_count, avg_length_plasmids, max_length_plasmids, min_length_plasmids, DBNAME) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (ROW["FILENAME"], ROW["FILEPATH"], ROW["chromosome_count"], ROW["avg_length_chromosomes"], ROW["max_length_chromosomes"], ROW["min_length_chromosomes"], ROW["contig_count"], ROW["avg_length_contig"], ROW["max_length_contig"], ROW["min_length_contig"], ROW["plasmid_count"], ROW["avg_length_plasmids"], ROW["max_length_plasmids"], ROW["min_length_plasmids"], "ENSEMBL")) move_to.commit() move_from.close() cur.close() move_to.close()
def query_events(self, task, sql): sql_filename = self.convert_lhe(task) return pd.read_sql_query(sql, 'sqlite:///' + sql_filename)
# -*- coding: utf-8 -*- """ Created on Sun Jan 3 10:32:44 2016 @author: wayne """ import pandas as pd import sqlite3 # Read sqlite query results into a pandas DataFrame conn = sqlite3.connect("babysleep_test.db") data = pd.read_sql_query("SELECT * from sleep_data", conn) # verify that result of SQL query is stored in the dataframe print data.head() conn.close()
import pandas as pd import sqlite3 import json RASA_DB_FILE = "rasa.db" RASA_ANA_FILE = "cleanRasa.db" # Read sqlite query results into a pandas DataFrame con = sqlite3.connect(RASA_DB_FILE) conversation_data = pd.read_sql_query("SELECT data FROM conversation_event", con) # Divide the conversation into sessions # inactivity period in the units of seconds inactivity_period_allowed = 1 * 60 old_sender_id = None old_timestamp = 0 # extract conversation text and entities information from the database conversation_text = pd.DataFrame(columns=('sender_id', 'session_id', 'event', 'timestamp', 'text', 'entities')) for conversation in conversation_data['data']: if 'text' in conversation: res = json.loads(conversation) sender_id = res['sender_id'] event = res['event'] timestamp = res['timestamp'] text = res['text'] entities = defaultdict(list) if 'parse_data' in res: for i in res['parse_data']['entities']: entities[i['entity']].append(i['value'])
# Import packages from sqlalchemy import create_engine import pandas as pd # Create engine: engine engine = create_engine('sqlite:///Chinook.sqlite') # Execute query and store records in DataFrame: df df = pd.read_sql_query('SELECT * FROM Employee WHERE EmployeeId >= 6 ORDER BY BirthDate;', engine) # Print head of DataFrame print(df.head())
def names(): """Return a list of sample names.""" stmt = belly_db.session.query(Samples).statement df = pd.read_sql_query(stmt, belly_db.session.bind) return jsonify(list(df.columns)[2:])
def export_case(self, save_path, case_name, max_tiles=None): main_df = self.format_df() main_name = os.path.join(save_path, case_name + "-info.csv") main_df.to_csv(main_name) if max_tiles == None: max_tiles = self.get_dimensions() tile_count_query = """SELECT COUNT(NULLIF(pap_area,0)) as pap_count, COUNT(NULLIF(den_area,0)) as den_count, COUNT(NULLIF(hy_area,0)) as hy_count, COUNT(NULLIF(min_area,0)) as min_count FROM master GROUP BY tile_id """ impacted_count_query = """SELECT COUNT(a.tag) as imp_count FROM master AS a, impacted AS b WHERE a.tag = b.tag GROUP BY a.tile_id """ count_df = pd.read_sql_query(tile_count_query, self.conn) imp_df = pd.read_sql_query(impacted_count_query, self.conn) imp_count = imp_df["imp_count"] count_df.insert(4, "imp_count", imp_count) count_name = os.path.join(save_path, case_name + "-counts.csv") count_df.to_csv(count_name) impacted_tag_query = """SELECT tag FROM impacted""" impacted_df = pd.read_sql_query(impacted_tag_query, self.conn) impacted_name = os.path.join(save_path, case_name + "-impacted-tags.csv") impacted_df.to_csv(impacted_name) total_count_query = """SELECT COUNT(NULLIF(pap_area,0)) as total_pap, COUNT(NULLIF(den_area,0)) as total_den, COUNT(NULLIF(hy_area,0)) as total_hy, COUNT(NULLIF(min_area,0)) as total_min, COUNT(NULLIF(den_area,0)) * 1.0 / COUNT(NULLIF(pap_area,0)) * 100 AS den_count_perc, COUNT(NULLIF(hy_area,0)) * 1.0 / COUNT(NULLIF(pap_area,0)) * 100 AS hy_count_perc, COUNT(NULLIF(min_area,0)) * 1.0/ COUNT(NULLIF(pap_area,0)) * 100 AS min_count_perc FROM master """ total_impacted = len(self.pull_impacted_tags()) total_pap = self.get_counts()[0] impacted_perc = round((total_impacted / total_pap) * 100, constants.perc_digits) count_df = pd.read_sql_query(total_count_query, self.conn) count_df.insert(4, "total_imp", total_impacted) count_df.insert(8, "imp_count_perc", impacted_perc) count_name = os.path.join(save_path, case_name + "-count-percentages.csv") count_df.to_csv(count_name) for fib_type in constants.fib_types: points_query = f"""SELECT {fib_type}.*, master.tile_id, tiles.real_tile FROM {fib_type} INNER JOIN master on master.tag = {fib_type}.tag INNER JOIN tiles on tiles.rel_tile = master.tile_id """ pt_df = pd.read_sql_query(points_query, self.conn) # account of padding of the image in tkinter pt_df["real_x"] = (pt_df["rel_x"] * constants.zoom_multiplier ) - constants.padding_size pt_df["real_y"] = (pt_df["rel_y"] * constants.zoom_multiplier ) - constants.padding_size # convert to overall coordinates pt_df["real_x"] = pt_df["real_x"] + ( (pt_df["real_tile"] % max_tiles[0]) * constants.tile_size) pt_df["real_y"] = pt_df["real_y"] + ( (pt_df["real_tile"] // max_tiles[0]) * constants.tile_size) pt_df["rel_x"] = pt_df["rel_x"] - ( constants.padding_size // constants.zoom_multiplier ) #set rel coords 0,0 to top left of tile pt_df["rel_y"] = pt_df["rel_y"] - (constants.padding_size // constants.zoom_multiplier) points_name = os.path.join(save_path, case_name + f"-{fib_type}-pts.csv") pt_df.to_csv(points_name)
con = psycopg2.connect(dbname='mimic') #initialize context dictionary context_dic= {} # Query mimic for notes notes_query = \ """ select n.subject_id,n.text from mimiciii.noteevents n where iserror IS NULL --this is null in mimic 1.4, rather than empty space and subject_id > %d and subject_id < %d ; """ % (min_id,max_id) notes = pd.read_sql_query(notes_query, con) text = '' for i,row in notes.iterrows(): toks = tokenize(row.text) text += ' '.join(toks)+'\n' extract_context(toks,window_size,context_dic) with open('context_small.txt','w') as f: f.write(text) f.close() context_dictionary_filename = str(context_dictionary_name)+'.npy' np.save(context_dictionary_filename,context_dic) #Save context dictionary after having read all the notes regex_punctuation = re.compile('[\',\.\-/\n]') regex_alphanum = re.compile('[^a-zA-Z0-9_ ]') regex_num = re.compile('\d[\d ]+')