def process_water_meter(data): series=[] for param in data['parameters']: query='select timestamp,state from water_data where timestamp between %d and %d and meter_id= %d;' %(data['start'],data['end'],int(param)) try: result=psql.frame_query(query,mysql_conn['water_meter']) except Exception, e: mysql_conn['smart_meter']=MySQLdb.connect(user='******',passwd='password',db='smart_meter'); mysql_conn['jplug']=MySQLdb.connect(user='******',passwd='password',db='jplug'); mysql_conn['water_meter']=MySQLdb.connect(user='******',passwd='password',db='water_meter'); mysql_conn['multisensor']=MySQLdb.connect(user='******',passwd='password',db='multisensor') result=psql.frame_query(query,mysql_conn['water_meter']) result.index=pd.to_datetime(result.timestamp*1e9) result=result.drop('timestamp',1) freq_downsampled=calculate_downsampling_frequency(result) if freq_downsampled is not None: result=result.resample(freq_downsampled, how='max') result=result.dropna() num_rows=len(result[result.columns[0]].values) temp=np.empty((num_rows,2)) #Subtracting 5.5 hrs to ensure, we always get UTC time x=result.index.astype(int)/1e6+5.5*60*60*1000 temp[:,0]=x for key in result: temp[:,1]=result[key].values series.append({'name':key+" "+water_meter_mapping[param],'data':temp.tolist()})
def remeasureFeatFlagDB(self): conn = conDB(self.host,self.dbname,passwd=self.passwd,port=self.port, user=self.user) import pandas.io.sql as psql pageIds = psql.frame_query("SELECT pageid FROM testing2 WHERE featured = 1 OR flagged = 1", conn) # go through each page, rescrape features and write out readability to database print("Calculating readability & scores & writing to database...") for i in range(len(pageIds.values)): p = pageIds.values[i] sql = "SELECT * FROM testing2 WHERE pageid = %s" % p[0] featuresDF = psql.frame_query(sql, conn) print i, p[0], featuresDF['reading_ease'][0] new_row = self.getWikiPageMeta(title=featuresDF['title'][0], pageid=p[0]) print new_row try: sql = "UPDATE testing2 SET grade_level = %s WHERE pageId = %s" % (str(new_row['grade_level']),str(p[0])) curDB(conn).execute(sql) sql = "UPDATE testing2 SET reading_ease = %s WHERE pageId = %s" % (str(new_row['reading_ease']),str(p[0])) curDB(conn).execute(sql) conn.commit() except: print "Skipping for testing2: " pass closeDB(conn)
def process_smart_meter(data): param_string="" for param in data['parameters']: param_string=param_string+param+"," query='select timestamp,'+param_string query=query[:-1]+' from %s_data where timestamp between %d and %d;' %(data['sensor'],data['start'],data['end']) print query try: result=psql.frame_query(query,mysql_conn[data['sensor']]) except: mysql_conn['smart_meter']=MySQLdb.connect(user='******',passwd='password',db='smart_meter'); mysql_conn['jplug']=MySQLdb.connect(user='******',passwd='password',db='jplug'); mysql_conn['water_meter']=MySQLdb.connect(user='******',passwd='password',db='water_meter'); mysql_conn['multisensor']=MySQLdb.connect(user='******',passwd='password',db='multisensor'); result=psql.frame_query(query,mysql_conn[data['sensor']]) result.index=pd.to_datetime(result.timestamp*1e9) result=result.drop('timestamp',1) freq_downsampled=calculate_downsampling_frequency(result) if freq_downsampled is not None: result=result.resample(freq_downsampled, how='mean') result=result.dropna() series=[] num_rows=len(result[result.columns[0]].values) temp=np.empty((num_rows,2)) #Subtracting 5.5 hrs to ensure, we always get UTC time x=result.index.astype(int)/1e6+5.5*60*60*1000 temp[:,0]=x for key in result: temp[:,1]=result[key].values series.append({'name':key,'data':temp.tolist()}) return json.dumps(series)
def get_geonet_ftec_data_sv(sv, tstart, tstop): ''' Query the GEONET database for filtered tec data from all receivers in a certain time span for a given satellite. INPUTS: sv - integer, satellite PRN id tstart - datetime, start of time window tstop - datetime, end of time window OUTPUTS: frame - an object containing the desired GPS data: rxID, time [POSIX], svPRN, slant TEC [tecu], ,filtered TEC [tecu], el [deg], and az [deg] ''' # Open a connection to the database con = mdb.connect(host=hostname,user='******',passwd='bdpass',db='gpsdatabase') # This is the UT POSIX time for the start/stop times of interest startUT = calendar.timegm(tstart.timetuple()) stopUT = calendar.timegm(tstop.timetuple()) # First find out if the entry is in there (i.e., we are just updating the png and avi file) sql_cmd = 'SELECT rxID, UT, svPRN, sTEC, el, az ' + \ 'FROM tecdata WHERE svPRN = %d AND UT >= %d AND UT <= %d' % (sv, startUT, stopUT) frame1 = psql.frame_query(sql_cmd, con=con) # Get the filtered TEC data sql_cmd = 'SELECT rxID, UT, svPRN, fTEC FROM ftecdata WHERE svPRN = %d AND UT >= %d and UT <= %d' %(sv, startUT, stopUT) frame2 = psql.frame_query(sql_cmd, con=con) # Join frame = pd.merge(frame1,frame2,how='inner',on=['UT','rxID','svPRN']) con.close() return frame
def get_rx_ftec_data(rxID, tstart, tstop): ''' Query the GEONET database for filtered tec data from a receiver in a certain time span. INPUTS: rxID - integer, receiver ID tstart - datetime, start of time window tstop - datetime, end of time window OUTPUTS: frame - an object containing the desired GPS data: rxID, time [POSIX], svPRN, filtered TEC [tecu], el [deg], and az [deg] ''' # Open a connection to the database con = mdb.connect(host=hostname,user='******',passwd='bdpass',db='gpsdatabase') # This is the UT POSIX time for the start/stop times of interest startUT = calendar.timegm(tstart.timetuple()) stopUT = calendar.timegm(tstop.timetuple()) # Grab the slant TEC data (need the el/az from this) sql_cmd = 'SELECT rxID, UT, svPRN, sTEC, el, az ' + \ 'FROM tecdata WHERE rxID = %s AND UT >= %d AND UT <= %d' % (rxID, startUT, stopUT) frame1 = psql.frame_query(sql_cmd, con=con) # Grab the filtered TEC sql_cmd = 'SELECT rxID, UT, svPRN, fTEC ' +\ 'FROM ftecdata where rxID = %s and UT >= %d AND UT <= %d' % (rxID, startUT, stopUT) frame2 = psql.frame_query(sql_cmd, con=con) con.close() # Join the two frames frame = pd.merge(frame1, frame2, how='inner', on=['UT', 'rxID', 'svPRN']); return frame
def get_geonet_ftec_data_at_time(t): ''' Query the GEONET filtered TEC database for data for all receivers at a certain time. INPUTS: t - datetime, desired sample time OUTPUTS: frame - an object containing the desired GPS data: rxID, time [POSIX], svPRN, slat TEC [tecu], filtered TEC [tecu], el [deg], and az [deg] ''' # Open a connection to the database con = mdb.connect(host=hostname,user='******',passwd='bdpass',db='gpsdatabase') # This is the UT POSIX time for the time of interest UT = calendar.timegm(t.timetuple()) # First find out if the entry is in there (i.e., we are just updating the png and avi file) sql_cmd = 'SELECT rxID, UT, svPRN, sTEC, el, az FROM tecdata WHERE UT = %d' \ % (UT) frame1 = psql.frame_query(sql_cmd, con=con) sql_cmd = 'SELECT rxID, UT, svPRN, fTEC FROM ftecdata WHERE UT = %d' % (UT) frame2 = psql.frame_query(sql_cmd, con=con) # Join frame = pd.merge(frame1,frame2,how='inner',on=['UT','rxID','svPRN']) con.close() return frame
def setUpData(self): debug, host, port, user, passwd, dbname, localpath = grabDatabaseSettingsFromCfgFile(configFileName) self.qualityPredictorFile = localpath + 'app/qualityPredictorFile.p' conn = conDB(host,dbname,passwd,port, user) #featured_csvfilename = '/Users/ahna/Documents/Work/insightdatascience/project/wikiphilia/webapp/datasets/featured.csv' #flagged_csvfilename = '/Users/ahna/Documents/Work/insightdatascience/project/wikiphilia/webapp/datasets/flagged.csv' # load data from database import pandas.io.sql as psql DF = psql.frame_query("SELECT * FROM testing2 WHERE flagged = 1 OR featured = 1", conn) DF = DF[~np.isnan(DF.meanWordLength)] # remove rows with NaN in meanWordLength DF = DF[~np.isnan(DF.reading_ease)] # remove rows with NaN in reading_ease X = DF[self.iUseFeatures].values #.astype('float') if self.bUsingBuckets: maxes = list() for i in range(len(self.iUseFeatures)): sql = "SELECT AVG({}), STD({}) FROM testing2 WHERE featured = 1".format(self.iUseFeatures[i],self.iUseFeatures[i]) temp = psql.frame_query(sql, conn) maxes.append(sum(temp.ix[0])) # add up average + standard dev of featued pages to be used for the maximum bucket value self.computeBuckets(X,maxes) # for some of the features, compute buckets so as not to use raw values but use "low","med","high" or more values X = self.getAllBuckets(X) # put some of the features into buckets y = DF['score'].values # labels are 1 if featured is True, 0 if flagged is True np.random.seed() # divide into training set and test set self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\ X, y, test_size=self.testPropor, train_size=1-self.testPropor, random_state=np.random.random_integers(100)) closeDB(conn)
def executa_query(self, query, get_data_frame=False): """ Executes a SQL query. @brief Local variable: dados - Receives the query result @param query A SQL instruction @param get_data_frame Determines the format of the query results. Sets False as default @return Query with the result """ dados = None if (get_data_frame is False): dados = psql.frame_query( query, con=self.conexao ).to_dict() else: dados = psql.frame_query( query, con=self.conexao ) return dados
def read_GrpAttrs(sFileDB, nStartFrm, nEndFrm): conn = lite.connect(sFileDB) sRange = "Frame>=" + str(nStartFrm) + " and Frame<=" + str(nEndFrm) dfIndAttrs = psql.frame_query("SELECT * FROM IndAttrs WHERE " + sRange, conn) dfGrp_Attrs = psql.frame_query("SELECT * FROM GrpAttrs WHERE " + sRange, conn) return dfIndAttrs, dfGrp_Attrs
def event_aggregation(users): #Lista de eventos json_node_events=[] #conexión a la base de datos local conexion=db.mySQLConect("localhost","root","lego","dashboard") #events+reports query="SELECT dashboard.reports.id as report_id,dashboard.timeline_events.id as event_id, dashboard.reports.node_id,dashboard.reports.status, dashboard.reports.time FROM dashboard.reports,dashboard.timeline_events where timestampdiff(MINUTE,dashboard.timeline_events.updated_at,dashboard.reports.time)<=3 AND dashboard.reports.time>=dashboard.timeline_events.updated_at AND node_id=subject_id AND event_type='updated';"; er=psql.frame_query(query,conexion.db) #Para todos los nodos que poseen un nombre de host asociado for idx,u in users[users.host.notnull()].iterrows(): #variables para rellenar el registro nid=u['code'] user=u['user'] host=u['host'] course=u['course'] #busco en la base de datos los eventos relacionados con ese codigo de autoes query="SELECT * FROM timeline_events WHERE (subject_id ="+str(nid)+" AND subject_type='Node') OR (secondary_subject_id="+str(nid)+");" node_events=psql.frame_query(query,conexion.db) #siempre el alumno empieza en la práctica 1 current_practice=1 #para cada evento encontrado for idx,event in node_events.iterrows(): #Evento de creación del usuario de autoes if event['event_type']=="created": register_event={ "user":user, "course":course,"host":host,"type":"created","report":None,"practice":current_practice, "date":event['created_at']} json_node_events.append(register_event) #Evento de moverse de práctica elif event['event_type']=="added_to": #si solo avanza una práctica if(current_practice+1==event['subject_id']): register_event={ "user":user,"course":course,"host":host, "type":"added_to","report":None,"practice":event['subject_id'], "date":event['created_at']} current_practice=event['subject_id'] json_node_events.append(register_event) #si avanza varias practicas a la vez else: for i in range(current_practice,event['subject_id']+1): register_event={ "user":user,"course":course,"host":host, "type":"added_to","report":None,"practice":i, "date":event['created_at']} json_node_events.append(register_event) current_practice=event['subject_id'] #Evento updated: significa o bien login, o bien evaluación correcta o bien evaluación fallida elif event['event_type']=="updated": #buscamos el informe report_event=er[er.event_id==event['id']] #para cada informe posible for idx2,report in report_event.iterrows(): register_event={ "user":user,"course":course,"host":host, "type":report['status'],"report":report['report_id'],"practice":current_practice, "date":event['created_at']} json_node_events.append(register_event) #Evento para salir de una práctia elif event['event_type']=="removed_from": register_event={ "user":user, "course":course,"host":host, "type":"removed_from","report":None,"practice":event['subject_id'], "date":event['created_at']} json_node_events.append(register_event) #evento de borrado de la plataforma: muy raro elif event['event_type']=="removed": register_event={ "user":user, "course":course,"host":host,"type":"removed","report":None,"practice":current_practice, "date":event['created_at']} json_node_events.append(register_event) events_frame=DataFrame(json_node_events) return events_frame
def executa_query(self, query, get_data_frame=False): dados = None if (get_data_frame is False): dados = psql.frame_query(query, con=self.conexao).to_dict() else: dados = psql.frame_query(query, con=self.conexao) if (dados is None): raise ResultadoConsultaNuloError("A biblioteca pandas não está instalada, ou nenhum dado foi passado a esse método") else: return dados
def MySQL_getdata(sql_table,train_per,test_per,features): #Getting data from MySql con = MySQLdb.connect(host="opt0.madvertise.net",user="******", passwd="27f7b0bc",db="dse") sql = "SELECT "+', '.join(features)+", SUM(`clicks`) as clicks, SUM(`instances`) as instances FROM `"+sql_table+"` WHERE date in ('"+'\',\''.join(test_per)+"') GROUP BY "+ ', '.join(features)+";" test_df = psql.frame_query(sql,con) sql = "SELECT "+', '.join(features)+", SUM(`clicks`) as clicks, SUM(`instances`) as instances FROM `"+sql_table+"` WHERE date >= '"+train_per[0]+"' AND date <= '"+train_per[1]+"' GROUP BY "+ ', '.join(features)+";" train_df = psql.frame_query(sql,con) con.close() return train_df,test_df
def main(): global snp global nasdaq global usa_events global usa_processed with mysql_cn: snp = psql.frame_query(config.SQL.get('SNP'), con=mysql_cn) nasdaq = psql.frame_query(config.SQL.get('NASDAQ'), con=mysql_cn) usa_events = psql.frame_query(config.SQL.get('USA_EVT'), con=mysql_cn) usa_processed = psql.frame_query(config.SQL.get('USA_PROCESSED'), con=mysql_cn) #fill_trading_stats_in_db() #fill_event_severity() show_stocks_correlation() calc_correlation()
def user_aggregation(): #Conexión a la base de datos MySQL dashboard=db.mySQLConect("localhost","root","lego","dashboard") foros=db.mySQLConect("localhost","root","lego","rita") #Lectura de los nodos de autoes nodos= psql.frame_query('select * from nodes;', dashboard.db) #Lectura de los usuarios de los foros usuarios=psql.frame_query('select * from user;', foros.db) #Lista con los usuarios: luego habrá que insertarlos en Mongo pero primero hay que calcular correctamente final_users=[] #Listado de usuarios especiales, que tienen una asociación concreta personal_rel=[(5,8),(1,25),(1,42),(9,15),(11,31),(6,47),(7,10),(7,41),(13,22),(14,77),(12,9),(15,11),(20,34),(23,45),(23,35),(24,36),(38,38)] #Para cada usuario de los foros for idx,user in usuarios.iterrows(): #Calculamos un posible hostname y lo buscamos en la tabla hostname=user['name'].lower().split()[0][0]+user['name'].lower().split()[1].encode() autoes_user=nodos[nodos['name'].str.contains(hostname)] #Si lo hemos encontrado if len(autoes_user)>0: for idx,au in autoes_user.iterrows(): if(au['created_at'].date()<datetime.strptime('08/01/11', "%m/%d/%y").date()): course='2010-2011' elif(au['created_at'].date()<datetime.strptime('08/01/12', "%m/%d/%y").date()): course='2011-2012' else: course='2012-2013' new_user={"code":au['id'], "user":user['iduser'], "name":user['name'], "host":au['name'], "course":course} final_users.append(new_user) #usuarios cuyo nombre no es regular: los hemos buscado a mano :( elif user['iduser'] in (x[0] for x in personal_rel): hosts=[(x,y) for (x,y) in personal_rel if x == user['iduser']] for host in hosts: au=nodos[nodos.id==host[1]].iloc[0] if(au['created_at'].date()<datetime.strptime('08/01/11', "%m/%d/%y").date()): course='2010-2011' elif(au['created_at'].date()<datetime.strptime('08/01/12', "%m/%d/%y").date()): course='2011-2012' else: course='2012-2013' code=long(au['id']) host=au['name'] new_user={"code":code, "user":user['iduser'], "name":user['name'], "host":host, "course":course} final_users.append(new_user) #Sino else: new_user={"code":None, "user":user['iduser'], "name":user['name'], "host":None, "course": None} final_users.append(new_user) #Adaptaciones personales users_frame=DataFrame(final_users) return users_frame
def executa_query(self, query, get_data_frame=False): dados = None if (get_data_frame is False): dados = psql.frame_query(query, con=self.conexao).to_dict() else: dados = psql.frame_query(query, con=self.conexao) if (dados is None): raise ResultadoConsultaNuloError( "A biblioteca pandas não está instalada, ou nenhum dado foi passado a esse método" ) else: return dados
def clidesc_getStationsByCountry(conn, country): """ Gets all the station details for all stations in a given country code (single or vector). It opens the connection, reads the stations table country must be a string, with country codes (if more than one) separated by ',' Parameters ---------- conn : db connection A database connection object as returned by clides_open() country : string The two letters country identifier e.g. 'WS' for Samoa, 'FJ' for Fiji, ... Returns ------- table : Pandas.DataFrame A Pandas.DataFrame object containing the list of stations available for the country """ if ',' in country: country = country.replace(',','\',\'') # builds the query string query = """SELECT * FROM stations WHERE country_code IN ('%s') ORDER BY country_code, station_no""" % ( country ) # get the table returned by the query as a pandas dataframe table = psql.frame_query(query, conn) return table
def read_cgc(): """Gets the genes from the cancer gene census. Data from CGC is available from here: http://cancer.sanger.ac.uk/cancergenome/projects/census/ Returns ------- cgc_in_database : tuple tuple of gene names in cancer gene census also in COSMIC """ cfg_opts = get_input_config('input') cgc_path = os.path.join(proj_dir, cfg_opts['cgc']) with open(cgc_path, 'r') as handle: cgc = tuple(gene.strip() for gene in handle.readlines()) # open connection try: # if DB is not created this will throw an error gene_db_path = os.path.join(proj_dir, get_db_config('2020plus')['db']) conn = sqlite3.connect(gene_db_path) sql = ("SELECT DISTINCT Gene" " FROM mutation" " WHERE Gene in " + str(cgc)) df = psql.frame_query(sql, con=conn) conn.close() # close connection # get significantly mutated genes found in database cgc_in_database = tuple(df['Gene'].astype(str)) logger.debug('There are only %d/%d CGC genes found in the database.' % (len(cgc_in_database), len(cgc))) except: cgc_in_database = cgc return cgc_in_database
def clidesc_getColumns(conn, table): """ Return the column names of a table Parameters ---------- conn : the postgresql connection table : string the table (e.g. 'obs_daily', 'obs_subdaily') Returns ------- columns : Pandas.DataFrame A Pandas.Series containing the column names Usage ----- columns = clidesc_getColumns(conn, 'obs_subdaily') columns = clidesc_getColumns(conn, 'obs_daily') """ query = "SELECT column_name FROM information_schema.columns WHERE table_name = '{}'".format( table) columns = psql.frame_query(query, conn) return columns
def data_frame(self, csv_file, dbhost, dbport, dbname, dbuser, dbpassword, sql_file, sqlparams): if csv_file: df = read_csv(csv_file) if sqlparams: params = dict([params.split('=') for params in sqlparams]) for column in params: filters = params[column].replace("'", '').split(',') df = df[df[column].isin(filters)] self.connection = None else: query = open(sql_file).read() params = dict([params.split('=') for params in sqlparams]) connstring = "host='%(host)s' port='%(port)s'" \ "dbname='%(dbname)s' user='******'" \ "password='******'" connstring = connstring % {'host': dbhost, 'port': dbport, 'dbname': dbname, 'user': dbuser, 'password': dbpassword} conn = psycopg2.connect(connstring or self.report['connstring']) self.connection = conn df = psql.frame_query(query % params, con=conn) return df
def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene' ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:] all_features = all_features[ new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options[ 'output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def clidesc_stations(conn, stations): """ Gets all the station details for a station number It opens the connection, reads the stations table station_no is a string, with stations separated by ',' Parameters ---------- conn : db connection Database connection object as returned by clidesc_open() stations : string The station or station list if several station should be passed as string, not list e.g. 'XXXXX, YYYYY, ZZZZZ' Returns ------- table : Pandas.DataFrame A Pandas.DataFrame object containing the station information """ if isinstance(stations, str) and ',' in stations: stations = stations.replace(',','\',\'') # builds the query string query = """SELECT * FROM stations WHERE station_no IN ('%s') ORDER BY station_no""" % (stations) # get the table returned by the query as a pandas dataframe table = psql.frame_query(query, conn) return table
def clidesc_getColumns(conn, table): """ Return the column names of a table Parameters ---------- conn : the postgresql connection table : string the table (e.g. 'obs_daily', 'obs_subdaily') Returns ------- columns : Pandas.DataFrame A Pandas.Series containing the column names Usage ----- columns = clidesc_getColumns(conn, 'obs_subdaily') columns = clidesc_getColumns(conn, 'obs_daily') """ query = "SELECT column_name FROM information_schema.columns WHERE table_name = '{}'".format(table) columns = psql.frame_query(query, conn) return columns
def L32pred(): db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='0'" data = frame_query(sql, db) URL = 'DUMB.csv' DUMB = pandas.read_csv(URL) N = '22' if N == '21': Questions = list(data.columns[2:8]) + list(data.columns[10:15]) Dumbster = DUMB[DUMB.columns[1:7] + DUMB.columns[9:14]] elif N == '22': Questions = list(data.columns[2:8]) + list(data.columns[17:21]) Dumbster = DUMB.columns[1:7] + DUMB.columns[16:20] elif N == '23': Questions = list(data.columns[2:8]) + list(data.columns[23:27]) Dumbster = DUMB.columns[1:7] + DUMB.columns[22:26] else: Questions = list(data.columns[2:8]) + list(data.columns[29:33]) Dumbster = DUMB.columns[1:7] + DUMB.columns[28:32] df_DUMB = DUMB[Dumbster + DUMB.columns[39:42]] df_test = data[Questions + list(data2.columns[40:43])] frames = [df_DUMB, df_test] df_WH = pandas.concat(frames) #ENCODING X_test = df_WH.to_dict('records') X_te = [] X_te.extend(X_test) encoder = DictVectorizer(sparse=True) X_encoded_test = encoder.fit_transform(X_te) clf2 = joblib.load('EstL32.pkl') Predictions = clf2.predict(X_encoded_test.toarray()[-1]) return str(int(Predictions))
def clidesc_stations(conn, stations): """ Gets all the station details for a station number It opens the connection, reads the stations table station_no is a string, with stations separated by ',' Parameters ---------- conn : db connection Database connection object as returned by clidesc_open() stations : string The station or station list if several station should be passed as string, not list e.g. 'XXXXX, YYYYY, ZZZZZ' Returns ------- table : Pandas.DataFrame A Pandas.DataFrame object containing the station information """ if isinstance(stations, str) and ',' in stations: stations = stations.replace(',', '\',\'') # builds the query string query = """SELECT * FROM stations WHERE station_no IN ('%s') ORDER BY station_no""" % ( stations) # get the table returned by the query as a pandas dataframe table = psql.frame_query(query, conn) return table
def L33Night(): #L21 Night db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='1'" data = frame_query(sql, db) N = '22' if N == '21': Questions = list(data.columns[2:8]) + list(data.columns[10:15]) Dumbster = DUMB[DUMB.columns[1:7] + DUMB.columns[9:14]] elif N == '22': Questions = list(data.columns[2:8]) + list(data.columns[17:21]) Dumbster = DUMB.columns[1:7] + DUMB.columns[16:20] elif N == '23': Questions = list(data.columns[2:8]) + list(data.columns[23:27]) Dumbster = DUMB.columns[1:7] + DUMB.columns[22:26] else: Questions = list(data.columns[2:8]) + list(data.columns[29:33]) Dumbster = DUMB.columns[1:7] + DUMB.columns[28:32] df_training = data df_train = df_training[Questions + list(data.columns[45:49])] Y_train = df_train['L33P'].values del df_train['L33P'] #ENCODING X_train = df_train.to_dict('records') X_tr = [] X_tr.extend(X_train) #One Hot Encoding enc = DictVectorizer(sparse=True) X_encoded_train = enc.fit_transform(X_tr) estimator = GradientBoostingClassifier() estimator.fit(X_encoded_train.toarray(), Y_train) joblib.dump(estimator, 'EstL33.pkl') return 'success'
def select(self, sql, commit=True, df=True): """ Execute une requete select pgsql et retourne le résultat si il y en a un et la sortie console. """ try: if self.debug == True: self.info("Sélection - %s" %sql) if df == True: df = psql.frame_query(sql, con=self.con) if len(df) == 0 and self.debug is True: self.warning("... Table vide") if self.debug == True: self.info("... Requête executée") return df else: self.cur.execute(sql) if self.debug == True: self.info("... Requête executée: %s " %self.cur.statusmessage) return self.cur.fetchall() except psycopg2.DatabaseError, e: self.error("%s" %e)
def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:] all_features = all_features[new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def integrity_check(): date1 = "1986-01-01" date2 = "2013-01-01" db = MySQLdb.connect(user = '******', passwd = 'xxxxxxxx', db = 'db') query = ("""SELECT DISTINCT artist,title FROM album WHERE chart_date BETWEEN '{0}' AND '{1}';""".format(date1, date2)) df = psql.frame_query(query, con=db) db.close() for tup in df.itertuples(): if isinstance(tup[1], basestring): pass else: print "Entry {0} -- Artist -- {1} -- Not a string".format(tup[0], tup[1]) if isinstance(tup[2], basestring): pass else: print "Entry {0} -- Title -- {1} -- Not a string".format(tup[0], tup[1]) return True
def get_sub_recommendations(self, redditor, subs=None): """Given a redditor, return recommended subs with scores""" redditor_subs = self.load_redditor_subs(redditor, subs=subs) if not isinstance(redditor_subs, list) or not len(redditor_subs): redditor_subs = DEFAULT_SUBS #Filter for similarity redditor_subs = [sub for sub in redditor_subs if sub in SUBS] base_query = "select sub2, sum(similarity) as similarity from similarity where sub1 in ({})\ group by sub2 ORDER BY similarity DESC;" query = base_query.format(', '.join( ["'" + i + "'" for i in redditor_subs])) all_sub_scores = pd_psql.frame_query(query, self.db_session.connection) all_sub_scores = all_sub_scores[-all_sub_scores.sub2.isin(redditor_subs )] all_sub_scores.fillna(0, inplace=True) all_sub_scores.sort(columns=['similarity'], ascending=False, inplace=True) recommendations = list(all_sub_scores.sub2[( -all_sub_scores.sub2.isin(DEFAULT_SUBS))][:10]) logging.info('\n\nRECOMMENDATIONS FOR REDDITOR {}:\n'.format(redditor)) logging.info(recommendations) if not len(recommendations): recommendations = DEFAULT_SUBS return recommendations
def clidesc_getStationsByCountry(conn, country): """ Gets all the station details for all stations in a given country code (single or vector). It opens the connection, reads the stations table country must be a string, with country codes (if more than one) separated by ',' Parameters ---------- conn : db connection A database connection object as returned by clides_open() country : string The two letters country identifier e.g. 'WS' for Samoa, 'FJ' for Fiji, ... Returns ------- table : Pandas.DataFrame A Pandas.DataFrame object containing the list of stations available for the country """ if ',' in country: country = country.replace(',', '\',\'') # builds the query string query = """SELECT * FROM stations WHERE country_code IN ('%s') ORDER BY country_code, station_no""" % ( country) # get the table returned by the query as a pandas dataframe table = psql.frame_query(query, conn) return table
def main(): db = mdb.connect(user="******",host="localhost",passwd="konnichiwa1",db="foodgrouper") query = 'Select * from foodgrouper.CityPopulation;' df = psql.frame_query(query, db) cur = db.cursor() for i in range(157,len(df[['latitude','longitude']])): latlong = df[['latitude','longitude']][i-1:i] clusters,data,c_info = foodgroups.foodGroups(latlong['latitude'][i-1],latlong['longitude'][i-1]) cluster_results = pd.DataFrame() cluster_results['labels'] = clusters['labels'] cluster_results['core_samples_mask'] = clusters['core_samples_mask'] cluster_results['eps'] = clusters['eps'] insert_request = "INSERT INTO foodgrouper.requests(latitude,longitude,City,State) VALUES (%f, %f, '%s', '%s');"%(latlong['latitude'][i-1] ,latlong['longitude'][i-1],df['City'][i-1:i][i-1],df['State'][i-1:i][i-1]) print "Writing city %i: %s, %s data to DB."%(i,df['City'][i-1:i][i-1],df['State'][i-1:i][i-1]) a = Thread(target=noInterrupt, args=(db,cur,insert_request,data,c_info,cluster_results)) a.start() a.join() time.sleep(randint(1,3)) cur.close() con.close()
def getActivityEntries( self ): print( len( self.TablesList ) ) for index in range( len( self.TablesList ) ): self.cursor.execute( "select Activity_Type from %s group by Activity_Type" % self.TablesList[index] ) result_activity = self.cursor.fetchall() k = 0 del self.ActivityList[0:len( self.ActivityList )] for t in result_activity: self.ActivityList.insert( k, t ) k = k + 1 for a in self.ActivityList: sql = ( "select * from " + self.TablesList[index] + " where Activity_Type = '" + str( a[0] ) + "'" ) # % (TablesList[index], str(a[0]))) df = pd_sql.frame_query( sql, self.db , index_col = "Date" ) orderedList = {'region':None, 'activity':None , 'Data':None} orderedList['region'] = self.TablesList[index] orderedList['activity'] = a[0] orderedList['Data'] = df self.FinalList.append( orderedList ) return self.FinalList
def L33pred(): URL = 'DUMB.csv' DUMB = pandas.read_csv(URL) db = MySQLdb.connect("localhost", "root", "", "ge_hackathon") cursor = db.cursor() sql = "SELECT * FROM answers WHERE finished='0'" data = frame_query(sql, db) URL = 'DUMB.csv' DUMB = pandas.read_csv(URL) Dumbster = DUMB.columns[1:7] df_DUMB = DUMB[Dumbster + DUMB.columns[44:47]] Questions = list(data.columns[2:8]) Dumbster = DUMB.columns[1:7] df_test = data[Questions + list(data.columns[45:48])] frames = [df_DUMB, df_test] df_WH = pandas.concat(frames) #ENCODING X_test = df_WH.to_dict('records') X_te = [] X_te.extend(X_test) encoder = DictVectorizer(sparse=True) X_encoded_test = encoder.fit_transform(X_te) clf2 = joblib.load('EstL33.pkl') Predictions = clf2.predict(X_encoded_test.toarray()[-1]) Predictions_prob = clf2.predict_proba(X_encoded_test.toarray()[-1]) return str(int(Predictions))
def get_rx_data(rxID, tstart, tstop): ''' Query the GEONET database for data from a receiver in a certain time span. INPUTS: rxID - integer, receiver ID tstart - datetime, start of time window tstop - datetime, end of time window OUTPUTS: frame - an object containing the desired GPS data: rxID, time [POSIX], latitude [deg], longitude [deg], and altitude [m] ''' # Open a connection to the database con = mdb.connect(host=hostname,user='******',passwd='bdpass',db='gpsdatabase') # This is the UT POSIX time for the start/stop times of interest startUT = calendar.timegm(tstart.timetuple()) stopUT = calendar.timegm(tstop.timetuple()) # First find out if the entry is in there (i.e., we are just updating the png and avi file) sql_cmd = 'SELECT rxID, UT, lat, lon, alt ' + \ 'FROM rxposition WHERE rxID = %s AND UT >= %d AND UT <= %d' % (rxID, startUT, stopUT) frame = psql.frame_query(sql_cmd, con=con) con.close() return frame
def getRawData(self, sFileDB, nStartFrm, nEndFrm): sTable = "Tracks" self.conn = lite.connect(sFileDB) sRange = "Frame>=" + str(nStartFrm) + " and Frame<=" + str(nEndFrm) dfRawData = psql.frame_query( "SELECT * FROM " + sTable + " WHERE " + sRange, self.conn) return dfRawData
def run(proj): symbolsin=proj.symbol causesin=proj.cause projid=proj.projid ma=multiprocessing.Manager() queue=ma.Queue() pool_size=multiprocessing.cpu_count()*2 pool=multiprocessing.Pool(processes=pool_size) result=pool.map_async(calcsymbol,((proj,symbol,queue) for symbol in symbolsin)) time.sleep(1) causedf=None while not queue.empty(): print "gone in to loop now" item=queue.get() #assure item[1] type symbolid=item[0] if isinstance(item[1],int): if item[1]==0: #update the symbol status try: conn=MySQLdb.connect(host='localhost',user='******',passwd='david',db='david_trade') cur=conn.cursor() expression="update symbol set status=0 where symbolid="+str(symbolid)+";" #tradedb.update('symbol',where="symbolid =$symbolid",vars={'symbolid':symbolid}, btstatus = 0) print "update 0" cur.execute(expression) conn.commit() cur.close() conn.close() except MySQLdb.Error,e: print "error in update 0 " print "Mysql Error %d:%s" %(e.args[0],e.args[1]) elif item[1]==1: print "update select cause" resultid=str(projid)+'_'+str(symbolid) try: conn=MySQLdb.connect(host='localhost',user='******',passwd='david',db='david_trade') cur=conn.cursor() expression="select btstatus from symbol where symbolid="+str(symbolid)+";" cur.execute(expression) #check btstatus to continue calc backtester #will have a condition query from mysql btstatus=cur.fetchone() if btstatus==1: return else: #select the causetable from mysql to calc backtester resultid=str(projid)+'_'+str(symbolid) tablename=resultid+'_result' expression="select * from "+tablename+";" causedf=psql.frame_query(expression, con=conn) #generate resultdf with cp and ncp for cause causedf.set_index('Date',drop=True,inplace=True) conn.commit() cur.close() conn.close() calcbtcor(proj,causedf,resultid) except MySQLdb.Error,e: print "error in select cause" print "Mysql Error %d:%s" %(e.args[0],e.args[1])
def read_olfactory_receptors(): """Reads in the significant olfactory receptors from Mutsigcv. Returns ------- olfactory : tuple tuple of gene names considered as olfactory receptors """ cfg_opts = get_input_config('input') or_path = os.path.join(proj_dir, cfg_opts['olfactory_receptors']) with open(or_path, 'r') as handle: olfactory = tuple(gene.strip() for gene in handle.readlines()) # open connection try: # it table is not found just catch exception gene_db_path = os.path.join(proj_dir, get_db_config('2020plus')['db']) conn = sqlite3.connect(gene_db_path) sql = ("SELECT DISTINCT Gene" " FROM mutation" " WHERE Gene in " + str(olfactory)) df = psql.frame_query(sql, con=conn) conn.close() # close connection # get significantly mutated genes found in database olfactory_in_database = tuple(df['Gene']) logger.debug( 'There are only %d/%d olfactory receptors found in the database.' % (len(olfactory_in_database), len(olfactory))) except: olfactory_in_database = olfactory return olfactory_in_database
def find_restaurant_for_food(food, city,db): sql = "select bid from Acad_reviews where city = \'"+city+"\' AND text like \'%"+food+"%\' AND stars > 4 order by bid" df_bid = psql.frame_query(sql, con=db) bids = df_bid['bid'] bid_counter = Counter(bids) top3 = bid_counter.most_common(3) topRest=[] for item in top3: bid = item[0] sql = "select name,url from acad_restaurants where bid =\'"+bid+"\'" df_rest = psql.frame_query(sql, con=db) rname = df_rest['name'][0] url = df_rest['url'][0] rec=[rname,url] topRest.append(rec) return topRest
def find(): rec =[] homecity = request.form['homecity'] visitcity = request.form['visitcity'] if not homecity: homecity = 'atlanta' if not visitcity: visitcity = 'new york' db = MySQLdb.connect(host = 'localhost',user = '******',passwd = 'insight', db = 'bestfood_AWS') cursor = db.cursor() cursor.execute("USE bestfood_AWS") sql = "select * from acad_topitems where homecity=\'"+visitcity+"\' AND visitcity= \'"+homecity+"\' order by rank asc limit 3" df_mysql = psql.frame_query(sql, con=db) foodlist =[] for i in range(0,2): food = df_mysql['term'][i] rname = df_mysql['rest'][i] url = df_mysql['url'][i] ratio = df_mysql['popularity_ratio'][i] rec = [food,rname,url,ratio] foodlist.append(rec) wordlist=[] wordlist.append(rec) wordlist.append(visitcity) #random.shuffle(foodlist) return render_template('results.html', homecity = homecity, visitcity = visitcity, wordlist = foodlist)
def sqlite_table_to_hdf5(db_name, table): h5_name = os.path.splitext(db_name)[0] h5_group = '%s_%s' % (h5_name, table) h5_out = '%s.h5' % h5_name print "Storage: %s Group: %s" % (h5_out, h5_group) store = pandas.HDFStore(h5_out) storer = store.get_storer('/%s' % h5_group) if storer is not None: # Already stored something, grab last transaction tid. last_item = store.select(h5_group, start=storer.nrows - 1) last_tid = int(last_item.index[0][1]) else: last_tid = -1 # all-time data. This might take some time. print "Last transaction ID: %d" % last_tid db = sqlite3.connect(db_name, detect_types=sqlite3.PARSE_COLNAMES) query = """SELECT datetime(timestamp, 'unixepoch') as "ts [timestamp]", CAST(price as FLOAT) as price, CAST(amount as FLOAT) as volume, tid FROM [%s] WHERE tid > ? ORDER BY tid ASC LIMIT 5000""" % table while True: dataframe = psql.frame_query(query, con=db, params=(last_tid, ), index_col=['ts', 'tid']) if not len(dataframe) or last_tid == dataframe.index[-1][1]: print "Done." break print last_tid, dataframe.index[-1][1], len(dataframe) store.append(h5_group, dataframe) last_tid = int(dataframe.index[-1][1]) store.close()
def get_data(Query): import pandas.io.sql as psql from pyodbc import connect AccessDBFileName = r".\SIF_Testing.accdb" conn = connect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=" + AccessDBFileName + ";") DataBase = psql.frame_query(Query, conn) return DataBase
def __Get_snapshot_of_data(self): import pandas.io.sql as psql from pyodbc import connect AccessDBFileName = r"\\ANUBIS\Dad_Docs\8510\8510 - Process Control Obsolescence\ENGINEERING\Databases\8510-0069 PLC Design Database.accdb" conn = connect("Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=" + AccessDBFileName + ";") AllNamesQuery = "SELECT [~qry APM2 Classification Information].* FROM [~qry APM2 Classification Information];" return psql.frame_query(AllNamesQuery, conn)
def get_similarity_from_sub(self, sub): """Given a sub, return all other subs with similarity""" sub_scores = pd_psql.frame_query("SELECT sub2, similarity FROM\ similarity WHERE sub1 = '{0} ORDER BY sum DESC';".format(sub), self.db_session.connection) sub_scores.similarity = sub_scores.similarity.astype(float) sub_scores = sub_scores[sub_scores['similarity']>0] sub_scores.set_index('sub2',inplace=True) return sub_scores
def getDataframeFromDatabase(host, db, un, pw): #query = "SELECT package from potential_unfair_apps LIMIT 1000;" query = "SELECT package from potential_unfair_apps;" print query conn = MySQLdb.connect(host=host, user=un, passwd=pw, db=db) unfair_apps_df = psql.frame_query(query, conn) return unfair_apps_df
def load_rows(sql): conn = psycopg2.connect(dbname=db, user=username, host=hostname) try: df = psql.frame_query(sql, conn) finally: conn.close() return df
def get_all_series_json(config): df = "" with statmart_utils.get_db_connection() as cursor: query = """ SELECT identifier, description FROM series as S WHERE S.identifier like '%s%s%s' """ % (config("prefix"), "\_%\_", config("suffix")) df = psql.frame_query(query, con=cursor.connection) return json.dumps(df.as_matrix().tolist(), indent=4)
def get_reviews(self, imdb_movie_id): if not self.in_review_database(imdb_movie_id): raise Exception("Movie %d is not in the database" % imdb_movie_id) query = """ SELECT * FROM rs_reviews WHERE rs_imdb_movie_id=%s""" % imdb_movie_id df_mysql = psql.frame_query(query, con=self.db) return df_mysql
def get_similarity_from_sub(self, sub): """Given a sub, return all other subs with similarity""" sub_scores = pd_psql.frame_query( "SELECT sub2, similarity FROM\ similarity WHERE sub1 = '{0} ORDER BY sum DESC';".format(sub), self.db_session.connection) sub_scores.similarity = sub_scores.similarity.astype(float) sub_scores = sub_scores[sub_scores['similarity'] > 0] sub_scores.set_index('sub2', inplace=True) return sub_scores
def load_redditor_subs(self, redditor, force_update=False): redditor_subs = pd_psql.frame_query("SELECT * FROM redditors WHERE redditor = '{0}'".format(redditor), self.db_session.connection) redditor_subs = redditor_subs.dropna(axis=1) del redditor_subs['redditor'] if redditor_subs.empty or force_update: print '\nRedditor not in the database. Acquiring.\n' if check_redditor_exists(redditor): return get_redditor_subs(redditor) return [] return list(redditor_subs.columns)
def extract_data_from_DB(datetime_format, db_local_path, db_utils_dict): with sqlite3.connect(db_local_path, detect_types=sqlite3.PARSE_DECLTYPES) as conn: dataframe_sqlite = psql.frame_query('select ' + db_utils_dict['table_columns'][0] + ', ' + db_utils_dict['table_columns'][1] + ', ' + db_utils_dict['table_columns'][2] + ' from ' + db_utils_dict['table_name'] , con=conn) # TODO: change the hour to 2 hours more! -> http://pandas.pydata.org/pandas-docs/dev/generated/pandas.tseries.tools.to_datetime.html # This solution is a bit hacky ... and slow!!! dataframe_sqlite.index = pd.to_datetime(dataframe_sqlite.pop(db_utils_dict['table_columns'][0])) dataframe_sqlite_UTC = dataframe_sqlite.tz_localize('UTC') dataframe_sqlite_MAD = dataframe_sqlite_UTC.tz_convert(MAD) return dataframe_sqlite_MAD
def getDataframeFromDatabase(host, db, un, pw): #query = "SELECT package from potential_unfair_apps LIMIT 1000;" query = "SELECT package from potential_unfair_apps;" print query conn = MySQLdb.connect(host = host, user = un, passwd = pw, db = db) unfair_apps_df = psql.frame_query(query, conn) return unfair_apps_df
def get_pandas(self, sql): try: self._con_cur() df = psql.frame_query(sql, con=self._con) return self._sanitize_df(df) except Exception, e: traceback.print_exc() print "Error: %s" % str(e) self._close() raise (e)
def clidesc_ObsDaily(conn, channels, stations, from_date, to_date): """ returns the daily observations table for the requested stations, channels and date range. Parameters ---------- conn : db connection A database connection object as returned by clides_open() channels : string The variable channels in the table stations : string The station or stations list from_date : string The starting date in 'YYYY-MM-DD' format e.g. '2014-01-01' to_date : string The end date in 'YYYY-MM-DD' format e.g. '2014-01-01' Returns ------- table : Pandas.DataFrame A Pandas.DataFrame containing the data """ # some string formatting on the stations if more than one if isinstance(stations, str) and ',' in stations: stations = stations.replace(',', '\',\'') inputs = '%s::%s::%s::%s' % (channels, stations, from_date, to_date) query = """ SELECT station_no, TO_CHAR(lsd, 'yyyy-mm-dd') as LSD, %s FROM obs_daily WHERE station_no IN ('%s') AND lsd >= TO_TIMESTAMP('%s', 'yyyy-mm-dd') AND lsd <= TO_TIMESTAMP('%s', 'yyyy-mm-dd') ORDER BY lsd""" % ( channels, stations, from_date, to_date) # get the table returned by the query as a pandas dataframe try: table = psql.frame_query(query, conn) # the index of the pandas DataFrame is the 'lsd' field, correctly # cast to pandas datetime index and named 'timestamp' table.index = pd.to_datetime(table.lsd) table.index.name = 'timestamp' return table except: print('query failed for %s, was probably malformed' % (inputs)) return None
def load_estimate_hist(uni_df, start, end, estimate): window = timedelta(days=252) con = lite.connect(ESTIMATES_BASE_DIR + "ibes.db") date = start df_list = list() uni_df = uni_df.reset_index() while (date < end): if date.day == end.day: times = [ str(end.hour - 3) + ":" + str(end.minute) ] else: times = ['16:00'] endDateStr = date.strftime('%Y%m%d') startDateStr = (date - window).strftime('%Y%m%d') for time in times: minPeriod = str(int(endDateStr[2:4])) + endDateStr[4:6] maxPeriod = str(int(endDateStr[2:4]) + 2) + "00" sql = "select * from t_ibes_det_snapshot where timestamp between '{} {}' and '{} {}' and measure = '{}' and forecast_period_ind = 1 and forecast_period_end_date > {} and forecast_period_end_date < {} group by sid, ibes_ticker, estimator, forecast_period_ind, forecast_period_end_date having timestamp = max(timestamp) order by sid, forecast_period_end_date;".format(startDateStr, time, endDateStr, time, estimate, minPeriod, maxPeriod) print(sql) df = psql.frame_query(sql, con) df['value'] = df['value'].astype(str) df = df[ df['value'] != '' ] # df['ts'] = pd.to_datetime( date.strftime("%Y%m%d") + " " + time ) df['date'] = pd.to_datetime( date.strftime("%Y%m%d") ) df['value'] = df['value'].astype(float) df['timestamp'] = pd.to_datetime(df['timestamp']) print(df.columns) df = pd.merge(uni_df[ uni_df['date'] == date ], df, how='inner', left_on=['sid'], right_on=['sid'], sort=True, suffixes=['', '_dead']) df = df[ ~df.duplicated(cols=['date', 'sid', 'estimator']) ] df = df.set_index(['date', 'sid']) df_list.append(df) date += timedelta(days=1) df = pd.concat(df_list) print("DFEPS") print(df) #consensus result_df = df.groupby(level=['date', 'sid']).agg({'value' : [np.mean, np.median, np.std, 'count', np.max, np.min], 'timestamp' : 'last'}) result_df.columns = result_df.columns.droplevel(0) for column in result_df.columns: result_df.rename(columns={column: estimate + '_' + column}, inplace=True) #detailed df = df.set_index('estimator', append=True) print("SEAN2") print(df.head()) df2 = df['value'].unstack(['estimator', 'sid']).fillna(0).diff().iloc[1:].stack(['sid', 'estimator']) df2 = df2[ df2 != 0 ] df2 = df2.reset_index('estimator').groupby(level=['date', 'sid']).agg(np.mean) del df2['estimator'] df2.columns = [estimate + '_diff_mean'] result_df = pd.merge(result_df, df2, left_index=True, right_index=True, how='left') return result_df
def get_series_info_map_json(config): df = "" with us.get_db_connection() as cursor: query = """ SELECT identifier, originalsource, proximatesource FROM series WHERE series.identifier like '%s\_%s\_%s' """ % (config["prefix"], "%", config["suffix"]) df = psql.frame_query(query, con=cursor.connection) df = df.set_index("identifier") return df.to_json(orient="index")
def psql2df(conn, columns, table): try: df = psql.frame_query("SELECT id, price FROM stock_price", conn) except Exception as error: err = {'error': 'Error: %s' % error} log.error(err) return err finally: if conn is not None: conn.close() return df
def getScrapedContent(): DBNAME = 'newscontent' DBUSER = '******' PASSWRD = open('password.txt').readline() conn = psycopg2.connect(database=DBNAME, user=DBUSER, password=PASSWRD) sql = 'SELECT * from stocknews_newscontent' df = psql.frame_query(sql, conn) return df
def get_movie(self, imdb_movie_id): query = """ SELECT * FROM rs_movies WHERE rs_imdb_movie_id=%s""" % imdb_movie_id df_mysql = psql.frame_query(query, con=self.db) if len(df_mysql) == 0: raise Exception("Movie %d is not in the database" % imdb_movie_id) assert len(df_mysql) <= 1 return df_mysql.ix[0]
def get_gene_length(): # get db db_path = get_db_config('2020plus')['db'] # query for gene length conn = sqlite3.connect(db_path) sql = "SELECT gene, gene_length FROM gene_features" df = psql.frame_query(sql, con=conn) df = df.set_index('gene') conn.close() return df
def sqldf(q, env, inmemory=True): """ query pandas data frames using sql syntax q: a sql query using DataFrames as tables env: variable environment; locals() or globals() in your function allows sqldf to access the variables in your python environment dbtype: memory/disk default is in memory; if not memory then it will be temporarily persisted to disk Example ----------------------------------------- # example with a data frame df = pd.DataFame({ x: range(100), y: range(100) }) from pandasql import sqldf sqldf("select * from df;", locals()) sqldf("select avg(x) from df;", locals()) #example with a list """ if inmemory: dbname = ":memory:" else: dbname = ".pandasql.db" conn = sqlite.connect(dbname, detect_types=sqlite.PARSE_DECLTYPES) tables = _extract_table_names(q) for table in tables: if table not in env: conn.close() if not inmemory : os.remove(dbname) raise Exception("%s not found" % table) df = env[table] df = _ensure_data_frame(df, table) _write_table(table, df, conn) try: result = frame_query(q, conn) except: result = None finally: conn.close() if not inmemory: os.remove(dbname) return result