def write_table(df, tablename, conn): """ Write a dataframe to the database. """ with catch_warnings(): filterwarnings('ignore', message='The provided table name \'%s\' is not found exactly as such in the database' % tablename) to_sql(df, name=tablename, con=conn, index=not any(name is None for name in df.index.names)) # load index into db if all levels are named
def write_to_db(df, name): # TODO: float with nan not being handled # HACK: sql._SQL_TYPES['text']['mysql'] = 'VARCHAR (255)' sql.to_sql(df.where(pd.notnull(df), None), con=con, name=name, flavor='mysql', if_exists='replace', index="id")
def download_dd_data(start=None): ''' 获取大单数据 ''' conn = db.get_dd_data_db() start = start if start is None: start = utils.today_last_year(1) for code in get_all_stock_code(): end = datetime.today().date() while start < end: date = end.strftime('%Y-%m-%d') df = ts.get_sina_dd(code=code, date=date, vol=500) if df is not None: df.insert(0, 'code', code) try: sql_df = df.loc[:, :] sql.to_sql(sql_df, name='dd_data', con=conn, index=True, if_exists='append') log.info('%s,%s dd data download ok.' % (code, start)) except Exception as e: log.error('download error:%s,%s' % (code, date)) pass start = start + timedelta(days=1)
def test_safe_names_warning(self): # GH 6798 df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space # warns on create table with spaces in names with tm.assert_produces_warning(): sql.to_sql(df, "test_frame3_legacy", self.conn, flavor="sqlite", index=False)
def test_sql_open_close(self): """ Test if the IO in the database still work if the connection is closed between the writing and reading (as in many real situations). """ self._load_test2_data() with tm.ensure_clean() as name: conn = self.connect(name) sql.to_sql( self.test_frame2, "test_frame2_legacy", conn, flavor="sqlite", index=False, ) conn.close() conn = self.connect(name) result = sql.read_sql( "SELECT * FROM test_frame2_legacy;", conn, flavor="sqlite", ) conn.close() tm.assert_frame_equal(self.test_frame2, result)
def download_money_flow_data(num=1000): ''' get money flow from sina finance :param num: :return: ''' conn = db.get_money_flow_db() cur_time = datetime.now() for code in get_all_stock_code(): code = append_stock_perfix(code) cost = datetime.now() for i in range(retry): try: url = sina_money_flow_api % (num, code) text = request.urlopen(url, timeout=10).read() text = text.decode('GBK') log.info('url=%s,size=%d, try=%d' % (url, len(text), i)) if len(text) < 10: continue # j = demjson.decode(text, 'utf-8') #json很大的时候效率非常查 text = text[2:-2] j = text.replace('"', '').split('},{') head = ['date', 'close', 'chg_r', 'turnover', 'netamount', 'ratio', 'zl_netamount', 'zl_ratio', 'cat_ratio'] # 日期 收盘价 涨跌幅 换手率 净流入 净流入率 主力净流入 主力净流入率 行业净流入率 data = [] for x in j: m = {} for s in x.split(','): k, v = s.split(':') if '-' == v or 'null' == v: v = '0.0' m[k] = v date = m['opendate'] close = float(m['trade']) chg_r = float(m['changeratio']) turnover = float(m['turnover']) / 10000 netamount = float(m['netamount']) / 10000 ratio = float(m['ratioamount']) zl_netamount = float(m['r0_net']) / 10000 zl_ratio = float(m['r0_ratio']) cat_ratio = float(m['cate_ra']) data.append([date, close, chg_r, turnover, netamount, ratio, zl_netamount, zl_ratio, cat_ratio]) df = pd.DataFrame(data, columns=head) log.info('data ok') if not df.empty: df.insert(1, 'code', code) sql_df = df.loc[:, :] sql.to_sql(sql_df, name='money_flow', con=conn, index=False, if_exists='append') log.info('%s,%s,%d money flow data download ok.' % (code, str(start), len(sql_df))) break except Exception as e: log.error('error:code=%s,start=%s,msg=%s' % (code, start, e)) log.debug('%s,costs:%d s' % (code, (datetime.now() - cost).seconds)) conn.close() log.info('money flow data download complete. cost %d s' % (datetime.now() - cur_time).seconds) pass
def CleanReferral(): print("Joining referral data to LA City zip codes to filter referrals within LA City.") # Join DCFS and LA City zip code by zip code # To obtain referrals made within LA City (DCFS referral data is originally county-level) conn = sq.connect('ReferralCrimeMap.db') cur = conn.cursor() query = ''' SELECT * FROM dcfs JOIN LAzip ON dcfs.ref_zip = LAzip.zipcode ''' dcfs_all = pd.read_sql(query,conn) conn.close() # Select the referrals received in the focal year 2016 print("Select the referrals received in focal year 2016.") dcfs_all['ref_year'] = dcfs_all['ref_date'].map(lambda x: x.split('-')[0]) dcfs_all['ref_month'] = dcfs_all['ref_date'].map(lambda x: x.split('-')[1]) dcfs_all.rename(columns={'prime_key':'zip_ID'}, inplace=True) dcfs16 = dcfs_all[dcfs_all['ref_year']=='2016'] # Save the data into ReferralCrimeMap.db print("Referral_LAcity2016 is saved to ReferralCrimeMap.db") conn = sq.connect('ReferralCrimeMap.db') pd_sql.to_sql(dcfs16, 'Referral_LAcity2016', conn, if_exists='replace',index=False) conn.close()
def daily(now): import pandas import datetime from sql import getCursor,insertStock from stock import oneStock,getStockList import pandas.io.sql as pdsql before=now-datetime.timedelta(700) list=getStockList() total=len(list) today=pandas.DataFrame() count=1 for v in list: print str(count)+'/'+str(total),'start',v try: df=oneStock(v,starttime=before,endtime=now) df=df.iloc[[-1]] eng=getE() pandas.DataFrame.replace(df,{float('inf'):0,float('NaN'):0}) pdsql.to_sql(df, 'today', eng, if_exists='append',index=False,index_label='stock_id') except Exception: count+=1 print Exception.message print v,'error' continue print str(count)+'/'+str(total),'end',v count+=1 return
def generate_sqlite(self): print "Converting csv file to sqlite for train set:" num_lines = sum(1 for line in open(self.folder+self.file_in)) columns = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'TIMESTAMP', 'DAYTYPE', 'MISSING_DATA', 'POLYLINE'] con = sqlite3.connect(self.folder+self.file_out) chunk_size = 5000 count = 1 for i in range(0, num_lines, chunk_size): df = pd.read_csv(self.folder+self.file_in, header=None, nrows=chunk_size, skiprows=i, low_memory=False) df.columns = columns sql.to_sql(df, name='train_data', con=con, index=False, index_label='molecule_id', if_exists='append') print "Batch No. {} completed".format(count) count += 1 con.close() # Delete the first row with duplicate column names con = sqlite3.connect(self.folder+self.file_out) c = con.cursor() c.execute("DELETE FROM train_data WHERE TRIP_ID='TRIP_ID'") con.commit() con.close() print "All completed!\n"
def test_read_table_columns(self): # test columns argument in read_table sql.to_sql(self.test_frame1, 'test_frame', self.conn) cols = ['A', 'B'] result = sql.read_table('test_frame', self.conn, columns=cols) self.assertEqual(result.columns.tolist(), cols, "Columns not correctly selected")
def test_to_sql_fail(self): sql.to_sql(self.test_frame1, 'test_frame2', self.conn, flavor='sqlite', if_exists='fail') self.assertTrue( sql.has_table('test_frame2', self.conn, flavor='sqlite'), 'Table not written to DB') self.assertRaises(ValueError, sql.to_sql, self.test_frame1, 'test_frame2', self.conn, flavor='sqlite', if_exists='fail')
def getData(self): # Get crime records via API if self.storage == "remote": print("Getting crime data from remote source.\nThis takes a while (approx. 5 mins)! Please be patient.") # API request information client_crime = Socrata('data.lacity.org','7pTgt6f2oTY53aDI1jXNJoNZD') offset_temp = 0 conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() cur.execute("DROP TABLE IF EXISTS 'Crime2016'") # Getting data in dataframe then manipulate before storing in ReferralCrimeMap.db while True: results = client_crime.get("7fvc-faax", limit=10000, offset=offset_temp) crime_df_temp = pd.DataFrame.from_records(results) # This loop stops when the next block of dataframe is empty if crime_df_temp.empty == True: break # Split location_1 into lat and long # Create 'year_rptd' to filter cases reported in 2016 # Create 'count' for later data analysis crime_df_temp['location_1'] = crime_df_temp['location_1'].astype('str') crime_df_temp['long'] = crime_df_temp['location_1'].map(lambda x: x.split(']')[0].split('[')[-1].split(',')[0]) crime_df_temp['lat'] = crime_df_temp['location_1'].map(lambda x: x.split(']')[0].split('[')[-1].split(',')[-1]) crime_df_temp['year_rptd'] = crime_df_temp['date_rptd'].map(lambda x: x.split('-')[0]) crime_df_temp['month_rptd'] = crime_df_temp['date_rptd'].map(lambda x: x.split('-')[1]) crime_df_temp['count'] = 1 crime_df_temp = crime_df_temp[crime_df_temp['year_rptd']=='2016'] # Insert dataframe into ReferralCrimeMap.db pd_sql.to_sql(crime_df_temp, 'Crime2016', conn, if_exists='append', index=False) offset_temp+=10000 # Shows the percentage of data if offset_temp % 100000 == 0: print(offset_temp/2000000*100,"%") else: continue cur.execute("SELECT * FROM Crime2016") print(cur.fetchone()) conn.close() # Load local data if -source is set to local else: print("Getting crime data from local source.") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() query = "SELECT * FROM Crime" try: crime = pd.read_sql(query, conn) conn.close() print(crime.head()) except Exception as e: print("There is an error:", e) print("Please set data course as remote.") exit()
def test(): v='sh600216' # print str(count)+'/'+str(total),'start',v # try: df=oneStock(v,starttime=datetime(2016,1,1),endtime=datetime(2016,8,30)) [conn,cur]=getCursor() pdsql.to_sql(df, v, conn, flavor='mysql') setPrimaryKey(v) return
def transform(self): # this function does the following # 1) convert timestamp to time of the day, day of week, and month of year # 2) convert polyline to snapshots # 3) calculate the trip length # 4) calculate the average speed print "Generating training file with descriptive stats:" # initialize the connection with the input and output sqlite file con_in = sqlite3.connect(self.folder+self.file_in) con_out = sqlite3.connect(self.folder+self.file_out) con_in.text_factory = str chunk_reader = pd.read_sql("SELECT * FROM train_data WHERE MISSING_DATA!='1'", con_in, chunksize=5000) count = 1 for chunk in chunk_reader: print 'Chunk {} started:'.format(count) chunk['Time_of_Day'] = chunk.TIMESTAMP.map(lambda x: datetime.utcfromtimestamp(float(x)).hour + datetime.utcfromtimestamp(float(x)).minute/60.0 + datetime.utcfromtimestamp(float(x)).second/3600.0) chunk['Hour_of_Day'] = chunk.Time_of_Day.map(lambda x: np.round(x)) chunk['Day_of_Week'] = chunk.TIMESTAMP.map(lambda x: datetime.utcfromtimestamp(float(x)).weekday()) chunk['Month_of_Year'] = chunk.TIMESTAMP.map(lambda x: datetime.utcfromtimestamp(float(x)).month) chunk['POLYLINE_Split'] = chunk.POLYLINE.map(lambda x: re.compile("\[[-+]?\d+.\d+,[-+]?\d+.\d+\]").findall(x)) chunk['Snapshots'] = chunk.POLYLINE_Split.map(lambda x: len(x)) chunk = chunk[chunk.Snapshots > 10] chunk['Start_Point'] = chunk.POLYLINE_Split.map(lambda x: lonlat_convert(x[0])) chunk['End_Point'] = chunk.POLYLINE_Split.map(lambda x: lonlat_convert(x[-1])) chunk['Trip_Length'] = pd.DataFrame([get_dist(chunk.iloc[i].Start_Point, chunk.iloc[i].End_Point) for i in range(len(chunk))]) chunk['Avg_Speed'] = chunk['Trip_Length']*1000.0 / ((chunk['Snapshots']-1)*15) chunk.drop(['POLYLINE', 'POLYLINE_Split', 'Start_Point', 'End_Point'], axis=1, inplace=True) sql.to_sql(chunk, name='train_data', con=con_out, index=False, index_label='molecule_id', if_exists='append') print 'Chunk {} completed!'.format(count) count += 1 con_in.close() con_out.close() print "All completed!\n"
def load_rates(): ''' Puts all the DAWF rates into a sql table ''' try: sql.has_table('DAWF', con=cnx) print "Injury rate data already loaded." except ValueError: sql.to_sql(DAWF_rate, name = 'DAWF', con=cnx) print "Injury rate data successfully loaded."
def load_aos(): ''' Puts all the Area Offices into a sql table ''' try: sql.has_table('AOS', con=cnx) print "Area office data already loaded." except ValueError: sql.to_sql(aos, name = 'AOS', con=cnx) print "Area office data successfully loaded."
def load_emp(): ''' Puts all of the employment data into a sql table ''' try: sql.has_table('allemp', con=cnx) print "Employment data already loaded." except ValueError: sql.to_sql(view_emp, name = 'allemp', con=cnx) print "Employment data successfully loaded."
def _write_table(tablename, df, conn): "writes a dataframe to the sqlite database" for col in df.columns: if re.search("[()]", col): msg = "please follow SQLite column naming conventions: " msg += "http://www.sqlite.org/lang_keywords.html" raise Exception(msg) to_sql(df, name=tablename, con=conn, flavor='sqlite')
def _write_table(self, tablename, df): "writes a dataframe to the sqlite database" for col in df.columns: if re.search("[()]", col): msg = "please follow SQLite column naming conventions: " msg += "http://www.sqlite.org/lang_keywords.html" raise Exception(msg) dtype = dict((k, str(v)) for k, v in dict(df.dtypes).items()) to_sql(df, name=tablename, con=self.conn, flavor="sqlite", dtype=dtype)
def _write_table(self, tablename, df, conn): "writes a dataframe to the sqlite database" for col in df.columns: if re.search("[()]", col): msg = "please follow SQLite column naming conventions: " msg += "http://www.sqlite.org/lang_keywords.html" raise Exception(msg) if self.do_log: self.log.append("(pandas) create table %s ..." % tablename) cards = ','.join(['?'] * len(df.columns)) self.log.append("(pandas) INSERT INTO %s VALUES (%s)" % (tablename , cards)) to_sql(df, name = tablename, con = self.conn, flavor = self.engine)
def test_to_sql_replace(self): sql.to_sql(self.test_frame1, 'test_frame3', self.conn, flavor='sqlite', if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame3', self.conn, flavor='sqlite', if_exists='replace') self.assertTrue( sql.has_table('test_frame3', self.conn, flavor='sqlite'), 'Table not written to DB') num_entries = len(self.test_frame1) num_rows = self._count_rows('test_frame3') self.assertEqual( num_rows, num_entries, "not the same number of rows as entries")
def test(now): import datetime from sql import getCursor,insertStock from stock import oneStock,getStockList import pandas.io.sql as pdsql before=now-datetime.timedelta(700) v='sh600216' df=oneStock(v,starttime=before,endtime=time) df=df.iloc[[-1]] [conn,cur]=getCursor() pdsql.to_sql(df, v, conn, if_exists='append',index_label='date') return
def test_to_sql_index_label(self): temp_frame = DataFrame({'col1': range(4)}) # no index name, defaults to 'pandas_index' sql.to_sql(temp_frame, 'test_index_label', self.conn) frame = sql.read_table('test_index_label', self.conn) self.assertEqual(frame.columns[0], 'pandas_index') # specifying index_label sql.to_sql(temp_frame, 'test_index_label', self.conn, if_exists='replace', index_label='other_label') frame = sql.read_table('test_index_label', self.conn) self.assertEqual(frame.columns[0], 'other_label', "Specified index_label not written to database") # using the index name temp_frame.index.name = 'index' sql.to_sql(temp_frame, 'test_index_label', self.conn, if_exists='replace') frame = sql.read_table('test_index_label', self.conn) self.assertEqual(frame.columns[0], 'index', "Index name not written to database") # has index name, but specifying index_label sql.to_sql(temp_frame, 'test_index_label', self.conn, if_exists='replace', index_label='other_label') frame = sql.read_table('test_index_label', self.conn) self.assertEqual(frame.columns[0], 'other_label', "Specified index_label not written to database")
def test_roundtrip(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, flavor='sqlite') result = sql.read_sql( 'SELECT * FROM test_frame_roundtrip', con=self.conn, flavor='sqlite') # HACK! result.index = self.test_frame1.index result.set_index('pandas_index', inplace=True) result.index.astype(int) result.index.name = None tm.assert_frame_equal(result, self.test_frame1)
def getData(self): if self.storage == "remote": print("Getting walk score data from remote source.\nThis takes a while! Please be patient.") # Get walk score walk_score = list() for i in self.lacity_zip: zipcode = i[1] zip_ID = i[0] url = "https://www.walkscore.com/CA/Los_Angeles/"+str(zipcode) response = requests.get(url) soup = BeautifulSoup(response.content, 'lxml') score = soup.find_all('div',{"style":"padding: 0; margin: 0; border: 0; outline: 0; position: absolute; top: 0; bottom: 0; left: 0; right: 0;"}) walkScoreTag = score[0].find('img') walkScore = walkScoreTag.attrs['src'].split('/')[-1].split('.')[0] walk_score.append((zipcode,walkScore,zip_ID)) # Convert walk score list into dataframe print("Convert walk score into dataframe") walk_score_df = pd.DataFrame(walk_score) walk_score_df.columns=['zip_code','walk_score','zip_ID'] print("walk score dataframe") print(walk_score_df.head()) # Insert walk score dataframe into ReferralCrimeMap.db print("Inserting walk score dataframe into ReferralCrimeMap.db") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() pd_sql.to_sql(walk_score_df, 'WalkScore', conn, if_exists='replace', index=False) cur.fetchmany(5) conn.close() else: print("Getting walk score from local source.") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() query = ''' SELECT * FROM WalkScore ''' try: walkScore = pd.read_sql(query, conn) conn.close() print(walkScore.head()) except Exception as e: print("There is an error:", e) print("Please enter correct data source.") exit()
def save(self, filename=None): """ Save the point histories to sqlite3 database. Save the device object properties to a pickle file so the device can be reloaded. """ if filename: if ".db" in filename: filename = filename.split(".")[0] self.properties.db_name = filename else: self.properties.db_name = "{}".format(self.properties.name) # Does file exist? If so, append data if os.path.isfile("{}.db".format(self.properties.db_name)): his = self._read_from_sql( 'select * from "{}"'.format("history"), self.properties.db_name ) his.index = his["index"].apply(Timestamp) try: last = his.index[-1] df_to_backup = self.backup_histories_df()[last:] except IndexError: df_to_backup = self.backup_histories_df() else: self._log.debug("Creating a new backup database") df_to_backup = self.backup_histories_df() # DataFrames that will be saved to SQL with contextlib.closing( sqlite3.connect("{}.db".format(self.properties.db_name)) ) as con: sql.to_sql( df_to_backup, name="history", con=con, index_label="index", index=True, if_exists="append", ) # Saving other properties to a pickle file... prop_backup = {} prop_backup["device"] = self.dev_properties_df() prop_backup["points"] = self.points_properties_df() with open("{}.bin".format(self.properties.db_name), "wb") as file: pickle.dump(prop_backup, file) self._log.info("Device saved to {}.db".format(self.properties.db_name))
def oneIndustry(industry,i_c): set=pd.DataFrame() for id in i_c.index: te=__company_info__.query('stock_id=='+str(id)) set=set.append(te) pure=set['outstanding'].sum()*10000 pure=int(pure) total=int((set['outstanding']*set['bvps']).sum()*10000) set['percent']=(set['outstanding']*set['bvps']*10000)/total count=len(set.index) #get src starttime=datetime.datetime(2006,1,1) endtime=datetime.datetime.now() use=[] for id in i_c.index: te=getSingleStock(transTable(id),starttime,endtime) updown(te) te.reset_index(inplace=True) te['stock_id']=id resu=pd.merge(left=te[['stock_id','open','high','close','low','volume','updown','date']],right=set[['stock_id','stock_name','percent']],on='stock_id') resu['open']=resu['open']*resu['percent'] resu['realclose']=resu['close'] resu['close']=resu['close']*resu['percent'] resu['high']=resu['high']*resu['percent'] resu['low']=resu['low']*resu['percent'] resu['volume']=resu['volume']*resu['percent'] use.append(resu) data=use[0] for i in range(1,len(use)): data=data.append(use[i]) data.fillna(0,inplace=True) df1=data[['open','high','low','close','volume']].groupby(data['date']).sum() df1.reset_index(level=0, inplace=True) df2=data[['stock_name','updown','stock_id','date','realclose']].groupby(data['date'],as_index=False).apply(apply_lambda) df=pd.merge(left=df1,right=df2,on='date') df['industry_name']=industry df['pure']=pure df['total']=total df['companySum']=count df = df.rename(columns={'volume': 'volumn', 'stock_name': 'leaderName','stock_id':'leader', 'realclose':'leaderPrice','updown':'leaderUpdown'}) df.set_index('date', drop=True, inplace=True) eng=getE() pd.DataFrame.replace(df,{float('inf'):0}) pdsql.to_sql(df, unicode(industry), eng,if_exists='replace',chunksize=200) setPrimaryKey(industry) return
def test_roundtrip(self): # this test otherwise fails, Legacy mode still uses 'pandas_index' # as default index column label sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, flavor='sqlite') result = sql.read_sql( 'SELECT * FROM test_frame_roundtrip', con=self.conn, flavor='sqlite') # HACK! result.index = self.test_frame1.index result.set_index('pandas_index', inplace=True) result.index.astype(int) result.index.name = None tm.assert_frame_equal(result, self.test_frame1)
def getLAcityZip(self): if self.storage == "remote": print("Getting LA City zip code from remote source.") # Get all zip codes in Los Angeles City (http://geohub.lacity.org/datasets/875d540d71e64b8696cc368865c2b640_0) url="https://opendata.arcgis.com/datasets/875d540d71e64b8696cc368865c2b640_0.geojson" r = requests.get(url) data = json.loads(r.text) lacity_zip = list() for i in range(len(data['features'])): try: zipcode = data['features'][i]['properties']['ZIPCODE'] lacity_zip.append((i+1,zipcode)) except Exception as e: print(e) continue self.lacity_zip = lacity_zip print("Total number of LA City zip codes:", len(lacity_zip)) # Convert the list into dataframe lacity_zip_df = pd.DataFrame(lacity_zip, columns=['prime_key', 'zipcode']) #Insert the data frame into ReferralCrimeMap.db conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() pd_sql.to_sql(lacity_zip_df, 'LAzip', conn, if_exists='replace', index=False) cur.execute('SELECT * FROM LAzip') print("Inserting LA zip codes into ReferralCrimeMab.db.") print(cur.fetchmany(5)) conn.close() else: print("Getting LA City zip code from local source.") conn = sq.connect("ReferralCrimeMap.db") cur=conn.cursor() query = ''' SELECT * FROM LAzip ''' try: lacity_zip = pd.read_sql(query, conn) conn.close() print(lacity_zip.head()) except Exception as e: print("There is an error:" ,e) print("Please enter correct data source.") exit()
def to_sql(df, table_name, if_exists='append', null=mdb.NULL, db=DB): # replace NaNs if 'timestamp' not in df.columns: df.fillna(null, inplace=True) # open connection to db db = mdb.connect(user="******", host="localhost", port=3306, db=db, charset='utf8') # write to db try: sql.to_sql(df, table_name, db, 'mysql', if_exists) except AttributeError: sql.write_frame(df, table_name, db, 'mysql', if_exists) # close connection to db db.close()
def getData(self): if self.storage == "remote": print( " * Getting walk score data from remote source.\nThis takes a few minutes! Thanks for being patient." ) # Get walk score walk_score = list() for i in self.lacity_zip: zipcode = i[1] zip_ID = i[0] url = "https://www.walkscore.com/CA/Los_Angeles/" + str( zipcode) response = requests.get(url) soup = BeautifulSoup(response.content, 'lxml') score = soup.find_all( 'div', { "style": "padding: 0; margin: 0; border: 0; outline: 0; position: absolute; top: 0; bottom: 0; left: 0; right: 0;" }) walkScoreTag = score[0].find('img') walkScore = walkScoreTag.attrs['src'].split('/')[-1].split( '.')[0] walk_score.append((zipcode, walkScore, zip_ID)) # Convert walk score list into dataframe print("Convert walk score into dataframe") walk_score_df = pd.DataFrame(walk_score) walk_score_df.columns = ['zip_code', 'walk_score', 'zip_ID'] print("walk score dataframe") print(walk_score_df.head()) # Insert walk score dataframe into ReferralCrimeMap.db print("Inserting walk score dataframe into ReferralCrimeMap.db") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() pd_sql.to_sql(walk_score_df, 'WalkScore', conn, if_exists='replace', index=False) cur.fetchmany(5) conn.close() else: print("Getting walk score from local source.") conn = sq.connect("ReferralCrimeMap.db") cur = conn.cursor() query = ''' SELECT * FROM WalkScore ''' try: walkScore = pd.read_sql(query, conn) conn.close() print(walkScore.head()) except Exception as e: print("There is an error:", e) print("Please enter correct data source.") exit()
def write_mun_db(conn, CSV_MUNICIPIOS): print("Cleaning municipio data") crime_municipios = CrimeMunicipios(os.path.join('snsp-data', CSV_MUNICIPIOS)) print("Writing metada to db") # has to be append to preserve the foreign keys pd_sql.to_sql(crime_municipios.tipo, 'tipo_municipios', conn, if_exists='append', index=False) pd_sql.to_sql(crime_municipios.subtipo, 'subtipo_municipios', conn, if_exists='append', index=False) pd_sql.to_sql(crime_municipios.modalidad, 'modalidad_municipios', conn, if_exists='append', index=False) pd_sql.to_sql(crime_municipios.bien_juridico, 'bien_juridico_municipios', conn, if_exists='append', index=False) pd_sql.to_sql(crime_municipios.municipios, 'municipio_names', conn, if_exists='append', index=False) pd_sql.to_sql(crime_municipios.population, 'population_municipios', conn, if_exists='append', index=False, chunksize=20000) print("Writing municipio data to db") # Temporary fix because the SNSP duplicated (with different values) the data for 2055 TRINIDAD ZAACHILA #crime_municipios.data = crime_municipios.data[(crime_municipios.data['state_code'] != 22) & (crime_municipios.data['mun_code'] != 555)] pd_sql.to_sql(crime_municipios.data, 'municipios_fuero_comun', conn, if_exists='append', index=False, chunksize=1000000) print("End writing municipio data to db")
engine = create_engine(r'mysql+mysqldb://root:123@localhost/employees') sqldf = psql.read_sql(query1, engine) print sqldf engine = create_engine(r'mysql+mysqldb://root:123@localhost/employees') sqldf = psql.read_sql(query3, engine, chunksize=10) for i, idf in enumerate(sqldf): print i, idf if i == 3: break engine = create_engine(r'mysql+mysqldb://root:123@localhost/employees') df = pd.DataFrame(np.random.random(100).reshape(10, 10), index=range(10), columns=range(10)) print df psql.to_sql(df, 'test_table', engine) engine = create_engine(r'mysql+mysqldb://root:123@localhost/employees') df = pd.DataFrame({'dept_no': 'd013', 'dept_name': 'Cat'}, index=range(1)) print df df.to_sql('departments', engine, if_exists='append', index=False) '''META DATA''' engine = create_engine(r'mysql+mysqldb://root:123@localhost/employees') metadata = MetaData(engine) my_table = Table('cat', metadata, autoload=True) my_table = Table('cat', metadata, Column('age', Integer), Column('name', String(40)), Column('color', String(40))) my_table.create() insert = my_table.insert() insert.execute(age=1, name='mimi', color='yellow') '''ORM''' Base = declarative_base()
def write_state_db(conn, CSV_ESTADOS): crime_states = CrimeStates(os.path.join('snsp-data', CSV_ESTADOS)) # has to be append to preserve the foreign keys pd_sql.to_sql(crime_states.tipo, 'tipo_states', conn, if_exists='append', index=False) pd_sql.to_sql(crime_states.subtipo, 'subtipo_states', conn, if_exists='append', index=False) pd_sql.to_sql(crime_states.modalidad, 'modalidad_states', conn, if_exists='append', index=False) # pd_sql.to_sql(crime_states.state_codes, 'state_names', conn, if_exists='append', index=False) pd_sql.to_sql(crime_states.bien_juridico, 'bien_juridico_states', conn, if_exists='append', index=False) pd_sql.to_sql(crime_states.population, 'population_states', conn, if_exists='append', index=False, chunksize=20000) pd_sql.to_sql(crime_states.data, 'estados_fuero_comun', conn, if_exists='append', index=False, chunksize=20000)