def get_brand_sku(category_level3_id, crawl_id): conn = pymysql.connect(host='127.0.0.1', user='******', password='******', db='customer', charset='utf8') #sku_jd所有的品牌-sku sku_all = [] sql_all = 'select concat(brand_id, "-", sku_group) as sku\ from sku_jd where crawl_id=(select crawl_id from sku_jd\ order by id desc limit 1) and category_level3_id="%s"'\ % category_level3_id #根据最新的crawl_id records_all = pd.read_sql(sql_all, conn) records_all = records_all.drop_duplicates('sku') for i in records_all['sku']: sku_all.append(i) #comment_count_jd已爬取的品牌-sku sku_crawled = [] sql_crawled = 'select concat(brand_id, "-", sku_group) as sku\ from comment_count_jd where crawl_id=%s\ and category_level3_id="%s"' % (crawl_id, category_level3_id) records_crawled = pd.read_sql(sql_crawled, conn) records_crawled = records_crawled.drop_duplicates('sku') for i in records_crawled['sku']: sku_crawled.append(i) #未爬取的sku sku_task = [i for i in sku_all if i not in sku_crawled] sku_task.sort() n = len(sku_task) conn.close() return sku_task, n
def get_labels_for_ids(ids, start_date, end_date): qinvest = ("SELECT newid, count(adverse_by_ourdef) from {} " "WHERE dateoccured >= '{}'::date " "AND dateoccured <= '{}'::date " "AND newid in ({}) " "group by newid " ).format(config["si_table"], start_date, end_date, format_officer_ids(ids)) qadverse = ("SELECT newid, count(adverse_by_ourdef) from {} " "WHERE adverse_by_ourdef = 1 " "AND dateoccured >= '{}'::date " "AND dateoccured <= '{}'::date " "AND newid in ({}) " "group by newid " ).format(config["si_table"], start_date, end_date, format_officer_ids(ids)) invest = pd.read_sql(qinvest, con=con) adverse = pd.read_sql(qadverse, con=con) adverse["adverse_by_ourdef"] = 1 adverse = adverse.drop(["count"], axis=1) invest = invest.drop(["count"], axis=1) outcomes = adverse.merge(invest, how='outer', on='newid') outcomes = outcomes.fillna(0) return outcomes
def mf_lookup(find_str, item): """ given name look up for managerID """ cnxn_jrgcb = pyodbc.connect(""" DRIVER={SQL Server}; SERVER=172.16.7.166; DATABASE=jrgcb; UID=sa; PWD=sa123456""") if item == 'Name': sql_mf = """ SELECT DISTINCT ManagerID FROM [jrgcb].[dbo].[FundAndManagerData_v2] WHERE [Name] = '""" + find_str + """' """ return pd.read_sql(sql_mf, cnxn_jrgcb) if item == 'SecuCode': sql_mf = """ SELECT DISTINCT ManagerID FROM [jrgcb].[dbo].[FundAndManagerData_v2] WHERE [SecuCode] = '""" + find_str + """' """ return pd.read_sql(sql_mf, cnxn_jrgcb)
def search_posts(phrase, engine): lemmatizer = WordNetLemmatizer() words = ["(^|[^a-z])" + lemmatizer.lemmatize(word) for word in word_tokenize(phrase) if word not in stopwords.words('english') and len(word) >= 3] if len(words) == 0: return None params = {'phrase': "|".join(words)} query = ["SELECT link_id, url, title FROM threads", "WHERE title_lower ~ %(phrase)s"] found = pd.read_sql(" ".join(query), engine, params=params) if len(found['link_id']) == 0: return None link_ids = ', '.join(found['link_id'].apply(lambda lid: "'" + lid + "'")) query = ["SELECT clean_body as body, affil, link_id FROM cleaned", "WHERE link_id IN (" + link_ids + ")"] data = pd.read_sql(" ".join(query), engine) valid = data[data['body'].apply(lambda text: len(text.split()) >= 10 and not bool(re.search("[^a-z]bot[^a-z]", text)))] if valid.shape[0] < 60: return None return valid, found.set_index('link_id')
def show_products(): cur, conn = mysql_connect("leo_markt") selected_category = request.form.get("category_to_show") if selected_category == "": flash("Please select a category ", "msg") if (request.method == "POST") and (request.form['add'] == "show_products"): df = pd.read_sql(""" SELECT * FROM products where category = '%s' """ % selected_category, con = conn) elif (request.method == "POST") and (request.form['add'] == "refresh"): print("refresh pushed") df = pd.read_sql(""" SELECT * FROM products """, con = conn) else: df = pd.read_sql(""" SELECT * FROM products """, con = conn) df = df.set_index('id') pr_id = df.index.tolist() pr_names = df.name.tolist() # pr_descriptions = df.description.tolist() pr_prices = df.price.tolist() # change format from 1,000.00 to 1.000,00 pr_prices = ['{:,.2f} €'.format(i).replace(",", "X").replace(".", ",").replace("X", ".") for i in pr_prices] pr_prices = [i.decode('utf-8') for i in pr_prices] # euro symbol not in ASCII pr_alailability = df.availability.tolist() pr_categories = df.category.tolist() products_zip = zip(pr_id, pr_names, pr_prices, pr_alailability, pr_categories) cur.close() conn.close() return products_zip
def __get_neighbor_movie__(self, candidate_id, viewed): """ Get the most similar movie with respect to candidate_id :return: array of movies id(s) """ current_movie = pd.read_sql("movies", self.engine).query("id_movie == " + str(candidate_id)).as_matrix() movies = pd.read_sql("movies", self.engine).as_matrix() candidate = [] # max = 0 for i in range(len(movies)): simRate = VideoBasedRecommendation.__check_similarity__(self, current_movie[0], movies[i]) candidate.append([movies[i][0], simRate]) #id movie # if (simRate > max) # max = simRate # sort VideoBasedRecommendation.quick_sort(self, 0, len(candidate)-1, candidate) # print(candidate) # select the best score out = [] i = 0; prev = candidate[0][1] while (i < len(candidate)): if (prev != candidate[i][1]): break if (candidate[i][0] not in viewed): out.append(candidate[i][0]) prev = candidate[i][1] i += 1 random.shuffle(out) return out[0:5]
def get_pitchab_for_pitcher(pitcher_name, con, reg=True): """ Get everything from pitch and atbat for a specific pitcher, merge on gameday_link + num usage: get_pitchab_for_pitcher(pitcher_name, con, reg=True) set "reg=False" to get spring training, all-star, post-season games """ atbat_sql = """select * from atbat where pitcher_name = "%s" """ % pitcher_name pitch_sql = ( """select * from pitch where gameday_link in (select gameday_link from atbat where pitcher_name = "%s") """ % pitcher_name ) atbat = pd.read_sql(atbat_sql, con) pitch = pd.read_sql(pitch_sql, con) pitchab = pitch.merge(atbat, on=["gameday_link", "num"]) pitchab.dropna(subset=["px"], inplace=True) if reg: game_sql = """select gameday_link from game where game_type="R" """ reg_gdls_df = pd.read_sql(game_sql, con) reg_gdls = ["gid_%s" % x for x in reg_gdls_df["gameday_link"].values] pitchab = pitchab[pitchab["gameday_link"].isin(reg_gdls)] for param in ("break_angle", "break_length", "break_y"): pitchab[param] = pd.to_numeric(pitchab[param]) return pitchab
def comp(database, metadata, site_info, hole_info): engine = create_engine(database) # Load metadata sql = "SELECT * FROM {};".format(metadata) metadata = pd.read_sql(sql, engine) # Load site data sql = "SELECT * FROM {};".format(site_info) sitedata = pd.read_sql(sql, engine) # Load hole data sql = "SELECT * FROM {};".format(hole_info) holedata = pd.read_sql(sql, engine) # Group and average hole data for sites hole_grouped = holedata.loc[:,('site_key', 'lat','lon','water_depth', 'total_penetration','etopo1_depth', 'surface_porosity', 'sed_thickness', 'crustal_age','coast_distance', 'ridge_distance', 'seamount', 'surface_productivity','toc', 'opal', 'caco3', 'woa_temp', 'woa_salinity', 'woa_o2','lith1','lith2','lith3','lith4', 'lith5','lith6','lith7','lith8','lith9', 'lith10','lith11','lith12','lith13' )].groupby("site_key").mean().reset_index() # Combine all tables site_meta_data = pd.merge(metadata, sitedata, how='outer', on=('site_key', 'leg', 'site')) data = pd.merge(site_meta_data, hole_grouped, how='outer', on=('site_key')).fillna(np.nan) site_metadata = data.dropna(subset = ['interface_flux']).reset_index(drop=True) return site_metadata
def dedup_table(): # http://stackoverflow.com/a/7745635/424631 dbname = 'sqlite+pysqlite:////home/aahu/Dropbox/black-market-recommender-systems/data/bmrs.db' conn = sqlalchemy.create_engine(dbname, module=sqlite3.dbapi2) init_size = pd.read_sql('SELECT COUNT(*) FROM bmrs;', conn) logger.info('initial size: {}'.format(init_size)) logger.info('batch scrapes together...') q = """ SELECT d1.* FROM bmrs d1 LEFT OUTER JOIN bmrs d2 ON (d1.listing = d2.listing AND d1.vendor = d2.vendor AND d1.marketplace = d2.marketplace AND d1.category = d2.category AND d1.cat_tuple = d2.cat_tuple AND d1.ships_from = d2.ships_from AND d1.ships_to = d2.ships_to AND d1.scrape_date < d2.scrape_date) WHERE d2.listing IS NULL AND d2.vendor IS NULL AND d2.marketplace IS NULL AND d2.category IS NULL AND d2.cat_tuple IS NULL AND d2.ships_from IS NULL AND d2.ships_to IS NULL; """ df = pd.read_sql(q, conn) df = df.drop_duplicates() print(df) logger.info('shape now: {}'.format(df.shape)) logger.info('overwriting old table...') df.to_sql('bmrs', conn, index=False, if_exists='replace') return
def positions(self, algo_id=None, json=True): if algo_id is not None: algo_id = algo_id.replace('/', '') trades_query = "SELECT * FROM trades WHERE exit_time IS NULL" if algo_id is not None: trades_query += " AND algo='" + algo_id + "'" trades = pd.read_sql(trades_query, self.dbconn) last_query = "SELECT s.id, s.symbol, max(t.last) as last_price FROM ticks t LEFT JOIN symbols s ON (s.id=t.symbol_id) GROUP BY s.id" last_prices = pd.read_sql(last_query, self.dbconn) trades = trades.merge(last_prices, on=['symbol']) trades['unrealized_pnl'] = np.where( trades['direction']=="SHORT", trades['entry_price']-trades['last_price'], trades['last_price']-trades['entry_price']) trades['slippage'] = abs(trades['entry_price'] - trades['market_price']) trades['slippage'] = np.where( ((trades['direction'] == "LONG") & (trades['entry_price'] > trades['market_price'])) | ((trades['direction'] == "SHORT") & (trades['entry_price'] < trades['market_price'])), -trades['slippage'], trades['slippage']) trades = trades.sort_values(['entry_time'], ascending=[False]) trades = trades.to_dict(orient="records") if json: return jsonify(trades) else: return trades
def dominant_set_topic_rank(): #dominant_set conn = sqlite3.connect("zhihu.db") following_data = pd.read_sql('select user_url, followee_url from Following where followee_url in (select user_url from User where agree_num > 50000) and user_url in (select user_url from User where agree_num > 50000)', conn) #following_data = pd.read_sql('select user_url, followee_url from Following where followee_url in (select user_url from User where agree_num > 10000) and user_url in (select user_url from User where agree_num > 10000)', conn) G = nx.DiGraph() for d in following_data.iterrows(): G.add_edge(d[1][0], d[1][1]) dominant_set = nx.dominating_set(G) print 'user number in dominant set:', len(dominant_set) #topics answered by users in dominant_set user_topic_data = pd.read_sql('select user_url, topic from UserTopic', conn) topicdict = defaultdict(int) i = 0#counter for row in user_topic_data.iterrows(): user_url = row[1][0] topic = row[1][1] if user_url in dominant_set: topicdict[topic] += 1 i += 1 #if i % 100 == 0: #print i conn.close() topicsorted = sorted(topicdict.items(), key=lambda x: x[1], reverse=True) # topic top 100 for t in topicsorted[:100]: print t[0],t[1]
def _calculate_factor_value_by_pd(self, stkcode, store): """""" msg = "Processing stock {}".format(stkcode) self.log.info(msg) raw_cur = self.raw_conn.cursor() sql = """ select Date,ClosePrice,AFloatShare,TotalShare from market_data_a_share2 where StkCode='{}' and Date>='{}' """.format(stkcode, self.start_date) df_mkt = pd.read_sql(sql, self.raw_conn, index_col='Date') prc_cur = self.prc_conn.cursor() sql = """ select * from financial_data where StkCode='{}' and Date>='{}' """.format(stkcode, self.start_date) df_fin = pd.read_sql(sql, self.prc_conn, index_col='Date') data = pd.concat([df_mkt,df_fin], axis=1) data = data.fillna(method='ffill') vals = [] for name in self.fct_name: _val = eval(self.fct_algos[name]) vals.append(_val.to_frame(name)) result = pd.concat(vals, axis=1) #store['val_'+stkcode] = result return result
def retrieve_best_hotels(city, state=''): """PURPOSE: To """ engine = cadb.connect_aws_db(write_unicode=True) conn = engine.connect() cmd = 'SELECT * FROM yelp_reviews' yelp_reviews = pd.read_sql(cmd, engine) cmd = 'SELECT * FROM yelp_hotels' yelp_hotels = pd.read_sql(cmd, engine) yelp = pd.merge(yelp_hotels, yelp_reviews, on='business_id', how='inner') yelp_city = yelp[yelp['hotel_city'] == city.strip()] yelp_dog_review = yelp_city[yelp_city['review_text'].str.contains('dog')].copy().reset_index() average_dog_ratings = [np.mean(yelp_dog_review[yelp_dog_review['hotel_id'] == hotel_id]['review_rating'].values) for hotel_id in np.unique(yelp_dog_review['hotel_id'])] unique_hotels = yelp_dog_review[yelp_dog_review['hotel_id'].isin(np.unique(yelp_dog_review['hotel_id']))].copy() unique_hotels.drop_duplicates(cols='hotel_id', inplace=True) unique_hotels['average_rating'] = average_dog_ratings best_dog_hotel_names = unique_hotels.sort(columns='average_rating', ascending=False)['hotel_name'].head(10).values best_dog_hotel_ratings = np.round(unique_hotels.sort(columns='average_rating', ascending=False)['average_rating'].head(10).values, 1) string_ratings = [str(rat) for rat in best_dog_hotel_ratings] #print('best dog hotels:') #print(best_dog_hotel_names) return best_dog_hotel_names, string_ratings
def get_data(self): DB = sqlite3.connect(self.DBname) if self.permissionLevel == 'admin': results = pd.read_sql("SELECT * from results", DB) else: ID_query = "SELECT staff_code from staff where username = ?" staffID = pd.read_sql(ID_query, DB, params=[self.username]) set_query = "SELECT teaching_set from staffing where staff_code = ?" setlist = pd.read_sql(set_query, DB, params=[staffID['staff_code'][0]]) sets = setlist['teaching_set'].tolist() results_query = ( "SELECT * from results where " + " or ".join(("teaching_set = " + "'" + str(n) + "' " for n in sets))) results = pd.read_sql(results_query, DB) assessments = pd.read_sql("SELECT * from assessments", DB) merged = pd.merge( results, assessments, how='left', left_on=['aID', 'qNum'], right_on=['aID', 'qNum']) merged = merged.drop([ 'aName_y', 'qTitle', 'aName_x', 'course', 'course_ID', 'module_ID'], axis=1) cols = ['aID', 'qNum', 'UPN', 'qModule', 'qTopic', 'pMark', 'qMark', 'teaching_set'] df = merged[cols] return df
def __init__(self, name, tag, thresh, category, db_path, db_lock): Filter.__init__(self, name, db_path, db_lock) self.tag = tag self.thresh = thresh self.category = category assert self.category == "track" or self.category == "album" self.lock.acquire() if self.category == "track": self.IDs = pd.read_sql( "SELECT ID_track.trackID \ FROM tag_artist LEFT JOIN ID_tag ON ID_tag.tagID=tag_artist.tagID \ LEFT JOIN ID_track ON ID_track.artistID=tag_artist.artistID\ WHERE ID_tag.tagName=? AND tag_artist.count>=?", self.con, params=(self.tag, self.thresh), ) self.IDs.columns = ["trackID"] if self.category == "album": self.IDs = pd.read_sql( "SELECT ID_album.albumID \ FROM tag_artist LEFT JOIN ID_tag ON ID_tag.tagID=tag_artist.tagID \ LEFT JOIN ID_album ON ID_album.artistID=tag_artist.artistID\ WHERE ID_tag.tagName=? AND tag_artist.count>=?", self.con, params=(self.tag, self.thresh), ) self.IDs.columns = ["albumID"] self.lock.release()
def __init__( self, name, db_path, db_lock, minim=0, maxim=time.strftime("%Y", time.localtime()), include_undated=False ): Filter.__init__(self, name, db_path, db_lock) self.minim = minim self.maxim = maxim if maxim and min: assert self.minim < self.maxim self.lock.acquire() if not include_undated: self.IDs = pd.read_sql( "SELECT albumID\ FROM date_album \ WHERE date_album.date_int>=? AND date_album.date_int<=?", self.con, params=(self.minim, self.maxim), ) if include_undated: self.IDs = pd.read_sql( "SELECT ID_album.albumID\ FROM ID_album \ LEFT JOIN date_album ON date_album.albumID=ID_album.albumID \ WHERE date_album.date_int>=? AND date_album.date_int<=? \ OR date_album.date_int ISNULL", self.con, params=(self.minim, self.maxim), ) self.lock.release() self.IDs.columns = ["albumID"]
def load_market_cap(symbol_list, query_date = '2015-01-04', source = "PASS / MARKET DATA", transform_to_weights = False, fix_symbols = False): """ This function modifies symbol_list if this list is not sorted. """ if not isinstance(query_date, str): if isinstance(query_date, datetime.date): query_date = str(query_date) else: query_date = str(query_date.date()) if fix_symbols: symbol_list.sort() symbol_list = [symbol.replace('=', '/') for symbol in symbol_list] db = make_db_connection() query_symbols = ' OR '.join(["ST_SECURITY_CODE='%s'" % (symbol) for symbol in symbol_list]) query_hdpks = "select ST_SECURITY_CODE, HD_PK from PASS_SYS.V_SERIE where (%s) and ST_NAME='%s'" %(query_symbols, source) df_hdpks = pd.read_sql(query_hdpks,db, index_col = 'ST_SECURITY_CODE') df_hdpks['HD_PK'] = df_hdpks['HD_PK'].apply(lambda key : key.encode('hex')) query_lkseries = ' OR '.join(["LK_SERIE=unhex('%s')" % (hdpk) for hdpk in df_hdpks.values.ravel().tolist()]) query_mkt_cap = "select A.DT_DATE, B.ST_SECURITY_CODE, A.NU_CUR_MKT_CAP from PASS_SYS.V_MKTDATA as A LEFT JOIN PASS_SYS.V_SERIE as B on A.LK_SERIE=B.HD_PK where (%s) and A.DT_DATE<='%s' ORDER BY A.DT_DATE DESC LIMIT %d" % (query_lkseries, query_date, len(symbol_list)*10) mkt_caps = pd.read_sql(query_mkt_cap, db, index_col = 'ST_SECURITY_CODE') mkt_caps = mkt_caps.groupby(axis = 0, level=0).apply(lambda df: df.bfill()['NU_CUR_MKT_CAP'].values[0])#.apply(lambda df: df.bfill()) db.close() mkt_caps.sort_index(inplace =True) if fix_symbols: #unfix them mkt_caps.index = symbol_list if transform_to_weights: mkt_caps = mkt_caps / mkt_caps.sum(skipna=True) return mkt_caps.to_frame('NU_CUR_MKT_CAP')
def test_against_popcycle(tmpdir): # Generate popcycle results popcycledir = tmpdir.join("popcycle") popcycle_cmd = "Rscript tests/generate_popcycle_results.R tests {}".format(str(popcycledir)) subprocess.check_call(popcycle_cmd.split()) # Generate seaflowpy results dbfile = str(tmpdir.join("testcruise.db")) shutil.copyfile("tests/testcruise_paramsonly.db", dbfile) os.chmod(dbfile, 0664) # make the db writeable evt_files = sfp.evt.find_evt_files("tests/testcruise_evt") filt_opts = { "notch1": None, "notch2": None, "offset": 0.0, "origin": None, "width": 0.5 } sfp.filterevt.filter_evt_files( evt_files, "testcruise", filt_opts, dbfile, str(tmpdir.join("opp"))) opp_files = sfp.evt.find_evt_files(str(tmpdir.join("opp"))) # Compare opp/vct table output with sqlite3.connect(dbfile) as con_py: opp_py = pd.read_sql("SELECT * FROM opp ORDER BY file", con_py) with sqlite3.connect(str(popcycledir.join("testcruise.db"))) as con_R: opp_R = pd.read_sql("SELECT * FROM opp ORDER BY file", con_R) columns = ["opp_count", "evt_count", "opp_evt_ratio", "notch1", "notch2", "offset", "origin", "width"] npt.assert_allclose(opp_py[columns], opp_R[columns]) assert "\n".join(opp_py["file"].values) == "\n".join(opp_R["file"].values) # Compare OPP file output opps_py = [sfp.evt.EVT(o) for o in sfp.evt.find_evt_files(str(tmpdir.join("opp")))] opps_R = [sfp.evt.EVT(o) for o in sfp.evt.find_evt_files(str(popcycledir.join("opp")))] assert len(opps_py) == len(opps_R) for i in range(len(opps_py)): npt.assert_array_equal(opps_py[i].df, opps_R[i].df)
def get_fix(self): """ 获取fix的DataFrame :return: :rtype: pd.DataFrame """ self.open() part1 = pd.read_sql('select * from %s' % self.table_name_fix_part1, self.connection) part2 = pd.read_sql('select * from %s' % self.table_name_fix_part2, self.connection) part3 = pd.read_sql('select * from %s' % self.table_name_fix_part3, self.connection) res_data = part1.merge(part2, how='left', left_on='index', right_on='index') res_data = res_data.merge(part3, how='left', left_on='index', right_on='index') self.close() # 数据的格式还有点问题, 需要fix一下 date_list = res_data['date'] del res_data['date'] del res_data['index'] res_data.index = date_list return res_data
def main_contents(): print u'これはとっても素敵なシステムです' print u'あなたを退屈な日常から解き放ち、新たなる境地へと導いてくれます' yomiuri_data = pd.read_sql("SELECT * FROM item ORDER BY DateLine DESC LIMIT 10;", conn) print u'----------------------------------------------------------' print u'早速気になるニュースを選んでみよう\n' data = yomiuri_data for i in range(5): print data['HeadLine'] print '> ', selected = int(sys.stdin.readline()) print u'----------------------------------------------------------' print data.ix[selected, 'HeadLine'], '\n\n' print data.ix[selected, 'article'], '\n\n' print u'----------------------------------------------------------' print u'この記事に関連なさそうなもの', '\n' no_related_genre = predict_min(yomiuri_data.ix[selected, :])[0] print no_related_genre data = pd.read_sql(u"SELECT * FROM item WHERE Genre1 = '{}' ORDER BY DateLine DESC LIMIT 10;".format(no_related_genre), conn) print u'\n\n_人人人人人人_' print u'> 仕事しろ <' print u' ̄Y^Y^Y^Y^Y^Y^ ̄\n\n'
def output(): """ function: calculates answer and renders results """ # reads in pace and activities from options mpace_hr = float(request.args.get('mpace')) day1 = request.args.get('day1') day2 = request.args.get('day2') day3 = request.args.get('day3') user_pattern = get_pattern([day3,day2,day1]) # loads data from database with db: cur = db.cursor() # list of runners good_runners = pd.read_sql("SELECT * FROM good_runners;",db) # markov transition probabilities prob_table = pd.read_sql("SELECT * FROM act_prob",db) # daily metrics for runners days = good_runners['days_to_marathon'] diffs = good_runners['run_difficulty'].astype(int) stresses = good_runners['run_stress'] difficulty = get_difficulty(prob_table,user_pattern) day_class = get_class(difficulty) with db: # get pdf and ints tables for stress and intensity # based on difficulty if difficulty == 0: pdf = [0] ints = [0] elif difficulty == 1: pdf = pd.read_sql("SELECT * FROM easy_pdf",db)['0'].tolist() ints = pd.read_sql("SELECT * FROM easy_ints",db)['0'].tolist() elif difficulty == 2: pdf = pd.read_sql("SELECT * FROM mod_pdf",db)['0'].tolist() ints = pd.read_sql("SELECT * FROM mod_ints",db)['0'].tolist() elif difficulty == 3: pdf = pd.read_sql("SELECT * FROM hard_pdf",db)['0'].tolist() ints = pd.read_sql("SELECT * FROM hard_ints",db)['0'].tolist() else: pdf = pd.read_sql("SELECT * FROM epic_pdf",db)['0'].tolist() ints = pd.read_sql("SELECT * FROM epic_ints",db)['0'].tolist() # calculate stress and intensity stress = get_stress(pdf) intensity = get_intensity(ints) # information to be displayed for the athlete todays_run = get_today(mpace_hr,intensity,stress) display = get_display(difficulty,todays_run) dist = display[0] pace = display[1] return render_template("output.html", diff = difficulty, stress = stress, dist = dist, pace = pace, day = day_class)
def __init__(self): engine_statement = "mysql+pymysql://" + self.usr + ":" + self.password + "@" + self.hostname + "/" + self.dbName self.engine= sa.create_engine(engine_statement,) self.user_behaviours = pd.read_sql("user_behaviours", self.engine).as_matrix() self.movies = pd.read_sql("movies", self.engine) print("Genre Ctor") return
def total_added(): #connect and retreive information from databases PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) DATABASE = os.path.join(PROJECT_ROOT, 'data', 'HG.db') conn = sqlite3.connect(DATABASE) c = conn.cursor() data= pd.read_sql('SELECT * FROM stats', conn) data2=pd.read_sql('SELECT * FROM allmedia',conn) followers_added_per_day = [] day = [] length=len(data['Followers'])-1 i=0 while i<(1000): x = data['Followers'][length-i] - data['Followers'][length-i-48] followers_added_per_day.append(x) y="{0}-{1}".format(data['Day'][length-i-48], data['Mon'][length-i-48]) day.append(y) i=i+48 length = len(followers_added_per_day) return followers_added_per_day, day, length followers_past_day = data['Followers'][length] - data['Followers'][length-48] followers_past_week = data['Followers'][length] - data['Followers'][length-48*7] followers_past_month = data['Followers'][length] - data['Followers'][length-48*30] return followers_past_day, followers_past_week, followers_past_month
def Followers_per_hour(): #connect and retreive information from databases PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) DATABASE = os.path.join(PROJECT_ROOT, 'data', 'HG.db') conn = sqlite3.connect(DATABASE) c = conn.cursor() data= pd.read_sql('SELECT * FROM stats ORDER BY id ASC', conn) data2=pd.read_sql('SELECT * FROM allmedia',conn) #create the column of difference in followers per row Followers_per_hour = (data['Followers']-data['Followers'].shift(2)).tolist() length = len(Followers_per_hour) Followers = data['Followers'].tolist() Positive_Followers = len([i for i in Followers_per_hour if i>0]) Nuetral_Followers = len([i for i in Followers_per_hour if i==0]) Negative_Followers = len([i for i in Followers_per_hour if i<0]) Positive_vs_Negative = (Positive_Followers, Nuetral_Followers, Negative_Followers) post_occurs = (data['Posts']-data['Posts'].shift(1)).tolist() Time_of_Post = (data2['Time_of_Post']).tolist() Likes = (data2['Likes']).tolist() yaxis=max(Likes) length_likes = len(Likes) conn.close() return Followers_per_hour, length, Followers, Positive_vs_Negative, Time_of_Post, Likes, length_likes, yaxis
def sort(self,sort_by, con, asc=True): #sort_by == list of strings with column names #asc == corresponding list of booleans True for ascending sorting (default) self.lock.acquire() if self.category == 'track': df = pd.read_sql('SELECT * FROM view_temp_track', con) if self.category == 'album': df = pd.read_sql('SELECT * FROM view_temp_album', con) self.lock.release() df_sorted = df.sort_values(by=sort_by, ascending=asc) self.lock.acquire() if self.category == 'track': # sqlite does not have a drop table command, writing tables with pandas leaves a weird level_o column in # the table when kwarg if_exists='replace (apparently this stems from trying to write the index, works if index set to false)' df_sorted.to_sql('view_temp_track',con, if_exists='replace', index=False) if self.category == 'album': df_sorted.to_sql('view_temp_album',con, if_exists='replace', index=False) con.commit() self.lock.release()
def getGroupNormsWithZerosAsDF(self, groups=[], where='', pivot=False, sparse=False): """returns a dict of (group_id => feature => group_norm)""" """default index is on group_id and feat""" index=['group_id','feat'] db_eng = mif.get_db_engine(self.corpdb) sql = """SELECT group_id, feat, group_norm from %s""" % (self.featureTable) if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if (where): sql += ' WHERE ' + where + " AND " + gCond else: sql += ' WHERE ' + gCond elif (where): sql += ' WHERE ' + where if pivot: if sparse: return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().to_sparse().fillna(value=0) else: return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().fillna(value=0) else: # this method won't work if default index is changed df = pd.read_sql(sql=sql, con=db_eng, index_col=index) idx = pd.MultiIndex.from_product([df.index.levels[0], df.index.levels[1]], names=df.index.names) if sparse: return df.reindex(idx).to_sparse().fillna(value=0) else: return df.reindex(idx).fillna(value=0)
def add_class_tag(df, df_neutral): troll_tags = [] for name in df['name']: df_count = pd.read_sql(""" SELECT body, name, author, score FROM May2015 WHERE name == '{}' """.format(name),sql_conn) troll_tags.append(1) for name in df_neutral['name']: df_count = pd.read_sql(""" SELECT body, name, author, score FROM May2015 WHERE name == '{}' """.format(name),sql_conn) troll_tags.append(0) df_all = df.append(df_neutral, ignore_index=True) df_all['Class'] = troll_tags df_all.to_csv('test.csv', encoding='utf-8') return df_all
def Followers_per_hour(): db = DatabaseCreate(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data'), 'HG.db') data = pd.read_sql('SELECT * FROM stats ORDER BY id ASC', db.conn) data2 = pd.read_sql('SELECT * FROM allmedia',db.conn) # create the column of difference in followers per row Followers_per_hour = (data['Followers']-data['Followers'].shift(2)).tolist() length = len(Followers_per_hour) Followers = data['Followers'].tolist() Positive_Followers = len([i for i in Followers_per_hour if i>0]) Nuetral_Followers = len([i for i in Followers_per_hour if i==0]) Negative_Followers = len([i for i in Followers_per_hour if i<0]) Positive_vs_Negative = (Positive_Followers, Nuetral_Followers, Negative_Followers) post_occurs = (data['Posts']-data['Posts'].shift(1)).tolist() Time_of_Post = (data2['Time_of_Post']).tolist() Likes = (data2['Likes']).tolist() yaxis = max(Likes) length_likes = len(Likes) db.conn.close() return Followers_per_hour, length, Followers, Positive_vs_Negative, Time_of_Post, Likes, length_likes, yaxis
def remove_outliers(measure_type): value_lb_dict = {'electric': 'Electric_(KWH)', 'gas': 'Gas_(CubicFeet)'} col = value_lb_dict[measure_type] conn = uo.connect('interval_ion') dfs = [] with conn: df_bs = pd.read_sql('SELECT * FROM {0}_id_station'.format(measure_type), conn) bs_pair = zip(df_bs['Building_Number'], df_bs['ICAO']) # bs_pair = [x for x in bs_pair if x[0] == 'AL0039AB'] for i, (b, s) in enumerate(bs_pair): print i, b with conn: df = pd.read_sql('SELECT * FROM {0} WHERE Building_Number = \'{1}\''.format(measure_type, b), conn) # df = df.head(n = 5000) # df.info() points = df[col] outliers = show_outlier(points, b, 'upper', measure_type, 1.5) # outliers = show_outlier(points, b, 'pos_roll', measure_type, 1.5) # mild, outliers = show_outlier(points, b, 'box', measure_type, 1.5) df['outlier'] = outliers print len([x for x in outliers if x]) dfs.append(df) df_all = pd.concat(dfs, ignore_index=True) print df_all.head() with conn: df_all.to_sql('{0}_outlier_tag'.format(measure_type), conn, if_exists='replace') return
def get_venues(self, top_venues, num=20, exclude_recent=False, sk_artist_id=None): """top_venues is a dict where the key is venue id and the value is a score. num is the number of venues to return returns venue metadata and stores results in the venues table if they don't already exist""" top_venues_df = pd.DataFrame.from_dict(top_venues).reset_index() top_venues_df.columns = ["id","score"] if exclude_recent and sk_artist_id is not None: recent_venues = self.get_recent_venues(sk_artist_id) top_venues_df = top_venues_df.loc[ ~top_venues_df['id'].isin(recent_venues['id']) ,:].reset_index(drop=True) con = sql3.connect(db_path) venue_ids = [] count=1 venues = pd.read_sql("select * from venues",con) for venue_id in top_venues_df['id']: if venue_id not in venues['id'].tolist() and not offline_mode: self.store_venue(venue_id) # store the venue data venues = pd.read_sql("select * from venues",con) # reload the venues dataframe if venues.loc[venues['id']==venue_id,'country'].tolist()[0]=="US": venue_ids.append(venue_id) count+=1 if count>num: break query = "select id, name, city, state, lat, lng, capacity from venues where country='US' and id in ({})".format(",".join(map(str,venue_ids))) venues = pd.read_sql(query,con) return venues.merge(top_venues_df).sort("score",ascending=False)
def stat_index_all_no_use(tmp_datetime): datetime_str = (tmp_datetime).strftime("%Y-%m-%d") datetime_int = (tmp_datetime).strftime("%Y%m%d") print("datetime_str:", datetime_str) print("datetime_int:", datetime_int) # 查询今日满足股票数据。剔除数据:创业板股票数据,中小板股票数据,所有st股票 # #`code` not like '002%' and `code` not like '300%' and `name` not like '%st%' sql_1 = """ SELECT `date`, `code`, `name`, `changepercent`, `trade`, `open`, `high`, `low`, `settlement`, `volume`, `turnoverratio`, `amount`, `per`, `pb`, `mktcap`, `nmc` FROM stock_data.ts_today_all WHERE `date` = %s and `trade` > 0 and `open` > 0 and trade <= 20 and `code` not like %s and `code` not like %s and `name` not like %s """ print(sql_1) global db data = pd.read_sql(sql=sql_1, con=db.engine, params=[datetime_int, '002%', '300%', '%st%']) data = data.drop_duplicates(subset="code", keep="last") print("########data[trade]########:", len(data)) # print(data["trade"]) # 1), n天涨跌百分百计算 # open price change (in percent) between today and the day before yesterday ‘r’ stands for rate. # stock[‘close_-2_r’] # 可以看到,-n天数据和今天数据的百分比。 stock_column = ['close_-1_r', 'close_-2_r', 'code', 'date'] # close_-1_r close_-2_r code date data_new = concat_guess_data(stock_column, data) # 2), CR指标 # http://wiki.mbalib.com/wiki/CR%E6%8C%87%E6%A0%87 价格动量指标 # CR跌穿a、b、c、d四条线,再由低点向上爬升160时,为短线获利的一个良机,应适当卖出股票。 # CR跌至40以下时,是建仓良机。而CR高于300~400时,应注意适当减仓。 stock_column = ['code', 'cr', 'cr-ma1', 'cr-ma2', 'cr-ma3', 'date'] # code cr cr-ma1 cr-ma2 cr-ma3 date data_new = concat_guess_data(stock_column, data_new) # 3), KDJ指标 # http://wiki.mbalib.com/wiki/%E9%9A%8F%E6%9C%BA%E6%8C%87%E6%A0%87 # 随机指标(KDJ)一般是根据统计学的原理,通过一个特定的周期(常为9日、9周等)内出现过的最高价、 # 最低价及最后一个计算周期的收盘价及这三者之间的比例关系,来计算最后一个计算周期的未成熟随机值RSV, # 然后根据平滑移动平均线的方法来计算K值、D值与J值,并绘成曲线图来研判股票走势。 # (3)在使用中,常有J线的指标,即3乘以K值减2乘以D值(3K-2D=J),其目的是求出K值与D值的最大乖离程度, # 以领先KD值找出底部和头部。J大于100时为超买,小于10时为超卖。 stock_column = ['code', 'date', 'kdjd', 'kdjj', 'kdjk'] # code date kdjd kdjj kdjk data_new = concat_guess_data(stock_column, data_new) # 4), MACD指标 # http://wiki.mbalib.com/wiki/MACD # 平滑异同移动平均线(Moving Average Convergence Divergence,简称MACD指标),也称移动平均聚散指标 # MACD 则可发挥其应有的功能,但当市场呈牛皮盘整格局,股价不上不下时,MACD买卖讯号较不明显。 # 当用MACD作分析时,亦可运用其他的技术分析指标如短期 K,D图形作为辅助工具,而且也可对买卖讯号作双重的确认。 stock_column = ['code', 'date', 'macd', 'macdh', 'macds'] # code date macd macdh macds data_new = concat_guess_data(stock_column, data_new) # 5), BOLL指标 # http://wiki.mbalib.com/wiki/BOLL # 布林线指标(Bollinger Bands) stock_column = ['boll', 'boll_lb', 'boll_ub', 'code', 'date'] # boll boll_lb boll_ub code date data_new = concat_guess_data(stock_column, data_new) # 6), RSI指标 # http://wiki.mbalib.com/wiki/RSI # 相对强弱指标(Relative Strength Index,简称RSI),也称相对强弱指数、相对力度指数 # (2)强弱指标保持高于50表示为强势市场,反之低于50表示为弱势市场。 # (3)强弱指标多在70与30之间波动。当六日指标上升到达80时,表示股市已有超买现象, # 如果一旦继续上升,超过90以上时,则表示已到严重超买的警戒区,股价已形成头部,极可能在短期内反转回转。 stock_column = ['code', 'date', 'rsi_12', 'rsi_6'] # code date rsi_12 rsi_6 data_new = concat_guess_data(stock_column, data_new) # 7), W%R指标 # http://wiki.mbalib.com/wiki/%E5%A8%81%E5%BB%89%E6%8C%87%E6%A0%87 # 威廉指数(Williams%Rate)该指数是利用摆动点来度量市场的超买超卖现象。 stock_column = ['code', 'date', 'wr_10', 'wr_6'] # code date wr_10 wr_6 data_new = concat_guess_data(stock_column, data_new) # 8), CCI指标 # http://wiki.mbalib.com/wiki/%E9%A1%BA%E5%8A%BF%E6%8C%87%E6%A0%87 # 顺势指标又叫CCI指标,其英文全称为“Commodity Channel Index”, # 是由美国股市分析家唐纳德·蓝伯特(Donald Lambert)所创造的,是一种重点研判股价偏离度的股市分析工具。 # 1、当CCI指标从下向上突破﹢100线而进入非常态区间时,表明股价脱离常态而进入异常波动阶段, # 中短线应及时买入,如果有比较大的成交量配合,买入信号则更为可靠。 # 2、当CCI指标从上向下突破﹣100线而进入另一个非常态区间时,表明股价的盘整阶段已经结束, # 将进入一个比较长的寻底过程,投资者应以持币观望为主。 # CCI, default to 14 days stock_column = ['cci', 'cci_20', 'code', 'date'] # cci cci_20 code date data_new = concat_guess_data(stock_column, data_new) # 9), TR、ATR指标 # http://wiki.mbalib.com/wiki/%E5%9D%87%E5%B9%85%E6%8C%87%E6%A0%87 # 均幅指标(Average True Ranger,ATR) # 均幅指标(ATR)是取一定时间周期内的股价波动幅度的移动平均值,主要用于研判买卖时机。 stock_column = ['cci', 'cci_20', 'code', 'date'] # cci cci_20 code date data_new = concat_guess_data(stock_column, data_new) # 10), DMA指标 # http://wiki.mbalib.com/wiki/DMA # DMA指标(Different of Moving Average)又叫平行线差指标,是目前股市分析技术指标中的一种中短期指标,它常用于大盘指数和个股的研判。 # DMA, difference of 10 and 50 moving average # stock[‘dma’] stock_column = ['code', 'date', 'dma'] # code date dma data_new = concat_guess_data(stock_column, data_new) # 11), DMI,+DI,-DI,DX,ADX,ADXR指标 # http://wiki.mbalib.com/wiki/DMI # 动向指数Directional Movement Index,DMI) # http://wiki.mbalib.com/wiki/ADX # 平均趋向指标(Average Directional Indicator,简称ADX) # http://wiki.mbalib.com/wiki/%E5%B9%B3%E5%9D%87%E6%96%B9%E5%90%91%E6%8C%87%E6%95%B0%E8%AF%84%E4%BC%B0 # 平均方向指数评估(ADXR)实际是今日ADX与前面某一日的ADX的平均值。ADXR在高位与ADX同步下滑,可以增加对ADX已经调头的尽早确认。 # ADXR是ADX的附属产品,只能发出一种辅助和肯定的讯号,并非入市的指标,而只需同时配合动向指标(DMI)的趋势才可作出买卖策略。 # 在应用时,应以ADX为主,ADXR为辅。 stock_column = ['adx', 'adxr', 'code', 'date', 'dx', 'mdi', 'pdi'] # adx adxr code date dx mdi pdi data_new = concat_guess_data(stock_column, data_new) # 12), TRIX,MATRIX指标 # http://wiki.mbalib.com/wiki/TRIX # TRIX指标又叫三重指数平滑移动平均指标(Triple Exponentially Smoothed Average) stock_column = ['code', 'date', 'trix', 'trix_9_sma'] # code date trix trix_9_sma data_new = concat_guess_data(stock_column, data_new) # 13), VR,MAVR指标 # http://wiki.mbalib.com/wiki/%E6%88%90%E4%BA%A4%E9%87%8F%E6%AF%94%E7%8E%87 # 成交量比率(Volumn Ratio,VR)(简称VR),是一项通过分析股价上升日成交额(或成交量,下同)与股价下降日成交额比值, # 从而掌握市场买卖气势的中期技术指标。 stock_column = ['code', 'date', 'vr', 'vr_6_sma'] # code date vr vr_6_sma data_new = concat_guess_data(stock_column, data_new) data_new = data_new.round(2) # 数据保留2位小数 # 删除老数据。 del_sql = "DELETE FROM guess_indicators_daily WHERE trade_date= %(trade_date)s " db.execute(del_sql, params={'trade_date': datetime_int}) # print(data_new.head()) # data_new["down_rate"] = (data_new["trade"] - data_new["wave_mean"]) / data_new["wave_base"] db.insert_db(data_new, "guess_indicators_daily", True, "`date`,`code`")
def user_elite_cleaned_csv(): """Prepare data from database and save the queried table as a csv file. This version only include reviews / tips years between 2010-2016. Elite users before 2010 or after 2016 are excluded. """ with sqlite3.connect(DB_PATH) as conn: c = conn.cursor() # pre-condition c.executescript(""" -- create user friend summary table CREATE TEMP TABLE _uf AS SELECT user_id, COUNT(*) as friends FROM user_friends GROUP BY user_id; -- create user tip summary table CREATE TEMP TABLE _t AS SELECT user_id, SUM(compliment_count) AS tip_compliment, COUNT(*) AS tips, AVG(length(text)) AS tip_len FROM tip WHERE STRFTIME('%Y', date) BETWEEN '2010' AND '2016' GROUP BY user_id; -- create user review summary table CREATE TEMP TABLE _r AS SELECT user_id, COUNT(*) as review_num, AVG(length(text)) as review_len FROM review WHERE STRFTIME('%Y', date) BETWEEN '2010' AND '2016' GROUP BY user_id; -- add users having 0 friends INSERT INTO _uf SELECT user_id, 0 FROM users WHERE user_id NOT IN ( SELECT user_id FROM _uf ); -- add users having 0 reviews INSERT INTO _r SELECT user_id, 0, 0. FROM users WHERE user_id NOT IN ( SELECT user_id FROM _r ); -- add users having 0 tips INSERT INTO _t SELECT user_id, 0, 0, 0. FROM users WHERE user_id NOT IN ( SELECT user_id FROM _t ); -- create exclusive user table CREATE TEMP TABLE _eu AS SELECT user_id FROM user_elite WHERE elite BETWEEN '2010' AND '2016' GROUP BY user_id; """) # noinspection SqlResolve df = pd.read_sql( """ SELECT u.user_id, review_count, useful, cool, funny, fans, (compliment_hot + compliment_more + compliment_profile + compliment_cute + compliment_list + compliment_note + compliment_plain + compliment_cool + compliment_funny + compliment_writer + compliment_photos) AS compliment, friends, tip_compliment, tips, tip_len, review_num, review_len, u.user_id IN (SELECT user_id FROM user_elite) AS elite FROM users u, _uf, _t, _r WHERE u.user_id = _uf.user_id AND u.user_id = _t.user_id AND u.user_id = _r.user_id AND u.user_id NOT IN _eu """, conn) df.to_csv(DATA_DIR / 'user-profiling.csv', index=False) # post-condition # noinspection SqlResolve c.executescript(""" DROP TABLE _t; DROP TABLE _uf; DROP TABLE _r; DROP TABLE _eu; """)
#encoding=utf-8 import pandas as pd import MySQLdb from sqlalchemy import create_engine from sqlalchemy.types import String,BLOB mysql_cn = create_engine('mysql+mysqldb://root:bmg123@localhost:3306/bbs_pro?charset=utf8') #mysql_cn=MySQLdb.connect(host="localhost",port=3306,user='******',passwd='bmg123',db='bbs_pro',charset='utf8') df=pd.read_sql("select * from app01_bbs",con=mysql_cn) #mysql_cn.close() #mysql_cn=MySQLdb.connect(host="localhost",port=3306,user='******',passwd='bmg123',db='bbs_pro',charset='utf8') pd.io.sql.to_sql(df,'app01_bbs_1',con=mysql_cn,if_exists='append',index=False)
import matplotlib.pyplot as plt import parameter as pa from sklearn import cluster from sklearn.metrics import adjusted_rand_score import matplotlib.pyplot as plt from sklearn.mixture import GaussianMixture from sqlalchemy import create_engine import pymysql import sys reload(sys) sys.setdefaultencoding('utf8') engine = create_engine( 'mysql+pymysql://root:[email protected]/demo?charset=utf8') df = pd.read_sql('select * from scope_jianmo', engine) l2 = [ '批发业', '纺织业', '橡胶和塑料制品业', '商务服务业', '纺织服装、服饰业', '印刷和记录媒介复制业', '科技推广和应用服务业', '零售业', '专业技术服务业', '通用设备制造业', '其他制造业', '其他金融业', '非金属矿物制品业', '金属制品业', '皮革、毛皮、羽毛及其制品和制鞋业', '研究和试验发展', '农、林、牧、渔服务业', '居民服务业', '造纸和纸制品业', '新闻和出版业', '化学原料和化学制品制造业', '仪器仪表制造业', '软件和信息技术服务业', '酒、饮料和精制茶制造业', '货币金融服务', '仓储业', '建筑装饰和其他建筑业', '机动车、电子产品和日用产品修理业', '化学纤维制造业', '文教、工美、体育和娱乐用品制造业', '装卸搬运和运输代理业', '土木工程建筑业', '道路运输业', '房地产业', '食品制造业', '专用设备制造业', '电气机械和器材制造业', '其他服务业', '废弃资源综合利用业', '互联网和相关服务', '金属制品、机械和设备修理业', '有色金属冶炼和压延加工业', '农业', '住宿业', '资本市场服务', '汽车制造业', '文化艺术业', '电信、广播电视和卫星传输服务', '医药制造业', '家具制造业', '铁路、船舶、航空航天和其他运输设备制造业', '娱乐业', '租赁业', '体育', '木材加工和木、竹、藤、棕、草制品业', '保险业', '教育', '煤炭开采和洗选业', '烟草制品业', '计算机、通信和其他电子设备制造业', '非金属矿采选业', '广播、电视、电影和影视录音制作业', '房屋建筑业', '黑色金属冶炼和压延加工业', '水上运输业', '邮政业', '农副食品加工业', '建筑安装业', '生态保护和环境治理业', '餐饮业', '卫生', '黑色金属矿采选业', '铁路运输业', '电力、热力生产和供应业', '畜牧业', '林业', '水的生产和供应业', '公共设施管理业', '航空运输业', '渔业', '石油加工、炼焦和核燃料加工业',
def load_data(): conn = sqlite3.connect('../data/db.sqlite') df = pd.read_sql('select * from disaster', conn) return df
def load_data_from_source(): global tripWindow_start_time, tripWindow_end_time global fromLaguardiaPoolsCreatedCount global toLaguardiaPoolsCreatedCount global fromLaguardiaPoolsProcessedCount global toLaguardiaPoolsProcesedCount # print("TRIPS WINDOW " + tripWindow_start_time + " " + tripWindow_end_time) # Get the 1st starting trip record whose pickup time is in between trip window start time and trip window end time trip_records_query = "select RideID, tpep_pickup_datetime ,pickup_latitude," + "pickup_longitude, dropoff_latitude," \ "dropoff_longitude,dist_airport from taxitrips_v2 " \ "where tpep_pickup_datetime " \ "between \"" \ + tripWindow_start_time + "\" and \"" + tripWindow_end_time + "\" ORDER BY tpep_pickup_datetime ASC " # print(trip_records_query) df_mysql = read_sql(trip_records_query, con=connection) if len(df_mysql) == 0: logging.info("No records exist between the given dates " + tripWindow_start_time + " " + tripWindow_end_time) # print("No records exist between the given dates " + tripWindow_start_time + " " + tripWindow_end_time) else: # Put it all to a data frame tripData = df_mysql # Get 1st records's start date pool_start_date = tripData.iloc[0]['tpep_pickup_datetime'] # Set pool end date based on pool start date and pool window pool_end_date = pool_start_date + timedelta(minutes=pool_window_time1) # print("Started Analyzing trip requests for pool windows of " + str(pool_window_time1) + " minutes") tripWindow_end_time = datetime.strptime(tripWindow_end_time, "%Y-%m-%d %H:%M:%S") while pool_end_date <= tripWindow_end_time: FromLaguardiaRecords = tripData.loc[ (tripData['pickup_longitude'].between(source_longitude_min, source_longitude_max)) & (tripData['pickup_latitude'].between(source_latitude_min, source_latitude_max)) & (tripData['tpep_pickup_datetime']).between( pool_start_date, pool_end_date)] ToLaguardiaRecords = tripData.loc[ (tripData['dropoff_longitude'].between(source_longitude_min, source_longitude_max)) & (tripData['dropoff_latitude'].between( source_latitude_min, source_latitude_max)) & (tripData['tpep_pickup_datetime']).between( pool_start_date, pool_end_date)] # print("len is " + str(len(FromLaguardiaRecords)) + " " + str(len(ToLaguardiaRecords))) pick_a_ride(FromLaguardiaRecords, "From Laguardia", pool_window_time1) pick_a_ride(ToLaguardiaRecords, "To Laguardia", pool_window_time1) pool_start_date = pool_end_date + timedelta(seconds=1) pool_end_date = pool_end_date + timedelta( minutes=pool_window_time1) if pool_start_date < tripWindow_end_time and pool_end_date > tripWindow_end_time: FromLaguardiaRecords = tripData.loc[ (tripData['pickup_longitude'].between(source_longitude_min, source_longitude_max)) & (tripData['pickup_latitude'].between(source_latitude_min, source_latitude_max)) & (tripData['tpep_pickup_datetime']).between( pool_start_date, tripWindow_end_time)] ToLaguardiaRecords = tripData.loc[ (tripData['dropoff_longitude'].between(source_longitude_min, source_longitude_max)) & (tripData['dropoff_latitude'].between( source_latitude_min, source_latitude_max)) & (tripData['tpep_pickup_datetime']).between( pool_start_date, tripWindow_end_time)] print("len is " + str(len(FromLaguardiaRecords)) + " " + str(len(ToLaguardiaRecords))) pick_a_ride(FromLaguardiaRecords, "From Laguardia", pool_window_time1) pick_a_ride(ToLaguardiaRecords, "To Laguardia", pool_window_time1) pool_start_date = tripData.iloc[0]['tpep_pickup_datetime'] logging.info("Starting processing for 10 minutes window") # Set pool end date based on pool start date and pool window pool_end_date = pool_start_date + timedelta(minutes=pool_window_time2)
def get(self, request: HttpRequest, val_pks: List[int], fmt_pks: Optional[List[int]] = None, *args, **kwargs) -> Response: if fmt_pks is None: fmt_pks = [] do_excel = False if 'report' in request.GET and request.GET['report'] == 'excel': do_excel = True grouping = request.GET.get('group-by', 'feature') validations = Validation.objects.filter(pk__in=val_pks) # Looking for best items in target validations ibest = Result.sa \ .query(Result.sa.item_id, func.max(Status.sa.priority).label('best_status_priority')) \ .filter(Result.sa.validation_id.in_(val_pks),) \ .join(Status.sa) \ .group_by(Result.sa.item_id).subquery('ibest') # looking for date of best validation best = Result.sa.query(ibest.c.item_id, func.max(Status.sa.id).label('best_status'), func.max(Validation.sa.date).label('best_validation_date')) \ .select_from(Result.sa) \ .join(Status.sa, Status.sa.id == Result.sa.status_id)\ .join(Validation.sa, Validation.sa.id == Result.sa.validation_id) \ .join(ibest, Result.sa.item_id == ibest.c.item_id) \ .filter( Result.sa.validation_id.in_(val_pks), Status.sa.priority == ibest.c.best_status_priority) \ .group_by(ibest.c.item_id).subquery('best') v2 = Result.sa.query( Result.sa.item_id, Validation.sa.id, Result.sa.status_id, Validation.sa.date ) \ .filter(Result.sa.validation_id.in_(val_pks),) \ .join(Validation.sa) \ .subquery('v2') # Looking for best validation in found date vbest = Result.sa.query(best.c.item_id, func.max(v2.c.id).label('best_validation'), func.max(best.c.best_status).label('best_status_id')) \ .select_from(best) \ .join(v2, and_(v2.c.item_id == best.c.item_id, v2.c.status_id == best.c.best_status, v2.c.date == best.c.best_validation_date)) \ .group_by(best.c.item_id) \ .subquery('vbest') # Looking for best results in found validations res = Result.sa.query(vbest.c.item_id, vbest.c.best_validation, vbest.c.best_status_id, func.max(Result.sa.id).label('result')) \ .select_from(vbest) \ .join(Result.sa, and_( Result.sa.item_id == vbest.c.item_id, Result.sa.validation_id == vbest.c.best_validation, Result.sa.status_id == vbest.c.best_status_id ) ) \ .group_by(vbest.c.item_id, vbest.c.best_validation, vbest.c.best_status_id) # Select scenario ids, feature names, codec names from # feature mapping rules which belong to selected FMTs fm_rules = fmt_rules(fmt_pks).subquery('fm_rules') # joining referenced tables to get names and so on res = res.subquery('res') q = Result.sa.query( (fm_rules.c.Feature if grouping == 'feature' else fm_rules.c.Codec).label('group'), Item.sa.name.label('item_name'), fm_rules.c.Codec, res.c.best_status_id, Validation.sa.name.label('val_name'), Driver.sa.name.label('driver_name'), Result.sa.item_id, Result.sa.validation_id, Validation.sa.source_file, Result.sa.result_url) \ .select_from(res) \ .join(Item.sa, Item.sa.id == res.c.item_id) \ .join(Result.sa, Result.sa.id == res.c.result) \ .join(fm_rules, Item.sa.scenario_id == fm_rules.c.scenario_id, full=True) \ .join(Validation.sa) \ .join(Driver.sa) \ .order_by(fm_rules.c.Feature if grouping == 'feature' else fm_rules.c.Codec, Item.sa.name) # Create DataFrame crosstab from SQL request df = pd.read_sql(q.statement, q.session.bind) df['group'] = df['group'].fillna('Unknown') if grouping == 'feature' and fmt_pks: # extend feature name with codec df = df.apply(lambda row: feature_codec_concat(row), axis=1) df.drop('Codec', axis=1) if df.empty: if do_excel: return Response() return Response({'headers': [], 'items': []}) ct = pd.crosstab(index=df.group, values=df.item_name, columns=df.best_status_id, aggfunc='count', colnames=[''], margins=True, margins_name='Total', dropna=False) # prepare DataFrame crosstab for response ct = prepare_crosstab(ct, grouping) # If no excel report needed just finish here with json return if not do_excel: return Response(convert_to_datatable_json(ct)) # Excel part workbook = excel.do_report(data=ct, extra=validations, report_name='Best status report') filename = f'best_report_{datetime.now():%Y-%m-%d_%H:%M:%S}.xlsx' response = HttpResponse(save_virtual_workbook(workbook), content_type='application/ms-excel') response['Content-Disposition'] = f'attachment; filename="{filename}"' return response
def main(args=None): args = parser.parse_args(args) config.read(args.config) try: logging.basicConfig( format="%(levelname)s:%(message)s", level=config["logging"]["log-level"] ) except ValueError: logging.warning( f"Incorrect log-level specified: {config['logging']['log-level']}. " "Falling back to INFO." ) logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) try: realm = Realm[config["api"]["realm"]] except KeyError: logging.critical( f"Configured realm \"{config['api']['realm']}\" is unknown. " f"Choose one of: {', '.join(Realm.__members__.keys())}" ) sys.exit(1) if not len(config["accounts"]): logging.warning( "There are no configured accounts, nothing to do. " 'Check the "accounts" section in the config file.' ) try: account_data = account_info( realm, config["api"]["application-id"], list(config["accounts"].keys()), ).values() except ValueError as e: logging.critical(e) sys.exit(1) flat_account_data = [ timestamps_to_datetime(flatten(data, strip=True), keys=TIME_FIELDS) for data in account_data ] rows = [ {column.name: data[column.name] for column in statistics.columns} for data in flat_account_data ] try: with sa.create_engine(config["db"]["url"]).connect() as conn: changed = False for row in rows: logging.info( f"Attempting insert {row['nickname']} @ {row['updated_at']}" ) try: conn.execute( statistics.insert() .values(row) .compile(dialect=postgresql.dialect()) ) changed = True logging.info("Insert successful") except IntegrityError as e: if not isinstance(e.orig, psycopg2.errors.UniqueViolation): raise e from e logging.info(f"Skipping, record exists") if config["plots"] and changed: logging.info("Change detected, updating plots") df = pd.read_sql( "SELECT * from statistics ORDER BY updated_at", conn, index_col=["account_id", "updated_at"], ) plt.style.use("Solarize_Light2") for path, interval_str in config["plots"].items(): if interval_str: figure = create_plot( df[ df.index.get_level_values(1) > ( pd.Timestamp.now(tz="UTC") - pd.Timedelta(interval_str) ) ] ) else: figure = create_plot(df) logging.info(f"Saving {path}") figure.savefig(path) except sa.exc.OperationalError as e: logging.critical(f"Invalid database URL: {config['db']['url']} ({e})") sys.exit(1) except sa.exc.NoSuchModuleError: logging.critical(f"Invalid protocol in database URL: {config['db']['url']}") sys.exit(1)
def get_columns(table, db): return pd.read_sql(f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{db[0]}' AND TABLE_NAME = '{table[0]}';", conn).values
tilt_1 = check(start, '傾角1管理值判定') tilt_2 = check(start, '傾角2管理值判定') send(mailgo, water) send(mailgo, tilt_1) send(mailgo, tilt_2) time.sleep(n) SYS_path = os.path.dirname(os.path.abspath(__file__)) STATUS_TABLE_Path = SYS_path + '/STATUS_TABLE_OUTPUT/' + 'Monitoring_Status.xlsx' USERLIST_DB = Path(SYS_path + "/USERLIST.mdb") conn_str = (r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};''DBQ=%s;' %(USERLIST_DB)) cnxn = pyodbc.connect(conn_str) df = pd.read_sql("SELECT * FROM USERLIST", cnxn) df_NOTIFICATION = df[df['資料通知'] == '開啟'] toaddr = list(df_NOTIFICATION['帳號']) mailgo = ",".join(toaddr) timer_reload(259200)
def get_databases(): databases = pd.read_sql("SELECT schema_name FROM information_schema.schemata WHERE schema_name not in ('information_schema','mysql','performance_schema','sys');", conn).values return databases
def get_tables(db): return pd.read_sql(f"SHOW TABLES FROM {db[0]};", conn).values
##################### # Importar librerías import pandas as pd import numpy as np #import plotly.graph_objects as go from datetime import datetime, timedelta import time from sqlalchemy import create_engine ##################### ### Fecth Raw data from AWS DB engine = create_engine('postgresql://*****:*****@ds4a18.cmlpaj0d1yqv.us-east-2.rds.amazonaws.com:5432/Airports_ds4a') var = pd.read_sql("SELECT count(1) from dataraw", engine.connect(), parse_dates=('valid',)) df = pd.read_sql("SELECT * from dataraw", engine.connect(), parse_dates=('valid',)) df = df.dropna(subset=['valid'],axis=0) df['DateTime'] = df['valid'].apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S")- timedelta(hours=3)) df.replace(['M',None],np.nan,inplace=True) print('Data Succesfully Fetched From AWS RDS',end='\n\n') ##################### #### Select Columns With Na % lower than 22% (previously selected) print('Deleting Variables With More Than 22% of Missing Values',end='\n\n') cols = ['id','station','DateTime', 'valid', 'tmpf', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'alti', 'vsby', 'skyc1', 'skyl1', 'feel'] df = df[cols]
# Connect to database: con = sqlite3.connect("NYC-Taxi.db") # ## Extract observations and clean data # # The analysis will focus on a sample of a specified number of observations of green and yellow cab data. Data from each of these sources will be stacked together, features created, and then finally a train and test set created. # In[2]: # need to set seed by randomly sampling indices in python then pass to sql # specify number of rows with pull variable pull = "5000000" # Store sample of green and yellow cab data into dataframes: df1 = pd.read_sql(f"SELECT * FROM green_cabs ORDER BY random() LIMIT" + " " + pull, con=con) df2 = pd.read_sql(f"SELECT * from yellow_cabs ORDER BY random() LIMIT" + " " + pull, con=con) # Add labels for green and yellow cabs and rename pickup/dropoff datetime columns: df1 = df1.rename( columns={ "lpep_pickup_datetime": "pickup_datetime", "lpep_dropoff_datetime": "dropoff_datetime" }) df2 = df2.rename( columns={ "tpep_pickup_datetime": "pickup_datetime", "tpep_dropoff_datetime": "dropoff_datetime"
sDate = str(j.strftime("%Y-%m-%d")) eDate = str((j + dt.timedelta(days=6)).strftime("%Y-%m-%d")) counter += 1 print(f"{client_name} - {counter} of {num_weeks}") print(f'Aggregating {sDate} - {eDate}') sql = f""" SELECT RanksDaily_{save_name}.Date, KeywordsTable.StatId, RanksDaily_{save_name}.Rank as Rank, RanksDaily_{save_name}.BaseRank as BaseRank, RankingUrl.Url as RankingUrl FROM RanksDaily_{save_name} JOIN KeywordsTable ON RanksDaily_{save_name}.KeywordId = KeywordsTable.Id JOIN RankingUrl ON RanksDaily_{save_name}.RankingUrlId = RankingUrl.Id WHERE date BETWEEN :sDate AND :eDate; """ params = {"sDate":sDate, "eDate":eDate} df = pd.read_sql(sql, params=params, con=con) week_df = df.drop_duplicates(subset=["StatId"], keep="first") week_df = week_df.drop(["Rank","BaseRank","RankingUrl"], axis=1) # get average Rank temp = df.groupby("StatId", as_index=False)["Rank"].mean().round().astype(int) week_df = week_df.merge(temp,on="StatId",how="left") # get average BaseRank temp = df.groupby("StatId", as_index=False)["BaseRank"].mean().round().astype(int) week_df = week_df.merge(temp,on="StatId",how="left") # get top RankingUrl temp = df.sort_values("BaseRank", ascending=True).drop_duplicates("StatId").sort_index().reset_index(drop=True) week_df = pd.merge(week_df, temp[["StatId","RankingUrl"]], on="StatId", how="left") # insert ranksweekly_df into RanksDaily_{save_name} print(f"Adding keywords...")
def update_db(shadow_dir, station_list, dfile='shadows_data.db'): ''' Update both tables: SHADOWS and STATIONS ''' conn = sqlite3.connect(dfile) c = conn.cursor() today = datetime.strftime(datetime.now(), "%Y/%m/%d") #header :station_id,station_name,lon,lat new_stations = pd.read_csv(station_list, header=None) new_stations.columns = ['station_id', 'station_name', 'lon', 'lat'] new_stations["Date"] = [today] * new_stations.shape[0] sql_com = "SELECT * FROM STATIONS" current_stations = pd.read_sql(sql_com, conn) for station in current_stations.station_id.values: new_stations.drop( new_stations.index[new_stations['station_id'] == station], inplace=True) if not new_stations.empty: #if it found something new, update sql data base print("Updating STATIONS table") new_stations.to_sql('STATIONS', conn, if_exists='append', index=False) else: print("No new stations added to STATIONS table") return #read the database again if it was updated current_stations = pd.read_sql(sql_com, conn) sql_com = "SELECT * FROM SHADOWS" shadow_old = pd.read_sql(sql_com, conn) #extract the info from the ifile from os.path import normpath, basename dir_info = basename(normpath(shadow_dir)) #extract data from the name of directory maxdist, res, horstep, dummy = dir_info.replace("lh_", "").split("_") stations = [] print("Checking for new data for SHADOWS table") for ifile in sorted(os.listdir(shadow_dir)): if ifile.startswith("lh_"): #will probably find shadows.log here station = int(ifile.replace("lh_", "").split("_")[1]) get_station = current_stations[current_stations['station_id'] == station] if get_station.empty: print("Station %d not yet in STATIONS table" % station) else: print("Getting SHADOWS for station %d" % station) print("Reading shadows from %s" % os.path.join(shadow_dir, ifile)) read_shadows = pd.read_csv(os.path.join(shadow_dir, ifile), index_col=False) size = read_shadows.shape[0] az = read_shadows.azimuth.to_list() hor = read_shadows.horizon_height.to_list() lon = get_station.lon.values[0] lat = get_station.lat.values[0] station_name = get_station.station_name.values[0] shadow_new = pd.DataFrame({ "station_id": [station] * size, "station_name": [station_name] * size, "resolution": [res] * size, "maxdistance": [maxdist] * size, "horizonstep": [horstep] * size, "azimuth": az, "horizon_height": hor, "Date": [today] * size }) if shadow_old.empty: shadow_new.to_sql('SHADOWS', conn, if_exists='append', index=False) else: #drop from the new data any stations already in old data for station in shadow_old.station_id.values: shadow_new.drop(shadow_new.index[ shadow_new['station_id'] == station], inplace=True) if not shadow_new.empty: shadow_new.to_sql('SHADOWS', conn, if_exists='append', index=False) else: print("No new data added to the SHADOWS table") print("database updated") c.execute(''' INSERT INTO DAILY_STATUS (station_id,station_name,Date) SELECT DISTINCT clt.station_id, ctr.station_name, clt.Date FROM STATIONS clt LEFT JOIN SHADOWS ctr ON clt.station_id = ctr.station_id ''') c.execute(''' SELECT DISTINCT * FROM DAILY_STATUS WHERE Date = (SELECT max(Date) FROM DAILY_STATUS) ''') df = DataFrame(c.fetchall(), columns=['station_id', 'station_name', 'Date']) print("New data") print(df)
def generate_plot_data( self, metric, parameter, model_group_ids, train_end_times, ): """Fetch data necessary for producing the plot from the distance table Arguments: metric (string) -- model evaluation metric, such as 'precision@' parameter (string) -- model evaluation metric parameter, such as '300_abs' model_group_ids (list) - Model group ids to include in the dataset train_end_times (list) - Train end times to include in the dataset Returns: (pandas.DataFrame) The relevant models and the percentage of time each was within various thresholds of the best model at that time """ model_group_union_sql = ' union all '.join([ '(select {} as model_group_id)'.format(model_group_id) for model_group_id in model_group_ids ]) plot_min, plot_max = self.plot_bounds(metric, parameter) plot_tick_dist = self.plot_tick_dist(plot_min, plot_max) sel_params = { 'metric': metric, 'parameter': parameter, 'model_group_union_sql': model_group_union_sql, 'distance_table': self.distance_from_best_table.distance_table, 'model_group_str': str_in_sql(model_group_ids), 'train_end_str': str_in_sql(train_end_times), 'series_start': plot_min, 'series_end': plot_max, 'series_tick': plot_tick_dist, } sel = """\ with model_group_ids as ({model_group_union_sql}), x_vals AS ( SELECT m.model_group_id, s.distance FROM (SELECT GENERATE_SERIES( {series_start}, {series_end}, {series_tick} ) AS distance) s CROSS JOIN ( SELECT DISTINCT model_group_id FROM model_group_ids ) m ) SELECT dist.model_group_id, distance, mg.model_type, COUNT(*) AS num_models, AVG(CASE WHEN dist_from_best_case <= distance THEN 1 ELSE 0 END) AS pct_of_time FROM {distance_table} dist JOIN x_vals USING(model_group_id) JOIN model_metadata.model_groups mg using (model_group_id) WHERE dist.metric='{metric}' AND dist.parameter='{parameter}' and model_group_id in ({model_group_str}) and train_end_time in ({train_end_str}) GROUP BY 1,2,3 """.format(**sel_params) return (pd.read_sql(sel, self.distance_from_best_table.db_engine) .sort_values(['model_group_id', 'distance']))
def test_data(cnxn): query = "SELECT 作物名稱,平均價 FROM dbo.Veg WHERE 作物名稱 LIKE (N'%花椰%')" df = pd.read_sql(query, cnxn) return df
def getTable(self, tablename='agmednet_01'): self.connectSQL() df = pd.read_sql("SELECT * FROM " + tablename, self.engine) return df
def main(): # Load settings filepath_settings = 'C:/DISCHARGEDB/code/data/settings.json' settings = initSettings() saveSettings(settings, filepath_settings) settings = fillSettingsTags(loadSettings(filepath_settings)) # Downlaod new images from ag mednet discharge = DISCHARGEDB(database=settings['database']) #discharge.download_images(settings) #discharge.update_images(settings) discharge.truncateTable(tablename='dicom') discharge.update_dicom(settings) ### Update agmednet reports ### discharge = DISCHARGEDB(host="127.0.0.1", port='3306', user="******", password="******", database=settings['database']) discharge.update_agmednet_01(settings) discharge.update_agmednet_02(settings) discharge = DISCHARGEDB(host="127.0.0.1", port='3306', user="******", password="******", database=settings['database']) rs = discharge.truncateTable('agmednet_02') discharge.update_agmednet_02(settings) df = discharge.getTable('agmednet_02') df = discharge.getTable('agmednet_01') #### Execute sript ##### discharge = DISCHARGEDB(host="127.0.0.1", port='3306', user="******", password="******", database=settings['database']) table = discharge.getTable('agmednet_01') discharge.truncateTable('agmednet_01') discharge.truncateTable('agmednet_02') discharge.connectSQL() table = discharge.getTable('agmednet_01') table = discharge.getTable('agmednet_01') self = db mysql_path = 'mysql://' + self.user + ':' + self.password + '@' + self.host + '/' + self.database + '?charset=utf8' sqlEngine = create_engine(mysql_path) df = pd.read_sql("SELECT * FROM agmednet_01", sqlEngine) ### Reset autoincrement db = DISCHARGEDB(host="127.0.0.1", port='3306', user="******", password="******", database=settings['database']) db.connectSQL() db.resetAutoIncrement() #db.createDB() db.initDB(settings) db.executeScript( fip_script= 'H:/cloud/cloud_data/Projects/DISCHARGEDB/src/scripts/set_primary_key.sql', replace=('TABLE_VAR', 'v_a06_docu_hosp')) result = db.executeSQL('SELECT * FROM dischargedb3.site;') db.sas7bdatTosql() db.closeSQL() filename = 'v_a01_fu_staff' db = DISCHARGEDB(database=settings['database']) db.connectSQL() db.executeSQL('ALTER TABLE ' + filename + ' ADD PRIMARY KEY index') command = "ALTER TABLE `dischargedb`.`v_a03_ses_staff` CHANGE COLUMN `index` `index` BIGINT NOT NULL ," cursor = db.db.cursor() cursor.execute(command) result = cursor.fetchall() db = DISCHARGEDB(database=settings['database']) db.connectSQL() #command = "ALTER TABLE `dischargedb`.`v_a02_fu_questf_sub01` CHANGE COLUMN `index` `index` BIGINT NULL ,ADD PRIMARY KEY (`index`);;" command = "ALTER TABLE dischargedb.v_a03_ses_staff CHANGE COLUMN index index BIGINT NOT NULL" db.executeSQL(command) ############################## reader = SAS7BDAT( 'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tmp/ecrf/v_g02_ct_reading_a.sas7bdat', skip_header=False) df1 = reader.to_data_frame() for i in range(len(reader.columns)): f = reader.columns[i].format print('format:', f) c = reader.columns[10] fip = 'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tmp/ecrf/v_a01_fu_staff.sas7bdat' df = pd.read_sas(fip, format='sas7bdat', encoding='iso-8859-1') df.to_sql(con=con, name='table_name_for_df', if_exists='replace', flavor='mysql') mysql_path = 'mysql://*****:*****@localhost/?charset=utf8' engine = create_engine(mysql_path, encoding="utf-8", echo=False) # with engine.connect() as con: # con.execute("use dischargedb3; drop table if exists " + name + ";") # df = pd.read_excel(path) # df.to_sql(name, engine, index=False) fip = 'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tables/sas/v_a02_fu_questf_sub01.sas7bdat' df = pd.read_sas(fip, format='sas7bdat', encoding='iso-8859-1') with engine.connect() as con: #con = engine.connect() con.execute("use dischargedb3;") df.to_sql('table6', engine, index=False) df = pd.read_excel( 'H:/cloud/cloud_data/Projects/DISCHARGEDB/data/tables/xlsx/discharge_ecrf_01092020.xlsx', sheet_name='Sheet1', index_col=0)
# In[2]: my_db = MySQLdb.connect(host='localhost', user='******', passwd='yesican', db='pythonBuild') cursor = my_db.cursor() # In[3]: read_query = 'SELECT * FROM nifty_it_index;' # In[4]: nifty_it_index = pd.read_sql(read_query, my_db, index_col=['Date'], parse_dates=True) # In[5]: nifty_it_index.info() # In[6]: nifty_it_index.head() # In[7]: read_query = 'SELECT * FROM infy_stock;' # In[8]:
def info_region_match(): region_info = pd.read_sql( "select fund_id, region from fund_info_aggregation WHERE region IS NOT NULL AND region <> ''", engine_rd) file = codecs.open("./Scripts/DataQuality/fund_info/TestChooseAddress.js", 'r', "utf-8") region_file = file.read() region_info_copy = region_info prov = { '11': '北京', '12': '天津', '13': '河北', '14': '山西', '15': '内蒙古', '21': '辽宁', '22': '吉林', '23': '黑龙江', '31': '上海', '32': '江苏', '33': '浙江', '34': '安徽', '35': '福建', '36': '江西', '37': '山东', '41': '河南', '42': '湖北', '43': '湖南', '44': '广东', '45': '广西', '46': '海南', '50': '重庆', '51': '四川', '52': '贵州', '53': '云南', '54': '西藏', '61': '陕西', '62': '甘肃', '63': '青海', '64': '宁夏', '65': '新疆', '71': '台湾', '81': '香港', '82': '澳门', '90': '外国' } for i in range(len(region_info)): region = region_info['region'][i] pattern = "'(\d+)','({}.*?)'".format(region) try: match_result = re.search(pattern, region_file).groups() if match_result[0] in ('440300', '440303', '440304', '440305', '440306', '440307', '440308', '440391', '440392'): region_info_copy['region'][i] = '深圳' # print(region_info['fund_id'][i]) else: prov_code = match_result[0][0:2] region_info_copy['region'][i] = prov[prov_code] # sql = "update easy.fund_info_aggregation set region = '{}' where fund_id = '{}'".format( # region_info_copy['region'][i], region_info_copy['fund_id'][i]) # engine_rd.execute(sql) except Exception as e: print(region_info['fund_id'][i], region) return region_info_copy
MIN(s.CourseID) as MathStarting FROM VsaDev.dbo.StudyPlan as s where s.CourseID in (select CourseID from Course as c where c.CourseNumber like 'MATH%') GROUP by s.GeneratedPlanID) as m1 on m1.GeneratedPlanID = g.GeneratedPlanID # find the english starting point full JOIN (select s.GeneratedPlanID, MIN(s.CourseID) as EnglishStarting FROM VsaDev.dbo.StudyPlan as s where s.CourseID in (select CourseID from Course as c where c.CourseNumber like 'ENGL%') GROUP by s.GeneratedPlanID) as eng on eng.GeneratedPlanID = g.GeneratedPlanID;") vaaData = pd.read_sql(data, conn) vaaData.shape # dropped the ids as they dont serve much purpose to my work. vaaData = vaaData.drop('ParameterSetID',1) vaaData = vaaData.drop('MajorID',1) vaaData = vaaData.drop('SchoolID',1) vaaData = vaaData.drop('JobTypeID',1) vaaData = vaaData.drop('QuarterPreferenceID',1) vaaData = vaaData.drop('StartingQuarter',1)
def data(): return pd.read_sql('passengers', DATABASE_URL).to_dict(orient="index")
def get_from_db(db, table_name): df = pd.read_sql("SELECT * FROM {}".format(table_name), db) return df
def read_data(self): engine = create_engine(self.db_url, echo=False) return pd.read_sql(self.sql_query, engine, **self.pandas_kwargs_read)
def getAvgSurveyResults(): avgResults = pd.read_sql( "select value as Question_Num, Data_Type, Chart_Type, sum(Correct) AS numCorrect, (sum(Correct) / (COUNT(Distinct Survey_ID))) * 100 As percent_correct from survey_results.survey_results where value!='feedbk' group by value", conn) return avgResults.to_json(orient='records')
""" import sqlite3 conn = sqlite3.connect( r'D:\Work\DataScience\DS ppt\datascience notes\SQLDB\employee.db') cur = conn.cursor() cur.execute('select * from EMPLOYEE') for rows in cur: print(list(rows)) ############################################################################# import sqlite3 import pandas as pd conn = sqlite3.connect( r'D:\Work\DataScience\DS ppt\datascience notes\SQLDB\employee.db') emp_set = pd.read_sql('select * from EMPLOYEE', conn) ############################################################################# import sqlite3 conn = sqlite3.connect( r'D:\Work\DataScience\DS ppt\datascience notes\SQLDB\employee.db') cur = conn.cursor() cur.execute( "INSERT INTO EMPLOYEE(EMP_ID,NAME,LOCATION,SALARY) VALUES(106,'KIRAN','CHENNAI',70000)" ) conn.commit() ############################################################################# a = [[1, "Ashwin", "Chennai"], [2, "Raina", "Chennai"], [3, "Steyn", "Hydrabad"]] b = [[2, "Raina", "Chennai"], [4, "Kohli", "Hydrabad"], [5, "Dhoni", "Pune"]]
def getNewSurveyResults(): newResults = pd.read_sql( "SELECT COUNT(Distinct Survey_ID) AS numberOFattempts, COUNT(Value) AS questionsAnswered, (SUM(correct) / COUNT(*)) * 100 AS pctCorrect, SUM(correct) AS numCorrect, SUM(correct != 1) as numIncorrect, SUM(correct)/COUNT(Distinct Survey_ID) AS avgScore FROM survey_results.survey_results where value!='feedbk'", conn) return newResults.to_json(orient='records')
def get_sql(sql, db_file, params=None): with sqlite3.connect(db_file) as db: return pd.read_sql(sql, db, params)