class TestParseFileTime(unittest.TestCase): ''' Test the parse_filename_time function, which attempts to extract the datetime/runtime/run iteration from a standard SWMF file name. ''' from datetime import datetime as dt files = [ 'mag_grid_e20130924-232600.out', 'y=0_mhd_1_e20130924-220500-054.out', 'y=0_mhd_2_t00001430_n00031073.out', 'z=0_mhd_2_t00050000_n00249620.out', os.path.join(spacepy_testing.datadir, 'pybats_test', 'mag_grid_ascii.out'), 'y=0_mhd_1_t20140410000000_n00001500.out', 'z=0_mhd_2_e20140410-000000-000_20140410-000300-000.outs', 'z=0_mhd_2_n00001500_00001889.outs' ] iters = [None, None, 31073, 249620, None, 1500, None, [1500, 1889]] times = [None, None, 870, 18000, None, None, None, None] dates = [ dt(2013, 9, 24, 23, 26, 0), dt(2013, 9, 24, 22, 5, 0), None, None, None, dt(2014, 4, 10, 0, 0), [dt(2014, 4, 10, 0, 0, 0), dt(2014, 4, 10, 0, 3, 0)], None ] def testParse(self): from spacepy.pybats import parse_filename_time for f, d, t, i in zip(self.files, self.dates, self.times, self.iters): self.assertEqual(parse_filename_time(f), (i, t, d))
def toYearFraction(date): """ DISCONTINUED use dt2year_decimal instead (which is the same) """ # # Give the decimal year # source : # http://stackoverflow.com/questions/6451655/python-how-to-convert-datetime-dates-to-decimal-years from datetime import datetime as dt import time def sinceEpoch(date): # returns seconds since epoch return time.mktime(date.timetuple()) s = sinceEpoch year = date.year startOfThisYear = dt(year=year, month=1, day=1) startOfNextYear = dt(year=year + 1, month=1, day=1) yearElapsed = s(date) - s(startOfThisYear) yearDuration = s(startOfNextYear) - s(startOfThisYear) fraction = yearElapsed / yearDuration return date.year + fraction
def test_pruning(self): raw_data = pd.read_csv("./Input/mdata_5-1.txt") cols = raw_data.columns X = raw_data[cols[1:-1]] y = raw_data[cols[-1]] # pre pruning clf = dt(criterion="gain_ratio", min_samples_leaf=4) clf.fit(X, y) print(clf.tree) print(clf.num_leaf) clf = dt(criterion="gain_ratio", min_samples_leaf=3) clf.fit(X, y) print(clf.tree) print(clf.num_leaf)
def toYearFraction(date): import time def sinceEpoch(date): # returns seconds since epoch return time.mktime(date.timetuple()) s = sinceEpoch year = date.year startOfThisYear = dt(year=year, month=1, day=1) startOfNextYear = dt(year=year+1, month=1, day=1) yearElapsed = s(date) - s(startOfThisYear) yearDuration = s(startOfNextYear) - s(startOfThisYear) fraction = yearElapsed/yearDuration return date.year + fraction
def test_q51(self): raw_data = pd.read_csv("./Input/data_5-1.txt") cols = raw_data.columns X = raw_data[cols[1:-1]] y = raw_data[cols[-1]] # criterion: gain_ratio clf = dt(criterion="gain_ratio") clf.fit(X, y) logger.info("gain_ratio") rst = { '有自己的房子': { '否': { '有工作': { '否': { '否': None }, '是': { '是': None } } }, '是': { '是': None } } } self.assertEqual(rst, clf.tree) logger.info(clf.tree)
def test_q51(): raw_data = pd.read_csv("./Input/data_5-1.txt") cols = raw_data.columns X = raw_data[cols[1:-1]] y = raw_data[cols[-1]] # criterion: gain_ratio clf = dt(criterion="gain_ratio") clf.fit(X, y) print("gain_ratio") rst = { '有自己的房子': { '否': { '有工作': { '否': { '否': None }, '是': { '是': None } } }, '是': { '是': None } } } print(clf.tree)
def test_e54(): raw_data = pd.read_csv("./Input/mdata_5-1.txt") cols = raw_data.columns X = raw_data[cols[1:-1]] y = raw_data[cols[-1]] clf = dt() clf.fit(X, y) print(clf.tree)
def test_e54(self): raw_data = pd.read_csv("./Input/mdata_5-1.txt") cols = raw_data.columns X = raw_data[cols[1:-1]] y = raw_data[cols[-1]] clf = dt() clf.fit(X, y) logger.info(clf.tree)
def temp_monthly(): """Temperature observartion for Top Station for last year""" session = Session(engine) #find last date in database from Measurements last_date = session.query(Measurement.date).order_by(Measurement.date.desc()).first().date #convert last date string to date last_date = dt.datetime.strptime(last_date, "%Y-%m-%d") #calculate date one year after last date using timedelta datetime function first_date = last_date - dt(days=365) #list the stations and the counts in descending order station_counts = session.query(Measurement.station, func.count(Measurement.station).group_by(Measurement.station).order_by(func.count(Measurement.station).desc().all() #create top station variable from tuple top_station = (station_counts[0]) top_station = (top_station[0]) #calculate the lowest temperature and the highest temperature recorded, and averge temperature of the most active station. session.query(func.min(Measurement.tobs), func.max(Measurement.tobs), func.avg(Measurement.tobs)).\ filter(Measurement.station == top_station).all() #query the last 12 months of tempearture observation data for this station and plot the results as a histogram filter(Measurement.station == top_station).filter(Measurement.date >= first_date).all() return jsonify(top_station_year_obs) @app.route("/api/v1/0/temp/<start>") @app.route("/api/v1.0/temp/<start>/<end>") def stats(start=None, end=None): session = Session(engine) """Return Temperature Min, Temperature Avg, Temperature Max""" #select statement sel = [func.min(Measurement.tobs), func.avg(Measurement.tobs), func.max(Measurement.tobs)] if not end: #calculate Temperature Min, Temperature Avg, Temperature Max for dates greater than start results = session.query(*sel).\ filter(Measurement.date >= start).all() #Ravel results into a 1D array and convert to a list temps = list(np.ravel(results)) return jsonify(temps) #calculate Temperature Min, Temperature Avg, Temperature Max with start and stop results = session.query(*sel).\ filter(Mesurement.date >= start).\ filter(Measurement.date <= end),all() return jsonify(results) if __name__ == '__main__': app.run()
def test_predict(self): raw_data = pd.read_csv("./Input/mdata_5-1.txt") cols = raw_data.columns X = raw_data[cols[1:-1]] y = raw_data[cols[-1]] clf = dt(criterion="gain_ratio") clf.fit(X, y) rst = clf.predict(X[:1]) self.assertEqual(rst, y[:1].values) print("predict: ", rst, "label: ", y[:1])
class TestParseFileTime(unittest.TestCase): ''' Test the parse_filename_time function, which attempts to extract the datetime/runtime/run iteration from a standard SWMF file name. ''' from datetime import datetime as dt files = [ 'mag_grid_e20130924-232600.out', 'y=0_mhd_1_e20130924-220500-054.out', 'y=0_mhd_2_t00001430_n00031073.out', 'z=0_mhd_2_t00050000_n00249620.out' ] dates = [dt(2013, 9, 24, 23, 26, 0), dt(2013, 9, 24, 22, 5, 0), None, None] times = [None, None, 870, 18000] iters = [None, None, 31073, 249620] def testParse(self): from spacepy.pybats import parse_filename_time for f, d, t, i in zip(self.files, self.dates, self.times, self.iters): self.assertEqual(parse_filename_time(f), (i, t, d))
def parser(self, f, output=None): try: book = xlrd.open_workbook(f) except: logger.error('%s error' % f) return sh = book.sheet_by_index(0) if sh.name != u'账户对账单': logger.error(f) # range for 账户对账单 first_col_vals = [sh.cell_value(rowx=i, colx=0) for i in range(sh.nrows)] start = end = None for idx, val in enumerate(first_col_vals): if val == u'对帐单': start = idx if val == u'当日持仓清单': end = idx if start and end: break vals = [] for row in xrange(start, end): vals.append([sh.cell_value(rowx=row, colx=c) for c in range(sh.ncols)]) for row in filter(lambda x: u'20' in x[0], vals): symbol = row[3] transaction_type = row[1] price = float(row[7]) vol = int(row[5]) total = float(row[8]) name = row[4] # timestamp yr = int(row[0][:4]) m = int(row[0][5:6]) d = int(row[0][-2:]) executed_on = dt(year=yr, month=m, day=d) c = MyChenmin( executed_on=executed_on, transaction_type=transaction_type, symbol=symbol, name=name, price=price, vol=vol, total=total ) c.save() logger.debug('%s done' % f)
def parser(self, symbol, val_list): logger.info('processing %s' % symbol) stock = MyStock.objects.get(symbol=symbol) records = [] cnt = 0 total = len(val_list) for vals in val_list: if len(vals) < 10: logger.error('wrong length %s' % ','.join(vals)) exec_start = time.time() date_stamp = dt(year=int(vals[1][:4]), month=int( vals[1][4:6]), day=int(vals[1][-2:])) open_p = Decimal(vals[2]) high_p = Decimal(vals[3]) low_p = Decimal(vals[4]) close_p = Decimal(vals[5]) vol = Decimal(vals[6]) amount = Decimal(vals[7]) * Decimal(10.0) adj = Decimal(vals[8]) status = int(vals[9]) h = MyStockHistorical( stock=stock, date_stamp=date_stamp, open_price=open_p, high_price=high_p, low_price=low_p, close_price=close_p, vol=vol, amount=amount, status=status, # adjusted values adj_open=open_p * adj, adj_high=high_p * adj, adj_low=low_p * adj, adj_close=close_p * adj, ) records.append(h) if len(records) >= 1000: MyStockHistorical.objects.bulk_create(records) cnt += len(records) records = [] logger.info('%s inserted %d/%d' % (symbol, cnt, total)) if len(records): MyStockHistorical.objects.bulk_create(records) logger.info('%s elapse %f' % (symbol, time.time() - exec_start))
def precipitation(): """Last year of Precipitation Data""" session = Session(engine) """Return a list of the dates and precipitation from last year""" # Query for the dates and precipitation from last year last_date = session.query(Measurement.date).order_by(Measurement.date.desc()).first().date #convert last date string to date last_date = dt.datetime.strptime(last_date, "%Y-%m-%d") #calculate date one year after last date using timedelta datetime function first_date = last_date - dt(days=365) #perform a query to retreive the data and precipitation scores last_year_data = session.query(Measurement.date, Measurement.prcp).filter(Measurement.date >= first_date).all() return jsonify(last_year_data)
import dash_table as dt from datetime import datetime as dt def resolve_entry_form(selections): for selection in selections: if selection is None: return return entry_layout entry_layout = html.Div(children=[ html.Div(html.Div(children=[ html.P('Entry Date'), dcc.DatePickerSingle(id='entry-date-picker', min_date_allowed=dt(2015, 1, 1), max_date_allowed=dt.today(), initial_visible_month=dt(dt.today().year, dt.today().month, 1), display_format='M/D/YYYY') ]), className='two columns offset-by-one column'), html.Div(html.Div(children=[ html.P('Strike Price'), dcc.Input(id='entry-strike-price', type='number', placeholder='Enter strike price...') ]), className='two columns'), html.Div(html.Div(children=[ html.P('Ticker'),
def import_chenmin_csv(): root = '/home/fengxia/Desktop/chenmin/alpha' for f in os.listdir(root): symbol, ext = os.path.splitext(os.path.basename(f)) stock, created = MyStock.objects.get_or_create(symbol=symbol) his = [x.isoformat() for x in MyStockHistorical.objects.filter( stock=stock).values_list('date_stamp', flat=True)] records = [] with open(os.path.join(root, f), 'rb') as csvfile: for cnt, vals in enumerate(csv.reader(csvfile)): if not vals: continue # handle blank lines # some time stamp is in form of "x/x/x", normalized to "x-x-x" # format vals[0] = vals[0].replace('/', '-') if len(vals) != 6: print 'error in %s' % symbol print cnt, vals raw_input() elif '-' not in vals[0]: continue # skip these title lines stamp = [int(v) for v in vals[0].split('-')] date_stamp = dt(year=stamp[0], month=stamp[1], day=stamp[2]) if date_stamp.date().isoformat() in his: continue # we already have these else: try: open_p = Decimal(vals[1]) except: open_p = Decimal(-1) try: high_p = Decimal(vals[2]) except: high_p = Decimal(-1) try: low_p = Decimal(vals[3]) except: low_p = Decimal(-1) try: close_p = Decimal(vals[4]) except: close_p = Decimal(-1) try: vol = int(vals[5]) / 1000.0 except: vol = -1 try: adj_p = Decimal(vals[6]) except: adj_p = Decimal(-1) h = MyStockHistorical( stock=stock, date_stamp=date_stamp, open_price=open_p, high_price=high_p, low_price=low_p, close_price=close_p, vol=vol, adj_close=adj_p ) records.append(h) if len(records) >= 1000: MyStockHistorical.objects.bulk_create(records) records = [] if len(records): MyStockHistorical.objects.bulk_create(records) # persist print '[%s] complete' % symbol
def get_business_above_series_c_or_ipo_merger_date(self, permalink, company_js=None): if not company_js: company_js = EntityParser.LoadJsonEntity('data/company/' + permalink) if company_js: deadpooled_year = company_js['deadpooled_year'] deadpooled_month = company_js['deadpooled_month'] if deadpooled_year and deadpooled_month: return dt(deadpooled_year, deadpooled_month, 1), 'Dead' funding_rounds = company_js.get('funding_rounds') label_date = None last_funding_round = None if funding_rounds: for round in funding_rounds: if round['round_code'] not in [ 'angel', 'seed', 'a', 'b', 'unattributed' ]: round_year = round['funded_year'] round_month = round['funded_month'] round_day = round['funded_day'] try: round_date = dt(round_year, round_month, round_day) except: continue if label_date is not None: if label_date > round_date: label_date = round_date last_funding_round = round['round_code'] else: last_funding_round = round['round_code'] label_date = round_date if label_date is not None: return label_date, last_funding_round acquisition = company_js.get('acquisition', []) ipos = company_js.get('ipo', []) if acquisition and len(acquisition) > 1: acq_year = acquisition['acquired_year'] acq_month = acquisition['acquired_month'] acq_day = acquisition['acquired_day'] try: acq_date = dt(acq_year, acq_month, acq_day) if label_date is not None: if label_date > acq_date: label_date = acq_date else: label_date = acq_date if label_date is not None: return label_date, 'acquired' except: pass if ipos: ipo_year = ipos['pub_year'] ipo_month = ipos['pub_month'] ipo_day = ipos['pub_day'] try: ipo_date = dt(ipo_year, ipo_month, ipo_day) if label_date is not None: if label_date > ipo_date: label_date = ipo_date else: label_date = ipo_date if label_date is not None: return label_date, 'ipo' except: pass return label_date, None
def extract_company_feature(self, company_file): company_js = EntityParser.LoadJsonEntity(company_file) if company_js is None: return None features = self.get_feature_template() founded_year = company_js['founded_year'] founded_month = company_js['founded_month'] if company_js[ 'founded_month'] else 1 founded_day = company_js['founded_day'] if company_js[ 'founded_day'] else 1 try: founded_date = dt(founded_year, founded_month, 1).date() except TypeError: return None funding_rounds = company_js['funding_rounds'] total_funding = 0 if funding_rounds: max_fund_code = 0 max_funding_round = None for round in funding_rounds: code = round['round_code'] raised_amt = round['raised_amount'] if raised_amt: total_funding += raised_amt if round['round_code'] in funding_stages.keys(): year = round['funded_year'] month = round['funded_month'] day = round['funded_day'] if round['funded_day'] else 1 try: fund_date = dt(year, month, day) except: continue date_diff = relativedelta.relativedelta( fund_date, founded_date) months_diff = (date_diff.years * 12) + date_diff.months if features[code + '_months']: if features[code + '_months'] < months_diff: features[code + '_months'] = months_diff else: features[code + '_months'] = months_diff if features[code + '_raised_amount']: if raised_amt: features[code + '_raised_amount'] += raised_amt else: features[code + '_raised_amount'] = raised_amt try: features['investment_per_funding_round'] = total_funding / len( funding_rounds) except ZeroDivisionError: features['investment_per_funding_round'] = 0 company_website = company_js['homepage_url'] features['no_of_employees'] = company_js['number_of_employees'] features['com_domain'] = True if '.com' in company_website else False features['number_of_milestone'] = len(company_js.get('milestones', [])) features['number_of_offices'] = len(company_js['offices']) features['category'] = company_js['category_code'] features['number_of_providers'] = len(company_js['providerships']) features['number_of_competitors'] = len(company_js['competitions']) features['number_of_products'] = len(company_js['products']) features['number_of_funding_rounds'] = len(funding_rounds) features['number_of_investments'] = len(company_js['investments']) features['headquarter_location'] = self.find_company_headquarter( company_js) features['number_of_co_founder'] = self.get_num_of_co_founders( company_js) company_permalink = company_js['permalink'] for competitor in company_js['competitions']: comp = competitor['competitor'] permalink = comp['permalink'] got_series_c = self.find_business_above_series_c_or_ipo_merger( permalink) if got_series_c: if features['number_of_competitors_got_series_c'] is None: features['number_of_competitors_got_series_c'] = 1 else: features['number_of_competitors_got_series_c'] += 1 features['label'] = self.find_business_above_series_c_or_ipo_merger( '', company_js) features['label_date'], features[ 'label_stage'] = self.get_business_above_series_c_or_ipo_merger_date( '', company_js) company_age = relativedelta.relativedelta(features['label_date'], founded_date) company_age_months = (company_age.years * 12) + company_age.months features['company_age_months'] = company_age_months features['name'] = company_js['name'] features['permalink'] = company_js['permalink'] features[ 'number_of_tech_crunch_article'] = self.get_tech_crunch_articles_count( company_permalink, features['label_date']) no_of_phd, no_of_financial, no_of_engineer, \ no_of_companies_by_founder, \ no_of_successful_company_by_founder = self.get_team_background(company_js) features['number_of_financial_background'] = no_of_financial features['number_of_engineering_background'] = no_of_engineer features['number_of_phd'] = no_of_engineer features['number_of_companies_by_founder'] = no_of_companies_by_founder features[ 'successful_companies_by_founder'] = no_of_successful_company_by_founder return features
def main(): """ ********************************************* Extract and clean data from nyc open data ********************************************* """ APP_TOKEN = app_token() base_url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$$app_token={}".format( APP_TOKEN) url = base_url + "{}" cnt_url = base_url + "{}{}" # select , where where_inj = "&$where=number_of_cyclist_injured>0.0&$limit=50000" where_kill = "&$where=number_of_cyclist_killed>0.0" inj_df = pd.read_json(url.format(where_inj)) killed_df = pd.read_json(url.format(where_kill)) def dt(date, time): date = pd.to_datetime(date).dt.date time = pd.to_datetime(time).dt.time return date, time # so frustrating. NYC open data changed columns from "accident" to "crash" killed_df.crash_date, killed_df.crash_time = dt(killed_df.crash_date, killed_df.crash_time) inj_df.crash_date, inj_df.crash_time = dt(inj_df.crash_date, inj_df.crash_time) killed_df = killed_df.rename(columns={ 'crash_date': 'accident_date', 'crash_time': 'accident_time' }) inj_df = inj_df.rename(columns={ 'crash_date': 'accident_date', 'crash_time': 'accident_time' }) df = (pd.concat([ inj_df, killed_df ]).drop(columns='location').drop_duplicates().reset_index(drop=True)) df.vehicle_type_code1 = df.vehicle_type_code1.apply( lambda x: str(x).upper()) df.vehicle_type_code2 = df.vehicle_type_code2.apply( lambda x: str(x).upper()) df['Accident Year'] = df.accident_date.apply(lambda x: x.year) df['Accident Month'] = df.accident_date.apply(lambda x: x.month) df['Accident Hour'] = df.accident_time.apply(lambda x: x.hour) def create_df(group): return (df.groupby(group).collision_id.count().reset_index().rename( columns={'collision_id': 'Number of Accidents'})) """ ********************************************* Create figures for month and hour data ********************************************* """ crash_mo_yr = create_df(['Accident Year', 'Accident Month']) crash_hr = create_df('Accident Hour') crash_mo_hr = create_df(['Accident Month', 'Accident Hour']) killed_df['accident_year'] = killed_df.accident_date.apply( lambda x: x.year) killed_df['accident_month'] = killed_df.accident_date.apply( lambda x: x.month) killed_df['accident_hr'] = killed_df.accident_time.apply(lambda x: x.hour) mo_fig = px.area(crash_mo_yr, x="Accident Month", y="Number of Accidents", animation_frame="Accident Year", range_y=[0, 800], range_x=[1, 12]) mo_fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000 mo_fig.layout.title = "Bicycle Accidents by Month for Each Year" pio.write_html(mo_fig, file="app/static/mo_fig.html", auto_play=False) hr_fig = px.area(crash_mo_hr, x="Accident Hour", y="Number of Accidents", animation_frame="Accident Month", range_y=[0, 400], range_x=[0, 23]) hr_fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1000 hr_fig.layout.title = "Bicycle Accidents by Hour For Each Month" pio.write_html(hr_fig, file="app/static/hr_fig.html", auto_play=False) """ ********************************************* Extract data from citibike files - all trips ********************************************* """ fdir = './agg_trip' agg_files = os.listdir(fdir) agg_df = pd.read_csv(fdir + '/' + agg_files[0]).iloc[:, [0, 1]] for i in range(1, len(agg_files)): agg_df = agg_df.append( pd.read_csv(fdir + '/' + agg_files[i]).iloc[:, [0, 1]]) agg_df.Date = pd.to_datetime(agg_df.Date).dt.date agg_df = agg_df.rename(columns={ 'Trips over the past 24-hours (midnight to 11:59pm)': 'Number of Trips' }) agg_df = agg_df.sort_values('Date') fig = px.line(agg_df, x='Date', y='Number of Trips', title="Number of CitiBike Trips by Day", hover_name='Date') pio.write_html(fig, file="app/static/fig.html", auto_play=False) """ ********************************************* Using 9/25/2019 to map common citibike routes ********************************************* """ high_day = pd.read_csv('./app/static/high_day.csv') coord092519 = high_day[[ 'start station name', 'start station id', 'start station latitude', 'start station longitude', 'end station name', 'end station id', 'end station latitude', 'end station longitude' ]].copy() coord092519['id'] = (coord092519['start station name'] + coord092519['end station name']) coord092519 = coord092519.groupby([ 'start station name', 'start station id', 'start station latitude', 'start station longitude', 'end station name', 'end station id', 'end station latitude', 'end station longitude' ]).id.count().reset_index() coord092519['filt'] = coord092519.apply( lambda x: 'y' if x['start station name'] == x['end station name'] else '', axis=1) coord092519 = coord092519[coord092519.filt != 'y'].reset_index(drop=True) cohort = coord092519[coord092519.id >= 4] cohort = cohort.rename(columns={'id': 'count'}) cohort['id'] = cohort['start station id'].apply( str) + '-' + cohort['end station id'].apply(str) routes = pd.read_csv('./app/static/backup_route_file.csv') routes = routes[ routes.geojson != '{"message":"Too Many Requests"}'].reset_index( drop=True) cohort_df = pd.merge(cohort, routes[['id', 'geojson']], on='id', how='inner') cohort_df = cohort_df[['geojson']].drop_duplicates() geojson = list(cohort_df.geojson) gjson = [] for i in range(len(geojson)): gjson.append( json.loads(geojson[i])['routes'][0]['geometry']['coordinates']) for i in gjson: for j in i: j.reverse() """ ********************************************* mapping the accidents ********************************************* """ loc_df = df[[ 'borough', 'latitude', 'longitude', 'on_street_name', 'off_street_name', 'accident_date' ]].copy() loc_df = loc_df[(pd.isna(loc_df.latitude) == False) & (loc_df.latitude != 0) & (loc_df.longitude != 0)] loc_df.on_street_name = loc_df.on_street_name.str.strip() loc_df.off_street_name = loc_df.off_street_name.str.strip() loc_df.accident_date = loc_df.accident_date.apply(str) loc_df['lat_lon_list'] = loc_df.apply(lambda x: [x.longitude, x.latitude], axis=1) loc_df = loc_df.sort_values('accident_date').reset_index(drop=True) intersect_df = loc_df.copy() intersect_df[ 'intersection'] = intersect_df.on_street_name + ';' + intersect_df.off_street_name intersect_df.intersection = intersect_df.intersection.apply( lambda x: ' & '.join(sorted(x.split(';'))) if pd.isna(x) == False else x) dang_int = (intersect_df.groupby( ['borough', 'intersection'])['accident_date'].count().reset_index().sort_values( 'accident_date', ascending=False).rename( columns={'accident_date': 'Number of Bike Accidents'})) # For the table dang_int_viz = (dang_int[dang_int['Number of Bike Accidents'] >= 10].copy( ).reset_index(drop=True).rename(columns={ 'borough': 'Borough', 'intersection': 'Intersection' })) for i in dang_int_viz.index: Crash( dang_int_viz.iloc[i].Borough, dang_int_viz.iloc[i].Intersection).create_map().save( 'app/static/crash_maps/' + dang_int_viz.iloc[i].Borough + dang_int_viz.iloc[i].Intersection.replace(' ', '_') + '.html') dang_int_viz.Intersection = dang_int_viz.apply( lambda x: '<a href={} target="iframe_map">{}</a>'.format( '../static/crash_maps/' + x.Borough + x.Intersection.replace( ' ', '_') + '.html', x.Intersection), axis=1) html = """<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th>Borough</th> <th>Intersection</th> <th>Number of Bike Accidents</th> </tr> </thead> <tbody> """ for i in dang_int_viz.index: html = (html + '<tr><td>' + dang_int_viz.iloc[i].Borough + '</td><td>' + dang_int_viz.iloc[i].Intersection + '</td><td>' + str(dang_int_viz.iloc[i]['Number of Bike Accidents']) + '</td></tr>') html = html + "</tbody></table>" html = BeautifulSoup(html, "lxml") html.body.insert( 0, BeautifulSoup('<link rel="stylesheet" href="/static/style.css">', "lxml")) with open('app/static/crash_table.html', 'w') as f: f.write(str(html)) lat_lon = intersect_df[['intersection', 'lat_lon_list']].copy() lat_lon.lat_lon_list = lat_lon.lat_lon_list.apply( lambda x: str(round(x[0], 5)) + ';' + str(round(x[1], 5))) lat_lon = lat_lon.drop_duplicates().reset_index(drop=True) lat_lon.lat_lon_list = lat_lon.lat_lon_list.apply( lambda x: [float(i) for i in x.split(';')]) for i in lat_lon.index: lat_lon.lat_lon_list[i].reverse() dang_int = pd.merge(dang_int, lat_lon, on='intersection', how='left') dang_int.to_csv('app/static/dang_int.csv', index=False) dang_int_10 = ( dang_int[(dang_int['Number of Bike Accidents'] >= 10) & (dang_int['Number of Bike Accidents'] < 15)].reset_index( drop=True)) dang_int_15 = ( dang_int[(dang_int['Number of Bike Accidents'] >= 15) & (dang_int['Number of Bike Accidents'] < 20)].reset_index( drop=True)) dang_int_20 = ( dang_int[dang_int['Number of Bike Accidents'] >= 20].reset_index( drop=True)) features = [{ 'type': 'Feature', 'geometry': { 'type': 'MultiPoint', 'coordinates': list(loc_df.lat_lon_list), }, 'properties': { 'times': list(loc_df.accident_date), 'icon': 'circle', 'iconstyle': { 'fillColor': 'red', 'fillOpacity': 0.5, 'stroke': 'false', 'radius': 5 }, 'style': { 'weight': 0.5 } } }] """ ********************************************* Getting the bike lanes and formatting the data ********************************************* """ bike_lanes = pd.read_json('./app/static/Bicycle Routes.geojson') bl_prot_json = [] bl_stand_json = [] for i in bike_lanes.index: if bike_lanes.iloc[i].features['properties']['facilitycl'] == 'I': for j in range( len(bike_lanes.iloc[i].features['geometry'] ['coordinates'])): bl_prot_json.append( bike_lanes.iloc[i].features['geometry']['coordinates'][j]) else: for j in range( len(bike_lanes.iloc[i].features['geometry'] ['coordinates'])): bl_stand_json.append( bike_lanes.iloc[i].features['geometry']['coordinates'][j]) for i in bl_prot_json: for j in i: j.reverse() for i in bl_stand_json: for j in i: j.reverse() """ ********************************************* Creating the map and interactive features ********************************************* """ nyc_map = folium.Map(location=[40.735, -73.95], zoom_start=11.5, tiles=None) folium.TileLayer('cartodbdark_matter', control=False).add_to(nyc_map) # Add bike lanes folium.PolyLine(bl_prot_json, weight=1, opacity=0.9, color='lime').add_to( folium.FeatureGroup(name='Protected Bike Lanes').add_to(nyc_map)) folium.PolyLine(bl_stand_json, weight=1, opacity=0.9, color='yellow').add_to( folium.FeatureGroup( name='Non-Protected Bike Lanes').add_to(nyc_map)) # Add citibike routes folium.PolyLine(gjson, weight=1, opacity=0.2).add_to( folium.FeatureGroup(name='Commonly Used Citibike Routes', overlay=False).add_to(nyc_map)) # Add Dangerous intersections data over10 = folium.FeatureGroup(name='Intersections w/10-14 Accidents', overlay=False) for i in dang_int_10.index: over10.add_child( folium.Marker( dang_int_10.lat_lon_list[i], tooltip=(dang_int_10.intersection[i] + ':\t' + str(dang_int_10['Number of Bike Accidents'][i]) + ' Accidents'), icon=folium.Icon(color='red', prefix='fa', icon='fas fa-bicycle'))) over15 = folium.FeatureGroup(name='Intersections w/15-19 Accidents', overlay=False) for i in dang_int_15.index: over15.add_child( folium.Marker( dang_int_15.lat_lon_list[i], tooltip=(dang_int_15.intersection[i] + ':\t' + str(dang_int_15['Number of Bike Accidents'][i]) + ' Accidents'), icon=folium.Icon(color='red', prefix='fa', icon='fas fa-bicycle'))) over20 = folium.FeatureGroup(name='Intersections w/20 or More Accidents', overlay=False) for i in dang_int_20.index: over20.add_child( folium.Marker( dang_int_20.lat_lon_list[i], tooltip=(dang_int_20.intersection[i] + ':\t' + str(dang_int_20['Number of Bike Accidents'][i]) + ' Accidents'), icon=folium.Icon(color='red', prefix='fa', icon='fas fa-bicycle'))) nyc_map.add_child(over10) nyc_map.add_child(over15) nyc_map.add_child(over20) plugins.TimestampedGeoJson( { 'type': 'FeatureCollection', 'features': features }, period='P1M', add_last_point=True, auto_play=True, loop=False, max_speed=2, loop_button=True, date_options='YYYY-MM-DD', time_slider_drag_update=True, duration='P1M').add_to(nyc_map) folium.LayerControl().add_to(nyc_map) nyc_map.save('app/static/map_nyc.html') """ ********************************************* Bike crash causes ********************************************* """ # Decided not to use the below for now. Could use it in the future... bike_list = ['BIKE', 'BICYCLE', 'E-BIK', 'BICYCLE', 'BYCIC'] cause_df = df[((pd.isna(df.contributing_factor_vehicle_3) == True) & ((df.vehicle_type_code1.isin(bike_list) == True) | (df.vehicle_type_code2.isin(bike_list) == True)))] cause_df = cause_df[(cause_df.vehicle_type_code1.isin(bike_list) == False) | (cause_df.vehicle_type_code2.isin(bike_list) == False)] def bike_cause(x): if x.vehicle_type_code1 in bike_list: return x.contributing_factor_vehicle_1 else: return x.contributing_factor_vehicle_2 def veh_cause(x): if x.vehicle_type_code1 not in bike_list: return x.contributing_factor_vehicle_1 else: return x.contributing_factor_vehicle_2 cause_df['bike_cause'] = cause_df.apply(bike_cause, axis=1) cause_df['veh_cause'] = cause_df.apply(veh_cause, axis=1) # remove Unspecified from dataset. Not useful bike_cause_df = (cause_df.groupby( 'bike_cause').collision_id.count().reset_index().sort_values( 'collision_id', ascending=False).head(15).reset_index(drop=True)) bike_cause_df = bike_cause_df[bike_cause_df.bike_cause != 'Unspecified'] veh_cause_df = (cause_df.groupby( 'veh_cause').collision_id.count().reset_index().sort_values( 'collision_id', ascending=False).head(15).reset_index(drop=True)) veh_cause_df = veh_cause_df[veh_cause_df.veh_cause != 'Unspecified']
#filname_csv = (os.getcwd() + "\Accuracy_error_metrics.csv") #print(filename) print("S3 bucket successfully created") print("Model successfully uploaded to S3") except Exception as e: print(e) from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator from datetime import timedelta default_args = { 'owner' = 'milony' 'depends_on_past' = False 'start_date' = dt(2018,4,12) 'retries' = 1 'retry_delay': timedelta(minutes=1) } dag = DAG('Assignment_3', default_args=args, schedule_interval= '@once' ) t0 = PythonOperator( task_id='performance_metrics', python_callable=performance_metrics, provide_context = True, op_kwargs = {'dataset':dataset}, dag=dag)
def add_years(original_time, add_years): try: return original_time.replace(year=original_time.year + add_years) except ValueError: return original_time + (dt(original_time.year + add_years, 1, 1) - dt(original_time.year, 1, 1))
className='element-table'), dcc.Interval(id='table-update', interval=2000, n_intervals=0), ]), # Hystory graph html.Div(id='history-graph-container', children=[ html.Br(), html.Label('Select hour to display'), dcc.DatePickerSingle( id="date-input", display_format="YYYY-M-D", month_format='MMMM Y', placeholder='MMMM Y', date=dt(2018, 5, 24)), html.Label('Select hour to display'), dcc.Dropdown(id='hour-dropdown', options=[{ 'label': '00 h.', 'value': 0 }, { 'label': '01 h.', 'value': 1 }, { 'label': '02 h.', 'value': 2 }, { 'label': '03 h.', 'value': 3 }, {