def get_month_data_chunk(dtfrom, csv_file, mmsis='all', dts_high=3000000.0, dts_low=0.0, ourlabel='all'): chunksize = 2000000 points = 0 weekly_data = {} mmsi = '' # escape and validate the date, return nothing if invalid dtfrom = solrEscapes.solr_escape_date(dtfrom) if dtfrom == '': return weekly_data, mmsi # escape the mmsis string mmsis = solrEscapes.solr_escape(mmsis) # read in the chunks and start filtering TextFileReader = pd.read_csv(csv_file, chunksize=chunksize) results = pd.concat(TextFileReader, ignore_index=True) results = results[results['fishing_f'] > 0] results.mmsi = results.mmsi.astype(str) # filter the dataframe by distance to shore and mmsis results = results[results['distshore_f'] >= dts_low] results = results[results['distshore_f'] <= dts_high] if mmsis != 'all': results = results[results['mmsi'].isin(string.split(mmsis, ','))] points += len(results) # split up the date and get a month length year = dtfrom.split('T')[0].split('-')[0] month = dtfrom.split('T')[0].split('-')[1] month_len = 31 weeks = 5 if month in ['01', '03', '05', '07', '08', '10', '12']: month_len = 32 elif month == '02': weeks = 4 month_len = 29 daycount = month_len - 1 # results = pd.read_csv('m_20130701_0_3000_1504285051.4036.csv') # results = results[results['fishing_f'] > 0] mmsi_list = results.mmsi.unique() mmsi = ', '.join(str(x) for x in mmsi_list) # print len(mmsi) # print datetime.datetime.now() if ourlabel != 'all': ol = ourlabel.split(',') ol = map(int, ol) results = results[results['ourlabel'].isin(ol)] filtered = results[[ 'mmsi', 'longitude', 'latitude', 'distshore_f', 'ourlabel' ]] if 'total_month' in weekly_data: weekly_data['total_month'] = np.append(weekly_data['total_month'], filtered.as_matrix(), axis=0) else: weekly_data['total_month'] = filtered.as_matrix() return weekly_data, mmsi
def ship_points_csv(dtfrom, csv_file, pos1, pos2, span='month', mmsis='all', dts_high=3000000.0, dts_low=0.0): chunksize = 2000000 points = 0 data = pd.DataFrame() mmsi = '' position1 = string.split(pos1, ',') position2 = string.split(pos2, ',') # escape and validate the date, return nothing if invalid dtfrom = solrEscapes.solr_escape_date(dtfrom) if dtfrom == '': return data # escape the mmsis string mmsis = solrEscapes.solr_escape(mmsis) # read in the chunks and start filtering reader = pd.read_csv(csv_file, chunksize=chunksize) for results in reader: # results = results[results['fishing_f'] > 0] # datespan if span == 'week': # split up the date and get a month length year = dtfrom.split('T')[0].split('-')[0] month = dtfrom.split('T')[0].split('-')[1] month_len = 31 weeks = 5 if month in ['01', '03', '05', '07', '08', '10', '12']: month_len = 32 elif month == '02': weeks = 4 month_len = 29 daycount = month_len - 1 # split up by week count = 0 start_day = 1 end_day = 8 while count < weeks: day = [] for d in range(start_day, end_day): if d > 10: day.append('%s-%s-%s' % (year, month, d)) else: day.append('%s-%s-0%s' % (year, month, d)) dayt = tuple(day) results = results[results['datetime'].apply( lambda s: s.startswith(dayt))] # filter for latlon area # account for dateline overlaps p1 = [float(i) for i in pos1.split(',')] p2 = [float(i) for i in pos2.split(',')] if p1[1] > p2[1]: # if the SW corner is further east than the NE box1_p1 = p1 box1_p2 = [p2[0], 180.0] box2_p1 = [p1[0], -180.0] box2_p2 = p2 r1 = results[results['longitude'].between(float(box1_p1[1]), float(box1_p2[1]), inclusive=True)] r1 = r1[r1['latitude'].between(float(box1_p1[0]), float(box1_p2[0]), inclusive=True)] r2 = results[results['longitude'].between(float(box1_p2[1]), float(box2_p2[1]), inclusive=True)] r2 = r2[r2['latitude'].between(float(box2_p1[0]), float(box2_p2[0]), inclusive=True)] results = pd.concat([r1, r2]) else: results = results[results['longitude'].between(float(position1[1]), float(position2[1]), inclusive=True)] results = results[results['latitude'].between(float(position1[0]), float(position2[0]), inclusive=True)] results.mmsi = results.mmsi.astype(str) # for some reason, getting these bad mmsi's, just remove them # results = results[results['mmsi'].str.len() > 8] # filter the dataframe by distance to shore and mmsis results = results[results['distshore_f'] >= dts_low] results = results[results['distshore_f'] <= dts_high] if mmsis != 'all': results = results[results['mmsi'].isin(string.split(mmsis, ','))] points += len(results) data = pd.concat([data, results]) # filtered = results[['mmsi', 'longitude', 'latitude', 'datetime', 'ourlabel', 'fishing_f']] # if len(data) <= 0: # data = filtered.as_matrix() # else: # data = np.append(data, filtered.as_matrix(), axis=0) data = data[['mmsi', 'longitude', 'latitude', 'datetime', 'fishing_f']] return data
def get_week_data_chunks(dtfrom, csv_file, mmsis='all', dts_high=3000000.0, dts_low=0.0, ourlabel='all'): chunksize = 2000000 points = 0 weekly_data = {} mmsi = '' # escape and validate the date, return nothing if invalid dtfrom = solrEscapes.solr_escape_date(dtfrom) if dtfrom == '': return weekly_data, mmsi # escape the mmsis string mmsis = solrEscapes.solr_escape(mmsis) # read in the chunks and start filtering reader = pd.read_csv(csv_file, chunksize=chunksize) for results in reader: results = results[results['fishing_f'] > 0] results.mmsi = results.mmsi.astype(str) # filter the dataframe by distance to shore and mmsis results = results[results['distshore_f'] >= dts_low] results = results[results['distshore_f'] <= dts_high] if mmsis != 'all': results = results[results['mmsi'].isin(string.split(mmsis, ','))] points += len(results) # split up the date and get a month length year = dtfrom.split('T')[0].split('-')[0] month = dtfrom.split('T')[0].split('-')[1] month_len = 31 weeks = 5 if month in ['01', '03', '05', '07', '08', '10', '12']: month_len = 32 elif month == '02': weeks = 4 month_len = 29 daycount = month_len - 1 # results = pd.read_csv('m_20130701_0_3000_1504285051.4036.csv') # results = results[results['fishing_f'] > 0] mmsi_list = results.mmsi.unique() mmsi = ', '.join(str(x) for x in mmsi_list) # print len(mmsi) # print datetime.datetime.now() if ourlabel != 'all': ol = ourlabel.split(',') ol = map(int, ol) results = results[results['ourlabel'].isin(ol)] # split up by week count = 0 start_day = (int)(dtfrom.split('T')[0].split('-')[2]) end_day = start_day + 7 day = [] for d in range(start_day, end_day): if d > 10: day.append('%s-%s-%s' % (year, month, d)) else: day.append('%s-%s-0%s' % (year, month, d)) dayt = tuple(day) filtered = results[results['datetime'].apply( lambda s: s.startswith(dayt))] filtered = filtered[[ 'mmsi', 'longitude', 'latitude', 'distshore_f', 'ourlabel' ]] if day[0] in weekly_data: weekly_data[day[0]] = np.append(weekly_data[day[0]], filtered.as_matrix(), axis=0) else: weekly_data[day[0]] = filtered.as_matrix() # filtered = results[['mmsi', 'longitude', 'latitude', 'distshore_f', 'ourlabel']] # if 'total_month' in weekly_data: # weekly_data['total_month'] = np.append(weekly_data['total_month'], filtered.as_matrix(), axis=0) # else: # weekly_data['total_month'] = filtered.as_matrix() return weekly_data, mmsi
mmsi_data = mmsi_data[mmsi_data['ourlabel'] != '-'] mmsi_data = mmsi_data[mmsi_data['ourlabel'] != 'nan'] # print mmsi_data mmsi_list = mmsi_data.mmsi.unique() for m in mmsi_list: df = alldata[alldata['mmsi'] == int(m)] df = df[['latitude', 'longitude', 'datetime', 'fishing_f']] routes[m] = df.as_matrix().tolist() # print routes[m] print json.dumps(routes) if __name__ == "__main__": dtfrom = solrEscapes.solr_escape_date(sys.argv[1]) # the start date path = sys.argv[2] # the path to csv's mmsis = solrEscapes.solr_escape( sys.argv[3]) # mmsi's to search, 'all' if all dts_high = float(sys.argv[4]) # distance to shore high dts_low = float(sys.argv[5]) # distance to shore low span = sys.argv[6] # 'month' or 'week' = one at a time pos1 = solrEscapes.solr_escape_latlon(sys.argv[7]) pos2 = solrEscapes.solr_escape_latlon(sys.argv[8]) date = dtfrom.split('T')[0] csv_file = path + date.split('-')[0] + '-' + date.split('-')[1] + '.csv' # alldata = ship_points_csv(dtfrom, csv_file, pos1, pos2, span, mmsis, dts_high, dts_low) # ship_routes(alldata)