def get_month_data_chunk(dtfrom,
                         csv_file,
                         mmsis='all',
                         dts_high=3000000.0,
                         dts_low=0.0,
                         ourlabel='all'):
    chunksize = 2000000

    points = 0
    weekly_data = {}
    mmsi = ''

    # escape and validate the date, return nothing if invalid
    dtfrom = solrEscapes.solr_escape_date(dtfrom)
    if dtfrom == '':
        return weekly_data, mmsi

    # escape the mmsis string
    mmsis = solrEscapes.solr_escape(mmsis)

    # read in the chunks and start filtering
    TextFileReader = pd.read_csv(csv_file, chunksize=chunksize)
    results = pd.concat(TextFileReader, ignore_index=True)

    results = results[results['fishing_f'] > 0]
    results.mmsi = results.mmsi.astype(str)

    # filter the dataframe by distance to shore and mmsis
    results = results[results['distshore_f'] >= dts_low]
    results = results[results['distshore_f'] <= dts_high]
    if mmsis != 'all':
        results = results[results['mmsi'].isin(string.split(mmsis, ','))]

    points += len(results)

    # split up the date and get a month length
    year = dtfrom.split('T')[0].split('-')[0]
    month = dtfrom.split('T')[0].split('-')[1]
    month_len = 31
    weeks = 5
    if month in ['01', '03', '05', '07', '08', '10', '12']:
        month_len = 32
    elif month == '02':
        weeks = 4
        month_len = 29

    daycount = month_len - 1

    # results = pd.read_csv('m_20130701_0_3000_1504285051.4036.csv')
    # results = results[results['fishing_f'] > 0]

    mmsi_list = results.mmsi.unique()
    mmsi = ', '.join(str(x) for x in mmsi_list)
    # print len(mmsi)
    # print datetime.datetime.now()

    if ourlabel != 'all':
        ol = ourlabel.split(',')
        ol = map(int, ol)
        results = results[results['ourlabel'].isin(ol)]

    filtered = results[[
        'mmsi', 'longitude', 'latitude', 'distshore_f', 'ourlabel'
    ]]
    if 'total_month' in weekly_data:
        weekly_data['total_month'] = np.append(weekly_data['total_month'],
                                               filtered.as_matrix(),
                                               axis=0)
    else:
        weekly_data['total_month'] = filtered.as_matrix()

    return weekly_data, mmsi
Esempio n. 2
0
def ship_points_csv(dtfrom,
                    csv_file,
                    pos1,
                    pos2,
                    span='month',
                    mmsis='all',
                    dts_high=3000000.0,
                    dts_low=0.0):
    chunksize = 2000000

    points = 0
    data = pd.DataFrame()
    mmsi = ''

    position1 = string.split(pos1, ',')
    position2 = string.split(pos2, ',')

    # escape and validate the date, return nothing if invalid
    dtfrom = solrEscapes.solr_escape_date(dtfrom)
    if dtfrom == '':
        return data

    # escape the mmsis string
    mmsis = solrEscapes.solr_escape(mmsis)

    # read in the chunks and start filtering
    reader = pd.read_csv(csv_file, chunksize=chunksize)

    for results in reader:

        # results = results[results['fishing_f'] > 0]

        # datespan
        if span == 'week':

            # split up the date and get a month length
            year = dtfrom.split('T')[0].split('-')[0]
            month = dtfrom.split('T')[0].split('-')[1]
            month_len = 31
            weeks = 5
            if month in ['01', '03', '05', '07', '08', '10', '12']:
                month_len = 32
            elif month == '02':
                weeks = 4
                month_len = 29

            daycount = month_len - 1

            # split up by week
            count = 0
            start_day = 1
            end_day = 8
            while count < weeks:
                day = []
                for d in range(start_day, end_day):
                    if d > 10:
                        day.append('%s-%s-%s' % (year, month, d))
                    else:
                        day.append('%s-%s-0%s' % (year, month, d))

                dayt = tuple(day)
                results = results[results['datetime'].apply(
                    lambda s: s.startswith(dayt))]

        # filter for latlon area

        # account for dateline overlaps
        p1 = [float(i) for i in pos1.split(',')]
        p2 = [float(i) for i in pos2.split(',')]
        if p1[1] > p2[1]:  # if the SW corner is further east than the NE
            box1_p1 = p1
            box1_p2 = [p2[0], 180.0]
            box2_p1 = [p1[0], -180.0]
            box2_p2 = p2

            r1 = results[results['longitude'].between(float(box1_p1[1]),
                                                      float(box1_p2[1]),
                                                      inclusive=True)]
            r1 = r1[r1['latitude'].between(float(box1_p1[0]),
                                           float(box1_p2[0]),
                                           inclusive=True)]

            r2 = results[results['longitude'].between(float(box1_p2[1]),
                                                      float(box2_p2[1]),
                                                      inclusive=True)]
            r2 = r2[r2['latitude'].between(float(box2_p1[0]),
                                           float(box2_p2[0]),
                                           inclusive=True)]

            results = pd.concat([r1, r2])

        else:
            results = results[results['longitude'].between(float(position1[1]),
                                                           float(position2[1]),
                                                           inclusive=True)]
            results = results[results['latitude'].between(float(position1[0]),
                                                          float(position2[0]),
                                                          inclusive=True)]

        results.mmsi = results.mmsi.astype(str)

        # for some reason, getting these bad mmsi's, just remove them
        # results = results[results['mmsi'].str.len() > 8]

        # filter the dataframe by distance to shore and mmsis
        results = results[results['distshore_f'] >= dts_low]
        results = results[results['distshore_f'] <= dts_high]
        if mmsis != 'all':
            results = results[results['mmsi'].isin(string.split(mmsis, ','))]

        points += len(results)

        data = pd.concat([data, results])

        # filtered = results[['mmsi', 'longitude', 'latitude', 'datetime', 'ourlabel', 'fishing_f']]
        # if len(data) <= 0:
        #     data = filtered.as_matrix()
        # else:
        #     data = np.append(data, filtered.as_matrix(), axis=0)

    data = data[['mmsi', 'longitude', 'latitude', 'datetime', 'fishing_f']]
    return data
def get_week_data_chunks(dtfrom,
                         csv_file,
                         mmsis='all',
                         dts_high=3000000.0,
                         dts_low=0.0,
                         ourlabel='all'):
    chunksize = 2000000

    points = 0
    weekly_data = {}
    mmsi = ''

    # escape and validate the date, return nothing if invalid
    dtfrom = solrEscapes.solr_escape_date(dtfrom)
    if dtfrom == '':
        return weekly_data, mmsi

    # escape the mmsis string
    mmsis = solrEscapes.solr_escape(mmsis)

    # read in the chunks and start filtering
    reader = pd.read_csv(csv_file, chunksize=chunksize)

    for results in reader:

        results = results[results['fishing_f'] > 0]

        results.mmsi = results.mmsi.astype(str)

        # filter the dataframe by distance to shore and mmsis
        results = results[results['distshore_f'] >= dts_low]
        results = results[results['distshore_f'] <= dts_high]
        if mmsis != 'all':
            results = results[results['mmsi'].isin(string.split(mmsis, ','))]

        points += len(results)

        # split up the date and get a month length
        year = dtfrom.split('T')[0].split('-')[0]
        month = dtfrom.split('T')[0].split('-')[1]
        month_len = 31
        weeks = 5
        if month in ['01', '03', '05', '07', '08', '10', '12']:
            month_len = 32
        elif month == '02':
            weeks = 4
            month_len = 29

        daycount = month_len - 1

        # results = pd.read_csv('m_20130701_0_3000_1504285051.4036.csv')
        # results = results[results['fishing_f'] > 0]

        mmsi_list = results.mmsi.unique()
        mmsi = ', '.join(str(x) for x in mmsi_list)
        # print len(mmsi)
        # print datetime.datetime.now()

        if ourlabel != 'all':
            ol = ourlabel.split(',')
            ol = map(int, ol)
            results = results[results['ourlabel'].isin(ol)]

        # split up by week
        count = 0
        start_day = (int)(dtfrom.split('T')[0].split('-')[2])
        end_day = start_day + 7

        day = []
        for d in range(start_day, end_day):
            if d > 10:
                day.append('%s-%s-%s' % (year, month, d))
            else:
                day.append('%s-%s-0%s' % (year, month, d))

        dayt = tuple(day)
        filtered = results[results['datetime'].apply(
            lambda s: s.startswith(dayt))]
        filtered = filtered[[
            'mmsi', 'longitude', 'latitude', 'distshore_f', 'ourlabel'
        ]]

        if day[0] in weekly_data:
            weekly_data[day[0]] = np.append(weekly_data[day[0]],
                                            filtered.as_matrix(),
                                            axis=0)
        else:
            weekly_data[day[0]] = filtered.as_matrix()

        # filtered = results[['mmsi', 'longitude', 'latitude', 'distshore_f', 'ourlabel']]
        # if 'total_month' in weekly_data:
        #     weekly_data['total_month'] = np.append(weekly_data['total_month'], filtered.as_matrix(), axis=0)
        # else:
        #     weekly_data['total_month'] = filtered.as_matrix()

    return weekly_data, mmsi
Esempio n. 4
0
    mmsi_data = mmsi_data[mmsi_data['ourlabel'] != '-']
    mmsi_data = mmsi_data[mmsi_data['ourlabel'] != 'nan']
    # print mmsi_data
    mmsi_list = mmsi_data.mmsi.unique()

    for m in mmsi_list:
        df = alldata[alldata['mmsi'] == int(m)]
        df = df[['latitude', 'longitude', 'datetime', 'fishing_f']]
        routes[m] = df.as_matrix().tolist()
        # print routes[m]

    print json.dumps(routes)


if __name__ == "__main__":
    dtfrom = solrEscapes.solr_escape_date(sys.argv[1])  # the start date
    path = sys.argv[2]  # the path to csv's
    mmsis = solrEscapes.solr_escape(
        sys.argv[3])  # mmsi's to search, 'all' if all
    dts_high = float(sys.argv[4])  # distance to shore high
    dts_low = float(sys.argv[5])  # distance to shore low
    span = sys.argv[6]  # 'month' or 'week' = one at a time
    pos1 = solrEscapes.solr_escape_latlon(sys.argv[7])
    pos2 = solrEscapes.solr_escape_latlon(sys.argv[8])

    date = dtfrom.split('T')[0]

    csv_file = path + date.split('-')[0] + '-' + date.split('-')[1] + '.csv'

    # alldata = ship_points_csv(dtfrom, csv_file, pos1, pos2, span, mmsis, dts_high, dts_low)
    # ship_routes(alldata)