Example #1
0
def s3_data_acquire(AWS_ACCESS_KEY,
                    AWS_SECRET_KEY,
                    path_to_data,
                    path_to_fwf_file,
                    qc_file_name='qc_streaming.csv'):

    os.chdir(path_to_data)

    #For establishing connection, use access and secret keys sent by Valentina.

    #STEP 1: Access S3 and download relevant streaming data file
    from boto.s3.connection import S3Connection
    conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)

    conn.get_all_buckets()

    bucket = conn.get_bucket('paratransitdata')

    #print bucket contents:
    rs = bucket.list()

    #get list of streaming_data files for today's date
    file_ls = []
    for key in rs:
        if re.search("streaming_data/Schedules_" + time.strftime('%Y%m%d'),
                     key.name.encode('ascii')):
            file_ls.append(key.name.encode('ascii'))

    #select relevant streaming_data file:
    data_key = bucket.get_key(file_ls[-1])
    move_to_me = 'real_time_data.csv'
    data_key.get_contents_to_filename(move_to_me)

    #STEP 2: change this file from fixed width formatted to tab delimitted
    # NOTE: datatypes are screwed up.
    data = read_fwf.read('real_time_data.csv')

    #STEP 3: QC this file a la the R QCing script
    data56 = data.loc[(data.ProviderId == '5') | (data.ProviderId == '6')]
    rides = data56.Run.unique()
    data56.loc[:, 'Activity'] = data56.loc[:, 'Activity'].astype('int')

    #lat/lon constraints:
    upper_right = [49.020430, -116.998768]
    lower_left = [45.606961, -124.974842]
    minlat = lower_left[0]
    maxlat = upper_right[0]
    minlon = lower_left[1]
    maxlon = upper_right[1]

    #Write the cleaned up data:
    ctr = 0
    for ride in rides:
        temp_ride = data56.loc[data56.Run == ride]
        temp_ride.drop(temp_ride.columns[0], axis=1)

        flag = 1  #1 == good, 0 == eliminate.
        lats = temp_ride.LAT
        lons = temp_ride.LON
        #eliminate runs from roster that have bad lat/lon data:
        if (any(lats < minlat) | any(lats > maxlat) | any(lons < minlon)
                | any(lons > maxlon)):
            flag = 0
        #eliminate runs that don't move
        if (all(lats == lats.iloc[0]) | all(lons == lons.iloc[0])):
            flag = 0
        #eliminate runs that don't leave a garage and return to a garage
        if (temp_ride.Activity.iloc[0] != 4.) | (temp_ride.Activity.iloc[-1] !=
                                                 3.):
            flag = 0

        temp_ride = temp_ride.drop(temp_ride.columns[0], axis=1)

        if (ctr == 0) & (flag == 1):
            temp_ride.to_csv(qc_file_name, mode='a', header=True, index=False)
            ctr += 1
        if (ctr != 0) & (flag == 1):
            temp_ride.to_csv(qc_file_name, mode='a', header=False, index=False)

    read_me = path_to_data + qc_file_name

    return pd.read_csv(read_me)
Example #2
0
def s3_data_acquire(AWS_ACCESS_KEY, AWS_SECRET_KEY, path_to_data, path_to_fwf_file, qc_file_name = 'qc_streaming.csv'):

    os.chdir(path_to_data)

    #For establishing connection, use access and secret keys sent by Valentina.

    #STEP 1: Access S3 and download relevant streaming data file
    from boto.s3.connection import S3Connection
    conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)

    conn.get_all_buckets()

    bucket = conn.get_bucket('paratransitdata')

    #print bucket contents:
    rs = bucket.list()

    #get list of streaming_data files for today's date
    file_ls = []
    for key in rs:
        if re.search("streaming_data/Schedules_"+ time.strftime('%Y%m%d'),key.name.encode('ascii')):
            file_ls.append(key.name.encode('ascii'))

    #select relevant streaming_data file:
    data_key = bucket.get_key(file_ls[-1])
    move_to_me = 'real_time_data.csv'
    data_key.get_contents_to_filename(move_to_me)


    #STEP 2: change this file from fixed width formatted to tab delimitted
    # NOTE: datatypes are screwed up.
    data = read_fwf.read('real_time_data.csv')

    #STEP 3: QC this file a la the R QCing script
    data56 = data.loc[(data.ProviderId == '5') | (data.ProviderId == '6')]
    rides = data56.Run.unique()
    data56.loc[:,'Activity'] = data56.loc[:,'Activity'].astype('int')

    #lat/lon constraints:
    upper_right = [49.020430, -116.998768]
    lower_left = [45.606961, -124.974842]
    minlat = lower_left[0]; maxlat = upper_right[0]
    minlon = lower_left[1]; maxlon = upper_right[1]


    #Write the cleaned up data:
    ctr = 0
    for ride in rides:
        temp_ride = data56.loc[data56.Run == ride]
        temp_ride.drop(temp_ride.columns[0], axis = 1)
        
        flag = 1 #1 == good, 0 == eliminate.
        lats = temp_ride.LAT; lons = temp_ride.LON
        #eliminate runs from roster that have bad lat/lon data:
        if(any(lats < minlat) | any(lats>maxlat) | any(lons<minlon) | any(lons > maxlon)):
            flag = 0
        #eliminate runs that don't move
        if(all(lats == lats.iloc[0]) | all(lons == lons.iloc[0])):
            flag = 0
        #eliminate runs that don't leave a garage and return to a garage
        if (temp_ride.Activity.iloc[0] != 4.) | (temp_ride.Activity.iloc[-1] != 3.):
            flag = 0
            
        temp_ride = temp_ride.drop(temp_ride.columns[0], axis = 1)
        
        if (ctr == 0) & (flag == 1):
            temp_ride.to_csv(qc_file_name, mode = 'a', header = True, index = False)
            ctr +=1
        if (ctr != 0) & (flag == 1) :
            temp_ride.to_csv(qc_file_name, mode = 'a', header = False, index = False)

    read_me = path_to_data+qc_file_name

    return pd.read_csv(read_me)
Example #3
0
def s3_data_acquire(AWS_ACCESS_KEY, AWS_SECRET_KEY, path_to_data, qc_file_name = 'qc_streaming.csv'):

    '''
    For establishing connection, use access and secret keys sent by Valentina.
    '''
    if os.path.isfile(os.path.join(path_to_data, qc_file_name)):
        os.remove(os.path.join(path_to_data, qc_file_name))

    #STEP 1: Access S3 and download relevant streaming data file
    from boto.s3.connection import S3Connection
    conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)

    conn.get_all_buckets()

    bucket = conn.get_bucket('paratransitdata')

    #print bucket contents:
    rs = bucket.list()

    #get list of streaming_data files for today's date
    file_ls = []
    for key in rs:
        if re.search('streaming_data/Schedules_'+ time.strftime('%Y%m%d'),key.name.encode('ascii')):
            file_ls.append(key.name.encode('ascii'))

    if not file_ls:
        print('There are no files from '+ str(time.strftime('%Y/%m/%d'))+ '!')
        return -1
        quit()
        
    #select relevant streaming_data file: watch out, hopefully the [-1] file isn't zero bytes!
    data_key = bucket.get_key(file_ls[-1])
    move_to_me = os.path.join(path_to_data,'real_time_data.tsv')
    data_key.get_contents_to_filename(move_to_me)
    print('Saving {0} from S3 bucket.'.format(file_ls[-1]))

    #STEP 2: change this file from fixed width formatted to tab delimitted
    data = read_fwf.read(move_to_me)
    print('Successfully converted fwf file.')

    #STEP 3: QC this file a la the R QCing script

    data56 = data.loc[(data.ProviderId == 5.) | (data.ProviderId == 6.)]
    rides = data56.Run.unique()
    data56.loc[:,'Activity'] = data56.loc[:,'Activity'].astype('int')

    #lat/lon constraints:
    upper_right = [49.020430, -116.998768]
    lower_left = [45.606961, -124.974842]
    minlat = lower_left[0]; maxlat = upper_right[0]
    minlon = lower_left[1]; maxlon = upper_right[1]


    #Write the cleaned up data:
    ctr = 0
    qc_file_name = os.path.join(path_to_data, qc_file_name)
    for ride in rides:
        temp_ride = data56.loc[data56.Run == ride]
        if 'ServiceDate' in temp_ride.columns:
            temp_ride = temp_ride.drop('ServiceDate', axis = 1)
        
        flag = 1 #1 == good, 0 == eliminate.
        lats = temp_ride.LAT; lons = temp_ride.LON
        #eliminate runs from roster that have bad lat/lon data:
        if(any(lats < minlat) | any(lats>maxlat) | any(lons<minlon) | any(lons > maxlon)):
            flag = 0
        #eliminate runs with just 2 rows of data:
        if temp_ride.shape[0] == 2:
            flag = 0
        #eliminate runs that don't move
        if(all(lats == lats.iloc[0]) | all(lons == lons.iloc[0])):
            flag = 0
        #eliminate runs that don't leave a garage and return to a garage
        if (temp_ride.Activity.iloc[0] != 4) | (temp_ride.Activity.iloc[-1] != 3):
            flag = 0

        if (ctr != 0) & (flag == 1) :
            temp_ride.to_csv(qc_file_name, mode = 'a', header = False, index = False)
        if (ctr == 0) & (flag == 1):
            temp_ride.to_csv(qc_file_name, mode = 'a', header = True, index = False)
            ctr = 1

    read_me = os.path.join(path_to_data, qc_file_name)
    ret = pd.read_csv(read_me)
    
    #resolve indexing issues if index column is type timedate, or something else
    if not set(ret.index)==set(range(0, ret.shape[0])):
        ret.index = range(0, ret.shape[0])
    
    return ret
Example #4
0
def s3_data_acquire(AWS_ACCESS_KEY,
                    AWS_SECRET_KEY,
                    path_to_data,
                    qc_file_name='qc_streaming.csv'):
    '''
    For establishing connection, use access and secret keys sent by Valentina.
    '''
    if os.path.isfile(os.path.join(path_to_data, qc_file_name)):
        os.remove(os.path.join(path_to_data, qc_file_name))

    #STEP 1: Access S3 and download relevant streaming data file
    from boto.s3.connection import S3Connection
    conn = S3Connection(AWS_ACCESS_KEY, AWS_SECRET_KEY)

    conn.get_all_buckets()

    bucket = conn.get_bucket('paratransitdata')

    #print bucket contents:
    rs = bucket.list()

    #get list of streaming_data files for today's date
    file_ls = []
    for key in rs:
        if re.search('streaming_data/Schedules_' + time.strftime('%Y%m%d'),
                     key.name.encode('ascii')):
            file_ls.append(key.name.encode('ascii'))

    if not file_ls:
        print('There are no files from ' + str(time.strftime('%Y/%m/%d')) +
              '!')
        return -1
        quit()

    #select relevant streaming_data file: watch out, hopefully the [-1] file isn't zero bytes!
    data_key = bucket.get_key(file_ls[-1])
    move_to_me = os.path.join(path_to_data, 'real_time_data.tsv')
    data_key.get_contents_to_filename(move_to_me)
    print('Saving {0} from S3 bucket.'.format(file_ls[-1]))

    #STEP 2: change this file from fixed width formatted to tab delimitted
    data = read_fwf.read(move_to_me)
    print('Successfully converted fwf file.')

    #STEP 3: QC this file a la the R QCing script

    data56 = data.loc[(data.ProviderId == 5.) | (data.ProviderId == 6.)]
    rides = data56.Run.unique()
    data56.loc[:, 'Activity'] = data56.loc[:, 'Activity'].astype('int')

    #lat/lon constraints:
    upper_right = [49.020430, -116.998768]
    lower_left = [45.606961, -124.974842]
    minlat = lower_left[0]
    maxlat = upper_right[0]
    minlon = lower_left[1]
    maxlon = upper_right[1]

    #Write the cleaned up data:
    ctr = 0
    qc_file_name = os.path.join(path_to_data, qc_file_name)
    for ride in rides:
        temp_ride = data56.loc[data56.Run == ride]
        if 'ServiceDate' in temp_ride.columns:
            temp_ride = temp_ride.drop('ServiceDate', axis=1)

        flag = 1  #1 == good, 0 == eliminate.
        lats = temp_ride.LAT
        lons = temp_ride.LON
        #eliminate runs from roster that have bad lat/lon data:
        if (any(lats < minlat) | any(lats > maxlat) | any(lons < minlon)
                | any(lons > maxlon)):
            flag = 0
        #eliminate runs with just 2 rows of data:
        if temp_ride.shape[0] == 2:
            flag = 0
        #eliminate runs that don't move
        if (all(lats == lats.iloc[0]) | all(lons == lons.iloc[0])):
            flag = 0
        #eliminate runs that don't leave a garage and return to a garage
        if (temp_ride.Activity.iloc[0] != 4) | (temp_ride.Activity.iloc[-1] !=
                                                3):
            flag = 0

        if (ctr != 0) & (flag == 1):
            temp_ride.to_csv(qc_file_name, mode='a', header=False, index=False)
        if (ctr == 0) & (flag == 1):
            temp_ride.to_csv(qc_file_name, mode='a', header=True, index=False)
            ctr = 1

    read_me = os.path.join(path_to_data, qc_file_name)
    ret = pd.read_csv(read_me)

    #resolve indexing issues if index column is type timedate, or something else
    if not set(ret.index) == set(range(0, ret.shape[0])):
        ret.index = range(0, ret.shape[0])

    return ret