def convert_file(fname, out_arr):
    def get_date(item):
        dt = None
        try:
            dt = datetime.strptime(item, "%m/%d/%y %H:%M")
        except:
            try:
                dt = datetime.strptime(item, "%y-%m-%d %H:%M")
            except:
                try:
                    dt = datetime.strptime(item, "%Y-%m-%d %H:%M")
                except:
                    dt = datetime.strptime(item, "%m/%d/%Y %H:%M")
        return dt

    i = 0
    f = open(fname, 'rU')
    reader = read_csv(f, engine='c', header=0)
    for _, row in reader.iterrows():
        r = BikeRide()

        tm = row['Duration']
        tm = tm.replace('h ', ':').replace('m ',
                                           ':').replace('sec.',
                                                        '').replace('s', '')
        h, m, s = map(int, tm.split(':'))
        r.duration = h * 60 * 60 + m * 60 + s

        r.start_date = get_date(row['Start date'])
        r.end_date = get_date(row['End date'])
        try:
            r.start_station = find_station(row['Start Station'])
            r.end_station = find_station(row['End Station'])
        except:
            print 'couldnt find', row['Start Station'], 'or', row[
                'End Station']
            continue
        r.subscribed = False if row['Subscription Type'] == 'Casual' else True
        out_arr.append(r)

        i += 1
        if i % 10000 == 0:
            print i
    f.close()
Exemple #2
0
dct = ijson.items(f, 'item')
# ijson lets us stream the file rather than open it all at once --- too big
i = 0
failed = 0

starting = 2453922

for j in range(starting):
    i += 1
    dct.next()

for r in dct:
    i += 1
    if i % 10000 == 0:
        print i
    ride = BikeRide()
    st_dt = datetime.strptime(r['start date'], "%Y-%m-%dT%H:%M:%S")
    ed_dt = datetime.strptime(r['end date'], "%Y-%m-%dT%H:%M:%S")
    ride.start_date = st_dt
    ride.end_date = ed_dt
    ride.duration = r['duration']
    ride.subscribed = False if r['user type'] == 'casual' else True
    try:
        ride.start_station = find_station(r['start station'])
        ride.end_station = find_station(r['end station'])
    except:
        failed += 1
        print 'fail number', failed
        failed_ls.append(r)
        continue
    ride_ls.append(ride)