def convert_file(fname, out_arr):
    def get_date(item):
        dt = None
        try:
            dt = datetime.strptime(item, "%m/%d/%y %H:%M")
        except:
            try:
                dt = datetime.strptime(item, "%y-%m-%d %H:%M")
            except:
                try:
                    dt = datetime.strptime(item, "%Y-%m-%d %H:%M")
                except:
                    dt = datetime.strptime(item, "%m/%d/%Y %H:%M")
        return dt

    i = 0
    f = open(fname, 'rU')
    reader = read_csv(f, engine='c', header=0)
    for _, row in reader.iterrows():
        r = BikeRide()

        tm = row['Duration']
        tm = tm.replace('h ', ':').replace('m ',
                                           ':').replace('sec.',
                                                        '').replace('s', '')
        h, m, s = map(int, tm.split(':'))
        r.duration = h * 60 * 60 + m * 60 + s

        r.start_date = get_date(row['Start date'])
        r.end_date = get_date(row['End date'])
        try:
            r.start_station = find_station(row['Start Station'])
            r.end_station = find_station(row['End Station'])
        except:
            print 'couldnt find', row['Start Station'], 'or', row[
                'End Station']
            continue
        r.subscribed = False if row['Subscription Type'] == 'Casual' else True
        out_arr.append(r)

        i += 1
        if i % 10000 == 0:
            print i
    f.close()
def convert_file(fname, out_arr):

	def get_date(item):
		dt = None
		try:
			dt = datetime.strptime(item, "%m/%d/%y %H:%M")
		except:
			try:
				dt = datetime.strptime(item, "%y-%m-%d %H:%M")
			except:
				try:
					dt = datetime.strptime(item, "%Y-%m-%d %H:%M")
				except:
					dt = datetime.strptime(item, "%m/%d/%Y %H:%M")
		return dt

	i = 0
	f = open(fname, 'rU')
	reader = read_csv(f, engine='c', header=0)
	for _, row in reader.iterrows():
		r = BikeRide()

		tm = row['Duration']
		tm = tm.replace('h ', ':').replace('m ', ':').replace('sec.', '').replace('s', '')
		h, m, s = map(int, tm.split(':'))
		r.duration = h * 60 * 60 + m * 60 + s

		r.start_date = get_date(row['Start date'])
		r.end_date = get_date(row['End date'])
		try:
			r.start_station = find_station(row['Start Station'])
			r.end_station = find_station(row['End Station'])
		except:
			print 'couldnt find', row['Start Station'], 'or', row['End Station']
			continue
		r.subscribed = False if row['Subscription Type'] == 'Casual' else True
		out_arr.append(r)

		i += 1
		if i % 10000 == 0:
			print i
	f.close()
Example #3
0
dct = ijson.items(f, 'item')
# ijson lets us stream the file rather than open it all at once --- too big
i = 0
failed = 0

starting = 2453922

for j in range(starting):
    i += 1
    dct.next()

for r in dct:
    i += 1
    if i % 10000 == 0:
        print i
    ride = BikeRide()
    st_dt = datetime.strptime(r['start date'], "%Y-%m-%dT%H:%M:%S")
    ed_dt = datetime.strptime(r['end date'], "%Y-%m-%dT%H:%M:%S")
    ride.start_date = st_dt
    ride.end_date = ed_dt
    ride.duration = r['duration']
    ride.subscribed = False if r['user type'] == 'casual' else True
    try:
        ride.start_station = find_station(r['start station'])
        ride.end_station = find_station(r['end station'])
    except:
        failed += 1
        print 'fail number', failed
        failed_ls.append(r)
        continue
    ride_ls.append(ride)
	else:
		print 'couldn\'t find "%s"' % stat_n
		return min(stations_hs.values(), key=lambda x: leven_dist(x, stat_n))

f = open('ride_data.json', 'r')
dct = ijson.items(f, 'item')
# ijson lets us stream the file rather than open it all at once --- too big
i = 0
failed = 0

for r in dct:
	i += 1
	if i % 10000 == 0:
		print i

	ride = BikeRide()
	try:
		ride.start_station = find_station(r['start station'])
		ride.end_station = find_station(r['end station'])
	except:
		print 'failed on station', r['start station'], 'or', r['end station']
	try:
		st_dt = datetime.strptime(r['start date'], "%Y-%m-%dT%H:%M:%S")
		ed_dt = datetime.strptime(r['end date'], "%Y-%m-%dT%H:%M:%S")
		ride.start_date = st_dt
		ride.end_date = ed_dt
		ride.duration = r['duration']
		ride.subscribed = False if r['user type'] == 'casual' else True
		# ride_ls.append(ride)
	except :
		failed += 1