def dataToFiles(fileData, row=None): """ Store a row of data in temporary files. This allows easier shuffling. :param fileData: a dictionary of information to track the file usage. The 'numFiles' key should be set to the number of partition files that will be used. :param row: if not None, the string to store. """ if 'files' not in fileData: numFiles = fileData.get('numFiles', 10) files = [] for f in xrange(numFiles): (fd, filename) = tempfile.mkstemp('.tmp') os.close(fd) fptr = fileutil.OpenWithoutCaching(filename, 'wb') files.append({'name': filename, 'fptr': fptr}) fileData['numFiles'] = numFiles fileData['files'] = files fileData['numRows'] = 0 files = fileData['files'] if row is not None: if fileData['numFiles'] > 1: fnum = random.randint(0, fileData['numFiles'] - 1) else: fnum = 0 files[fnum]['fptr'].write(row.encode('utf8') + '\n') fileData['numRows'] += 1
def outputFiles(fileData, dptr, format='instagram'): """ Based on a set of input files, shuffle each in turn and write it out. :param fileData: a dictionary of information that tracked file usage. :param dptr: the output file-like pointer. :param format: either 'message' or 'instagram'. This affects whether an _id column is added to the data. """ files = fileData['files'] numRows = fileData['numRows'] for f in files: f['fptr'].close() starttime = time.time() id = 0 for f in files: data = fileutil.OpenWithoutCaching( f['name'], 'rb').read().strip().split('\n') os.unlink(f['name']) random.shuffle(data) for line in data: if format == 'instagram': dptr.write('%d\t%s\n' % (id, line)) elif format == 'message': dptr.write('%s\n' % (line)) else: dptr.write(',' if id else '[\n') dptr.write('%s\n' % (line)) id += 1 if not id % 1000: elapsed = time.time() - starttime if elapsed: left = (numRows - id) * (elapsed / id) else: left = 0 sys.stderr.write('%d/%d %3.1fs %3.1fs left \r' % ( id, numRows, elapsed, left)) sys.stderr.flush() data = None if format == 'json': dptr.write(']\n')
def dataToFiles(fileData, opts={}, row=None): """ Store a row of data in temporary files. This allows importing the data using mongoimport when we are done, and also will distribute the data across multiple small files for quicker random shuffling if desired. :param fileData: a dictionary of information to track the file usage. The 'count' key should be set to the approximate expected number of rows to facilitate shuffling. :param opts: general command-line options. :param row: if not None, the python document to store. """ if 'numFiles' not in fileData: if 'count' not in fileData or not opts.get('shuffle', False): numFiles = 1 else: # assume the first row is representative for data size if row: rowLen = len(json.dumps(row)) + 1 else: rowLen = 256 roughFileSize = 256 * 1024 * 1024 numFiles = int(fileData['count'] * rowLen / roughFileSize) + 1 files = [] for f in xrange(numFiles): (fd, filename) = tempfile.mkstemp('.json') os.close(fd) fptr = fileutil.OpenWithoutCaching(filename, 'wb') files.append({'name': filename, 'fptr': fptr}) fileData['numFiles'] = numFiles fileData['files'] = files fileData['numRows'] = 0 files = fileData['files'] if row is not None: if fileData['numFiles'] > 1: fnum = random.randint(0, fileData['numFiles'] - 1) else: fnum = 0 files[fnum]['fptr'].write(json.dumps(row) + '\n') fileData['numRows'] += 1
def importFiles(fileData, opts={}, destDB=None): """ Given data stored in temporary files, import the data into mongo. If the data should be shuffled, shuffle it first. :param fileData: a dictionary of information that tracked file usage. :param destDB: the name of the destination database. :param opts: command line options. """ global DBName if destDB: DBName = destDB else: destDB = DBName files = fileData['files'] numRows = fileData['numRows'] for f in files: f['fptr'].close() starttime = time.time() if opts.get('shuffle', False): (fd, filename) = tempfile.mkstemp('combined.json') print filename os.close(fd) fptr = fileutil.OpenWithoutCaching(filename, 'wb') processed = 0 for f in files: data = fileutil.OpenWithoutCaching( f['name']).read().strip().split('\n') os.unlink(f['name']) random.shuffle(data) for line in data: fptr.write(line + '\n') processed += 1 if not processed % 1000: elapsed = time.time() - starttime if elapsed: left = (numRows - processed) * (elapsed / processed) else: left = 0 sys.stdout.write('%d/%d %3.1fs %3.1fs left \r' % (processed, numRows, elapsed, left)) sys.stdout.flush() data = None fptr.close() else: filename = files[0]['name'] if not destDB: return client = getDbConnection(destDB) database = client.get_default_database() destColl = database['trips'] destColl.drop() destColl.drop_indexes() if not opts.get('dropIndex', False): indexTrips(opts) destColl = None cmd = [ 'mongoimport', '--db=' + destDB, '--collection=trips', '--file=' + filename, '--drop' ] subprocess.call(cmd) if not opts.get('keepFiles', False): os.unlink(filename) sys.stdout.write('Imported %3.1fs\n' % (time.time() - starttime, )) if opts.get('endIndex', False): starttime = time.time() sys.stdout.write('Indexing\n') indexTrips(opts) sys.stdout.write('Indexed %3.1fs\n' % (time.time() - starttime, ))
'dropoff_latitude': 'dy', 'payment_type': 'ty', 'fare_amount': 'f', 'surcharge': 'sr', 'mta_tax': 'tx', 'tip_amount': 'tp', 'tolls_amount': 'tl', 'total_amount': 't', } if len(sys.argv) != 3 or '--help' in sys.argv: print """Convert JSON load file for Mongo to a load file for Postgres. Syntax: load_pg.py (existing json file) (target pg file)""" sys.exit() dptr = fileutil.OpenWithoutCaching(sys.argv[2], 'wb') dptr.write("""DROP TABLE trips; CREATE TABLE trips ( medallion text, hack_license text, vendor_id text, rate_code int, store_and_fwd_flag text, pickup_datetime int, dropoff_datetime int, passenger_count int, trip_time_in_secs int, trip_distance real, pickup_longitude double precision, pickup_latitude double precision,