Esempio n. 1
0
def dataToFiles(fileData, row=None):
    """
    Store a row of data in temporary files.  This allows easier shuffling.

    :param fileData: a dictionary of information to track the file usage.  The
                     'numFiles' key should be set to the number of partition
                     files that will be used.
    :param row: if not None, the string to store.
    """
    if 'files' not in fileData:
        numFiles = fileData.get('numFiles', 10)
        files = []
        for f in xrange(numFiles):
            (fd, filename) = tempfile.mkstemp('.tmp')
            os.close(fd)
            fptr = fileutil.OpenWithoutCaching(filename, 'wb')
            files.append({'name': filename, 'fptr': fptr})
        fileData['numFiles'] = numFiles
        fileData['files'] = files
        fileData['numRows'] = 0
    files = fileData['files']
    if row is not None:
        if fileData['numFiles'] > 1:
            fnum = random.randint(0, fileData['numFiles'] - 1)
        else:
            fnum = 0
        files[fnum]['fptr'].write(row.encode('utf8') + '\n')
        fileData['numRows'] += 1
Esempio n. 2
0
def outputFiles(fileData, dptr, format='instagram'):
    """
    Based on a set of input files, shuffle each in turn and write it out.

    :param fileData: a dictionary of information that tracked file usage.
    :param dptr: the output file-like pointer.
    :param format: either 'message' or 'instagram'.  This affects whether an
                   _id column is added to the data.
    """
    files = fileData['files']
    numRows = fileData['numRows']
    for f in files:
        f['fptr'].close()
    starttime = time.time()
    id = 0
    for f in files:
        data = fileutil.OpenWithoutCaching(
            f['name'], 'rb').read().strip().split('\n')
        os.unlink(f['name'])
        random.shuffle(data)
        for line in data:
            if format == 'instagram':
                dptr.write('%d\t%s\n' % (id, line))
            elif format == 'message':
                dptr.write('%s\n' % (line))
            else:
                dptr.write(',' if id else '[\n')
                dptr.write('%s\n' % (line))
            id += 1
            if not id % 1000:
                elapsed = time.time() - starttime
                if elapsed:
                    left = (numRows - id) * (elapsed / id)
                else:
                    left = 0
                sys.stderr.write('%d/%d %3.1fs  %3.1fs left  \r' % (
                    id, numRows, elapsed, left))
                sys.stderr.flush()
        data = None
    if format == 'json':
        dptr.write(']\n')
Esempio n. 3
0
def dataToFiles(fileData, opts={}, row=None):
    """
    Store a row of data in temporary files.  This allows importing the data
    using mongoimport when we are done, and also will distribute the data
    across multiple small files for quicker random shuffling if desired.

    :param fileData: a dictionary of information to track the file usage.  The
                     'count' key should be set to the approximate expected
                     number of rows to facilitate shuffling.
    :param opts: general command-line options.
    :param row: if not None, the python document to store.
    """
    if 'numFiles' not in fileData:
        if 'count' not in fileData or not opts.get('shuffle', False):
            numFiles = 1
        else:
            # assume the first row is representative for data size
            if row:
                rowLen = len(json.dumps(row)) + 1
            else:
                rowLen = 256
            roughFileSize = 256 * 1024 * 1024
            numFiles = int(fileData['count'] * rowLen / roughFileSize) + 1
        files = []
        for f in xrange(numFiles):
            (fd, filename) = tempfile.mkstemp('.json')
            os.close(fd)
            fptr = fileutil.OpenWithoutCaching(filename, 'wb')
            files.append({'name': filename, 'fptr': fptr})
        fileData['numFiles'] = numFiles
        fileData['files'] = files
        fileData['numRows'] = 0
    files = fileData['files']
    if row is not None:
        if fileData['numFiles'] > 1:
            fnum = random.randint(0, fileData['numFiles'] - 1)
        else:
            fnum = 0
        files[fnum]['fptr'].write(json.dumps(row) + '\n')
        fileData['numRows'] += 1
Esempio n. 4
0
def importFiles(fileData, opts={}, destDB=None):
    """
    Given data stored in temporary files, import the data into mongo.  If the
    data should be shuffled, shuffle it first.

    :param fileData: a dictionary of information that tracked file usage.
    :param destDB: the name of the destination database.
    :param opts: command line options.
    """
    global DBName
    if destDB:
        DBName = destDB
    else:
        destDB = DBName

    files = fileData['files']
    numRows = fileData['numRows']
    for f in files:
        f['fptr'].close()
    starttime = time.time()
    if opts.get('shuffle', False):
        (fd, filename) = tempfile.mkstemp('combined.json')
        print filename
        os.close(fd)
        fptr = fileutil.OpenWithoutCaching(filename, 'wb')
        processed = 0
        for f in files:
            data = fileutil.OpenWithoutCaching(
                f['name']).read().strip().split('\n')
            os.unlink(f['name'])
            random.shuffle(data)
            for line in data:
                fptr.write(line + '\n')
                processed += 1
                if not processed % 1000:
                    elapsed = time.time() - starttime
                    if elapsed:
                        left = (numRows - processed) * (elapsed / processed)
                    else:
                        left = 0
                    sys.stdout.write('%d/%d %3.1fs  %3.1fs left  \r' %
                                     (processed, numRows, elapsed, left))
                    sys.stdout.flush()
            data = None
        fptr.close()
    else:
        filename = files[0]['name']

    if not destDB:
        return
    client = getDbConnection(destDB)
    database = client.get_default_database()
    destColl = database['trips']
    destColl.drop()
    destColl.drop_indexes()
    if not opts.get('dropIndex', False):
        indexTrips(opts)
    destColl = None
    cmd = [
        'mongoimport', '--db=' + destDB, '--collection=trips',
        '--file=' + filename, '--drop'
    ]
    subprocess.call(cmd)
    if not opts.get('keepFiles', False):
        os.unlink(filename)
    sys.stdout.write('Imported %3.1fs\n' % (time.time() - starttime, ))

    if opts.get('endIndex', False):
        starttime = time.time()
        sys.stdout.write('Indexing\n')
        indexTrips(opts)
        sys.stdout.write('Indexed %3.1fs\n' % (time.time() - starttime, ))
Esempio n. 5
0
    'dropoff_latitude': 'dy',
    'payment_type': 'ty',
    'fare_amount': 'f',
    'surcharge': 'sr',
    'mta_tax': 'tx',
    'tip_amount': 'tp',
    'tolls_amount': 'tl',
    'total_amount': 't',
}

if len(sys.argv) != 3 or '--help' in sys.argv:
    print """Convert JSON load file for Mongo to a load file for Postgres.

Syntax: load_pg.py (existing json file) (target pg file)"""
    sys.exit()
dptr = fileutil.OpenWithoutCaching(sys.argv[2], 'wb')
dptr.write("""DROP TABLE trips;

CREATE TABLE trips (
    medallion text,
    hack_license text,
    vendor_id text,
    rate_code int,
    store_and_fwd_flag text,
    pickup_datetime int,
    dropoff_datetime int,
    passenger_count int,
    trip_time_in_secs int,
    trip_distance real,
    pickup_longitude double precision,
    pickup_latitude double precision,