Ejemplo n.º 1
0
def deleteGCSobj ( gcsPath ):

    ( bucketName, oName ) = splitGCSpath ( gcsPath )
    ## print " --> ", bucketName, oName
    connection = boto.gs.connection.GSConnection()
    bucket = connection.get_bucket(bucketName)
    if oName not in bucket:
        raise ValueError('did not find %s in %s' % (oName, bucketName))
    key = boto.gs.key.Key(bucket, oName)
    try:
        key.delete()
    finally:
        key.close()
        connection.close()
Ejemplo n.º 2
0
def deleteGCSobj ( gcsPath ):

    ( bucketName, oName ) = splitGCSpath ( gcsPath )
    ## print " --> ", bucketName, oName
    connection = boto.gs.connection.GSConnection()
    bucket = connection.get_bucket(bucketName)
    if oName not in bucket:
        raise ValueError('did not find %s in %s' % (oName, bucketName))
    key = boto.gs.key.Key(bucket, oName)
    try:
        key.delete()
    finally:
        key.close()
        connection.close()
def close_connection():
    global connection
    if connection:
        connection.close()
        connection = None
Ejemplo n.º 4
0
def close_connection():
    global connection
    if connection:
        connection.close()
        connection = None
Ejemplo n.º 5
0
def makeDFfromShards ( gcsPath, localDir, verboseFlag ):
    t0 = time.time() 

    if ( verboseFlag ):
        print " input gcsPath : <%s> " % gcsPath
        print " calling storage.Client ... "

    ( bucketName, bNamePattern ) = splitGCSpath ( gcsPath )
    if ( verboseFlag ):
        print " bucketName : <%s> " % bucketName
        print " bNamePattern : <%s> " % bNamePattern

    connection = boto.gs.connection.GSConnection()
    bucket = connection.get_bucket(bucketName)
    blobs = bucket.list()

    executor = futures.ThreadPoolExecutor(40)
    submitted_futures = {}
    for b in blobs:

        if ( not matchesWildCardPath ( b.name, bNamePattern ) ):
            continue
        submitted_futures[executor.submit(processShard, b, gcsPath, localDir, verboseFlag)] = makeLocalFilename(localDir, b.name)

    cum_df = None
    fieldNames = [None, None, None]
    init = True
    future_keys = submitted_futures.keys()
    while future_keys:
        future_done, future_keys = futures.wait(future_keys, return_when = futures.FIRST_COMPLETED)
        for future in future_done:
            local_name = submitted_futures.pop(future)
            if future.exception() is not None:
                print '\t%s generated an exception--%s: %s' % (gcsPath, type(future.exception()).__name__, future.exception())
            else:
                shard_df = future.result()
                print '\tfinished file %s' % (local_name)
                if cum_df is not None:
                    cumfieldNames = cum_df.axes[1]
                    fieldNames = shard_df.axes[1]
                    if (cumfieldNames[0] != fieldNames[0]):
                        print " ERROR: inconsistent rowName ... <%s> <%s> " % (cumfieldNames[0], fieldNames[0])
                        print "        error from file <%s> " % local_name
                        raise ValueError(' ERROR: inconsistent rowName ... <%s> <%s> \n        error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath))
                    if (cumfieldNames[1] != fieldNames[1]):
                        print " ERROR: inconsistent colName ... <%s> <%s> " % (cumfieldNames[1], fieldNames[1])
                        print "        error from file <%s> " % local_name
                        raise ValueError(' ERROR: inconsistent colName ... <%s> <%s> \n        error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath))
                    if (cumfieldNames[2] != fieldNames[2]):
                        print " ERROR: inconsistent datName ... <%s> <%s> " % (cumfieldNames[2], fieldNames[2])
                        print "        error from file <%s> " % local_name
                        raise ValueError(' ERROR: inconsistent datName ... <%s> <%s> \n        error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath))
                cum_df = addToDataframe(cum_df, shard_df, init)
                init = False
    
    connection.close()
    t1 = time.time()

    print " =============================================================== "
    print " finished creating dataframe.  time taken in seconds : %s" % (t1-t0)
    print " =============================================================== "
    
    return ( cum_df )
Ejemplo n.º 6
0
def makeDFfromShards ( gcsPath, localDir, verboseFlag ):
    t0 = time.time() 

    if ( verboseFlag ):
        print " input gcsPath : <%s> " % gcsPath
        print " calling storage.Client ... "

    ( bucketName, bNamePattern ) = splitGCSpath ( gcsPath )
    if ( verboseFlag ):
        print " bucketName : <%s> " % bucketName
        print " bNamePattern : <%s> " % bNamePattern

    connection = boto.gs.connection.GSConnection()
    bucket = connection.get_bucket(bucketName)
    blobs = bucket.list()

    executor = futures.ThreadPoolExecutor(40)
    submitted_futures = {}
    for b in blobs:

        if ( not matchesWildCardPath ( b.name, bNamePattern ) ):
            continue
        submitted_futures[executor.submit(processShard, b, gcsPath, localDir, verboseFlag)] = makeLocalFilename(localDir, b.name)

    cum_df = None
    fieldNames = [None, None, None]
    init = True
    future_keys = submitted_futures.keys()
    while future_keys:
        future_done, future_keys = futures.wait(future_keys, return_when = futures.FIRST_COMPLETED)
        for future in future_done:
            local_name = submitted_futures.pop(future)
            if future.exception() is not None:
                print '\t%s generated an exception--%s: %s' % (gcsPath, type(future.exception()).__name__, future.exception())
            else:
                shard_df = future.result()
                print '\tfinished file %s' % (local_name)
                if cum_df is not None:
                    cumfieldNames = cum_df.axes[1]
                    fieldNames = shard_df.axes[1]
                    if (cumfieldNames[0] != fieldNames[0]):
                        print " ERROR: inconsistent rowName ... <%s> <%s> " % (cumfieldNames[0], fieldNames[0])
                        print "        error from file <%s> " % local_name
                        raise ValueError(' ERROR: inconsistent rowName ... <%s> <%s> \n        error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath))
                    if (cumfieldNames[1] != fieldNames[1]):
                        print " ERROR: inconsistent colName ... <%s> <%s> " % (cumfieldNames[1], fieldNames[1])
                        print "        error from file <%s> " % local_name
                        raise ValueError(' ERROR: inconsistent colName ... <%s> <%s> \n        error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath))
                    if (cumfieldNames[2] != fieldNames[2]):
                        print " ERROR: inconsistent datName ... <%s> <%s> " % (cumfieldNames[2], fieldNames[2])
                        print "        error from file <%s> " % local_name
                        raise ValueError(' ERROR: inconsistent datName ... <%s> <%s> \n        error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath))
                cum_df = addToDataframe(cum_df, shard_df, init)
                init = False
    
    connection.close()
    t1 = time.time()

    print " =============================================================== "
    print " finished creating dataframe.  time taken in seconds : %s" % (t1-t0)
    print " =============================================================== "
    
    return ( cum_df )