def deleteGCSobj ( gcsPath ): ( bucketName, oName ) = splitGCSpath ( gcsPath ) ## print " --> ", bucketName, oName connection = boto.gs.connection.GSConnection() bucket = connection.get_bucket(bucketName) if oName not in bucket: raise ValueError('did not find %s in %s' % (oName, bucketName)) key = boto.gs.key.Key(bucket, oName) try: key.delete() finally: key.close() connection.close()
def close_connection(): global connection if connection: connection.close() connection = None
def makeDFfromShards ( gcsPath, localDir, verboseFlag ): t0 = time.time() if ( verboseFlag ): print " input gcsPath : <%s> " % gcsPath print " calling storage.Client ... " ( bucketName, bNamePattern ) = splitGCSpath ( gcsPath ) if ( verboseFlag ): print " bucketName : <%s> " % bucketName print " bNamePattern : <%s> " % bNamePattern connection = boto.gs.connection.GSConnection() bucket = connection.get_bucket(bucketName) blobs = bucket.list() executor = futures.ThreadPoolExecutor(40) submitted_futures = {} for b in blobs: if ( not matchesWildCardPath ( b.name, bNamePattern ) ): continue submitted_futures[executor.submit(processShard, b, gcsPath, localDir, verboseFlag)] = makeLocalFilename(localDir, b.name) cum_df = None fieldNames = [None, None, None] init = True future_keys = submitted_futures.keys() while future_keys: future_done, future_keys = futures.wait(future_keys, return_when = futures.FIRST_COMPLETED) for future in future_done: local_name = submitted_futures.pop(future) if future.exception() is not None: print '\t%s generated an exception--%s: %s' % (gcsPath, type(future.exception()).__name__, future.exception()) else: shard_df = future.result() print '\tfinished file %s' % (local_name) if cum_df is not None: cumfieldNames = cum_df.axes[1] fieldNames = shard_df.axes[1] if (cumfieldNames[0] != fieldNames[0]): print " ERROR: inconsistent rowName ... <%s> <%s> " % (cumfieldNames[0], fieldNames[0]) print " error from file <%s> " % local_name raise ValueError(' ERROR: inconsistent rowName ... <%s> <%s> \n error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath)) if (cumfieldNames[1] != fieldNames[1]): print " ERROR: inconsistent colName ... <%s> <%s> " % (cumfieldNames[1], fieldNames[1]) print " error from file <%s> " % local_name raise ValueError(' ERROR: inconsistent colName ... <%s> <%s> \n error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath)) if (cumfieldNames[2] != fieldNames[2]): print " ERROR: inconsistent datName ... <%s> <%s> " % (cumfieldNames[2], fieldNames[2]) print " error from file <%s> " % local_name raise ValueError(' ERROR: inconsistent datName ... <%s> <%s> \n error from file <%s> ' (cumfieldNames[0], fieldNames[0], gcsPath)) cum_df = addToDataframe(cum_df, shard_df, init) init = False connection.close() t1 = time.time() print " =============================================================== " print " finished creating dataframe. time taken in seconds : %s" % (t1-t0) print " =============================================================== " return ( cum_df )