def validate_summaries(key_prefix, bucket_name, storage): storage_client = Storage() key_list = storage_client.list_keys(bucket_name, key_prefix + '/') sorted_key_list = sorted(key_list, key=lambda x: int(x.split('/')[-1])) summaries_buf = io.BytesIO() # Get all summaries into one buffer for key_name in sorted_key_list: with open(f's3://{storage.bucket}/{key_name}', 'rb', transport_params=dict( client=storage.get_client())) as source_file: copyfileobj(source_file, summaries_buf) cmd = ['./valsort', '-s', '/dev/stdin'] with subprocess.Popen(cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) as p: with p.stdin as valinput, p.stdout as valoutput, p.stderr as valerr: with p.stdin as valinput: # Need to close input for valsort to finish valinput.write(summaries_buf.getbuffer()) returncode = p.wait() if returncode != 0: raise Exception( f'Non-zero return code for valsort: {returncode}\n' + valerr.read().decode('utf-8')) valoutput_str = valoutput.read().decode('utf-8') return valoutput_str
def delete_object(bucket, key, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) logger.info('Deleting object "{}" from bucket "{}"'.format(key, bucket)) storage.delete_object(bucket, key) logger.info('Object deleted successfully')
def list_bucket(prefix, bucket, backend, debug, config): if config: config = load_yaml_config(config) log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(config=config, backend=backend) logger.info('Listing objects in bucket {}'.format(bucket)) objects = storage.list_objects(bucket, prefix=prefix) if objects: width = max([len(obj['Key']) for obj in objects]) print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) print('-' * width, '\t', '-' * 20, '\t', '-' * 9) for obj in objects: key = obj['Key'] date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S") size = sizeof_fmt(obj['Size']) print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width)) print() print('Total objects: {}'.format(len(objects))) else: width = 10 print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) print('-' * width, '\t', '-' * 20, '\t', '-' * 9) print('\nThe bucket is empty')
def validate_command(prefix, image): storage_client = Storage() with FunctionExecutor(runtime=image) as fexec: bucket = fexec.config['lithops']['storage_bucket'] key_list = storage_client.list_keys(bucket, prefix + '/') validate_records_futures = fexec.map(validate_records, key_list, extra_args=[bucket, prefix], include_modules=['util']) results = fexec.get_result(fs=validate_records_futures) for index, r in enumerate(results): if not r['success']: print(f'Failed to validate partition: {key_list[index]}') print(r['stderr']) return validate_summaries_futures = fexec.map(validate_summaries, [prefix + summary_postfix], extra_args=[bucket], include_modules=['util']) results = fexec.get_result(fs=validate_summaries_futures) if results[0] == '': print('Success!') else: print(results)
def parse_input_path_for_lithops(sm_config, input_path): if input_path.startswith('s3://') or input_path.startswith('s3a://'): backend = 'aws_s3' bucket, prefix = split_s3_path(input_path) else: backend = 'ibm_cos' bucket, prefix = split_cos_path(input_path) storage = Storage(sm_config['lithops'], backend) if backend == 'aws_s3' and sm_config['lithops']['aws_s3'][ 'endpoint'].startswith('http://'): # WORKAROUND for local Minio access # Lithops forces the url to HTTPS, so overwrite the S3 client with a fixed client # https://github.com/lithops-cloud/lithops/issues/708 storage.storage_handler.s3_client = get_s3_client() keys_in_path = storage.list_keys(bucket, prefix) imzml_keys = [ key for key in keys_in_path if key.lower().endswith('.imzml') ] ibd_keys = [key for key in keys_in_path if key.lower().endswith('.ibd')] debug_info = f'Path {input_path} had keys: {keys_in_path}' assert len( imzml_keys) == 1, f'Couldn\'t determine imzML file. {debug_info}' assert len(ibd_keys) == 1, f'Couldn\'t determine ibd file. {debug_info}' imzml_cobject = CloudObject(storage.backend, bucket, imzml_keys[0]) ibd_cobject = CloudObject(storage.backend, bucket, ibd_keys[0]) return storage, imzml_cobject, ibd_cobject
def put_object(filename, bucket, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) logger.info('Uploading file {} to bucket {}'.format(filename, bucket)) with open(filename, 'rb') as in_file: storage.put_object(bucket, filename, in_file) logger.info('File uploaded successfully')
def get_object(bucket, key, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) logger.info('Downloading object {} from bucket {}'.format(key, bucket)) data_stream = storage.get_object(bucket, key, stream=True) with open(key, 'wb') as out: shutil.copyfileobj(data_stream, out) logger.info('Object downloaded successfully')
def empty_bucket(bucket, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) logger.info('Deleting all objects in bucket "{}"'.format(bucket)) keys = storage.list_keys(bucket) logger.info('Total objects found: {}'.format(len(keys))) storage.delete_objects(bucket, keys) logger.info('All objects deleted successfully')
def upload_test_imzml(storage: Storage, sm_config, ds_config): """Create an ImzML file, upload it into storage, and return an imzml_reader for it""" with make_test_imzml(ds_config) as (imzml_path, ibd_path): imzml_content = open(imzml_path, 'rb').read() ibd_content = open(ibd_path, 'rb').read() bucket, prefix = sm_config['lithops']['sm_storage']['imzml'] storage.put_cloudobject(imzml_content, bucket, f'{prefix}/test_ds/test.imzML') storage.put_cloudobject(ibd_content, bucket, f'{prefix}/test_ds/test.ibd') return f'cos://{bucket}/{prefix}/test_ds'
def sort_category(keys_list, prefix, category_stack, consider_last_byte_sorted, storage, id): storage_client = Storage() category_id = id num_bytes_already_sorted = len(category_stack) if not consider_last_byte_sorted: num_bytes_already_sorted = num_bytes_already_sorted - 1 num_bytes_to_sort = 10 - num_bytes_already_sorted category_sink = io.BytesIO() for key_name in keys_list: with open(f's3://{storage.bucket}/{key_name}', 'rb', transport_params=dict( client=storage.get_client())) as source_file: copyfileobj(source_file, category_sink) category_buffer = category_sink.getbuffer() record_arr = np.frombuffer(category_buffer, dtype=np.dtype([ ('sorted', f'V{num_bytes_already_sorted}'), ('key', f'V{num_bytes_to_sort}'), ('value', 'V90') ])) sorted_category = np.sort(record_arr, order='key') with open( f's3://{storage.bucket}/{prefix}/{category_id}', 'wb', transport_params=dict(client=storage.get_client())) as sorted_file: sorted_file.write(memoryview(sorted_category)) return True
def download_file(bucket, key, out, backend, debug, config): if config: config = load_yaml_config(config) log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(config=config, backend=backend) def download_file(): logger.info( f'Downloading file {storage.backend}://{bucket}/{key} to {out or key}' ) if storage.download_file(bucket, key, out): file_size = os.path.getsize(out or key) logger.info( f'Download File {key} - Size: {sizeof_fmt(file_size)} - Ok') else: logger.error(f'Download File {key} - Error') with ThreadPoolExecutor() as ex: future = ex.submit(download_file) cy = cycle(r"-\|/") while not future.done(): print("Downloading file " + next(cy), end="\r") time.sleep(0.1) future.result()
def list_bucket(bucket, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) logger.info('Listing objects in bucket {}'.format(bucket)) objects = storage.list_objects(bucket) width = max([len(obj['Key']) for obj in objects]) print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) print('-' * width, '\t', '-' * 20, '\t', '-' * 9) for obj in objects: key = obj['Key'] date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S") size = sizeof_fmt(obj['Size']) print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width)) print()
def make_lithops_imzml_reader( storage: Storage, mz_precision='f', polarity='positive', ds_config=TEST_DS_CONFIG, ): """Create an ImzML file, upload it into storage, and return an imzml_reader for it""" mz_dtype = {'f': np.float32, 'd': np.float64}[mz_precision] with TemporaryDirectory() as tmpdir: with ImzMLWriter(f'{tmpdir}/test.imzML', polarity=polarity, mz_dtype=mz_dtype) as writer: for coords, (mzs, ints) in zip(MOCK_COORDINATES, MOCK_SPECTRA): writer.addSpectrum(mzs, ints, coords) imzml_content = open(f'{tmpdir}/test.imzML', 'rb').read() ibd_content = open(f'{tmpdir}/test.ibd', 'rb').read() imzml_cobj = storage.put_cloudobject(imzml_content) ibd_cobj = storage.put_cloudobject(ibd_content) return LithopsImzMLReader(storage, imzml_cobj, ibd_cobj)
def __init__(self, storage: Storage, imzml_cobject: CloudObject, ibd_cobject: CloudObject): imzml_parser = ImzMLParser( storage.get_cloudobject(imzml_cobject, stream=True), ibd_file=None, parse_lib='ElementTree', include_spectra_metadata=METADATA_FIELDS, ) self._ibd_cobject = ibd_cobject self.imzml_reader = imzml_parser.portable_spectrum_reader() super().__init__(imzml_parser)
def get_tweets(keyword, location): auth = tweepy.OAuthHandler(CONSUMERKEY, SECRETKEY) auth.set_access_token(TWITTERKEY, TWITTERSECRET) twitterAPI = tweepy.API(auth, wait_on_rate_limit=True) searchstr = '"' + keyword + '"' + " " + '"' + location + '"' + "lang:ca OR lang:es -filter:retweets" # Only look for tweets in catalan or spanish and exclude retweets list_tweets = [ ] # In this dictionary array we will store the structured tweets # Start to iterate over the twitter API to download tweets for tweet in tweepy.Cursor( twitterAPI.search, q=searchstr, tweet_mode="extended").items(500): # numberOftwets # Start saving tweets, separating all the relevant data tweetstr = tweet.full_text url = "https://twitter.com/twitter/statuses/" + str(tweet.id) fecha = tweet.created_at.strftime("%m/%d/%Y %H:%M:%S") localizacion = str(tweet.user.location) packed_tweet = { "Texto tweet": tweetstr, "URL": url, "Fecha": fecha, "Ubicacion": localizacion # Localizacion del usuario del tweet y no del tema (madrid, cataluña, etc) } list_tweets.append(packed_tweet) # Add all the tweets from the list to another dictionary packed_tweets = {"tweets": list_tweets} # Upload them to the cloud object storage storage = Storage() storage.put_object(bucket=STORAGEBUCKET, key=keyword + location + ".json", body=json.dumps(packed_tweets))
def analyze_tweets(keyword, location): # Get the data from cloud storage = Storage() json_tweets = storage.get_object(bucket=STORAGEBUCKET, key=keyword + location + ".json") packed_tweets = json.loads(json_tweets) analisador = SentimentIntensityAnalyzer() # Columnas CSV: # URL, Localizacion, Fecha, Sentiment with open(keyword + location + ".csv", 'w') as file: writer = csv.writer(file) writer.writerow(["URL", "Fecha", "Sentiment"]) # Start iterating over the tweets downloaded from the cloud, execute sentimental analysis and put the result in a csv file for tweet in packed_tweets["tweets"]: tweetstr = mtranslate.translate(str(tweet["Texto tweet"]), "en", "auto") writer.writerow([ str(tweet["URL"]), str(tweet["Fecha"]), str(analisador.polarity_scores(tweetstr)['compound']) ])
def generate_command(number, prefix, partitions, image): bucket = None with FunctionExecutor(runtime=image) as fexec: bucket = fexec.config['lithops']['storage_bucket'] futures = fexec.map(generate_records, range(partitions), extra_args=[number, prefix], include_modules=['util']) results = fexec.get_result(fs=futures) # print(results) partition_size = record_size * number # Check if all files have been uploaded storage_client = Storage() partition_list = storage_client.list_objects(bucket, prefix + '/') assert len( partition_list ) == partitions, f'partition_list: {len(partition_list)}; partitions: {partitions}' for info in partition_list: assert info[ 'Size'] == partition_size, f'partition size: {partition_size} \ninfo: {info}' print('Done!')
def delete_object(bucket, key, prefix, backend, debug): log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) storage = Storage(backend=backend) if key: logger.info('Deleting object "{}" from bucket "{}"'.format(key, bucket)) storage.delete_object(bucket, key) logger.info('Object deleted successfully') elif prefix: objs = storage.list_keys(bucket, prefix) logger.info('Deleting {} objects with prefix "{}" from bucket "{}"'.format(len(objs), prefix, bucket)) storage.delete_objects(bucket, objs) logger.info('Object deleted successfully')
BUCKET_NAME = 'lithops-sample-data' # change-me def my_function(obj_id, storage): print(obj_id) data = storage.get_cloudobject(obj_id) return data.decode() if __name__ == '__main__': obj_key = 'cloudobject1.txt' storage = Storage() obj_id = storage.put_cloudobject('Hello World!', BUCKET_NAME, obj_key) print(obj_id) fexec = FunctionExecutor() fexec.call_async(my_function, obj_id) print(fexec.get_result()) obj_key = 'cloudobject2.txt' storage = fexec.storage obj_id = storage.put_cloudobject('Hello World!', BUCKET_NAME, obj_key) print(obj_id) fexec.call_async(my_function, obj_id) print(fexec.get_result())
def sort_command(input_prefix, output_prefix, max_parallelism, image): storage_client = Storage() bucket = None input_info_lis = None with FunctionExecutor(runtime=image, workers=max_parallelism) as fexec: bucket = fexec.config['lithops']['storage_bucket'] input_info_list = storage_client.list_objects(bucket, input_prefix + '/') input_size = sum(info['Size'] for info in input_info_list) (num_shuffles, last_values_per_category) = make_plan(input_size) current_values_per_category = 1 current_prefix = input_prefix current_keys_list = [{ 'keys_list': [key_name], 'prefix': input_prefix + '-intermediate0', 'category_stack': [] } for key_name in storage_client.list_keys(bucket, input_prefix + '/')] for current_shuffle in range(num_shuffles): # Change values per category of last shuffle if current_shuffle == num_shuffles - 1: current_values_per_category = last_values_per_category radix_sort_futures = fexec.map(radix_sort_by_byte, current_keys_list, extra_args={ 'values_per_category': current_values_per_category }, include_modules=['util']) radix_sort_results = fexec.get_result(fs=radix_sort_futures) categories_keys_lists = {} for res in radix_sort_results: intermediate_keys_list = res['keys_list'] input_category_stack = res['category_stack'] for key_name in intermediate_keys_list: category_id = int(key_name.rsplit(sep='/', maxsplit=3)[-3]) new_category_stack = input_category_stack + [category_id] new_category_stack_str = '/'.join( [str(x) for x in new_category_stack]) if new_category_stack_str in categories_keys_lists: categories_keys_lists[new_category_stack_str].append( key_name) else: categories_keys_lists[new_category_stack_str] = [ key_name ] # Partition category lists # Attach prefix metadata so that sorter knows what to name files each_category_size = input_size / ( (256 / current_values_per_category) * (current_shuffle + 1)) num_partitions_per_category = math.ceil(each_category_size / buffer_size_to_categorize) current_keys_list = [] for category_stack_str, cat_keys_list in categories_keys_lists.items( ): for sub_list in np.array_split(cat_keys_list, num_partitions_per_category): partition_entry = { 'keys_list': sub_list, 'prefix': f'{input_prefix}-intermediate{str(current_shuffle + 1)}', 'category_stack': [int(x) for x in category_stack_str.split('/')] } current_keys_list.append(partition_entry) consider_last_byte_sorted = False if last_values_per_category == 1: consider_last_byte_sorted = True for entry in current_keys_list: entry['prefix'] = output_prefix sorted_keys_list = sorted(current_keys_list, key=lambda x: x['category_stack']) sort_category_futures = fexec.map(sort_category, sorted_keys_list, extra_args={ 'consider_last_byte_sorted': consider_last_byte_sorted }, include_modules=['util']) results = fexec.get_result(fs=sort_category_futures) # print(results) # Check if size of output matches size of input output_info_list = storage_client.list_objects(bucket, output_prefix) output_size = sum(info['Size'] for info in output_info_list) assert input_size == output_size, f'input size: {input_size}, output_size: {output_size}' print('Done!')