def validate_summaries(key_prefix, bucket_name, storage):
    storage_client = Storage()
    key_list = storage_client.list_keys(bucket_name, key_prefix + '/')
    sorted_key_list = sorted(key_list, key=lambda x: int(x.split('/')[-1]))

    summaries_buf = io.BytesIO()

    # Get all summaries into one buffer
    for key_name in sorted_key_list:
        with open(f's3://{storage.bucket}/{key_name}',
                  'rb',
                  transport_params=dict(
                      client=storage.get_client())) as source_file:

            copyfileobj(source_file, summaries_buf)

    cmd = ['./valsort', '-s', '/dev/stdin']
    with subprocess.Popen(cmd,
                          stdout=subprocess.PIPE,
                          stdin=subprocess.PIPE,
                          stderr=subprocess.PIPE) as p:
        with p.stdin as valinput, p.stdout as valoutput, p.stderr as valerr:
            with p.stdin as valinput:  # Need to close input for valsort to finish
                valinput.write(summaries_buf.getbuffer())
            returncode = p.wait()
            if returncode != 0:
                raise Exception(
                    f'Non-zero return code for valsort: {returncode}\n' +
                    valerr.read().decode('utf-8'))
            valoutput_str = valoutput.read().decode('utf-8')

            return valoutput_str
Exemple #2
0
def delete_object(bucket, key, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)
    logger.info('Deleting object "{}" from bucket "{}"'.format(key, bucket))
    storage.delete_object(bucket, key)
    logger.info('Object deleted successfully')
Exemple #3
0
def list_bucket(prefix, bucket, backend, debug, config):
    if config:
        config = load_yaml_config(config)
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(config=config, backend=backend)
    logger.info('Listing objects in bucket {}'.format(bucket))
    objects = storage.list_objects(bucket, prefix=prefix)

    if objects:
        width = max([len(obj['Key']) for obj in objects])

        print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width))
        print('-' * width, '\t', '-' * 20, '\t', '-' * 9)
        for obj in objects:
            key = obj['Key']
            date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S")
            size = sizeof_fmt(obj['Size'])
            print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width))
        print()
        print('Total objects: {}'.format(len(objects)))
    else:
        width = 10
        print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width))
        print('-' * width, '\t', '-' * 20, '\t', '-' * 9)
        print('\nThe bucket is empty')
def validate_command(prefix, image):
    storage_client = Storage()

    with FunctionExecutor(runtime=image) as fexec:
        bucket = fexec.config['lithops']['storage_bucket']
        key_list = storage_client.list_keys(bucket, prefix + '/')

        validate_records_futures = fexec.map(validate_records,
                                             key_list,
                                             extra_args=[bucket, prefix],
                                             include_modules=['util'])
        results = fexec.get_result(fs=validate_records_futures)
        for index, r in enumerate(results):
            if not r['success']:
                print(f'Failed to validate partition: {key_list[index]}')
                print(r['stderr'])
                return

        validate_summaries_futures = fexec.map(validate_summaries,
                                               [prefix + summary_postfix],
                                               extra_args=[bucket],
                                               include_modules=['util'])
        results = fexec.get_result(fs=validate_summaries_futures)
        if results[0] == '':
            print('Success!')
        else:
            print(results)
def parse_input_path_for_lithops(sm_config, input_path):
    if input_path.startswith('s3://') or input_path.startswith('s3a://'):
        backend = 'aws_s3'
        bucket, prefix = split_s3_path(input_path)
    else:
        backend = 'ibm_cos'
        bucket, prefix = split_cos_path(input_path)

    storage = Storage(sm_config['lithops'], backend)
    if backend == 'aws_s3' and sm_config['lithops']['aws_s3'][
            'endpoint'].startswith('http://'):
        # WORKAROUND for local Minio access
        # Lithops forces the url to HTTPS, so overwrite the S3 client with a fixed client
        # https://github.com/lithops-cloud/lithops/issues/708
        storage.storage_handler.s3_client = get_s3_client()

    keys_in_path = storage.list_keys(bucket, prefix)
    imzml_keys = [
        key for key in keys_in_path if key.lower().endswith('.imzml')
    ]
    ibd_keys = [key for key in keys_in_path if key.lower().endswith('.ibd')]

    debug_info = f'Path {input_path} had keys: {keys_in_path}'
    assert len(
        imzml_keys) == 1, f'Couldn\'t determine imzML file. {debug_info}'
    assert len(ibd_keys) == 1, f'Couldn\'t determine ibd file. {debug_info}'

    imzml_cobject = CloudObject(storage.backend, bucket, imzml_keys[0])
    ibd_cobject = CloudObject(storage.backend, bucket, ibd_keys[0])
    return storage, imzml_cobject, ibd_cobject
Exemple #6
0
def put_object(filename, bucket, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)
    logger.info('Uploading file {} to bucket {}'.format(filename, bucket))
    with open(filename, 'rb') as in_file:
        storage.put_object(bucket, filename, in_file)
    logger.info('File uploaded successfully')
Exemple #7
0
def get_object(bucket, key, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)
    logger.info('Downloading object {} from bucket {}'.format(key, bucket))
    data_stream = storage.get_object(bucket, key, stream=True)
    with open(key, 'wb') as out:
        shutil.copyfileobj(data_stream, out)
    logger.info('Object downloaded successfully')
Exemple #8
0
def empty_bucket(bucket, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)
    logger.info('Deleting all objects in bucket "{}"'.format(bucket))
    keys = storage.list_keys(bucket)
    logger.info('Total objects found: {}'.format(len(keys)))
    storage.delete_objects(bucket, keys)
    logger.info('All objects deleted successfully')
Exemple #9
0
def upload_test_imzml(storage: Storage, sm_config, ds_config):
    """Create an ImzML file, upload it into storage, and return an imzml_reader for it"""
    with make_test_imzml(ds_config) as (imzml_path, ibd_path):
        imzml_content = open(imzml_path, 'rb').read()
        ibd_content = open(ibd_path, 'rb').read()

    bucket, prefix = sm_config['lithops']['sm_storage']['imzml']
    storage.put_cloudobject(imzml_content, bucket,
                            f'{prefix}/test_ds/test.imzML')
    storage.put_cloudobject(ibd_content, bucket, f'{prefix}/test_ds/test.ibd')
    return f'cos://{bucket}/{prefix}/test_ds'
def sort_category(keys_list, prefix, category_stack, consider_last_byte_sorted,
                  storage, id):
    storage_client = Storage()
    category_id = id
    num_bytes_already_sorted = len(category_stack)
    if not consider_last_byte_sorted:
        num_bytes_already_sorted = num_bytes_already_sorted - 1
    num_bytes_to_sort = 10 - num_bytes_already_sorted

    category_sink = io.BytesIO()
    for key_name in keys_list:
        with open(f's3://{storage.bucket}/{key_name}',
                  'rb',
                  transport_params=dict(
                      client=storage.get_client())) as source_file:

            copyfileobj(source_file, category_sink)

    category_buffer = category_sink.getbuffer()
    record_arr = np.frombuffer(category_buffer,
                               dtype=np.dtype([
                                   ('sorted', f'V{num_bytes_already_sorted}'),
                                   ('key', f'V{num_bytes_to_sort}'),
                                   ('value', 'V90')
                               ]))
    sorted_category = np.sort(record_arr, order='key')

    with open(
            f's3://{storage.bucket}/{prefix}/{category_id}',
            'wb',
            transport_params=dict(client=storage.get_client())) as sorted_file:

        sorted_file.write(memoryview(sorted_category))

    return True
Exemple #11
0
def download_file(bucket, key, out, backend, debug, config):
    if config:
        config = load_yaml_config(config)

    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(config=config, backend=backend)

    def download_file():
        logger.info(
            f'Downloading file {storage.backend}://{bucket}/{key} to {out or key}'
        )
        if storage.download_file(bucket, key, out):
            file_size = os.path.getsize(out or key)
            logger.info(
                f'Download File {key} - Size: {sizeof_fmt(file_size)} - Ok')
        else:
            logger.error(f'Download File {key} - Error')

    with ThreadPoolExecutor() as ex:
        future = ex.submit(download_file)
        cy = cycle(r"-\|/")
        while not future.done():
            print("Downloading file " + next(cy), end="\r")
            time.sleep(0.1)
        future.result()
Exemple #12
0
def list_bucket(bucket, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)
    logger.info('Listing objects in bucket {}'.format(bucket))
    objects = storage.list_objects(bucket)

    width = max([len(obj['Key']) for obj in objects])

    print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width))
    print('-' * width, '\t', '-' * 20, '\t', '-' * 9)
    for obj in objects:
        key = obj['Key']
        date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S")
        size = sizeof_fmt(obj['Size'])
        print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width))
    print()
def make_lithops_imzml_reader(
    storage: Storage,
    mz_precision='f',
    polarity='positive',
    ds_config=TEST_DS_CONFIG,
):
    """Create an ImzML file, upload it into storage, and return an imzml_reader for it"""
    mz_dtype = {'f': np.float32, 'd': np.float64}[mz_precision]
    with TemporaryDirectory() as tmpdir:
        with ImzMLWriter(f'{tmpdir}/test.imzML', polarity=polarity, mz_dtype=mz_dtype) as writer:
            for coords, (mzs, ints) in zip(MOCK_COORDINATES, MOCK_SPECTRA):
                writer.addSpectrum(mzs, ints, coords)

        imzml_content = open(f'{tmpdir}/test.imzML', 'rb').read()
        ibd_content = open(f'{tmpdir}/test.ibd', 'rb').read()

    imzml_cobj = storage.put_cloudobject(imzml_content)
    ibd_cobj = storage.put_cloudobject(ibd_content)
    return LithopsImzMLReader(storage, imzml_cobj, ibd_cobj)
    def __init__(self, storage: Storage, imzml_cobject: CloudObject,
                 ibd_cobject: CloudObject):
        imzml_parser = ImzMLParser(
            storage.get_cloudobject(imzml_cobject, stream=True),
            ibd_file=None,
            parse_lib='ElementTree',
            include_spectra_metadata=METADATA_FIELDS,
        )

        self._ibd_cobject = ibd_cobject
        self.imzml_reader = imzml_parser.portable_spectrum_reader()

        super().__init__(imzml_parser)
Exemple #15
0
def get_tweets(keyword, location):

    auth = tweepy.OAuthHandler(CONSUMERKEY, SECRETKEY)
    auth.set_access_token(TWITTERKEY, TWITTERSECRET)

    twitterAPI = tweepy.API(auth, wait_on_rate_limit=True)
    searchstr = '"' + keyword + '"' + " " + '"' + location + '"' + "lang:ca OR lang:es -filter:retweets"  # Only look for tweets in catalan or spanish and exclude retweets

    list_tweets = [
    ]  # In this dictionary array we will store the structured tweets

    # Start to iterate over the twitter API to download tweets
    for tweet in tweepy.Cursor(
            twitterAPI.search, q=searchstr,
            tweet_mode="extended").items(500):  # numberOftwets
        # Start saving tweets, separating all the relevant data
        tweetstr = tweet.full_text
        url = "https://twitter.com/twitter/statuses/" + str(tweet.id)
        fecha = tweet.created_at.strftime("%m/%d/%Y %H:%M:%S")
        localizacion = str(tweet.user.location)
        packed_tweet = {
            "Texto tweet": tweetstr,
            "URL": url,
            "Fecha": fecha,
            "Ubicacion":
            localizacion  # Localizacion del usuario del tweet y no del tema (madrid, cataluña, etc)
        }

        list_tweets.append(packed_tweet)

    # Add all the tweets from the list to another dictionary
    packed_tweets = {"tweets": list_tweets}

    # Upload them to the cloud object storage
    storage = Storage()
    storage.put_object(bucket=STORAGEBUCKET,
                       key=keyword + location + ".json",
                       body=json.dumps(packed_tweets))
Exemple #16
0
def analyze_tweets(keyword, location):
    # Get the data from cloud
    storage = Storage()
    json_tweets = storage.get_object(bucket=STORAGEBUCKET,
                                     key=keyword + location + ".json")
    packed_tweets = json.loads(json_tweets)

    analisador = SentimentIntensityAnalyzer()

    # Columnas CSV:
    # URL, Localizacion, Fecha, Sentiment
    with open(keyword + location + ".csv", 'w') as file:
        writer = csv.writer(file)
        writer.writerow(["URL", "Fecha", "Sentiment"])
        # Start iterating over the tweets downloaded from the cloud, execute sentimental analysis and put the result in a csv file
        for tweet in packed_tweets["tweets"]:
            tweetstr = mtranslate.translate(str(tweet["Texto tweet"]), "en",
                                            "auto")
            writer.writerow([
                str(tweet["URL"]),
                str(tweet["Fecha"]),
                str(analisador.polarity_scores(tweetstr)['compound'])
            ])
def generate_command(number, prefix, partitions, image):
    bucket = None
    with FunctionExecutor(runtime=image) as fexec:
        bucket = fexec.config['lithops']['storage_bucket']
        futures = fexec.map(generate_records,
                            range(partitions),
                            extra_args=[number, prefix],
                            include_modules=['util'])
        results = fexec.get_result(fs=futures)
        # print(results)

    partition_size = record_size * number

    # Check if all files have been uploaded
    storage_client = Storage()
    partition_list = storage_client.list_objects(bucket, prefix + '/')
    assert len(
        partition_list
    ) == partitions, f'partition_list: {len(partition_list)}; partitions: {partitions}'
    for info in partition_list:
        assert info[
            'Size'] == partition_size, f'partition size: {partition_size} \ninfo: {info}'

    print('Done!')
Exemple #18
0
def delete_object(bucket, key, prefix, backend, debug):
    log_level = logging.INFO if not debug else logging.DEBUG
    setup_lithops_logger(log_level)
    storage = Storage(backend=backend)

    if key:
        logger.info('Deleting object "{}" from bucket "{}"'.format(key, bucket))
        storage.delete_object(bucket, key)
        logger.info('Object deleted successfully')
    elif prefix:
        objs = storage.list_keys(bucket, prefix)
        logger.info('Deleting {} objects with prefix "{}" from bucket "{}"'.format(len(objs), prefix, bucket))
        storage.delete_objects(bucket, objs)
        logger.info('Object deleted successfully')
Exemple #19
0
BUCKET_NAME = 'lithops-sample-data'  # change-me


def my_function(obj_id, storage):
    print(obj_id)

    data = storage.get_cloudobject(obj_id)

    return data.decode()


if __name__ == '__main__':

    obj_key = 'cloudobject1.txt'
    storage = Storage()
    obj_id = storage.put_cloudobject('Hello World!', BUCKET_NAME, obj_key)
    print(obj_id)

    fexec = FunctionExecutor()
    fexec.call_async(my_function, obj_id)
    print(fexec.get_result())

    obj_key = 'cloudobject2.txt'
    storage = fexec.storage
    obj_id = storage.put_cloudobject('Hello World!', BUCKET_NAME, obj_key)
    print(obj_id)

    fexec.call_async(my_function, obj_id)
    print(fexec.get_result())
def sort_command(input_prefix, output_prefix, max_parallelism, image):
    storage_client = Storage()
    bucket = None
    input_info_lis = None

    with FunctionExecutor(runtime=image, workers=max_parallelism) as fexec:
        bucket = fexec.config['lithops']['storage_bucket']
        input_info_list = storage_client.list_objects(bucket,
                                                      input_prefix + '/')
        input_size = sum(info['Size'] for info in input_info_list)
        (num_shuffles, last_values_per_category) = make_plan(input_size)

        current_values_per_category = 1
        current_prefix = input_prefix
        current_keys_list = [{
            'keys_list': [key_name],
            'prefix': input_prefix + '-intermediate0',
            'category_stack': []
        } for key_name in storage_client.list_keys(bucket, input_prefix + '/')]
        for current_shuffle in range(num_shuffles):
            # Change values per category of last shuffle
            if current_shuffle == num_shuffles - 1:
                current_values_per_category = last_values_per_category

            radix_sort_futures = fexec.map(radix_sort_by_byte,
                                           current_keys_list,
                                           extra_args={
                                               'values_per_category':
                                               current_values_per_category
                                           },
                                           include_modules=['util'])
            radix_sort_results = fexec.get_result(fs=radix_sort_futures)

            categories_keys_lists = {}
            for res in radix_sort_results:
                intermediate_keys_list = res['keys_list']
                input_category_stack = res['category_stack']
                for key_name in intermediate_keys_list:
                    category_id = int(key_name.rsplit(sep='/', maxsplit=3)[-3])
                    new_category_stack = input_category_stack + [category_id]
                    new_category_stack_str = '/'.join(
                        [str(x) for x in new_category_stack])
                    if new_category_stack_str in categories_keys_lists:
                        categories_keys_lists[new_category_stack_str].append(
                            key_name)
                    else:
                        categories_keys_lists[new_category_stack_str] = [
                            key_name
                        ]

            # Partition category lists
            # Attach prefix metadata so that sorter knows what to name files
            each_category_size = input_size / (
                (256 / current_values_per_category) * (current_shuffle + 1))
            num_partitions_per_category = math.ceil(each_category_size /
                                                    buffer_size_to_categorize)

            current_keys_list = []
            for category_stack_str, cat_keys_list in categories_keys_lists.items(
            ):
                for sub_list in np.array_split(cat_keys_list,
                                               num_partitions_per_category):
                    partition_entry = {
                        'keys_list':
                        sub_list,
                        'prefix':
                        f'{input_prefix}-intermediate{str(current_shuffle + 1)}',
                        'category_stack':
                        [int(x) for x in category_stack_str.split('/')]
                    }
                    current_keys_list.append(partition_entry)

        consider_last_byte_sorted = False
        if last_values_per_category == 1:
            consider_last_byte_sorted = True
        for entry in current_keys_list:
            entry['prefix'] = output_prefix
        sorted_keys_list = sorted(current_keys_list,
                                  key=lambda x: x['category_stack'])
        sort_category_futures = fexec.map(sort_category,
                                          sorted_keys_list,
                                          extra_args={
                                              'consider_last_byte_sorted':
                                              consider_last_byte_sorted
                                          },
                                          include_modules=['util'])
        results = fexec.get_result(fs=sort_category_futures)
        # print(results)

    # Check if size of output matches size of input

    output_info_list = storage_client.list_objects(bucket, output_prefix)
    output_size = sum(info['Size'] for info in output_info_list)
    assert input_size == output_size, f'input size: {input_size}, output_size: {output_size}'

    print('Done!')