Ejemplo n.º 1
0
def worker_task(instance_no, total_instances, bin_data_source_blob):
    """
    get the task for the worker
    arguments contains the various parameters that will
    be used by the machines to process the data like file numbers
    instance_no belongs to [0, total_instances - 1]

    :param instance_no: the instance_no, this process is running on
    :param total_instances: total no. of instances
    :param bin_data_source_blob: blob name of for binary data
    """
    if log:
        log_info = log.info
    else:
        log_info = print_alias

    BIN_DATA_STORAGE = os.path.expanduser(
        '~/raw_data')  # binary will be stored in ~/raw_data
    PROCESSED_DATA_BLOB_NAME = "processed/" + bin_data_source_blob  # blob name for processed data
    PROCESSED_DATA_STORAGE = os.path.expanduser(
        '~/' + PROCESSED_DATA_BLOB_NAME)  # processed data storage loc

    assigned_blobs = assign_files(instance_no=instance_no,
                                  total_instances=total_instances,
                                  bin_data_source_blob=bin_data_source_blob)
    log_info("Instance_no: {}".format(instance_no))
    log_info('Blobs assigned: ' + str(assigned_blobs))

    # downloading the files
    file_names = []
    for blob in assigned_blobs:  # downloading bin files
        rel_file_name = blob.name.replace(bin_data_source_blob, '')
        joinable_rel_file_name = get_joinable_rear_path(rel_file_name)
        filename = os.path.join(
            BIN_DATA_STORAGE,
            joinable_rel_file_name)  # absolute path for raw_data
        make_dirs(os.path.dirname(filename))
        blob.download_to_filename(filename)
        log_info('File {} downloaded to {}'.format(str(blob.name), filename))
        file_names.append(filename)

    save_names = []
    upload_names = []
    for filename in file_names:
        # processing the file
        save_filename = filename.replace(BIN_DATA_STORAGE,
                                         PROCESSED_DATA_STORAGE).replace(
                                             '.bin', '.json')
        make_dirs(os.path.dirname(save_filename))
        log_parser.main(log, filename=filename, save_filename=save_filename)
        save_names.append(save_filename)

        # uploading the file
        upload_name = save_filename.replace(os.path.expanduser('~/'), '')
        upload_blob(source_file_path=save_filename,
                    destination_blob_name=upload_name,
                    bucket_name=BUCKET_NAME)
        upload_names.append(upload_name)