Exemple #1
0
def run_h5_convert(results_type=None):
    tantalus_api = TantalusApi()

    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))

            else:
                print(result["id"])
                logging.info('no yaml found')

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
Exemple #2
0
def fix_bams(jira_ticket=None, dry_run=False):

    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()

    SC_WGS_BAM_DIR_TEMPLATE = os.path.join(
        'single_cell_indexing',
        'bam',
        '{library_id}',
        '{ref_genome}',
        '{aligner_name}',
        'numlanes_{number_lanes}',
        '{jira_ticket}',
    )

    reference_genome_map = {
        'HG19': 'grch37',
        'MM10': 'mm10',
    }

    analyses_list = []
    from_storage_name = "singlecellresults"
    to_storage_name = "singlecellblob"
    from_storage_client = tantalus_api.get_storage_client(from_storage_name)
    to_storage_client = tantalus_api.get_storage_client(to_storage_name)
    to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id']

    if jira_ticket is not None:
        analyses_list.append(
            tantalus_api.get('analysis',
                             jira_ticket=jira_ticket,
                             analysis_type__name="align",
                             status="complete"))

    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'):
            analyses = tantalus_api.list('analysis',
                                         analysis_type__name="align",
                                         status="complete",
                                         version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]
        print(f"moving bams for {jira_ticket}")

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            # Get number of lanes from dataset for use with filepath
            lanes = set()
            for sequence_lane in dataset['sequence_lanes']:
                lane = "{}_{}".format(sequence_lane['flowcell_id'],
                                      sequence_lane['lane_number'])
                lanes.add(lane)
            number_lanes = len(lanes)

            try:
                file_instances = tantalus_api.get_dataset_file_instances(
                    dataset["id"],
                    "sequencedataset",
                    from_storage_name,
                )
            except dbclients.tantalus.DataNotOnStorageError:
                logging.info(
                    f'dataset {dataset["id"]} not on {from_storage_name}, skipping'
                )
                continue

            for file_instance in file_instances:
                blobname = file_instance["file_resource"]["filename"]

                # get url of source blob
                blob_url = from_storage_client.get_url(blobname)

                bam_filename = blobname.split("/bams/")[1]
                new_blobname = os.path.join(
                    SC_WGS_BAM_DIR_TEMPLATE.format(
                        library_id=dataset["library"]["library_id"],
                        ref_genome=reference_genome_map[
                            dataset["reference_genome"]],
                        aligner_name=dataset["aligner"],
                        number_lanes=number_lanes,
                        jira_ticket=jira_ticket,
                    ),
                    bam_filename,
                )

                # copy blob to desired storage account with new blobname
                blob_filepath = f"{to_storage_client.prefix}/{new_blobname}"
                logging.info(
                    f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}'
                )
                if not dry_run:
                    to_storage_client.blob_service.copy_blob(
                        container_name="data",
                        blob_name=new_blobname,
                        copy_source=blob_url,
                    )

                file_resource_id = file_instance['file_resource']['id']
                file_instance_id = file_instance['id']

                logging.info(
                    f'updating file resource {file_resource_id} to have filename {new_blobname}'
                )
                if not dry_run:
                    tantalus_api.update('file_resource',
                                        id=file_resource_id,
                                        filename=new_blobname)

                logging.info(
                    f'updating file instance {file_instance_id} to have storage with id {to_storage_id}'
                )
                if not dry_run:
                    tantalus_api.update('file_instance',
                                        id=file_instance_id,
                                        storage=to_storage_id)
Exemple #3
0
def rename_fastqs(dataset_id, storage_name, dry_run=False, check_only=False):
    logging.info(f'dataset: {dataset_id}')
    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()
    storage_client = tantalus_api.get_storage_client(storage_name)

    dataset = tantalus_api.get('sequencedataset', id=dataset_id)

    file_instances = tantalus_api.get_dataset_file_instances(
        dataset['id'],
        'sequencedataset',
        storage_name,
    )

    for file_instance in file_instances:
        filename = file_instance['file_resource']['filename']

        if os.path.basename(filename) == 'metadata.yaml':
            continue

        assert len(dataset['sequence_lanes']) == 1

        parts = filename.split('/')
        basename = os.path.basename(filename)

        non_conforming = False
        try:
            assert parts[0] == 'single_cell_indexing'
            assert parts[1] == 'fastq'
            assert parts[2] == dataset['library']['library_id']
            assert parts[3].split(
                '_')[0] == dataset['sequence_lanes'][0]['flowcell_id']
            assert parts[3].split(
                '_')[1] == dataset['sequence_lanes'][0]['lane_number']
            assert parts[4] == dataset['sample']['sample_id']
        except AssertionError:
            non_conforming = True

        if check_only:
            if non_conforming:
                raise Exception(f'filename {filename} does not conform')
            continue

        new_filename = SC_WGS_FQ_TEMPLATE.format(
            dlp_library_id=dataset['library']['library_id'],
            flowcell_id=dataset['sequence_lanes'][0]['flowcell_id'],
            lane_number=dataset['sequence_lanes'][0]['lane_number'],
            cell_sample_id=dataset['sample']['sample_id'],
            cell_filename=basename,
        )

        if new_filename == filename:
            logging.info(f'skipping conforming {filename} on {storage_name}')
            continue

        logging.info(
            f'renaming {filename} to {new_filename} on {storage_name}')

        if not dry_run:
            if not storage_client.exists(new_filename):
                storage_client.copy(filename, new_filename, wait=True)
            tantalus_api.swap_file(file_instance, new_filename)
            storage_client.delete(filename)
def run_h5_convert(cache_dir,
                   dataset_id=None,
                   results_type=None,
                   redo=False,
                   dry_run=False,
                   check_done=False):
    tantalus_api = TantalusApi()

    local_cache_client = tantalus_api.get_cache_client(cache_dir)
    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if dataset_id is not None:
        results_list = [tantalus_api.get("resultsdataset", id=dataset_id)]
        logging.info('converting results with id {}'.format(dataset_id))

    elif results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                # Destruct outputs csv.yaml directly, check non destruct files
                if 'destruct' in existing_filename:
                    continue
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml and check_done:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))
                continue

            file_resource_ids = []

            filepaths_to_clean = []

            for file_instance in file_instances:
                if not file_instance['file_resource']['filename'].endswith(
                        '.h5'):
                    continue

                datamanagement.transfer_files.cache_file(
                    tantalus_api, file_instance, cache_dir)

                h5_filepath = local_cache_client.get_url(
                    file_instance['file_resource']['filename'])

                filepaths_to_clean.append(h5_filepath)

                logging.info('converting {}'.format(h5_filepath))

                for key, csv_filepath in get_h5_csv_info(h5_filepath):
                    if not csv_filepath.startswith(cache_dir):
                        raise Exception(
                            'unexpected csv path {}'.format(csv_filepath))

                    csv_filename = csv_filepath[len(cache_dir):]
                    csv_filename = csv_filename.lstrip('/')

                    if csv_filename in existing_filenames and not redo:
                        logging.info(
                            'file {} already exists, not converting'.format(
                                csv_filename))
                        continue

                    if dry_run:
                        logging.info('would convert {}, key {} to {}'.format(
                            h5_filepath, key, csv_filepath))
                        continue

                    logging.info('converting {}, key {} to {}'.format(
                        h5_filepath, key, csv_filepath))
                    convert_h5(h5_filepath, key, csv_filepath)

                    yaml_filename = csv_filename + '.yaml'
                    yaml_filepath = csv_filepath + '.yaml'

                    fileinfo_to_add = [
                        (csv_filename, csv_filepath),
                        (yaml_filename, yaml_filepath),
                    ]

                    for filename, filepath in fileinfo_to_add:
                        logging.info('creating file {} from path {}'.format(
                            filename, filepath))

                        remote_storage_client.create(filename,
                                                     filepath,
                                                     update=redo)
                        remote_filepath = os.path.join(
                            remote_storage_client.prefix, filename)

                        logging.info('adding file {} from path {}'.format(
                            filename, remote_filepath))

                        (file_resource, file_instance) = tantalus_api.add_file(
                            remote_storage_name, remote_filepath,
                            update=True)  #redo)

                        file_resource_ids.append(file_resource["id"])
                        filepaths_to_clean.append(filepath)

            if len(file_resource_ids) == 0:
                logging.warning('no files added')
                continue

            logging.info('adding file resources {} to dataset {}'.format(
                file_resource_ids, result["id"]))

            tantalus_api.update(
                "resultsdataset",
                result["id"],
                file_resources=result["file_resources"] + file_resource_ids,
            )

            for filepath in filepaths_to_clean:
                logging.info('removing file {}'.format(filepath))
                os.remove(filepath)

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
def main(
    storage_name,
    dataset_type,
    dataset_id=None,
    tag_name=None,
    check_remote=None,
    dry_run=False,
):
    logging.info('cleanup up storage {}'.format(storage_name))

    if check_remote:
        logging.info('checking remote {}'.format(check_remote))
    else:
        logging.warning('not checking remote')

    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_client = None
    if check_remote is not None:
        remote_client = tantalus_api.get_storage_client(check_remote)

    if dataset_id is None and tag_name is None:
        raise ValueError('require either dataset id or tag name')

    if dataset_id is not None and tag_name is not None:
        raise ValueError('require exactly one of dataset id or tag name')

    if dataset_id is not None:
        logging.info('cleanup up dataset {}, {}'.format(
            dataset_id, dataset_type))
        datasets = tantalus_api.list(dataset_type, id=dataset_id)

    if tag_name is not None:
        logging.info('cleanup up tag {}'.format(tag_name))
        datasets = tantalus_api.list(dataset_type, tags__name=tag_name)

    total_data_size = 0
    file_num_count = 0

    for dataset in datasets:
        logging.info('checking dataset with id {}, name {}'.format(
            dataset['id'], dataset['name']))

        # Optionally skip datasets not present and intact on the remote storage
        if check_remote is not None:
            if not tantalus_api.is_dataset_on_storage(
                    dataset['id'], 'sequencedataset', check_remote):
                logging.warning(
                    'not deleting dataset with id {}, not on remote storage '.
                    format(dataset['id'], check_remote))
                continue

            # For each file instance on the remote, check if it exists and has the correct size in tantalus
            remote_file_size_check = True
            for file_instance in tantalus_api.get_dataset_file_instances(
                    dataset['id'], dataset_type, check_remote):
                try:
                    tantalus_api.check_file(file_instance)
                except DataError:
                    logging.exception('check file failed')
                    remote_file_size_check = False

            # Skip this dataset if any files failed
            if not remote_file_size_check:
                logging.warning(
                    "skipping dataset {} that failed check on {}".format(
                        dataset['id'], check_remote))
                continue

        # Check consistency with the removal storage
        file_size_check = True
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            try:
                tantalus_api.check_file(file_instance)
            except DataError:
                logging.exception('check file failed')
                file_size_check = False

        # Skip this dataset if any files failed
        if not file_size_check:
            logging.warning(
                "skipping dataset {} that failed check on {}".format(
                    dataset['id'], storage_name))
            continue

        # Delete all files for this dataset
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            if dry_run:
                logging.info(
                    "would delete file instance with id {}, filepath {}".
                    format(file_instance['id'], file_instance['filepath']))
            else:
                logging.info(
                    "deleting file instance with id {}, filepath {}".format(
                        file_instance['id'], file_instance['filepath']))
                tantalus_api.update("file_instance",
                                    id=file_instance['id'],
                                    is_deleted=True)
            total_data_size += file_instance['file_resource']['size']
            file_num_count += 1

    logging.info("deleted a total of {} files with size {} bytes".format(
        file_num_count, total_data_size))