Ejemplo n.º 1
0
def run_h5_convert(results_type=None):
    tantalus_api = TantalusApi()

    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))

            else:
                print(result["id"])
                logging.info('no yaml found')

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
Ejemplo n.º 2
0
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a all FQ datasets for a library id.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    for dataset_info, metadata in create_lane_fastq_metadata(
            tantalus_api, library_id):
        metadata_filename = os.path.join(dataset_info['base_dir'],
                                         'metadata.yaml')
        metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                      metadata_filename)

        metadata_io = io.BytesIO()
        metadata_io.write(
            yaml.dump(metadata, default_flow_style=False).encode())

        logging.info(f'writing metadata to file {metadata_filepath}')
        client.write_data(metadata_filename, metadata_io)

        logging.info(f'adding {metadata_filepath} to tantalus')

        if not dry_run:
            file_resource, file_instance = tantalus_api.add_file(
                storage_name, metadata_filepath, update=True)

            for dataset_id in dataset_info['dataset_ids']:
                dataset = tantalus_api.get('sequencedataset', id=dataset_id)

                new_file_resources = set(dataset['file_resources'])
                new_file_resources.add(file_resource['id'])

                tantalus_api.update('sequencedataset',
                                    id=dataset_id,
                                    file_resources=list(new_file_resources))
Ejemplo n.º 3
0
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a dataset and add to tantalus.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id)

    metadata_filename = os.path.join(base_dir, 'metadata.yaml')
    metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                  metadata_filename)

    metadata_io = io.BytesIO()
    metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode())

    print(f'writing metadata to file {metadata_filepath}')
    client.write_data(metadata_filename, metadata_io)

    print(f'adding {metadata_filepath} to tantalus')

    if not dry_run:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             metadata_filepath,
                                                             update=True)

        dataset = tantalus_api.get('sequencedataset', id=dataset_id)

        new_file_resources = set(dataset['file_resources'])
        new_file_resources.add(file_resource['id'])

        tantalus_api.update('sequencedataset',
                            id=dataset_id,
                            file_resources=list(new_file_resources))
Ejemplo n.º 4
0
def main(
    storage_name,
    dry_run=False,
    check_remote=None,
):
    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_storage_client = None
    if check_remote is not None:
        remote_storage_client = tantalus_api.get_storage_client(check_remote)

    file_instances = tantalus_api.list('file_instance',
                                       storage__name=storage_name,
                                       is_deleted=True)

    # DEBUG: check whether we are getting back
    # consistent ordered results from tantalus
    file_instances = list(file_instances)
    file_instance_ids = set([f['id'] for f in file_instances])
    if len(file_instances) != len(file_instance_ids):
        raise Exception('received duplicate results from tantalus')

    logging.info('processing {} file instances'.format(len(file_instance_ids)))
    logging.info('processing the following file instances: {}'.format(
        str(file_instance_ids)))

    for file_instance in file_instances:
        file_resource = tantalus_api.get(
            'file_resource', id=file_instance['file_resource']['id'])
        all_file_instances = list(
            tantalus_api.list('file_instance',
                              file_resource=file_resource['id']))

        logging.info(
            'checking file instance {}, file resource {}, filepath {}'.format(
                file_instance['id'], file_resource['id'],
                file_instance['filepath']))

        sequencedatasets = tantalus_api.list(
            'sequencedataset', file_resources__id=file_resource['id'])
        resultsdatasets = tantalus_api.list(
            'resultsdataset', file_resources__id=file_resource['id'])

        sequencedataset_ids = list(set([a['id'] for a in sequencedatasets]))
        resultsdataset_ids = list(set([a['id'] for a in resultsdatasets]))

        logging.info(
            'file resource {} belongs to sequencedataset {} and resultsdatasets {}'
            .format(file_resource['id'], sequencedataset_ids,
                    resultsdataset_ids))

        # Optionally check for a remote version
        if remote_storage_client:
            remote_instance = None
            for other_instance in file_resource['file_instances']:
                if other_instance['storage']['name'] == check_remote:
                    remote_instance = other_instance

            if not remote_instance:
                logging.info(
                    'not deleting file instance {}, no other instance'.format(
                        file_instance['id']))
                continue

            if remote_instance['is_deleted']:
                logging.info(
                    'not deleting file instance {}, other instance {} deleted'.
                    format(file_instance['id'], other_instance['id']))
                continue

            if not remote_storage_client.exists(file_resource['filename']):
                logging.info(
                    'not deleting file instance {}, other instance {} doesnt exist'
                    .format(file_instance['id'], other_instance['id']))
                continue

            logging.info(
                'deletion ok for file instance {}, found other instance {}'.
                format(file_instance['id'], other_instance['id']))

        # Delete the file from the filesystem
        logging.info('deleting file {}'.format(file_instance['filepath']))
        if not dry_run:
            try:
                storage_client.delete(file_resource['filename'])
            except FileNotFoundError:
                logging.exception('file already deleted')

        # Delete the instance model from tantalus
        logging.info('deleting file instance {}'.format(file_instance['id']))
        if not dry_run:
            tantalus_api.delete('file_instance', id=file_instance['id'])

        # If this is the only file instance for this file resource, delete the file resource
        if len(all_file_instances) == 1:
            assert all_file_instances[0]['id'] == file_instance['id']
            logging.info('deleting file resource {}'.format(
                file_resource['id']))
            if not dry_run:
                tantalus_api.delete('file_resource', id=file_resource['id'])
jira_api = JIRA('https://www.bcgsc.ca/jira/',
                basic_auth=(JIRA_USER, JIRA_PASSWORD))

filename_pattern_map = {
    "*_1_*.raw.fastq.gz": (1, True),
    "*_2_*.raw.fastq.gz": (2, True),
    "*_1_*.fastq.gz": (1, True),
    "*_2_*.fastq.gz": (2, True),
}

sequencing_instrument_map = {
    'HiSeqX': 'HX',
    'HiSeq2500': 'H2500',
    'NextSeq550': 'NextSeq550',
}
storage_client = tantalus_api.get_storage_client("scrna_fastq")


def get_existing_fastq_data(tantalus_api, library):
    ''' Get the current set of fastq data in tantalus.

    Args:
        library (str): tenx library name

    Returns:
        existing_data: set of lanes (flowcell_id, lane_number)
    '''

    existing_flowcell_ids = []

    lanes = tantalus_api.list('sequencing_lane',
Ejemplo n.º 6
0
def fix_bams(jira_ticket=None, dry_run=False):

    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()

    SC_WGS_BAM_DIR_TEMPLATE = os.path.join(
        'single_cell_indexing',
        'bam',
        '{library_id}',
        '{ref_genome}',
        '{aligner_name}',
        'numlanes_{number_lanes}',
        '{jira_ticket}',
    )

    reference_genome_map = {
        'HG19': 'grch37',
        'MM10': 'mm10',
    }

    analyses_list = []
    from_storage_name = "singlecellresults"
    to_storage_name = "singlecellblob"
    from_storage_client = tantalus_api.get_storage_client(from_storage_name)
    to_storage_client = tantalus_api.get_storage_client(to_storage_name)
    to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id']

    if jira_ticket is not None:
        analyses_list.append(
            tantalus_api.get('analysis',
                             jira_ticket=jira_ticket,
                             analysis_type__name="align",
                             status="complete"))

    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'):
            analyses = tantalus_api.list('analysis',
                                         analysis_type__name="align",
                                         status="complete",
                                         version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]
        print(f"moving bams for {jira_ticket}")

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            # Get number of lanes from dataset for use with filepath
            lanes = set()
            for sequence_lane in dataset['sequence_lanes']:
                lane = "{}_{}".format(sequence_lane['flowcell_id'],
                                      sequence_lane['lane_number'])
                lanes.add(lane)
            number_lanes = len(lanes)

            try:
                file_instances = tantalus_api.get_dataset_file_instances(
                    dataset["id"],
                    "sequencedataset",
                    from_storage_name,
                )
            except dbclients.tantalus.DataNotOnStorageError:
                logging.info(
                    f'dataset {dataset["id"]} not on {from_storage_name}, skipping'
                )
                continue

            for file_instance in file_instances:
                blobname = file_instance["file_resource"]["filename"]

                # get url of source blob
                blob_url = from_storage_client.get_url(blobname)

                bam_filename = blobname.split("/bams/")[1]
                new_blobname = os.path.join(
                    SC_WGS_BAM_DIR_TEMPLATE.format(
                        library_id=dataset["library"]["library_id"],
                        ref_genome=reference_genome_map[
                            dataset["reference_genome"]],
                        aligner_name=dataset["aligner"],
                        number_lanes=number_lanes,
                        jira_ticket=jira_ticket,
                    ),
                    bam_filename,
                )

                # copy blob to desired storage account with new blobname
                blob_filepath = f"{to_storage_client.prefix}/{new_blobname}"
                logging.info(
                    f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}'
                )
                if not dry_run:
                    to_storage_client.blob_service.copy_blob(
                        container_name="data",
                        blob_name=new_blobname,
                        copy_source=blob_url,
                    )

                file_resource_id = file_instance['file_resource']['id']
                file_instance_id = file_instance['id']

                logging.info(
                    f'updating file resource {file_resource_id} to have filename {new_blobname}'
                )
                if not dry_run:
                    tantalus_api.update('file_resource',
                                        id=file_resource_id,
                                        filename=new_blobname)

                logging.info(
                    f'updating file instance {file_instance_id} to have storage with id {to_storage_id}'
                )
                if not dry_run:
                    tantalus_api.update('file_instance',
                                        id=file_instance_id,
                                        storage=to_storage_id)
Ejemplo n.º 7
0
def rename_fastqs(dataset_id, storage_name, dry_run=False, check_only=False):
    logging.info(f'dataset: {dataset_id}')
    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()
    storage_client = tantalus_api.get_storage_client(storage_name)

    dataset = tantalus_api.get('sequencedataset', id=dataset_id)

    file_instances = tantalus_api.get_dataset_file_instances(
        dataset['id'],
        'sequencedataset',
        storage_name,
    )

    for file_instance in file_instances:
        filename = file_instance['file_resource']['filename']

        if os.path.basename(filename) == 'metadata.yaml':
            continue

        assert len(dataset['sequence_lanes']) == 1

        parts = filename.split('/')
        basename = os.path.basename(filename)

        non_conforming = False
        try:
            assert parts[0] == 'single_cell_indexing'
            assert parts[1] == 'fastq'
            assert parts[2] == dataset['library']['library_id']
            assert parts[3].split(
                '_')[0] == dataset['sequence_lanes'][0]['flowcell_id']
            assert parts[3].split(
                '_')[1] == dataset['sequence_lanes'][0]['lane_number']
            assert parts[4] == dataset['sample']['sample_id']
        except AssertionError:
            non_conforming = True

        if check_only:
            if non_conforming:
                raise Exception(f'filename {filename} does not conform')
            continue

        new_filename = SC_WGS_FQ_TEMPLATE.format(
            dlp_library_id=dataset['library']['library_id'],
            flowcell_id=dataset['sequence_lanes'][0]['flowcell_id'],
            lane_number=dataset['sequence_lanes'][0]['lane_number'],
            cell_sample_id=dataset['sample']['sample_id'],
            cell_filename=basename,
        )

        if new_filename == filename:
            logging.info(f'skipping conforming {filename} on {storage_name}')
            continue

        logging.info(
            f'renaming {filename} to {new_filename} on {storage_name}')

        if not dry_run:
            if not storage_client.exists(new_filename):
                storage_client.copy(filename, new_filename, wait=True)
            tantalus_api.swap_file(file_instance, new_filename)
            storage_client.delete(filename)
Ejemplo n.º 8
0
def run_h5_convert(cache_dir,
                   dataset_id=None,
                   results_type=None,
                   redo=False,
                   dry_run=False,
                   check_done=False):
    tantalus_api = TantalusApi()

    local_cache_client = tantalus_api.get_cache_client(cache_dir)
    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if dataset_id is not None:
        results_list = [tantalus_api.get("resultsdataset", id=dataset_id)]
        logging.info('converting results with id {}'.format(dataset_id))

    elif results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                # Destruct outputs csv.yaml directly, check non destruct files
                if 'destruct' in existing_filename:
                    continue
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml and check_done:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))
                continue

            file_resource_ids = []

            filepaths_to_clean = []

            for file_instance in file_instances:
                if not file_instance['file_resource']['filename'].endswith(
                        '.h5'):
                    continue

                datamanagement.transfer_files.cache_file(
                    tantalus_api, file_instance, cache_dir)

                h5_filepath = local_cache_client.get_url(
                    file_instance['file_resource']['filename'])

                filepaths_to_clean.append(h5_filepath)

                logging.info('converting {}'.format(h5_filepath))

                for key, csv_filepath in get_h5_csv_info(h5_filepath):
                    if not csv_filepath.startswith(cache_dir):
                        raise Exception(
                            'unexpected csv path {}'.format(csv_filepath))

                    csv_filename = csv_filepath[len(cache_dir):]
                    csv_filename = csv_filename.lstrip('/')

                    if csv_filename in existing_filenames and not redo:
                        logging.info(
                            'file {} already exists, not converting'.format(
                                csv_filename))
                        continue

                    if dry_run:
                        logging.info('would convert {}, key {} to {}'.format(
                            h5_filepath, key, csv_filepath))
                        continue

                    logging.info('converting {}, key {} to {}'.format(
                        h5_filepath, key, csv_filepath))
                    convert_h5(h5_filepath, key, csv_filepath)

                    yaml_filename = csv_filename + '.yaml'
                    yaml_filepath = csv_filepath + '.yaml'

                    fileinfo_to_add = [
                        (csv_filename, csv_filepath),
                        (yaml_filename, yaml_filepath),
                    ]

                    for filename, filepath in fileinfo_to_add:
                        logging.info('creating file {} from path {}'.format(
                            filename, filepath))

                        remote_storage_client.create(filename,
                                                     filepath,
                                                     update=redo)
                        remote_filepath = os.path.join(
                            remote_storage_client.prefix, filename)

                        logging.info('adding file {} from path {}'.format(
                            filename, remote_filepath))

                        (file_resource, file_instance) = tantalus_api.add_file(
                            remote_storage_name, remote_filepath,
                            update=True)  #redo)

                        file_resource_ids.append(file_resource["id"])
                        filepaths_to_clean.append(filepath)

            if len(file_resource_ids) == 0:
                logging.warning('no files added')
                continue

            logging.info('adding file resources {} to dataset {}'.format(
                file_resource_ids, result["id"]))

            tantalus_api.update(
                "resultsdataset",
                result["id"],
                file_resources=result["file_resources"] + file_resource_ids,
            )

            for filepath in filepaths_to_clean:
                logging.info('removing file {}'.format(filepath))
                os.remove(filepath)

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
Ejemplo n.º 9
0
def import_bam(storage_name,
               bam_file_path,
               sample=None,
               library=None,
               lane_infos=None,
               read_type=None,
               ref_genome=None,
               tag_name=None,
               update=False):
    """
    Imports bam into tantalus

    Args:
        storage_name:   (string) name of destination storage
        bam_file_path:  (string) filepath to bam on destination storage
        sample:         (dict) contains sample_id
        library:        (dict) contains library_id, library_type, index_format
        lane_infos:     (dict) contains flowcell_id, lane_number, 
                        adapter_index_sequence, sequencing_cenre, read_type, 
                        reference_genome, aligner
        read_type:      (string) read type for the run
        tag_name:       (string)
        update:         (boolean)
    Returns:
        sequence_dataset:   (dict) sequence dataset created on tantalus
    """
    tantalus_api = TantalusApi()

    # Get a url allowing access regardless of whether the file
    # is in cloud or local storage
    storage_client = tantalus_api.get_storage_client(storage_name)
    bam_filename = tantalus_api.get_file_resource_filename(
        storage_name, bam_file_path)
    bam_url = storage_client.get_url(bam_filename)

    bam_header = pysam.AlignmentFile(bam_url).header
    bam_header_info = get_bam_header_info(bam_header)

    if ref_genome is None:
        ref_genome = get_bam_ref_genome(bam_header)

    aligner_name = get_bam_aligner_name(bam_header)

    logging.info(
        f"bam header shows reference genome {ref_genome} and aligner {aligner_name}"
    )

    bai_file_path = None
    if storage_client.exists(bam_filename + ".bai"):
        bai_file_path = bam_file_path + ".bai"
    else:
        logging.info(f"no bam index found at {bam_filename + '.bai'}")

    # If no sample was specified assume it exists in tantalus and
    # search for it based on header info
    if sample is None:
        if len(bam_header_info["sample_ids"]) != 1:
            raise ValueError(
                f"found sample_ids={bam_header_info['sample_ids']}, please specify override sample id"
            )
        sample_id = list(bam_header_info["sample_ids"])[0]
        sample = tantalus_api.get('sample', sample_id=sample_id)

    # If no library was specified assume it exists in tantalus and
    # search for it based on header info
    if library is None:
        if len(bam_header_info["library_ids"]) != 1:
            raise ValueError(
                f"found library_ids={bam_header_info['library_ids']}, please specify override library id"
            )
        library_id = list(bam_header_info["library_ids"])[0]
        library = tantalus_api.get('dna_library', library_id=library_id)

    # Default paired end reads
    if read_type is None:
        read_type = 'P'

    # If no lane infos were specified create them from header info
    if lane_infos is None:
        lane_infos = []
        for lane in bam_header_info["sequence_lanes"]:
            lane_info = {
                "flowcell_id": lane["flowcell_id"],
                "lane_number": lane["lane_number"],
                "library_id": lane["library_id"],
                "sequencing_centre": lane["sequencing_centre"],
                "read_type": read_type,
            }
            lane_infos.append(lane_info)

    # Add the sequence dataset to Tantalus
    sequence_dataset = add_sequence_dataset(
        tantalus_api,
        storage_name=storage_name,
        sample=sample,
        library=library,
        dataset_type="BAM",
        sequence_lanes=lane_infos,
        bam_file_path=bam_file_path,
        reference_genome=ref_genome,
        aligner=aligner_name,
        bai_file_path=bai_file_path,
        tag_name=tag_name,
        update=update,
    )

    return sequence_dataset
Ejemplo n.º 10
0
def add_generic_results(filepaths,
                        storage_name,
                        results_name,
                        results_type,
                        results_version,
                        sample_ids=(),
                        library_ids=(),
                        analysis_pk=None,
                        recursive=False,
                        tag_name=None,
                        update=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()
    storage_client = tantalus_api.get_storage_client(storage_name)

    sample_pks = []
    for sample_id in sample_ids:
        samples = tantalus_api.get(
            "sample",
            sample_id=sample_id,
        )
        sample_pks.append(samples['id'])

    library_pks = []
    for library_id in library_ids:
        librarys = tantalus_api.get(
            "dna_library",
            library_id=library_id,
        )
        library_pks.append(librarys['id'])

    #Add the file resource to tantalus
    file_resource_pks = []
    for filepath in filepaths:
        if recursive:
            logging.info("Recursing directory {}".format(filepath))
            filename_prefix = tantalus_api.get_file_resource_filename(
                storage_name, filepath)
            add_filepaths = []
            for filename in storage_client.list(filename_prefix):
                add_filepaths.append(
                    tantalus_api.get_filepath(storage_name, filename))

        else:
            add_filepaths = [filepath]

        for add_filepath in add_filepaths:
            logging.info(
                "Adding file resource for {} to Tantalus".format(add_filepath))
            resource, instance = tantalus_api.add_file(
                storage_name=storage_name,
                filepath=add_filepath,
                update=update,
            )
            file_resource_pks.append(resource["id"])

    results_dataset_fields = dict(
        name=results_name,
        results_type=results_type,
        results_version=results_version,
        analysis=analysis_pk,
        samples=sample_pks,
        libraries=library_pks,
        file_resources=file_resource_pks,
    )

    #Add the dataset to tantalus
    try:
        results_id = tantalus_api.get(
            "results", name=results_dataset_fields["name"])["id"]
    except NotFoundError:
        results_id = None

    if update and results_id is not None:
        logging.warning("results dataset {} exists, updating".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.update("results",
                                              id=results_id,
                                              **results_dataset_fields)

    else:
        logging.info("creating results dataset {}".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.get_or_create("results",
                                                     **results_dataset_fields)

    if tag_name is not None:
        tantalus_api.tag(tag_name, resultsdataset_set=[results_id])

    logging.info("Succesfully created sequence dataset with ID {}".format(
        results_dataset["id"]))

    if remote_storage_name is not None:
        transfer_files.transfer_dataset(tantalus_api, results_dataset['id'],
                                        "resultsdataset", storage_name,
                                        remote_storage_name)

    return results_dataset
Ejemplo n.º 11
0
import os
import click
import logging

from dbclients.tantalus import TantalusApi
from dbclients.basicclient import NotFoundError
from datamanagement.utils.constants import LOGGING_FORMAT

logging.basicConfig(format=LOGGING_FORMAT, level=logging.INFO)
logging.getLogger("azure.storage.common.storageclient").setLevel(
    logging.WARNING)

tantalus_api = TantalusApi()

storage_client = tantalus_api.get_storage_client("singlecellresults")


@click.command()
@click.option('--jira_ticket')
def add_missing_annotations_files(jira_ticket=None):
    """
    There exists QC runs that have align and hmmcopy but no annotations results on Tantalus.
    These analyses were ran with version v0.2.25 of the single cell pipeline. Thus, filter for all align objects
    ran with v0.2.25 and see if the ticket has annotation blobs on Azure. If so, check if there exists an
    annotations result dataset for that ticket and create it and an annotations analysis if not.

    If result dataset already exist, iterate through annotation blobs and add to Tantalus if it has not been tracked.
    """

    if jira_ticket is None:
        analyses = list(
Ejemplo n.º 12
0
def main(
    storage_name,
    dataset_type,
    dataset_id=None,
    tag_name=None,
    check_remote=None,
    dry_run=False,
):
    logging.info('cleanup up storage {}'.format(storage_name))

    if check_remote:
        logging.info('checking remote {}'.format(check_remote))
    else:
        logging.warning('not checking remote')

    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_client = None
    if check_remote is not None:
        remote_client = tantalus_api.get_storage_client(check_remote)

    if dataset_id is None and tag_name is None:
        raise ValueError('require either dataset id or tag name')

    if dataset_id is not None and tag_name is not None:
        raise ValueError('require exactly one of dataset id or tag name')

    if dataset_id is not None:
        logging.info('cleanup up dataset {}, {}'.format(
            dataset_id, dataset_type))
        datasets = tantalus_api.list(dataset_type, id=dataset_id)

    if tag_name is not None:
        logging.info('cleanup up tag {}'.format(tag_name))
        datasets = tantalus_api.list(dataset_type, tags__name=tag_name)

    total_data_size = 0
    file_num_count = 0

    for dataset in datasets:
        logging.info('checking dataset with id {}, name {}'.format(
            dataset['id'], dataset['name']))

        # Optionally skip datasets not present and intact on the remote storage
        if check_remote is not None:
            if not tantalus_api.is_dataset_on_storage(
                    dataset['id'], 'sequencedataset', check_remote):
                logging.warning(
                    'not deleting dataset with id {}, not on remote storage '.
                    format(dataset['id'], check_remote))
                continue

            # For each file instance on the remote, check if it exists and has the correct size in tantalus
            remote_file_size_check = True
            for file_instance in tantalus_api.get_dataset_file_instances(
                    dataset['id'], dataset_type, check_remote):
                try:
                    tantalus_api.check_file(file_instance)
                except DataError:
                    logging.exception('check file failed')
                    remote_file_size_check = False

            # Skip this dataset if any files failed
            if not remote_file_size_check:
                logging.warning(
                    "skipping dataset {} that failed check on {}".format(
                        dataset['id'], check_remote))
                continue

        # Check consistency with the removal storage
        file_size_check = True
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            try:
                tantalus_api.check_file(file_instance)
            except DataError:
                logging.exception('check file failed')
                file_size_check = False

        # Skip this dataset if any files failed
        if not file_size_check:
            logging.warning(
                "skipping dataset {} that failed check on {}".format(
                    dataset['id'], storage_name))
            continue

        # Delete all files for this dataset
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            if dry_run:
                logging.info(
                    "would delete file instance with id {}, filepath {}".
                    format(file_instance['id'], file_instance['filepath']))
            else:
                logging.info(
                    "deleting file instance with id {}, filepath {}".format(
                        file_instance['id'], file_instance['filepath']))
                tantalus_api.update("file_instance",
                                    id=file_instance['id'],
                                    is_deleted=True)
            total_data_size += file_instance['file_resource']['size']
            file_num_count += 1

    logging.info("deleted a total of {} files with size {} bytes".format(
        file_num_count, total_data_size))