Exemple #1
0
def run_h5_convert(results_type=None):
    tantalus_api = TantalusApi()

    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))

            else:
                print(result["id"])
                logging.info('no yaml found')

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
def main(
        storage_name,
        dataset_type=None,
        dataset_id=None,
        tag_name=None,
        all_file_instances=False,
        dry_run=False,
        fix_corrupt=False,
        remove_missing=False,
    ):
    logging.info('checking integrity of storage {}'.format(storage_name))

    tantalus_api = TantalusApi()

    if all_file_instances:
        file_instances = tantalus_api.list('file_instance', storage__name=storage_name)

    else:
        file_instances = get_dataset_file_instances(
            tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name)

    for file_instance in file_instances:
        logging.info('checking file instance {} with path {}'.format(
            file_instance['id'], file_instance['filepath']))

        if file_instance['is_deleted']:
            logging.info('file instance {} marked as deleted'.format(
                file_instance['id']))
            continue

        file_corrupt = False
        file_missing = False
        try:
            tantalus_api.check_file(file_instance)
        except DataCorruptionError:
            file_corrupt = True
            logging.exception('check file failed')
        except DataMissingError:
            file_missing = True
            logging.exception('missing file')

        if file_corrupt and fix_corrupt:
            logging.info('updating file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                tantalus_api.update_file(file_instance)

        if file_missing and remove_missing:
            logging.info('deleting file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                file_instance = tantalus_api.update(
                    'file_instance',
                    id=file_instance['id'],
                    is_deleted=True,
                )
Exemple #3
0
def fix_bams(jira_ticket=None, dry_run=False):

    tantalus_api = TantalusApi()

    analyses_list = []
    storage_name = "singlecellresults"

    if jira_ticket is not None:
        analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete"))
    
    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3'):
            analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]

        filename = f'{jira_ticket}/results/bams/metadata.yaml'

        logging.info(f'adding file {filename}')
        if not dry_run:
            file_instance, file_resource = tantalus_api.add_file(storage_name, filename)

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            dataset_id = dataset['id']

            logging.info(f'adding file to dataset {dataset_id}')
            if not dry_run:
                file_resource_ids = dataset['file_resources']
                file_resource_ids = file_resource_ids.append(file_resource['id'])
                tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
Exemple #4
0
def download_datasets(results_type,
                      from_storage_name,
                      to_storage_name,
                      dataset_id=None,
                      jira_ticket=None):
    ''' Download a set of datasets by type.
    '''

    tantalus_api = TantalusApi()

    if dataset_id is not None:
        datasets = tantalus_api.list('results', id=dataset_id)
    elif jira_ticket is not None:
        datasets = tantalus_api.list('results',
                                     results_type=results_type,
                                     analysis__jira_ticket=jira_ticket)
    else:
        datasets = tantalus_api.list('results', results_type=results_type)

    dataset_ids = list()
    for dataset in datasets:
        dataset_ids.append(dataset['id'])

    # Download most recent first
    dataset_ids = reversed(sorted(dataset_ids))

    failed = False
    for dataset_id in dataset_ids:
        try:
            transfer_dataset(tantalus_api, dataset_id, 'resultsdataset',
                             from_storage_name, to_storage_name)
        except:
            logging.exception(f'failed to download {dataset_id}')
            failed = True

    if failed:
        raise Exception('one or more downloads failed')
def glob_microscope_data(filepaths,
                         storage_name,
                         tag_name=None,
                         update=False,
                         remote_storage_name=None):

    tantalus_api = TantalusApi()

    chip_paths = collections.defaultdict(set)
    chip_libraries = collections.defaultdict(set)

    for filepath in filepaths:
        match = re.match(
            r".*/single_cell_indexing/Microscope/(\d+)_(A\d+[A-Z]*)", filepath)
        if match is None:
            logging.warning('skipping malformed {}'.format(filepath))
            continue

        fields = match.groups()
        date = fields[0]
        chip_id = fields[1]

        libraries = list(
            tantalus_api.list('dna_library', library_id__startswith=chip_id))

        if len(libraries) == 0:
            logging.error(
                'skipping file with unknown library {}'.format(filepath))
            continue

        library_ids = set([library['library_id'] for library in libraries])

        chip_paths[chip_id].add(filepath)
        chip_libraries[chip_id].update(library_ids)

    for chip_id in chip_paths:
        add_microscope_results(
            chip_paths[chip_id],
            chip_id,
            chip_libraries[chip_id],
            storage_name,
            tag_name=tag_name,
            update=update,
            remote_storage_name=remote_storage_name,
        )
Exemple #6
0
def catalog_cellenone_datasets(storage_name,
                               tag_name=None,
                               update=False,
                               remote_storage_name=None):

    tantalus_api = TantalusApi()

    for dataset in tantalus_api.list('resultsdataset',
                                     results_type='CELLENONE'):
        # HACK: Check for metadata yaml file in dataset
        found_metadata = False
        try:
            file_resource = tantalus_api.get(
                'file_resource',
                resultsdataset__id=dataset['id'],
                filename__endswith='metadata.yaml')
            found_metadata = True
        except NotFoundError:
            logging.info(f"no metadata for dataset {dataset['id']}")

        if found_metadata:
            logging.info(
                f"found metadata for dataset {dataset['id']}, skipping")
            continue

        try:
            process_cellenone_dataset(dataset,
                                      storage_name,
                                      tag_name=tag_name,
                                      update=update,
                                      remote_storage_name=remote_storage_name)

        except KeyboardInterrupt:
            raise

        except:
            logging.exception(f"catalog failed for dataset {dataset['id']}")
from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
import logging


tantalus_api = TantalusApi()
colossus_api = ColossusApi()


if __name__ == '__main__':
    print "STARTING"
    colossus_analyses = colossus_api.list('analysis_information')
    tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align")

    analysis_lane_dict = {}

    for analysis in tantalus_analyses:
        lane_set = set()
        for input_dataset in analysis['input_datasets']:
            dataset = tantalus_api.get('sequencedataset',id=input_dataset)
            for lane in dataset['sequence_lanes']:
                lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number'])))

        analysis_lane_dict[analysis['name']] = lane_set

    print analysis_lane_dict

    for analysis in colossus_analyses:
        key = analysis['analysis_jira_ticket'] + '_align'
        if key in analysis_lane_dict.keys():
            lanes = []
def main(
    storage_name,
    dry_run=False,
    check_remote=None,
):
    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_storage_client = None
    if check_remote is not None:
        remote_storage_client = tantalus_api.get_storage_client(check_remote)

    file_instances = tantalus_api.list('file_instance',
                                       storage__name=storage_name,
                                       is_deleted=True)

    # DEBUG: check whether we are getting back
    # consistent ordered results from tantalus
    file_instances = list(file_instances)
    file_instance_ids = set([f['id'] for f in file_instances])
    if len(file_instances) != len(file_instance_ids):
        raise Exception('received duplicate results from tantalus')

    logging.info('processing {} file instances'.format(len(file_instance_ids)))
    logging.info('processing the following file instances: {}'.format(
        str(file_instance_ids)))

    for file_instance in file_instances:
        file_resource = tantalus_api.get(
            'file_resource', id=file_instance['file_resource']['id'])
        all_file_instances = list(
            tantalus_api.list('file_instance',
                              file_resource=file_resource['id']))

        logging.info(
            'checking file instance {}, file resource {}, filepath {}'.format(
                file_instance['id'], file_resource['id'],
                file_instance['filepath']))

        sequencedatasets = tantalus_api.list(
            'sequencedataset', file_resources__id=file_resource['id'])
        resultsdatasets = tantalus_api.list(
            'resultsdataset', file_resources__id=file_resource['id'])

        sequencedataset_ids = list(set([a['id'] for a in sequencedatasets]))
        resultsdataset_ids = list(set([a['id'] for a in resultsdatasets]))

        logging.info(
            'file resource {} belongs to sequencedataset {} and resultsdatasets {}'
            .format(file_resource['id'], sequencedataset_ids,
                    resultsdataset_ids))

        # Optionally check for a remote version
        if remote_storage_client:
            remote_instance = None
            for other_instance in file_resource['file_instances']:
                if other_instance['storage']['name'] == check_remote:
                    remote_instance = other_instance

            if not remote_instance:
                logging.info(
                    'not deleting file instance {}, no other instance'.format(
                        file_instance['id']))
                continue

            if remote_instance['is_deleted']:
                logging.info(
                    'not deleting file instance {}, other instance {} deleted'.
                    format(file_instance['id'], other_instance['id']))
                continue

            if not remote_storage_client.exists(file_resource['filename']):
                logging.info(
                    'not deleting file instance {}, other instance {} doesnt exist'
                    .format(file_instance['id'], other_instance['id']))
                continue

            logging.info(
                'deletion ok for file instance {}, found other instance {}'.
                format(file_instance['id'], other_instance['id']))

        # Delete the file from the filesystem
        logging.info('deleting file {}'.format(file_instance['filepath']))
        if not dry_run:
            try:
                storage_client.delete(file_resource['filename'])
            except FileNotFoundError:
                logging.exception('file already deleted')

        # Delete the instance model from tantalus
        logging.info('deleting file instance {}'.format(file_instance['id']))
        if not dry_run:
            tantalus_api.delete('file_instance', id=file_instance['id'])

        # If this is the only file instance for this file resource, delete the file resource
        if len(all_file_instances) == 1:
            assert all_file_instances[0]['id'] == file_instance['id']
            logging.info('deleting file resource {}'.format(
                file_resource['id']))
            if not dry_run:
                tantalus_api.delete('file_resource', id=file_resource['id'])
def check_indices(library_id=None):
    tantalus_api = TantalusApi()
    colossus_api = ColossusApi()

    if library_id is None:
        library_ids = set([a['pool_id'] for a in colossus_api.list('library')])

    else:
        library_ids = [library_id]

    for library_id in library_ids:

        # Get colossus sublibrary indices
        sublibraries = colossus_api.list('sublibraries',
                                         library__pool_id=library_id)
        colossus_indices = set(
            [a['primer_i7'] + '-' + a['primer_i5'] for a in sublibraries])

        datasets = tantalus_api.list(
            'sequence_dataset',
            library__library_id=library_id,
            library__library_type__name='SC_WGS',
            dataset_type='FQ',
        )

        lane_datasets = collections.defaultdict(list)

        for dataset in datasets:

            assert len(dataset['sequence_lanes']) == 1

            flowcell_lane = '_'.join([
                dataset['sequence_lanes'][0]['flowcell_id'],
                dataset['sequence_lanes'][0]['lane_number'],
            ])

            lane_datasets[flowcell_lane].append(dataset)

        for flowcell_lane in lane_datasets:

            # Get tantalus sublibraries and indices
            tantalus_indices = set()
            tantalus_dataset_ids = []
            tantalus_sequencing_centre = set()
            for dataset in lane_datasets[flowcell_lane]:
                file_resources = list(
                    tantalus_api.list('file_resource',
                                      sequencedataset__id=dataset['id']))
                tantalus_indices.update(
                    set([
                        a['sequencefileinfo']['index_sequence']
                        for a in file_resources
                    ]))
                tantalus_dataset_ids.append(dataset['id'])
                tantalus_sequencing_centre.update([
                    a['sequencing_centre'] for a in dataset['sequence_lanes']
                ])

            assert len(tantalus_sequencing_centre) == 1
            tantalus_sequencing_centre = list(tantalus_sequencing_centre)[0]

            if len(colossus_indices - tantalus_indices) > 0:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: {} in colossus but not tantalus'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre,
                            len(colossus_indices - tantalus_indices)))

            if len(tantalus_indices - colossus_indices) > 0:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: {} in tantalus but not colossus'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre,
                            len(tantalus_indices - colossus_indices)))

            if tantalus_indices == colossus_indices:
                print(
                    'library {}, datasets {}, lane {}, sequencing_centre {}: OK'
                    .format(library_id, tantalus_dataset_ids, flowcell_lane,
                            tantalus_sequencing_centre))
Exemple #10
0
def fix_bams(jira_ticket=None, dry_run=False):

    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()

    SC_WGS_BAM_DIR_TEMPLATE = os.path.join(
        'single_cell_indexing',
        'bam',
        '{library_id}',
        '{ref_genome}',
        '{aligner_name}',
        'numlanes_{number_lanes}',
        '{jira_ticket}',
    )

    reference_genome_map = {
        'HG19': 'grch37',
        'MM10': 'mm10',
    }

    analyses_list = []
    from_storage_name = "singlecellresults"
    to_storage_name = "singlecellblob"
    from_storage_client = tantalus_api.get_storage_client(from_storage_name)
    to_storage_client = tantalus_api.get_storage_client(to_storage_name)
    to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id']

    if jira_ticket is not None:
        analyses_list.append(
            tantalus_api.get('analysis',
                             jira_ticket=jira_ticket,
                             analysis_type__name="align",
                             status="complete"))

    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'):
            analyses = tantalus_api.list('analysis',
                                         analysis_type__name="align",
                                         status="complete",
                                         version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]
        print(f"moving bams for {jira_ticket}")

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            # Get number of lanes from dataset for use with filepath
            lanes = set()
            for sequence_lane in dataset['sequence_lanes']:
                lane = "{}_{}".format(sequence_lane['flowcell_id'],
                                      sequence_lane['lane_number'])
                lanes.add(lane)
            number_lanes = len(lanes)

            try:
                file_instances = tantalus_api.get_dataset_file_instances(
                    dataset["id"],
                    "sequencedataset",
                    from_storage_name,
                )
            except dbclients.tantalus.DataNotOnStorageError:
                logging.info(
                    f'dataset {dataset["id"]} not on {from_storage_name}, skipping'
                )
                continue

            for file_instance in file_instances:
                blobname = file_instance["file_resource"]["filename"]

                # get url of source blob
                blob_url = from_storage_client.get_url(blobname)

                bam_filename = blobname.split("/bams/")[1]
                new_blobname = os.path.join(
                    SC_WGS_BAM_DIR_TEMPLATE.format(
                        library_id=dataset["library"]["library_id"],
                        ref_genome=reference_genome_map[
                            dataset["reference_genome"]],
                        aligner_name=dataset["aligner"],
                        number_lanes=number_lanes,
                        jira_ticket=jira_ticket,
                    ),
                    bam_filename,
                )

                # copy blob to desired storage account with new blobname
                blob_filepath = f"{to_storage_client.prefix}/{new_blobname}"
                logging.info(
                    f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}'
                )
                if not dry_run:
                    to_storage_client.blob_service.copy_blob(
                        container_name="data",
                        blob_name=new_blobname,
                        copy_source=blob_url,
                    )

                file_resource_id = file_instance['file_resource']['id']
                file_instance_id = file_instance['id']

                logging.info(
                    f'updating file resource {file_resource_id} to have filename {new_blobname}'
                )
                if not dry_run:
                    tantalus_api.update('file_resource',
                                        id=file_resource_id,
                                        filename=new_blobname)

                logging.info(
                    f'updating file instance {file_instance_id} to have storage with id {to_storage_id}'
                )
                if not dry_run:
                    tantalus_api.update('file_instance',
                                        id=file_instance_id,
                                        storage=to_storage_id)
def run_h5_convert(cache_dir,
                   dataset_id=None,
                   results_type=None,
                   redo=False,
                   dry_run=False,
                   check_done=False):
    tantalus_api = TantalusApi()

    local_cache_client = tantalus_api.get_cache_client(cache_dir)
    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if dataset_id is not None:
        results_list = [tantalus_api.get("resultsdataset", id=dataset_id)]
        logging.info('converting results with id {}'.format(dataset_id))

    elif results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                # Destruct outputs csv.yaml directly, check non destruct files
                if 'destruct' in existing_filename:
                    continue
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml and check_done:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))
                continue

            file_resource_ids = []

            filepaths_to_clean = []

            for file_instance in file_instances:
                if not file_instance['file_resource']['filename'].endswith(
                        '.h5'):
                    continue

                datamanagement.transfer_files.cache_file(
                    tantalus_api, file_instance, cache_dir)

                h5_filepath = local_cache_client.get_url(
                    file_instance['file_resource']['filename'])

                filepaths_to_clean.append(h5_filepath)

                logging.info('converting {}'.format(h5_filepath))

                for key, csv_filepath in get_h5_csv_info(h5_filepath):
                    if not csv_filepath.startswith(cache_dir):
                        raise Exception(
                            'unexpected csv path {}'.format(csv_filepath))

                    csv_filename = csv_filepath[len(cache_dir):]
                    csv_filename = csv_filename.lstrip('/')

                    if csv_filename in existing_filenames and not redo:
                        logging.info(
                            'file {} already exists, not converting'.format(
                                csv_filename))
                        continue

                    if dry_run:
                        logging.info('would convert {}, key {} to {}'.format(
                            h5_filepath, key, csv_filepath))
                        continue

                    logging.info('converting {}, key {} to {}'.format(
                        h5_filepath, key, csv_filepath))
                    convert_h5(h5_filepath, key, csv_filepath)

                    yaml_filename = csv_filename + '.yaml'
                    yaml_filepath = csv_filepath + '.yaml'

                    fileinfo_to_add = [
                        (csv_filename, csv_filepath),
                        (yaml_filename, yaml_filepath),
                    ]

                    for filename, filepath in fileinfo_to_add:
                        logging.info('creating file {} from path {}'.format(
                            filename, filepath))

                        remote_storage_client.create(filename,
                                                     filepath,
                                                     update=redo)
                        remote_filepath = os.path.join(
                            remote_storage_client.prefix, filename)

                        logging.info('adding file {} from path {}'.format(
                            filename, remote_filepath))

                        (file_resource, file_instance) = tantalus_api.add_file(
                            remote_storage_name, remote_filepath,
                            update=True)  #redo)

                        file_resource_ids.append(file_resource["id"])
                        filepaths_to_clean.append(filepath)

            if len(file_resource_ids) == 0:
                logging.warning('no files added')
                continue

            logging.info('adding file resources {} to dataset {}'.format(
                file_resource_ids, result["id"]))

            tantalus_api.update(
                "resultsdataset",
                result["id"],
                file_resources=result["file_resources"] + file_resource_ids,
            )

            for filepath in filepaths_to_clean:
                logging.info('removing file {}'.format(filepath))
                os.remove(filepath)

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
Exemple #12
0
from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi

import time

if __name__ == '__main__':
    print "TANTALUS CREATING..."
    tantalus_api = TantalusApi()
    print "COLOSSUS CREATING..."
    colossus_api = ColossusApi()

    instances = tantalus_api.list("file_instance")
    for instance in instances:
        print instance["filepath"]



Exemple #13
0
from datamanagement.utils.gsc import GSCAPI
from datamanagement.utils.constants import LOGGING_FORMAT
from dbclients.tantalus import TantalusApi
from dbclients.basicclient import NotFoundError

if __name__ == '__main__':
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    gsc_api = GSCAPI()

    tantalus_api = TantalusApi()

    # List of relevant libraries from GSC lanes
    lanes = list(tantalus_api.list('sequencing_lane', sequencing_centre='GSC'))

    libraries = set()
    for lane in lanes:
        library = tantalus_api.get('dna_library', id=lane['dna_library'])
        if library['library_type'] == 'WGS':
            libraries.add(library['library_id'])

    lane_fixes = []

    for library_id in libraries:
        infos = gsc_api.query("library?name={}".format(library_id))

        if len(infos) == 0:
            logging.warning('unable to find {}'.format(library_id))
Exemple #14
0
def fix():
    tantalus_api = TantalusApi()

    datasets = list(
        tantalus_api.list(
            'sequence_dataset',
            dataset_type='BAM',
            library__library_type__name='WGS',
        ))

    for dataset in datasets:
        bams = {}
        bais = {}
        specs = {}
        for file_resource_id in dataset['file_resources']:
            file_resource = tantalus_api.get('file_resource',
                                             id=file_resource_id)
            if file_resource['filename'].endswith('.bam'):
                bams[file_resource_id] = file_resource['filename']
            elif file_resource['filename'].endswith('.spec'):
                specs[file_resource_id] = file_resource['filename']
            elif file_resource['filename'].endswith('.bam.bai'):
                bais[file_resource_id] = file_resource['filename']

        if len(bams) == 0 and len(specs) == 0:
            print(dataset['id'])

        elif len(bams) > 1:
            logging.info(f"fixing {dataset['name']}, {bams}")

            to_remove_bam_id = max(bams.keys())
            to_remove_bai_id = None
            for id_, bai in bais.items():
                if bai.startswith(bams[to_remove_bam_id]):
                    assert to_remove_bai_id is None
                    to_remove_bai_id = id_
                    break
            assert to_remove_bai_id is not None

            logging.info((to_remove_bam_id, bams[to_remove_bam_id],
                          to_remove_bai_id, bais[to_remove_bai_id]))

            new_file_resources = dataset['file_resources']
            new_file_resources.remove(to_remove_bam_id)
            new_file_resources.remove(to_remove_bai_id)

            logging.info(
                f"updating {dataset['id']} to have files {new_file_resources}")

            tantalus_api.update('sequencedataset',
                                id=dataset['id'],
                                file_resources=new_file_resources)

            assert dataset["name"].endswith(str(dataset["version_number"]))

            similar_datasets = list(
                tantalus_api.list(
                    "sequence_dataset",
                    name=dataset["name"],
                ))
            new_version_number = max(d['version_number']
                                     for d in similar_datasets) + 1

            new_dataset_params = dict(
                sample=dataset['sample']['id'],
                library=dataset['library']['id'],
                sequence_lanes=[l['id'] for l in dataset['sequence_lanes']],
                aligner=dataset['aligner'],
                reference_genome=dataset['reference_genome'],
                name=dataset['name'][:-1] + str(new_version_number),
                dataset_type=dataset['dataset_type'],
                version_number=new_version_number,
                file_resources=[to_remove_bam_id, to_remove_bai_id],
            )

            logging.info(new_dataset_params)

            new_dataset = tantalus_api.create('sequencedataset',
                                              **new_dataset_params)

            logging.info(new_dataset)
Exemple #15
0
def main(
    storage_name,
    dataset_type,
    dataset_id=None,
    tag_name=None,
    check_remote=None,
    dry_run=False,
):
    logging.info('cleanup up storage {}'.format(storage_name))

    if check_remote:
        logging.info('checking remote {}'.format(check_remote))
    else:
        logging.warning('not checking remote')

    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_client = None
    if check_remote is not None:
        remote_client = tantalus_api.get_storage_client(check_remote)

    if dataset_id is None and tag_name is None:
        raise ValueError('require either dataset id or tag name')

    if dataset_id is not None and tag_name is not None:
        raise ValueError('require exactly one of dataset id or tag name')

    if dataset_id is not None:
        logging.info('cleanup up dataset {}, {}'.format(
            dataset_id, dataset_type))
        datasets = tantalus_api.list(dataset_type, id=dataset_id)

    if tag_name is not None:
        logging.info('cleanup up tag {}'.format(tag_name))
        datasets = tantalus_api.list(dataset_type, tags__name=tag_name)

    total_data_size = 0
    file_num_count = 0

    for dataset in datasets:
        logging.info('checking dataset with id {}, name {}'.format(
            dataset['id'], dataset['name']))

        # Optionally skip datasets not present and intact on the remote storage
        if check_remote is not None:
            if not tantalus_api.is_dataset_on_storage(
                    dataset['id'], 'sequencedataset', check_remote):
                logging.warning(
                    'not deleting dataset with id {}, not on remote storage '.
                    format(dataset['id'], check_remote))
                continue

            # For each file instance on the remote, check if it exists and has the correct size in tantalus
            remote_file_size_check = True
            for file_instance in tantalus_api.get_dataset_file_instances(
                    dataset['id'], dataset_type, check_remote):
                try:
                    tantalus_api.check_file(file_instance)
                except DataError:
                    logging.exception('check file failed')
                    remote_file_size_check = False

            # Skip this dataset if any files failed
            if not remote_file_size_check:
                logging.warning(
                    "skipping dataset {} that failed check on {}".format(
                        dataset['id'], check_remote))
                continue

        # Check consistency with the removal storage
        file_size_check = True
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            try:
                tantalus_api.check_file(file_instance)
            except DataError:
                logging.exception('check file failed')
                file_size_check = False

        # Skip this dataset if any files failed
        if not file_size_check:
            logging.warning(
                "skipping dataset {} that failed check on {}".format(
                    dataset['id'], storage_name))
            continue

        # Delete all files for this dataset
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            if dry_run:
                logging.info(
                    "would delete file instance with id {}, filepath {}".
                    format(file_instance['id'], file_instance['filepath']))
            else:
                logging.info(
                    "deleting file instance with id {}, filepath {}".format(
                        file_instance['id'], file_instance['filepath']))
                tantalus_api.update("file_instance",
                                    id=file_instance['id'],
                                    is_deleted=True)
            total_data_size += file_instance['file_resource']['size']
            file_num_count += 1

    logging.info("deleted a total of {} files with size {} bytes".format(
        file_num_count, total_data_size))