Ejemplo n.º 1
0
def main(
        storage_name,
        dataset_type=None,
        dataset_id=None,
        tag_name=None,
        all_file_instances=False,
        dry_run=False,
        fix_corrupt=False,
        remove_missing=False,
    ):
    logging.info('checking integrity of storage {}'.format(storage_name))

    tantalus_api = TantalusApi()

    if all_file_instances:
        file_instances = tantalus_api.list('file_instance', storage__name=storage_name)

    else:
        file_instances = get_dataset_file_instances(
            tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name)

    for file_instance in file_instances:
        logging.info('checking file instance {} with path {}'.format(
            file_instance['id'], file_instance['filepath']))

        if file_instance['is_deleted']:
            logging.info('file instance {} marked as deleted'.format(
                file_instance['id']))
            continue

        file_corrupt = False
        file_missing = False
        try:
            tantalus_api.check_file(file_instance)
        except DataCorruptionError:
            file_corrupt = True
            logging.exception('check file failed')
        except DataMissingError:
            file_missing = True
            logging.exception('missing file')

        if file_corrupt and fix_corrupt:
            logging.info('updating file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                tantalus_api.update_file(file_instance)

        if file_missing and remove_missing:
            logging.info('deleting file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                file_instance = tantalus_api.update(
                    'file_instance',
                    id=file_instance['id'],
                    is_deleted=True,
                )
Ejemplo n.º 2
0
def fix_bams(jira_ticket=None, dry_run=False):

    tantalus_api = TantalusApi()

    analyses_list = []
    storage_name = "singlecellresults"

    if jira_ticket is not None:
        analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete"))
    
    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3'):
            analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]

        filename = f'{jira_ticket}/results/bams/metadata.yaml'

        logging.info(f'adding file {filename}')
        if not dry_run:
            file_instance, file_resource = tantalus_api.add_file(storage_name, filename)

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            dataset_id = dataset['id']

            logging.info(f'adding file to dataset {dataset_id}')
            if not dry_run:
                file_resource_ids = dataset['file_resources']
                file_resource_ids = file_resource_ids.append(file_resource['id'])
                tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
Ejemplo n.º 3
0
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a all FQ datasets for a library id.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    for dataset_info, metadata in create_lane_fastq_metadata(
            tantalus_api, library_id):
        metadata_filename = os.path.join(dataset_info['base_dir'],
                                         'metadata.yaml')
        metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                      metadata_filename)

        metadata_io = io.BytesIO()
        metadata_io.write(
            yaml.dump(metadata, default_flow_style=False).encode())

        logging.info(f'writing metadata to file {metadata_filepath}')
        client.write_data(metadata_filename, metadata_io)

        logging.info(f'adding {metadata_filepath} to tantalus')

        if not dry_run:
            file_resource, file_instance = tantalus_api.add_file(
                storage_name, metadata_filepath, update=True)

            for dataset_id in dataset_info['dataset_ids']:
                dataset = tantalus_api.get('sequencedataset', id=dataset_id)

                new_file_resources = set(dataset['file_resources'])
                new_file_resources.add(file_resource['id'])

                tantalus_api.update('sequencedataset',
                                    id=dataset_id,
                                    file_resources=list(new_file_resources))
Ejemplo n.º 4
0
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a dataset and add to tantalus.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id)

    metadata_filename = os.path.join(base_dir, 'metadata.yaml')
    metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                  metadata_filename)

    metadata_io = io.BytesIO()
    metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode())

    print(f'writing metadata to file {metadata_filepath}')
    client.write_data(metadata_filename, metadata_io)

    print(f'adding {metadata_filepath} to tantalus')

    if not dry_run:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             metadata_filepath,
                                                             update=True)

        dataset = tantalus_api.get('sequencedataset', id=dataset_id)

        new_file_resources = set(dataset['file_resources'])
        new_file_resources.add(file_resource['id'])

        tantalus_api.update('sequencedataset',
                            id=dataset_id,
                            file_resources=list(new_file_resources))
Ejemplo n.º 5
0
def fix_bams(jira_ticket=None, dry_run=False):

    logging.info(f'dry run: {dry_run}')

    tantalus_api = TantalusApi()

    SC_WGS_BAM_DIR_TEMPLATE = os.path.join(
        'single_cell_indexing',
        'bam',
        '{library_id}',
        '{ref_genome}',
        '{aligner_name}',
        'numlanes_{number_lanes}',
        '{jira_ticket}',
    )

    reference_genome_map = {
        'HG19': 'grch37',
        'MM10': 'mm10',
    }

    analyses_list = []
    from_storage_name = "singlecellresults"
    to_storage_name = "singlecellblob"
    from_storage_client = tantalus_api.get_storage_client(from_storage_name)
    to_storage_client = tantalus_api.get_storage_client(to_storage_name)
    to_storage_id = tantalus_api.get('storage', name=to_storage_name)['id']

    if jira_ticket is not None:
        analyses_list.append(
            tantalus_api.get('analysis',
                             jira_ticket=jira_ticket,
                             analysis_type__name="align",
                             status="complete"))

    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3', 'v0.5.4'):
            analyses = tantalus_api.list('analysis',
                                         analysis_type__name="align",
                                         status="complete",
                                         version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]
        print(f"moving bams for {jira_ticket}")

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            # Get number of lanes from dataset for use with filepath
            lanes = set()
            for sequence_lane in dataset['sequence_lanes']:
                lane = "{}_{}".format(sequence_lane['flowcell_id'],
                                      sequence_lane['lane_number'])
                lanes.add(lane)
            number_lanes = len(lanes)

            try:
                file_instances = tantalus_api.get_dataset_file_instances(
                    dataset["id"],
                    "sequencedataset",
                    from_storage_name,
                )
            except dbclients.tantalus.DataNotOnStorageError:
                logging.info(
                    f'dataset {dataset["id"]} not on {from_storage_name}, skipping'
                )
                continue

            for file_instance in file_instances:
                blobname = file_instance["file_resource"]["filename"]

                # get url of source blob
                blob_url = from_storage_client.get_url(blobname)

                bam_filename = blobname.split("/bams/")[1]
                new_blobname = os.path.join(
                    SC_WGS_BAM_DIR_TEMPLATE.format(
                        library_id=dataset["library"]["library_id"],
                        ref_genome=reference_genome_map[
                            dataset["reference_genome"]],
                        aligner_name=dataset["aligner"],
                        number_lanes=number_lanes,
                        jira_ticket=jira_ticket,
                    ),
                    bam_filename,
                )

                # copy blob to desired storage account with new blobname
                blob_filepath = f"{to_storage_client.prefix}/{new_blobname}"
                logging.info(
                    f'copying {new_blobname} to storage {to_storage_name} from {blob_url} to {blob_filepath}'
                )
                if not dry_run:
                    to_storage_client.blob_service.copy_blob(
                        container_name="data",
                        blob_name=new_blobname,
                        copy_source=blob_url,
                    )

                file_resource_id = file_instance['file_resource']['id']
                file_instance_id = file_instance['id']

                logging.info(
                    f'updating file resource {file_resource_id} to have filename {new_blobname}'
                )
                if not dry_run:
                    tantalus_api.update('file_resource',
                                        id=file_resource_id,
                                        filename=new_blobname)

                logging.info(
                    f'updating file instance {file_instance_id} to have storage with id {to_storage_id}'
                )
                if not dry_run:
                    tantalus_api.update('file_instance',
                                        id=file_instance_id,
                                        storage=to_storage_id)
Ejemplo n.º 6
0
def run_h5_convert(cache_dir,
                   dataset_id=None,
                   results_type=None,
                   redo=False,
                   dry_run=False,
                   check_done=False):
    tantalus_api = TantalusApi()

    local_cache_client = tantalus_api.get_cache_client(cache_dir)
    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if dataset_id is not None:
        results_list = [tantalus_api.get("resultsdataset", id=dataset_id)]
        logging.info('converting results with id {}'.format(dataset_id))

    elif results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                # Destruct outputs csv.yaml directly, check non destruct files
                if 'destruct' in existing_filename:
                    continue
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml and check_done:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))
                continue

            file_resource_ids = []

            filepaths_to_clean = []

            for file_instance in file_instances:
                if not file_instance['file_resource']['filename'].endswith(
                        '.h5'):
                    continue

                datamanagement.transfer_files.cache_file(
                    tantalus_api, file_instance, cache_dir)

                h5_filepath = local_cache_client.get_url(
                    file_instance['file_resource']['filename'])

                filepaths_to_clean.append(h5_filepath)

                logging.info('converting {}'.format(h5_filepath))

                for key, csv_filepath in get_h5_csv_info(h5_filepath):
                    if not csv_filepath.startswith(cache_dir):
                        raise Exception(
                            'unexpected csv path {}'.format(csv_filepath))

                    csv_filename = csv_filepath[len(cache_dir):]
                    csv_filename = csv_filename.lstrip('/')

                    if csv_filename in existing_filenames and not redo:
                        logging.info(
                            'file {} already exists, not converting'.format(
                                csv_filename))
                        continue

                    if dry_run:
                        logging.info('would convert {}, key {} to {}'.format(
                            h5_filepath, key, csv_filepath))
                        continue

                    logging.info('converting {}, key {} to {}'.format(
                        h5_filepath, key, csv_filepath))
                    convert_h5(h5_filepath, key, csv_filepath)

                    yaml_filename = csv_filename + '.yaml'
                    yaml_filepath = csv_filepath + '.yaml'

                    fileinfo_to_add = [
                        (csv_filename, csv_filepath),
                        (yaml_filename, yaml_filepath),
                    ]

                    for filename, filepath in fileinfo_to_add:
                        logging.info('creating file {} from path {}'.format(
                            filename, filepath))

                        remote_storage_client.create(filename,
                                                     filepath,
                                                     update=redo)
                        remote_filepath = os.path.join(
                            remote_storage_client.prefix, filename)

                        logging.info('adding file {} from path {}'.format(
                            filename, remote_filepath))

                        (file_resource, file_instance) = tantalus_api.add_file(
                            remote_storage_name, remote_filepath,
                            update=True)  #redo)

                        file_resource_ids.append(file_resource["id"])
                        filepaths_to_clean.append(filepath)

            if len(file_resource_ids) == 0:
                logging.warning('no files added')
                continue

            logging.info('adding file resources {} to dataset {}'.format(
                file_resource_ids, result["id"]))

            tantalus_api.update(
                "resultsdataset",
                result["id"],
                file_resources=result["file_resources"] + file_resource_ids,
            )

            for filepath in filepaths_to_clean:
                logging.info('removing file {}'.format(filepath))
                os.remove(filepath)

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
Ejemplo n.º 7
0
def add_generic_results(filepaths,
                        storage_name,
                        results_name,
                        results_type,
                        results_version,
                        sample_ids=(),
                        library_ids=(),
                        analysis_pk=None,
                        recursive=False,
                        tag_name=None,
                        update=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()

    sample_pks = []
    for sample_id in sample_ids:
        samples = tantalus_api.get(
            "sample",
            sample_id=sample_id,
        )
        sample_pks.append(samples['id'])

    library_pks = []
    for library_id in library_ids:
        librarys = tantalus_api.get(
            "dna_library",
            library_id=library_id,
        )
        library_pks.append(librarys['id'])

    #Add the file resource to tantalus
    file_resource_pks = []
    for filepath in filepaths:
        if recursive:
            logging.info("Recursing directory {}".format(filepath))
            add_filepaths = []
            for (dirpath, dirnames, filenames) in os.walk(filepath):
                for filename in filenames:
                    add_filepaths.append(os.path.join(dirpath, filename))

        else:
            add_filepaths = [filepath]

        for add_filepath in add_filepaths:
            logging.info(
                "Adding file resource for {} to Tantalus".format(add_filepath))
            resource, instance = tantalus_api.add_file(
                storage_name=storage_name,
                filepath=add_filepath,
                update=update,
            )
            file_resource_pks.append(resource["id"])

    results_dataset_fields = dict(
        name=results_name,
        results_type=results_type,
        results_version=results_version,
        analysis=analysis_pk,
        samples=sample_pks,
        libraries=library_pks,
        file_resources=file_resource_pks,
    )

    #Add the dataset to tantalus
    try:
        results_id = tantalus_api.get(
            "results", name=results_dataset_fields["name"])["id"]
    except NotFoundError:
        results_id = None

    if update and results_id is not None:
        logging.warning("results dataset {} exists, updating".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.update("results",
                                              id=results_id,
                                              **results_dataset_fields)

    else:
        logging.info("creating results dataset {}".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.get_or_create("results",
                                                     **results_dataset_fields)

    if tag_name is not None:
        tantalus_api.tag(tag_name, resultsdataset_set=[results_id])

    logging.info("Succesfully created sequence dataset with ID {}".format(
        results_dataset["id"]))

    if remote_storage_name is not None:
        transfer_files.transfer_dataset(tantalus_api, results_dataset['id'],
                                        "resultsdataset", storage_name,
                                        remote_storage_name)

    return results_dataset
Ejemplo n.º 8
0
                dna_library=incorrect_lane['dna_library'],
                sequencing_centre="GSC",
                sequencing_instrument=incorrect_lane['sequencing_instrument'],
                sequencing_library_id=incorrect_lane['sequencing_library_id'],
                read_type=incorrect_lane['read_type'],
            )
            pass

        else:
            for field in ('sequencing_centre', 'sequencing_instrument',
                          'sequencing_library_id'):
                if correct_lane[field] != incorrect_lane[field]:
                    logging.warning('updating {} from {} to {}'.format(
                        field, correct_lane[field], incorrect_lane[field]))
                    correct_lane = tantalus_api.update(
                        'sequencing_lane',
                        id=correct_lane['id'],
                        **dict(field=incorrect_lane[field]))

        datasets = list(
            tantalus_api.list(
                'sequence_dataset',
                library__library_id=row['library_id'],
            ))

        for dataset in datasets:
            lane_pks = [l['id'] for l in dataset['sequence_lanes']]
            if incorrect_lane['id'] not in lane_pks:
                continue
            num_lanes = len(lane_pks)
            lane_pks.remove(incorrect_lane['id'])
            lane_pks.append(correct_lane['id'])
Ejemplo n.º 9
0
def fix():
    tantalus_api = TantalusApi()

    datasets = list(
        tantalus_api.list(
            'sequence_dataset',
            dataset_type='BAM',
            library__library_type__name='WGS',
        ))

    for dataset in datasets:
        bams = {}
        bais = {}
        specs = {}
        for file_resource_id in dataset['file_resources']:
            file_resource = tantalus_api.get('file_resource',
                                             id=file_resource_id)
            if file_resource['filename'].endswith('.bam'):
                bams[file_resource_id] = file_resource['filename']
            elif file_resource['filename'].endswith('.spec'):
                specs[file_resource_id] = file_resource['filename']
            elif file_resource['filename'].endswith('.bam.bai'):
                bais[file_resource_id] = file_resource['filename']

        if len(bams) == 0 and len(specs) == 0:
            print(dataset['id'])

        elif len(bams) > 1:
            logging.info(f"fixing {dataset['name']}, {bams}")

            to_remove_bam_id = max(bams.keys())
            to_remove_bai_id = None
            for id_, bai in bais.items():
                if bai.startswith(bams[to_remove_bam_id]):
                    assert to_remove_bai_id is None
                    to_remove_bai_id = id_
                    break
            assert to_remove_bai_id is not None

            logging.info((to_remove_bam_id, bams[to_remove_bam_id],
                          to_remove_bai_id, bais[to_remove_bai_id]))

            new_file_resources = dataset['file_resources']
            new_file_resources.remove(to_remove_bam_id)
            new_file_resources.remove(to_remove_bai_id)

            logging.info(
                f"updating {dataset['id']} to have files {new_file_resources}")

            tantalus_api.update('sequencedataset',
                                id=dataset['id'],
                                file_resources=new_file_resources)

            assert dataset["name"].endswith(str(dataset["version_number"]))

            similar_datasets = list(
                tantalus_api.list(
                    "sequence_dataset",
                    name=dataset["name"],
                ))
            new_version_number = max(d['version_number']
                                     for d in similar_datasets) + 1

            new_dataset_params = dict(
                sample=dataset['sample']['id'],
                library=dataset['library']['id'],
                sequence_lanes=[l['id'] for l in dataset['sequence_lanes']],
                aligner=dataset['aligner'],
                reference_genome=dataset['reference_genome'],
                name=dataset['name'][:-1] + str(new_version_number),
                dataset_type=dataset['dataset_type'],
                version_number=new_version_number,
                file_resources=[to_remove_bam_id, to_remove_bai_id],
            )

            logging.info(new_dataset_params)

            new_dataset = tantalus_api.create('sequencedataset',
                                              **new_dataset_params)

            logging.info(new_dataset)
Ejemplo n.º 10
0
def main(
    storage_name,
    dataset_type,
    dataset_id=None,
    tag_name=None,
    check_remote=None,
    dry_run=False,
):
    logging.info('cleanup up storage {}'.format(storage_name))

    if check_remote:
        logging.info('checking remote {}'.format(check_remote))
    else:
        logging.warning('not checking remote')

    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_client = None
    if check_remote is not None:
        remote_client = tantalus_api.get_storage_client(check_remote)

    if dataset_id is None and tag_name is None:
        raise ValueError('require either dataset id or tag name')

    if dataset_id is not None and tag_name is not None:
        raise ValueError('require exactly one of dataset id or tag name')

    if dataset_id is not None:
        logging.info('cleanup up dataset {}, {}'.format(
            dataset_id, dataset_type))
        datasets = tantalus_api.list(dataset_type, id=dataset_id)

    if tag_name is not None:
        logging.info('cleanup up tag {}'.format(tag_name))
        datasets = tantalus_api.list(dataset_type, tags__name=tag_name)

    total_data_size = 0
    file_num_count = 0

    for dataset in datasets:
        logging.info('checking dataset with id {}, name {}'.format(
            dataset['id'], dataset['name']))

        # Optionally skip datasets not present and intact on the remote storage
        if check_remote is not None:
            if not tantalus_api.is_dataset_on_storage(
                    dataset['id'], 'sequencedataset', check_remote):
                logging.warning(
                    'not deleting dataset with id {}, not on remote storage '.
                    format(dataset['id'], check_remote))
                continue

            # For each file instance on the remote, check if it exists and has the correct size in tantalus
            remote_file_size_check = True
            for file_instance in tantalus_api.get_dataset_file_instances(
                    dataset['id'], dataset_type, check_remote):
                try:
                    tantalus_api.check_file(file_instance)
                except DataError:
                    logging.exception('check file failed')
                    remote_file_size_check = False

            # Skip this dataset if any files failed
            if not remote_file_size_check:
                logging.warning(
                    "skipping dataset {} that failed check on {}".format(
                        dataset['id'], check_remote))
                continue

        # Check consistency with the removal storage
        file_size_check = True
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            try:
                tantalus_api.check_file(file_instance)
            except DataError:
                logging.exception('check file failed')
                file_size_check = False

        # Skip this dataset if any files failed
        if not file_size_check:
            logging.warning(
                "skipping dataset {} that failed check on {}".format(
                    dataset['id'], storage_name))
            continue

        # Delete all files for this dataset
        for file_instance in tantalus_api.get_dataset_file_instances(
                dataset['id'], dataset_type, storage_name):
            if dry_run:
                logging.info(
                    "would delete file instance with id {}, filepath {}".
                    format(file_instance['id'], file_instance['filepath']))
            else:
                logging.info(
                    "deleting file instance with id {}, filepath {}".format(
                        file_instance['id'], file_instance['filepath']))
                tantalus_api.update("file_instance",
                                    id=file_instance['id'],
                                    is_deleted=True)
            total_data_size += file_instance['file_resource']['size']
            file_num_count += 1

    logging.info("deleted a total of {} files with size {} bytes".format(
        file_num_count, total_data_size))