Ejemplo n.º 1
0
def glob_cellenone_data(filepaths, storage_name, tag_name=None, update=False, remote_storage_name=None):

    tantalus_api = TantalusApi()

    for filepath in filepaths:
        match = re.match(r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)", filepath)
        if match is None:
            logging.warning('skipping malformed {}'.format(filepath))
            continue

        fields = match.groups()
        date = fields[0]
        library_id = fields[1]

        try:
            tantalus_api.get('dna_library', library_id=library_id)
        except NotFoundError:
            logging.warning('skipping file with unknown library {}'.format(filepath))
            continue

        try:
            process_cellenone_images(
                library_id,
                filepath,
                storage_name,
                tag_name=tag_name,
                update=update,
                remote_storage_name=remote_storage_name,
            )
        except ValueError:
            logging.exception(f'unable to process {library_id}, {filepath}')
Ejemplo n.º 2
0
def add_microscope_results(filepaths,
                           chip_id,
                           library_ids,
                           storage_name,
                           tag_name=None,
                           update=False,
                           remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'MICROSCOPE_{}'.format(chip_id)
    results_type = 'MICROSCOPE'
    results_version = None

    try:
        existing_results = tantalus_api.get('results', name=results_name)
    except NotFoundError:
        existing_results = None

    if existing_results is not None and not update:
        logging.info(f'results for {chip_id} exist, not processing')
        return

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=library_ids,
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 3
0
def cache_tagged_datasets(tag_name,
                          from_storage_name,
                          cache_directory,
                          suffix_filter=None):
    """ Cache a set of tagged datasets
    """

    tantalus_api = TantalusApi()

    tag = tantalus_api.get("tag", name=tag_name)

    for dataset_id in tag['sequencedataset_set']:
        cache_dataset(tantalus_api,
                      dataset_id,
                      "sequencedataset",
                      from_storage_name,
                      cache_directory,
                      suffix_filter=suffix_filter)

    for dataset_id in tag['resultsdataset_set']:
        cache_dataset(tantalus_api,
                      dataset_id,
                      "resultsdataset",
                      from_storage_name,
                      cache_directory,
                      suffix_filter=suffix_filter)
Ejemplo n.º 4
0
def process_cellenone_images(
    library_id,
    source_dir,
    storage_name,
    tag_name=None,
    update=False,
    remote_storage_name=None,
):

    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_IMAGES_{}'.format(library_id)
    results_type = 'CELLENONE_IMAGES'
    results_version = 'v1'

    try:
        existing_results = tantalus_api.get('results', name=results_name)
    except NotFoundError:
        existing_results = None

    if existing_results is not None and not update:
        logging.info(f'results for {library_id} exist, not processing')
        return

    storage = tantalus_api.get('storage', name=storage_name)
    storage_directory = storage['storage_directory']

    destination_dir = os.path.join(
        storage_directory,
        'single_cell_indexing',
        'Cellenone',
        'Cellenone_processed',
        library_id,
        results_version,
    )

    try:
        os.makedirs(destination_dir)
    except:
        pass

    with tempfile.TemporaryDirectory() as temp_dir:
        filepaths = catalog_images(library_id, source_dir, destination_dir,
                                   temp_dir)

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=False,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 5
0
def process_cellenone_dataset(dataset,
                              storage_name,
                              tag_name=None,
                              update=False,
                              remote_storage_name=None):

    assert len(dataset['libraries']) == 1
    library_id = dataset['libraries'][0]['library_id']

    tantalus_api = TantalusApi()

    if not tantalus_api.is_dataset_on_storage(dataset['id'], 'resultsdataset',
                                              storage_name):
        raise ValueError(
            f"dataset {dataset['id']} not on storage {storage_name}")

    # Assume all files in the raw dataset are under the directory:
    #  single_cell_indexing/Cellenone/Cellenone_images/{date}_{library_id}

    filename_prefix = 'single_cell_indexing/Cellenone/Cellenone_images/'

    source_dir = None
    for file_resource in tantalus_api.get_dataset_file_resources(
            dataset['id'], 'resultsdataset'):
        if source_dir is None:
            if not file_resource['filename'].startswith(filename_prefix):
                raise ValueError(
                    f"file {file_resource['filename']} is not in directory {filename_prefix}"
                )

            library_subdir = file_resource['filename'].split('/')[3]

            if not library_subdir.endswith(library_id):
                raise ValueError(
                    f"file {file_resource['filename']} is not in a directory ending with {library_id}"
                )

            source_dir = '/'.join(file_resource['filename'].split('/')[:4])

        elif not file_resource['filename'].startswith(source_dir):
            raise ValueError(
                f"file {file_resource['filename']} is not in directory {source_dir}"
            )

    assert source_dir is not None

    source_dir = tantalus_api.get_filepath(storage_name, source_dir)

    process_cellenone_images(
        library_id,
        source_dir,
        storage_name,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 6
0
def add_analysis(**kwargs):
    tantalus_api = TantalusApi()

    #Create new analysis object
    analysis = tantalus_api.get_or_create("analysis",
                                          name=kwargs['name'],
                                          jira_ticket=kwargs['jira_id'],
                                          analysis_type=kwargs['type'],
                                          version=kwargs['version'])

    logging.info("Successfully created analysis with ID {}".format(
        analysis["id"]))
Ejemplo n.º 7
0
def transfer_tagged_datasets(tag_name, from_storage_name, to_storage_name):
    """ Transfer a set of tagged datasets
    """

    tantalus_api = TantalusApi()

    tag = tantalus_api.get("tag", name=tag_name)

    for dataset_id in tag['sequencedataset_set']:
        transfer_dataset(tantalus_api, dataset_id, "sequencedataset",
                         from_storage_name, to_storage_name)

    for dataset_id in tag['resultsdataset_set']:
        transfer_dataset(tantalus_api, dataset_id, "resultsdataset",
                         from_storage_name, to_storage_name)
Ejemplo n.º 8
0
def main(
        storage_name,
        dataset_type=None,
        dataset_id=None,
        tag_name=None,
        all_file_instances=False,
        dry_run=False,
        fix_corrupt=False,
        remove_missing=False,
    ):
    logging.info('checking integrity of storage {}'.format(storage_name))

    tantalus_api = TantalusApi()

    if all_file_instances:
        file_instances = tantalus_api.list('file_instance', storage__name=storage_name)

    else:
        file_instances = get_dataset_file_instances(
            tantalus_api, storage_name, dataset_type, dataset_id=dataset_id, tag_name=tag_name)

    for file_instance in file_instances:
        logging.info('checking file instance {} with path {}'.format(
            file_instance['id'], file_instance['filepath']))

        if file_instance['is_deleted']:
            logging.info('file instance {} marked as deleted'.format(
                file_instance['id']))
            continue

        file_corrupt = False
        file_missing = False
        try:
            tantalus_api.check_file(file_instance)
        except DataCorruptionError:
            file_corrupt = True
            logging.exception('check file failed')
        except DataMissingError:
            file_missing = True
            logging.exception('missing file')

        if file_corrupt and fix_corrupt:
            logging.info('updating file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                tantalus_api.update_file(file_instance)

        if file_missing and remove_missing:
            logging.info('deleting file instance {} with path {}'.format(
                file_instance['id'], file_instance['filepath']))

            if not dry_run:
                file_instance = tantalus_api.update(
                    'file_instance',
                    id=file_instance['id'],
                    is_deleted=True,
                )
Ejemplo n.º 9
0
def add_cellenone_results(filepaths,
                          library_id,
                          storage_name,
                          tag_name=None,
                          update=False,
                          remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_{}'.format(library_id)
    results_type = 'CELLENONE'
    results_version = None

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 10
0
def main(storage_name, bam_file_path, **kwargs):
    """
    Imports the bam into tantalus by creating a sequence dataset and 
    file resources 
    """
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    tantalus_api = TantalusApi()

    sample = None
    if kwargs.get('sample_id') is not None:
        sample = tantalus_api.get_or_create(
            'sample',
            sample_id=kwargs['sample_id'],
        )

    library = None
    if kwargs.get('library_id') is not None:
        if kwargs.get('library_type') is not None and kwargs.get(
                'index_format') is not None:
            library = tantalus_api.get_or_create(
                'dna_library',
                library_id=kwargs['library_id'],
                library_type=kwargs['library_type'],
                index_format=kwargs['index_format'],
            )
        else:
            library = tantalus_api.get(
                'dna_library',
                library_id=kwargs['library_id'],
            )

    dataset = import_bam(
        storage_name,
        bam_file_path,
        sample=sample,
        library=library,
        read_type=kwargs.get('read_type'),
        ref_genome=kwargs.get('ref_genome'),
        update=kwargs.get('update'),
        tag_name=kwargs.get('tag_name'),
    )

    print("dataset {}".format(dataset["id"]))
Ejemplo n.º 11
0
def transfer_inputs(dataset_ids, results_ids, from_storage, to_storage):
    tantalus_api = TantalusApi()

    for dataset_id in dataset_ids:
        transfer_dataset(tantalus_api, dataset_id, 'sequencedataset', from_storage, to_storage)

    for results_id in results_ids:
        transfer_dataset(tantalus_api, results_id, 'resultsdataset', from_storage, to_storage)
Ejemplo n.º 12
0
def catalog_cellenone_dataset(library_id,
                              storage_name,
                              tag_name=None,
                              update=False,
                              remote_storage_name=None):

    tantalus_api = TantalusApi()

    dataset = tantalus_api.get('resultsdataset',
                               results_type='CELLENONE',
                               libraries__library_id=library_id)

    process_cellenone_dataset(dataset,
                              storage_name,
                              tag_name=tag_name,
                              update=update,
                              remote_storage_name=remote_storage_name)
Ejemplo n.º 13
0
def glob_microscope_data(filepaths,
                         storage_name,
                         tag_name=None,
                         update=False,
                         remote_storage_name=None):

    tantalus_api = TantalusApi()

    chip_paths = collections.defaultdict(set)
    chip_libraries = collections.defaultdict(set)

    for filepath in filepaths:
        match = re.match(
            r".*/single_cell_indexing/Microscope/(\d+)_(A\d+[A-Z]*)", filepath)
        if match is None:
            logging.warning('skipping malformed {}'.format(filepath))
            continue

        fields = match.groups()
        date = fields[0]
        chip_id = fields[1]

        libraries = list(
            tantalus_api.list('dna_library', library_id__startswith=chip_id))

        if len(libraries) == 0:
            logging.error(
                'skipping file with unknown library {}'.format(filepath))
            continue

        library_ids = set([library['library_id'] for library in libraries])

        chip_paths[chip_id].add(filepath)
        chip_libraries[chip_id].update(library_ids)

    for chip_id in chip_paths:
        add_microscope_results(
            chip_paths[chip_id],
            chip_id,
            chip_libraries[chip_id],
            storage_name,
            tag_name=tag_name,
            update=update,
            remote_storage_name=remote_storage_name,
        )
Ejemplo n.º 14
0
def glob_cellenone_data(filepaths,
                        storage_name,
                        tag_name=None,
                        update=False,
                        skip_existing=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()

    library_paths = collections.defaultdict(set)

    for filepath in filepaths:
        match = re.match(
            r".*/single_cell_indexing/Cellenone/Cellenone_images/(\d+)_(A\d+[A-Z]*)/?$",
            filepath)
        if match is None:
            logging.warning('skipping malformed {}'.format(filepath))
            continue

        fields = match.groups()
        date = fields[0]
        library_id = fields[1]

        try:
            tantalus_api.get('dna_library', library_id=library_id)
        except NotFoundError:
            logging.warning(
                'skipping file with unknown library {}'.format(filepath))
            continue

        logging.info(f'queueing library {library_id} data from {filepath}')
        library_paths[library_id].add(filepath)

    for library_id in library_paths:
        add_cellenone_results(
            library_paths[library_id],
            library_id,
            storage_name,
            tag_name=tag_name,
            update=update,
            skip_existing=skip_existing,
            remote_storage_name=remote_storage_name,
        )
Ejemplo n.º 15
0
def transfer_dataset_cmd(dataset_id,
                         dataset_model,
                         from_storage_name,
                         to_storage_name,
                         suffix_filter=None):
    tantalus_api = TantalusApi()
    transfer_dataset(tantalus_api,
                     dataset_id,
                     dataset_model,
                     from_storage_name,
                     to_storage_name,
                     suffix_filter=suffix_filter)
Ejemplo n.º 16
0
def cache_dataset_cmd(dataset_id,
                      dataset_model,
                      from_storage_name,
                      cache_directory,
                      suffix_filter=None):
    tantalus_api = TantalusApi()
    cache_dataset(tantalus_api,
                  dataset_id,
                  dataset_model,
                  from_storage_name,
                  cache_directory,
                  suffix_filter=suffix_filter)
Ejemplo n.º 17
0
def add_cellenone_results(filepaths,
                          library_id,
                          storage_name,
                          tag_name=None,
                          update=False,
                          skip_existing=False,
                          remote_storage_name=None):
    colossus_api = ColossusApi()
    tantalus_api = TantalusApi()

    results_name = 'CELLENONE_{}'.format(library_id)
    results_type = 'CELLENONE'
    results_version = None

    try:
        existing_results = tantalus_api.get('resultsdataset',
                                            name=results_name,
                                            results_type=results_type)
    except NotFoundError:
        existing_results = None

    if skip_existing and existing_results is not None:
        return existing_results

    results_dataset = add_generic_results(
        filepaths=filepaths,
        storage_name=storage_name,
        results_name=results_name,
        results_type=results_type,
        results_version=results_version,
        library_ids=[library_id],
        recursive=True,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )

    return results_dataset
Ejemplo n.º 18
0
def download_datasets(results_type,
                      from_storage_name,
                      to_storage_name,
                      dataset_id=None,
                      jira_ticket=None):
    ''' Download a set of datasets by type.
    '''

    tantalus_api = TantalusApi()

    if dataset_id is not None:
        datasets = tantalus_api.list('results', id=dataset_id)
    elif jira_ticket is not None:
        datasets = tantalus_api.list('results',
                                     results_type=results_type,
                                     analysis__jira_ticket=jira_ticket)
    else:
        datasets = tantalus_api.list('results', results_type=results_type)

    dataset_ids = list()
    for dataset in datasets:
        dataset_ids.append(dataset['id'])

    # Download most recent first
    dataset_ids = reversed(sorted(dataset_ids))

    failed = False
    for dataset_id in dataset_ids:
        try:
            transfer_dataset(tantalus_api, dataset_id, 'resultsdataset',
                             from_storage_name, to_storage_name)
        except:
            logging.exception(f'failed to download {dataset_id}')
            failed = True

    if failed:
        raise Exception('one or more downloads failed')
Ejemplo n.º 19
0
def catalog_cellenone_datasets(storage_name,
                               tag_name=None,
                               update=False,
                               remote_storage_name=None):

    tantalus_api = TantalusApi()

    for dataset in tantalus_api.list('resultsdataset',
                                     results_type='CELLENONE'):
        # HACK: Check for metadata yaml file in dataset
        found_metadata = False
        try:
            file_resource = tantalus_api.get(
                'file_resource',
                resultsdataset__id=dataset['id'],
                filename__endswith='metadata.yaml')
            found_metadata = True
        except NotFoundError:
            logging.info(f"no metadata for dataset {dataset['id']}")

        if found_metadata:
            logging.info(
                f"found metadata for dataset {dataset['id']}, skipping")
            continue

        try:
            process_cellenone_dataset(dataset,
                                      storage_name,
                                      tag_name=tag_name,
                                      update=update,
                                      remote_storage_name=remote_storage_name)

        except KeyboardInterrupt:
            raise

        except:
            logging.exception(f"catalog failed for dataset {dataset['id']}")
def add_cellenone_data(library_id,
                       cellenone_filepath,
                       storage_name,
                       tag_name=None,
                       update=False,
                       remote_storage_name=None):

    tantalus_api = TantalusApi()

    process_cellenone_images(
        library_id,
        cellenone_filepath,
        storage_name,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 21
0
def add_cellenone_data(filepaths,
                       library_id,
                       storage_name,
                       tag_name=None,
                       update=False,
                       remote_storage_name=None):

    tantalus_api = TantalusApi()

    add_cellenone_results(
        filepaths,
        library_id,
        storage_name,
        tag_name=tag_name,
        update=update,
        remote_storage_name=remote_storage_name,
    )
Ejemplo n.º 22
0
def main(**kwargs):
    try:
        df = pd.read_csv(kwargs["ids"])
    except IOError:
        raise Exception("The file {} could not be opened for reading".format(
            kwargs["sample_ids"]))

    tantalus_api = TantalusApi()

    col_name = kwargs["id_type"] + "_id"
    df = df.apply(get_filepath, args=(
        tantalus_api,
        col_name,
    ), axis=1)

    df[[col_name, "shahlab_path", "blob_path",
        "rocks_path"]].to_csv(kwargs["output_file"], index=False)
Ejemplo n.º 23
0
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a all FQ datasets for a library id.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    for dataset_info, metadata in create_lane_fastq_metadata(
            tantalus_api, library_id):
        metadata_filename = os.path.join(dataset_info['base_dir'],
                                         'metadata.yaml')
        metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                      metadata_filename)

        metadata_io = io.BytesIO()
        metadata_io.write(
            yaml.dump(metadata, default_flow_style=False).encode())

        logging.info(f'writing metadata to file {metadata_filepath}')
        client.write_data(metadata_filename, metadata_io)

        logging.info(f'adding {metadata_filepath} to tantalus')

        if not dry_run:
            file_resource, file_instance = tantalus_api.add_file(
                storage_name, metadata_filepath, update=True)

            for dataset_id in dataset_info['dataset_ids']:
                dataset = tantalus_api.get('sequencedataset', id=dataset_id)

                new_file_resources = set(dataset['file_resources'])
                new_file_resources.add(file_resource['id'])

                tantalus_api.update('sequencedataset',
                                    id=dataset_id,
                                    file_resources=list(new_file_resources))
Ejemplo n.º 24
0
def add_generic_dataset(**kwargs):
    tantalus_api = TantalusApi()

    file_resource_pks = []

    sample = tantalus_api.get("sample", sample_id=kwargs['sample_id'])

    library = tantalus_api.get("dna_library", library_id=kwargs['library_id'])

    #Add the file resource to tantalus
    for filepath in kwargs['filepaths']:
        logging.info(
            "Adding file resource for {} to Tantalus".format(filepath))
        resource, instance = tantalus_api.add_file(
            storage_name=kwargs['storage_name'],
            filepath=filepath,
            update=kwargs['update'])
        file_resource_pks.append(resource["id"])

    if "tag_name" in kwargs:
        tag = tantalus_api.get("tag", name=kwargs["tag_name"])
        tags = [tag["id"]]
    else:
        tags = []

    ref_genome = kwargs.get("reference_genome")
    aligner = kwargs.get("aligner")

    if "sequence_lane_pks" in kwargs:
        sequence_pks = map(str, kwargs["sequence_lane_pks"])

    #Add the dataset to tantalus
    sequence_dataset = tantalus_api.get_or_create(
        "sequence_dataset",
        name=kwargs['dataset_name'],
        dataset_type=kwargs['dataset_type'],
        sample=sample["id"],
        library=library["id"],
        sequence_lanes=sequence_pks,
        file_resources=file_resource_pks,
        reference_genome=ref_genome,
        aligner=aligner,
        tags=tags,
    )

    logging.info("Succesfully created sequence dataset with ID {}".format(
        sequence_dataset["id"]))
Ejemplo n.º 25
0
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a dataset and add to tantalus.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id)

    metadata_filename = os.path.join(base_dir, 'metadata.yaml')
    metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                  metadata_filename)

    metadata_io = io.BytesIO()
    metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode())

    print(f'writing metadata to file {metadata_filepath}')
    client.write_data(metadata_filename, metadata_io)

    print(f'adding {metadata_filepath} to tantalus')

    if not dry_run:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             metadata_filepath,
                                                             update=True)

        dataset = tantalus_api.get('sequencedataset', id=dataset_id)

        new_file_resources = set(dataset['file_resources'])
        new_file_resources.add(file_resource['id'])

        tantalus_api.update('sequencedataset',
                            id=dataset_id,
                            file_resources=list(new_file_resources))
Ejemplo n.º 26
0
def run_h5_convert(results_type=None):
    tantalus_api = TantalusApi()

    remote_storage_client = tantalus_api.get_storage_client(
        remote_storage_name)

    if results_type is not None:
        results_list = tantalus_api.list("resultsdataset",
                                         results_type=results_type)
        logging.info(
            'converting results with results type {}'.format(results_type))

    else:
        results_list = tantalus_api.list("resultsdataset")
        logging.info('converting all results')

    for result in results_list:
        logging.info('processing results dataset {}'.format(result['id']))

        try:
            file_instances = tantalus_api.get_dataset_file_instances(
                result["id"],
                "resultsdataset",
                remote_storage_name,
            )

            existing_filenames = set(
                [i['file_resource']['filename'] for i in file_instances])

            found_csv_yaml = False
            for existing_filename in existing_filenames:
                if existing_filename.endswith('.csv.gz.yaml'):
                    found_csv_yaml = True
                    break

            if found_csv_yaml:
                logging.info('found filename {}, skipping conversion'.format(
                    existing_filename))

            else:
                print(result["id"])
                logging.info('no yaml found')

        except NotFoundError:
            logging.exception('no files found for conversion')

        except KeyboardInterrupt:
            raise

        except Exception:
            logging.exception('conversion failed')
Ejemplo n.º 27
0
def fix_bams(jira_ticket=None, dry_run=False):

    tantalus_api = TantalusApi()

    analyses_list = []
    storage_name = "singlecellresults"

    if jira_ticket is not None:
        analyses_list.append(tantalus_api.get('analysis', jira_ticket=jira_ticket, analysis_type__name="align", status="complete"))
    
    else:
        # Get all completed align analyses ran with specific version
        # the bams associated to these analyses are in the wrong storage account
        for version in ('v0.5.2', 'v0.5.3'):
            analyses = tantalus_api.list('analysis', analysis_type__name="align", status="complete", version=version)
            analyses_list += [a for a in analyses]

    for analysis in analyses_list:
        jira_ticket = analysis["jira_ticket"]

        filename = f'{jira_ticket}/results/bams/metadata.yaml'

        logging.info(f'adding file {filename}')
        if not dry_run:
            file_instance, file_resource = tantalus_api.add_file(storage_name, filename)

        # get all bam datasets associated with the jira ticket
        bam_datasets = tantalus_api.list(
            "sequencedataset",
            dataset_type="BAM",
            analysis__jira_ticket=jira_ticket,
        )

        for dataset in bam_datasets:
            dataset_id = dataset['id']

            logging.info(f'adding file to dataset {dataset_id}')
            if not dry_run:
                file_resource_ids = dataset['file_resources']
                file_resource_ids = file_resource_ids.append(file_resource['id'])
                tantalus_api.update('sequencedataset', id=dataset['id'], file_resources=file_resource_ids)
Ejemplo n.º 28
0
from datetime import datetime
from collections import defaultdict

from workflows.unanalyzed_data import *

import datamanagement.templates as templates

from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
from dbclients.basicclient import NotFoundError

from workflows.utils import file_utils
from workflows.utils import saltant_utils
from workflows.utils.colossus_utils import get_ref_genome

tantalus_api = TantalusApi()
colossus_api = ColossusApi()

log = logging.getLogger('sisyphus')
log.setLevel(logging.DEBUG)
stream_handler = logging.StreamHandler()
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stream_handler.setFormatter(formatter)
log.addHandler(stream_handler)
log.propagate = False


def get_sequencings(library_info):
    '''
    Given library id (str), return list of sequencings
Ejemplo n.º 29
0
from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
import logging


tantalus_api = TantalusApi()
colossus_api = ColossusApi()


if __name__ == '__main__':
    print "STARTING"
    colossus_analyses = colossus_api.list('analysis_information')
    tantalus_analyses = tantalus_api.list('analysis', analysis_type__name="align")

    analysis_lane_dict = {}

    for analysis in tantalus_analyses:
        lane_set = set()
        for input_dataset in analysis['input_datasets']:
            dataset = tantalus_api.get('sequencedataset',id=input_dataset)
            for lane in dataset['sequence_lanes']:
                lane_set.add(str(lane['flowcell_id'] + "_" + str(lane['lane_number'])))

        analysis_lane_dict[analysis['name']] = lane_set

    print analysis_lane_dict

    for analysis in colossus_analyses:
        key = analysis['analysis_jira_ticket'] + '_align'
        if key in analysis_lane_dict.keys():
            lanes = []
Ejemplo n.º 30
0
def main(
    storage_name,
    dry_run=False,
    check_remote=None,
):
    tantalus_api = TantalusApi()

    storage_client = tantalus_api.get_storage_client(storage_name)

    remote_storage_client = None
    if check_remote is not None:
        remote_storage_client = tantalus_api.get_storage_client(check_remote)

    file_instances = tantalus_api.list('file_instance',
                                       storage__name=storage_name,
                                       is_deleted=True)

    # DEBUG: check whether we are getting back
    # consistent ordered results from tantalus
    file_instances = list(file_instances)
    file_instance_ids = set([f['id'] for f in file_instances])
    if len(file_instances) != len(file_instance_ids):
        raise Exception('received duplicate results from tantalus')

    logging.info('processing {} file instances'.format(len(file_instance_ids)))
    logging.info('processing the following file instances: {}'.format(
        str(file_instance_ids)))

    for file_instance in file_instances:
        file_resource = tantalus_api.get(
            'file_resource', id=file_instance['file_resource']['id'])
        all_file_instances = list(
            tantalus_api.list('file_instance',
                              file_resource=file_resource['id']))

        logging.info(
            'checking file instance {}, file resource {}, filepath {}'.format(
                file_instance['id'], file_resource['id'],
                file_instance['filepath']))

        sequencedatasets = tantalus_api.list(
            'sequencedataset', file_resources__id=file_resource['id'])
        resultsdatasets = tantalus_api.list(
            'resultsdataset', file_resources__id=file_resource['id'])

        sequencedataset_ids = list(set([a['id'] for a in sequencedatasets]))
        resultsdataset_ids = list(set([a['id'] for a in resultsdatasets]))

        logging.info(
            'file resource {} belongs to sequencedataset {} and resultsdatasets {}'
            .format(file_resource['id'], sequencedataset_ids,
                    resultsdataset_ids))

        # Optionally check for a remote version
        if remote_storage_client:
            remote_instance = None
            for other_instance in file_resource['file_instances']:
                if other_instance['storage']['name'] == check_remote:
                    remote_instance = other_instance

            if not remote_instance:
                logging.info(
                    'not deleting file instance {}, no other instance'.format(
                        file_instance['id']))
                continue

            if remote_instance['is_deleted']:
                logging.info(
                    'not deleting file instance {}, other instance {} deleted'.
                    format(file_instance['id'], other_instance['id']))
                continue

            if not remote_storage_client.exists(file_resource['filename']):
                logging.info(
                    'not deleting file instance {}, other instance {} doesnt exist'
                    .format(file_instance['id'], other_instance['id']))
                continue

            logging.info(
                'deletion ok for file instance {}, found other instance {}'.
                format(file_instance['id'], other_instance['id']))

        # Delete the file from the filesystem
        logging.info('deleting file {}'.format(file_instance['filepath']))
        if not dry_run:
            try:
                storage_client.delete(file_resource['filename'])
            except FileNotFoundError:
                logging.exception('file already deleted')

        # Delete the instance model from tantalus
        logging.info('deleting file instance {}'.format(file_instance['id']))
        if not dry_run:
            tantalus_api.delete('file_instance', id=file_instance['id'])

        # If this is the only file instance for this file resource, delete the file resource
        if len(all_file_instances) == 1:
            assert all_file_instances[0]['id'] == file_instance['id']
            logging.info('deleting file resource {}'.format(
                file_resource['id']))
            if not dry_run:
                tantalus_api.delete('file_resource', id=file_resource['id'])