Beispiel #1
0
    def create_output_datasets(self, storages, update=False):
        """
        Create the set of output sequence datasets produced by this analysis.
        """
        assert len(self.analysis['input_datasets']) == 1
        input_dataset = self.tantalus_api.get('sequence_dataset', id=self.analysis['input_datasets'][0])
        storage_client = self.tantalus_api.get_storage_client(storages["working_inputs"])
        metadata_yaml_path = os.path.join(self.bams_dir, "metadata.yaml")
        metadata_yaml = yaml.safe_load(storage_client.open_file(metadata_yaml_path))

        name = templates.WGS_SPLIT_BAM_NAME_TEMPLATE.format(
            dataset_type="BAM",
            sample_id=input_dataset["sample"]["sample_id"],
            library_type=input_dataset["library"]["library_type"],
            library_id=input_dataset["library"]["library_id"],
            lanes_hash=get_lanes_hash(input_dataset["sequence_lanes"]),
            aligner=input_dataset['aligner'],
            reference_genome=input_dataset['reference_genome'],
            split_length=self.split_size,
        )

        file_resources = []
        for filename in metadata_yaml["filenames"] + ['metadata.yaml']:
            filepath = os.path.join(
                storage_client.prefix, self.bams_dir, filename)
            file_resource, file_instance = self.tantalus_api.add_file(
                storages["working_inputs"], filepath, update=update)
            file_resources.append(file_resource["id"])

        data = {
            'name': name,
            'version_number': 1,
            'dataset_type': "BAM",
            'sample': input_dataset["sample"]["id"],
            'library': input_dataset["library"]["id"],
            'sequence_lanes': [a["id"] for a in input_dataset["sequence_lanes"]],
            'file_resources': file_resources,
            'aligner': input_dataset["aligner"],
            'reference_genome': input_dataset["reference_genome"],
            'region_split_length': self.split_size,
            'analysis': self.analysis['id'],
        }

        keys = [
            'name',
            'version_number',
        ]

        output_dataset, _ = self.tantalus_api.create(
            'sequencedataset', data, keys, get_existing=True, do_update=update)

        logging.info("Created sequence dataset {}".format(name))

        return [output_dataset]
Beispiel #2
0
    def generate_unique_name(cls, tantalus_api, jira, version, args,
                             input_datasets, input_results):
        assert len(input_datasets) == 1
        dataset = tantalus_api.get('sequence_dataset', id=input_datasets[0])

        name = templates.SC_PSEUDOBULK_ANALYSIS_NAME_TEMPLATE.format(
            analysis_type=cls.analysis_type_,
            aligner=dataset['aligner'],
            ref_genome=dataset['reference_genome'],
            library_id=dataset['library']['library_id'],
            sample_id=dataset['sample']['sample_id'],
            lanes_hashed=get_lanes_hash(dataset["sequence_lanes"]),
        )

        return name
def get_tantalus_bam_filename(sample, library, lane_infos):
    """
    Creates filename for bam that matches current naming conventions
    in Tantalus

    Args:
        sample:     (dict) the sample associated with the bam
        library:    (dict) the library associated with the bam
        lane_infos: (list of dictionaries) contains lane info 
                    associated with the bam
    Returns:
        bam_path:   (string) the filename for the bam following 
                    naming conventions
    """
    lanes_str = get_lanes_hash(lane_infos)
    bam_path = WGS_BAM_NAME_TEMPLATE.format(
        sample_id=sample["sample_id"],
        library_type=library["library_type"],
        library_id=library["library_id"],
        lanes_str=lanes_str,
    )

    return bam_path
Beispiel #4
0
    def generate_unique_name(cls, tantalus_api, jira, version, args,
                             input_datasets, input_results):
        assert len(input_datasets) == 2
        for dataset_id in input_datasets:
            dataset = tantalus_api.get('sequencedataset', id=dataset_id)
            if dataset['sample']['sample_id'] == args['sample_id']:
                tumour_dataset = dataset

        assert tumour_dataset['aligner'].startswith(args['aligner'])
        assert tumour_dataset['reference_genome'] == args['ref_genome']
        assert tumour_dataset['library']['library_id'] == args['library_id']
        assert tumour_dataset['sample']['sample_id'] == args['sample_id']

        name = templates.SC_PSEUDOBULK_ANALYSIS_NAME_TEMPLATE.format(
            analysis_type=cls.analysis_type_,
            aligner=args['aligner'],
            ref_genome=args['ref_genome'],
            library_id=args['library_id'],
            sample_id=args['sample_id'],
            lanes_hashed=get_lanes_hash(tumour_dataset["sequence_lanes"]),
        )

        return name
def import_tenx_fastqs(storage_name,
                       sequencing,
                       no_comments=False,
                       update=False):
    storage_client = tantalus_api.get_storage_client(storage_name)

    # get colossus sequencing id
    sequencing_id = sequencing["id"]
    # get pool id from sequencing
    pool_id = sequencing["tenx_pool"]
    # get colossus tenx pool object
    pool = colossus_api.get("tenxpool", id=pool_id)
    # get pool name
    pool_name = pool['pool_name']

    # get gsc id (this may not have been filled out)
    gsc_pool_id = sequencing["gsc_library_id"]
    # query gsc by gsc pool id
    gsc_pool_infos = gsc_api.query(f"library?name={gsc_pool_id}")

    # check if not results returned
    if not gsc_pool_id:
        # query gsc by our indentifier instead i.e. colossus pool name
        gsc_pool_infos = gsc_api.query(
            f"library?external_identifier={pool_name}")
        if gsc_pool_infos:
            # get name used internally at gsc
            gsc_pool_id = gsc_pool_infos[0]["name"]

    # try to fetch for gsc pool info again
    gsc_pool = gsc_api.query(f"library?name={gsc_pool_id}")

    # if no results found for a second time, exit
    if not gsc_pool:
        logging.info(f"cannot find data for {pool_name}, {gsc_pool_id}")
        return None

    # get id of gsc pool
    pool_id = gsc_pool[0]["id"]

    # get information about sequecing run
    run_info = gsc_api.query(f"run?library_id={pool_id}")

    logging.info(f"Importing {pool_name} ")

    # init dictionary to be used for collecting library index pairs
    index_lib = dict()

    pool_libraries = []

    # for each library in the pool, collect the sample and index of the library
    for library in pool["libraries"]:
        # get colossus tenx library
        tenxlib = colossus_api.get("tenxlibrary", id=library)
        library = tenxlib['name']

        pool_libraries.append(library)
        # get sample name
        sample = tenxlib["sample"]["sample_id"]
        # get index name
        index_used = tenxlib["tenxlibraryconstructioninformation"][
            "index_used"]
        # index always ends with comma, so remove comma from name
        index = index_used.split(",")[0]
        print(f"{tenxlib['name']} {tenxlib['sample']['sample_id']} {index}")
        # add info keyed by index
        index_lib[index] = dict(tenxlib=tenxlib,
                                library=tenxlib['name'],
                                sample=tenxlib['sample']['sample_id'])

    # iterate through all sequencing runs of this pool
    for run in run_info:
        run_id = run["id"]
        # get all libcores of run
        # in the case of tenx, a libcore represents a colossus tenxlibrary
        libcore = gsc_api.query(
            f"libcore?run_id={run_id}&relations=primer%2Crun%2Clibrary&primer_columns=name"
        )

        gsc_sublibraries = []
        dataset_ids = []

        # skip run if no libcores found
        if not libcore:
            logging.info(f"no libcore")
            continue

        for lib in libcore:
            lanes = []
            lane_pks = []

            filenames = []

            index = lib["primer"]["name"]
            flowcell_id = lib["run"]["flowcell_id"]
            flowcell = gsc_api.query(f"flowcell?id={flowcell_id}")

            # check if the libcore is associated with a library in the pool
            try:
                tenxlib = index_lib[index]["tenxlib"]
                library = index_lib[index]["library"]
                sample = index_lib[index]["sample"]
            except Exception as e:
                logging.error(f"Index not found: {e}")
                raise Exception(f"Index not found: {e}")

            # collect sequencing info
            flowcell_id = str(flowcell[0]['lims_flowcell_code'])
            lane_number = str(lib['run']['lane_number'])
            sequencing_date = str(lib["run"]["run_datetime"])
            sequencing_instrument = get_sequencing_instrument(
                lib["run"]["machine"])
            sequencing_instrument = sequencing_instrument_map[
                sequencing_instrument]

            flowcell_lane = f"{flowcell_id}_{lane_number}"
            # get existing data
            existing_data = get_existing_fastq_data(tantalus_api, library)
            if flowcell_lane in existing_data:
                logging.info(
                    f"skipping {flowcell_lane} since already imported")
                continue

            # get internal gsc library name
            gsc_library_id = lib["library"]["name"]
            # update library's gsc name
            colossus_api.update("tenxlibrary",
                                id=tenxlib["id"],
                                gsc_library_id=gsc_library_id)

            gsc_sublibraries.append(gsc_library_id)

            # query for fastqs of the library
            fastqs = gsc_api.query(f"concat_fastq?libcore_id={lib['id']}")
            print(fastqs)

            for fastq in fastqs:
                filename_pattern = fastq["file_type"]["filename_pattern"]

                read_end, passed = filename_pattern_map.get(
                    filename_pattern, (None, None))

                if read_end is None:
                    logging.info(
                        "Unrecognized file type: {}".format(filename_pattern))
                    continue

                # construct fastq name
                new_filename = "_".join([
                    library, sample, "S1", f"L00{lane_number}", f"R{read_end}",
                    "001.fastq.gz"
                ])
                fullpath = os.path.join(storage_client.prefix, library,
                                        flowcell_lane, new_filename)
                filenames.append(fullpath)

                # add fastq to cloud storage
                storage_client.create(
                    os.path.join(library, flowcell_lane, new_filename),
                    fastq["data_path"],
                    update=True,
                )

            # if no files were found move onto next library
            if not filenames:
                print(
                    f"no data for run_id: {run_id}; lane {flowcell_id}_{lane_number}"
                )
                continue

            # collect and add lane info
            lane = dict(flowcell_id=flowcell_id, lane_number=str(lane_number))
            lanes.append(lane)

            # create tantalus library
            dna_library = tantalus_api.get_or_create(
                "dna_library",
                library_id=library,
                library_type="SC_RNASEQ",
                index_format="TENX",
            )

            try:
                lane_object = tantalus_api.get(
                    "sequencing_lane",
                    flowcell_id=flowcell_id,
                    lane_number=str(lane_number),
                    dna_library=dna_library["id"],
                )

                tantalus_api.update(
                    "sequencing_lane",
                    id=lane_object["id"],
                    sequencing_centre="GSC",
                    sequencing_instrument=sequencing_instrument,
                    read_type="TENX",
                )

            except:
                lane_object, _ = tantalus_api.create(
                    "sequencing_lane",
                    fields=dict(
                        flowcell_id=flowcell_id,
                        lane_number=str(lane_number),
                        sequencing_centre="GSC",
                        sequencing_instrument=sequencing_instrument,
                        read_type="TENX",
                        dna_library=dna_library["id"],
                    ),
                    keys=[
                        "flowcell_id",
                        "lane_number",
                        "sequencing_centre",
                        "dna_library",
                    ],
                    get_existing=True,
                )

            lane_pks.append(lane_object["id"])

            dataset_name = TENX_SCRNA_DATASET_TEMPLATE.format(
                dataset_type="FQ",
                sample_id=sample,
                library_type="SC_RNASEQ",
                library_id=library,
                lanes_hash=get_lanes_hash(lanes),
            )
            sequence_dataset = add_generic_dataset(
                filepaths=filenames,
                sample_id=sample,
                library_id=library,
                storage_name="scrna_fastq",
                dataset_name=dataset_name,
                dataset_type="FQ",
                sequence_lane_pks=lane_pks,
                reference_genome="HG38",
                update=True,
            )

            dataset_ids.append(sequence_dataset)

            url = f"https://colossus.canadacentral.cloudapp.azure.com/tenx/sequencing/{sequencing_id}"
            comment = f"Import successful:\n\nLane: {flowcell_lane}\nGSC Library ID: {gsc_library_id}\n{url}"

            comments = jira_api.comments(tenxlib["jira_ticket"])
            commented = False
            for c in comments:
                if c.body == comment:
                    commented = True
                    break

            if not commented:
                comment_jira(tenxlib["jira_ticket"], comment)

            # create jira ticket
            jira_ticket = create_analysis_jira_ticket(library, sample,
                                                      tenxlib['jira_ticket'])
            # create colossus analysis
            analysis, _ = colossus_api.create(
                "tenxanalysis",
                fields={
                    "version": "vm",
                    "jira_ticket": jira_ticket,
                    "run_status": "idle",
                    "tenx_library": tenxlib["id"],
                    "submission_date": str(datetime.date.today()),
                    "tenxsequencing_set": [],
                },
                keys=["jira_ticket"],
            )
            # create tantalus analysis
            create_tenx_analysis_from_library(jira_ticket, library)

        # check if data has been imported
        if filenames:
            # add lanes to colossus
            colossus_lane = colossus_api.get_or_create(
                "tenxlane",
                flow_cell_id=flowcell_lane,
                sequencing=sequencing_id,
            )
            # update lane with gsc id and date
            colossus_api.update(
                "tenxlane",
                id=colossus_lane["id"],
                tantalus_datasets=list(set(dataset_ids)),
                gsc_sublibrary_names=gsc_sublibraries,
                sequencing_date=sequencing_date,
            )

    # check if gsc id hasn't been added correctly
    if sequencing["gsc_library_id"] != gsc_pool_id:
        logging.info(
            "Updating gsc library id of sequencing {} from {} to {}".format(
                sequencing["id"], sequencing["gsc_library_id"], gsc_pool_id))
        colossus_api.update("tenxsequencing",
                            sequencing["id"],
                            gsc_library_id=gsc_pool_id)

    logging.info("Succesfully imported {} {}".format(pool_name, gsc_pool_id))

    import_info = dict(
        pool_name=pool_name,
        libraries=pool_libraries,
        gsc_library_id=gsc_pool_id,
    )

    return import_info
Beispiel #6
0
def create_sequence_dataset_models(file_info,
                                   storage_name,
                                   tag_name,
                                   tantalus_api,
                                   analysis_id=None,
                                   update=False):
    """Create tantalus sequence models for a list of files."""

    analysis = None
    if analysis_id is not None:
        analysis = tantalus_api.get('analysis', id=analysis_id)

    # Get storage and tag PKs
    storage = tantalus_api.get("storage", name=storage_name)
    storage_pk = storage["id"]

    # Sort files by dataset
    dataset_info = collections.defaultdict(list)
    for info in file_info:
        if info["dataset_type"] == 'BAM':
            dataset_name = templates.SC_WGS_BAM_NAME_TEMPLATE.format(
                dataset_type=info["dataset_type"],
                sample_id=info["sample_id"],
                library_type=info["library_type"],
                library_id=info["library_id"],
                lanes_hash=get_lanes_hash(info["sequence_lanes"]),
                aligner=info["aligner_name"],
                reference_genome=info["ref_genome"],
                jira_ticket=analysis["jira_ticket"],
            )
        elif info["dataset_type"] == 'FQ':
            dataset_name = templates.SC_WGS_FQ_NAME_TEMPLATE.format(
                dataset_type=info["dataset_type"],
                sample_id=info["sample_id"],
                library_type=info["library_type"],
                library_id=info["library_id"],
                lane=get_lane_str(info["sequence_lanes"][0]),
            )
        dataset_info[dataset_name].append(info)

    # Create datasets
    dataset_ids = set()
    for dataset_name, infos in dataset_info.items():
        # Get library PK
        library = tantalus_api.get_or_create(
            "dna_library",
            library_id=infos[0]["library_id"],
            library_type=infos[0]["library_type"],
            index_format=infos[0]["index_format"],
        )
        library_pk = library["id"]

        # Get sample PK
        sample = tantalus_api.get_or_create(
            "sample",
            sample_id=infos[0]["sample_id"],
        )
        sample_pk = sample["id"]

        # Build up sequence dataset attrs; we'll add to this as we
        # proceed throughout the function
        sequence_dataset = dict(
            name=dataset_name,
            dataset_type=infos[0]["dataset_type"],
            sample=sample_pk,
            library=library_pk,
            sequence_lanes=[],
            file_resources=[],
        )

        # Add in the analysis id if it's provided
        if analysis_id is not None:
            sequence_dataset["analysis"] = analysis_id

        # Add in BAM specific items
        if infos[0]["dataset_type"] == "BAM":
            sequence_dataset["aligner"] = infos[0]["aligner_name"]
            sequence_dataset["reference_genome"] = infos[0]["ref_genome"]

        for info in infos:
            # Check consistency for fields used for dataset
            check_fields = (
                "dataset_type",
                "sample_id",
                "library_id",
                "library_type",
                "index_format",
            )
            for field_name in check_fields:
                if info[field_name] != infos[0][field_name]:
                    raise Exception("error with field {}".format(field_name))

            for sequence_lane in info["sequence_lanes"]:
                sequence_lane = dict(sequence_lane)
                sequence_lane["dna_library"] = library_pk
                sequence_lane["lane_number"] = str(
                    sequence_lane["lane_number"])

                sequence_lane = tantalus_api.get_or_create(
                    "sequencing_lane", **sequence_lane)

                sequence_dataset["sequence_lanes"].append(sequence_lane["id"])

            sequence_file_info = dict(index_sequence=info["index_sequence"])

            if "read_end" in info:
                sequence_file_info["read_end"] = info["read_end"]

            file_resource, file_instance = tantalus_api.add_file(
                storage_name,
                info["filepath"],
                update=update,
            )

            sequence_file_info = tantalus_api.get_or_create(
                "sequence_file_info",
                file_resource=file_resource["id"],
                **sequence_file_info)

            sequence_dataset["file_resources"].append(file_resource["id"])

        try:
            dataset_id = tantalus_api.get("sequence_dataset",
                                          name=sequence_dataset["name"])["id"]
        except NotFoundError:
            dataset_id = None

        if update and dataset_id is not None:
            log.warning("sequence dataset {} has changed, updating".format(
                sequence_dataset["name"]))
            dataset = tantalus_api.update("sequence_dataset",
                                          id=dataset_id,
                                          **sequence_dataset)

        else:
            log.info("creating sequence dataset {}".format(
                sequence_dataset["name"]))
            dataset = tantalus_api.get_or_create("sequence_dataset",
                                                 **sequence_dataset)

        if tag_name is not None:
            tantalus_api.tag(tag_name, sequencedataset_set=[dataset['id']])

        dataset_ids.add(dataset['id'])

    return dataset_ids
Beispiel #7
0
def add_sequence_dataset(tantalus_api,
                         storage_name,
                         sample,
                         library,
                         dataset_type,
                         sequence_lanes,
                         bam_file_path,
                         reference_genome,
                         aligner,
                         bai_file_path=None,
                         tag_name=None,
                         update=False):
    """
        Add a sequence dataset, gets or creates the required sample, library, 
        and sequence lanes for the dataset

        Args:
            storage_name (str)
            dataset_type (str)
            sample_id (dict):       contains: sample_id
            library (dict):         contains: library_id, library_type, index_format
            sequence_lanes (list):  contains: flowcell_id, read_type, lane_number, 
                                    sequencing_centre, sequencing_instrument, library_id
            bam_file_path (str):    bam file path to data included in dataset
            reference_genome (str)
            aligner (str)
            bai_file_path (str):    bam index file path to data included in dataset (optional)
            tags (list)
        Returns:
            sequence_dataset (dict)
        """
    # Create the sample
    sample = tantalus_api.get_or_create(
        "sample",
        sample_id=sample['sample_id'],
    )

    # Create the library
    library = tantalus_api.get_or_create("dna_library",
                                         library_id=library["library_id"],
                                         library_type=library["library_type"],
                                         index_format=library["index_format"])

    # Create the sequence lanes
    sequence_lane_pks = []
    for lane in sequence_lanes:
        # Get library ID associated with each lane
        lane_library_pk = tantalus_api.get_or_create(
            "dna_library",
            library_id=lane["library_id"],
            library_type=library["library_type"],
            index_format=library["index_format"])["id"]

        lane_fields = dict(
            dna_library=lane_library_pk,
            flowcell_id=lane["flowcell_id"],
            lane_number=str(lane["lane_number"]),
        )

        # Optional fields for create
        for field_name in ("read_type", "sequencing_centre",
                           "sequencing_instrument"):
            if field_name in lane_fields:
                lane_fields[field_name] = lane[field_name]
            else:
                logging.warning(
                    f"field {field_name} missing for lane {lane['flowcell_id']}_{lane['lane_number']}"
                )

        lane_pk = tantalus_api.get_or_create("sequencing_lane",
                                             **lane_fields)["id"]

        sequence_lane_pks.append(lane_pk)

    # Create the tag
    if tag_name is not None:
        tag_pk = tantalus_api.get_or_create("tag", name=tag_name)["id"]
        tags = [tag_pk]
    else:
        tags = []

    # Create the file resources
    file_resource_pks = []
    file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                         bam_file_path,
                                                         update=update)
    file_resource_pks.append(file_resource["id"])

    if bai_file_path is not None:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             bai_file_path,
                                                             update=update)
        file_resource_pks.append(file_resource["id"])

    dataset_name = templates.WGS_BAM_NAME_TEMPLATE.format(
        dataset_type="BAM",
        sample_id=sample["sample_id"],
        library_type=library["library_type"],
        library_id=library["library_id"],
        lanes_hash=get_lanes_hash(sequence_lanes),
        aligner=aligner,
        reference_genome=reference_genome,
    )

    # Find all similarly named datasets
    similar_datasets = list(
        tantalus_api.list(
            "sequence_dataset",
            name=dataset_name,
        ))

    # Filter for a similarly named dataset with the same files
    existing_dataset = None
    for dataset in similar_datasets:
        if set(dataset['file_resources']) == set(file_resource_pks):
            existing_dataset = dataset
            logging.info(
                f"found existing dataset {dataset['id']} with identical file list"
            )
            break
        elif set(dataset['file_resources']).intersection(
                set(file_resource_pks)):
            raise ValueError(
                f"dataset {dataset['id']} has files {dataset['file_resources']} partially intersecting with {list(file_resource_pks)}"
            )

    if existing_dataset is not None:
        # Get or create to check field consistency
        sequence_dataset = tantalus_api.get_or_create(
            "sequence_dataset",
            name=dataset_name,
            version_number=existing_dataset['version_number'],
            dataset_type=dataset_type,
            sample=sample["id"],
            library=library["id"],
            sequence_lanes=sequence_lane_pks,
            file_resources=file_resource_pks,
            reference_genome=reference_genome,
            aligner=aligner,
        )

        # Update the existing dataset tags
        tag_ids = tags + existing_dataset["tags"]
        sequence_dataset = tantalus_api.update(
            "sequence_dataset",
            id=existing_dataset["id"],
            tags=tag_ids,
        )

    else:
        # Find a new version number if necessary
        version_number = 1
        if len(similar_datasets) > 0:
            version_number = max(d['version_number']
                                 for d in similar_datasets) + 1
            logging.info(
                f"creating new version of dataset {dataset_name} with version number {version_number}"
            )

        fields = {
            'name': dataset_name,
            'version_number': version_number,
            'dataset_type': dataset_type,
            'sample': sample["id"],
            'library': library["id"],
            'sequence_lanes': sequence_lane_pks,
            'file_resources': file_resource_pks,
            'reference_genome': reference_genome,
            'aligner': aligner,
            'tags': tags,
        }

        sequence_dataset, is_updated = tantalus_api.create(
            "sequence_dataset", fields, keys=["name", "version_number"])

    return sequence_dataset
Beispiel #8
0
def tantalus_import(library_id,
                    sample_id,
                    lane_infos,
                    blob_paths,
                    sequencing_centre,
                    dataset_type,
                    storage_name,
                    tag_name=None,
                    update=False):
    """
    Imports tenx sequence dataset and file resources into tantalus

    Args:
        library_id:         (str) internal name for the library
        sample_id:          (str) internal name for the sample
        lane_infos:         (list) a list of dictionaries containing 
                            flowcell ID, lane number, and sequencing
                            instrument
        blob_paths:         (list) a list of filepaths to the FASTQs 
                            on azure storage
        sequencing_centre:  (str) GSC or BRC according to where the 
                            library was sequenced 
        dataset_type:       (str) FQ, BAM, or BCL
        storage_name:       (str) name of the azure storage in tantalus
        tag_name:           (str) name of the tag associated with the 
                            dataset
        update:             (bool) a boolean indicating whether to update
                            any information already in tantalus
    Returns:
        sequence_dataset["id"]: ID of the newly created sequence dataset
    """
    file_resource_ids, file_instance_ids, sequence_lanes, sequence_lanes_pks = [], [], [], []

    sample_pk = tantalus_api.get_or_create(
        "sample",
        sample_id=sample_id,
    )["id"]
    library_pk = tantalus_api.get_or_create(
        "dna_library",
        library_id=library_id,
        library_type="SC_RNASEQ",
        index_format="TENX",
    )["id"]

    # Add the file resources to tantalus
    for blob_path in blob_paths:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             blob_path,
                                                             update=update)
        file_resource_ids.append(file_resource["id"])
        file_instance_ids.append(file_instance["id"])

    logging.info("Adding lanes to Tantalus")
    for lane_info in lane_infos:
        # Try to find a match for the sequencing instrument
        try:
            sequencing_instrument = TANTALUS_SEQUENCING_MAP[
                lane_info["sequencing_instrument"]]
        except KeyError:
            sequencing_instrument = lane_info["sequencing_instrument"]
        lane = tantalus_api.get_or_create(
            "sequencing_lane",
            flowcell_id=lane_info["flowcell_id"],
            dna_library=library_pk,
            read_type="TENX",
            lane_number=str(lane_info["lane_number"]),
            sequencing_centre=sequencing_centre,
            sequencing_instrument=lane_info["sequencing_instrument"])
        sequence_lanes.append(lane)
        sequence_lanes_pks.append(lane["id"])

    dataset_name = TENX_SCRNA_DATASET_TEMPLATE.format(
        dataset_type=dataset_type,
        sample_id=sample_id,
        library_type="SC_RNASEQ",
        library_id=library_id,
        lanes_hash=get_lanes_hash(sequence_lanes),
    )

    # Create tags
    if tag_name is not None:
        tag_pk = tantalus_api.get_or_create("tag", name=tag_name)["id"]
        tags = [tag_pk]
    else:
        tags = []

    # Add the sequence dataset to tantalus
    sequence_dataset = tantalus_api.get_or_create(
        "sequence_dataset",
        name=dataset_name,
        dataset_type=dataset_type,
        sample=sample_pk,
        library=library_pk,
        sequence_lanes=sequence_lanes_pks,
        file_resources=file_resource_ids,
        tags=tags,
    )
    logging.info("Sequence dataset has ID {}".format(sequence_dataset["id"]))
    return sequence_dataset["id"]