Ejemplo n.º 1
0
def main(storage_name, bam_file_path, **kwargs):
    """
    Imports the bam into tantalus by creating a sequence dataset and 
    file resources 
    """
    logging.basicConfig(format=LOGGING_FORMAT,
                        stream=sys.stderr,
                        level=logging.INFO)

    tantalus_api = TantalusApi()

    sample = None
    if kwargs.get('sample_id') is not None:
        sample = tantalus_api.get_or_create(
            'sample',
            sample_id=kwargs['sample_id'],
        )

    library = None
    if kwargs.get('library_id') is not None:
        if kwargs.get('library_type') is not None and kwargs.get(
                'index_format') is not None:
            library = tantalus_api.get_or_create(
                'dna_library',
                library_id=kwargs['library_id'],
                library_type=kwargs['library_type'],
                index_format=kwargs['index_format'],
            )
        else:
            library = tantalus_api.get(
                'dna_library',
                library_id=kwargs['library_id'],
            )

    dataset = import_bam(
        storage_name,
        bam_file_path,
        sample=sample,
        library=library,
        read_type=kwargs.get('read_type'),
        ref_genome=kwargs.get('ref_genome'),
        update=kwargs.get('update'),
        tag_name=kwargs.get('tag_name'),
    )

    print("dataset {}".format(dataset["id"]))
Ejemplo n.º 2
0
def add_generic_dataset(**kwargs):
    tantalus_api = TantalusApi()

    file_resource_pks = []

    sample = tantalus_api.get_or_create("sample", sample_id=kwargs['sample_id'])

    library = tantalus_api.get_or_create("dna_library", library_id=kwargs['library_id'])

    #Add the file resource to tantalus
    for filepath in kwargs['filepaths']:
        logging.info("Adding file resource for {} to Tantalus".format(filepath))
        resource, instance = tantalus_api.add_file(
            storage_name=kwargs['storage_name'],
            filepath=filepath,
            update=kwargs['update'],
        )
        file_resource_pks.append(resource["id"])

    if "tag_name" in kwargs:
        tag = tantalus_api.get("tag", name=kwargs["tag_name"])
        tags = [tag["id"]]
    else:
        tags = []

    ref_genome = kwargs.get("reference_genome")
    aligner = kwargs.get("aligner")

    # Add the dataset to tantalus
    sequence_dataset = tantalus_api.get_or_create(
        "sequence_dataset",
        name=kwargs['dataset_name'],
        dataset_type=kwargs['dataset_type'],
        sample=sample["id"],
        library=library["id"],
        sequence_lanes=kwargs["sequence_lane_pks"],
        file_resources=file_resource_pks,
        reference_genome=ref_genome,
        aligner=aligner,
        tags=tags,
    )

    logging.info("Succesfully created sequence dataset with ID {}".format(sequence_dataset["id"]))
Ejemplo n.º 3
0
def add_analysis(**kwargs):
    tantalus_api = TantalusApi()

    #Create new analysis object
    analysis = tantalus_api.get_or_create("analysis",
                                          name=kwargs['name'],
                                          jira_ticket=kwargs['jira_id'],
                                          analysis_type=kwargs['type'],
                                          version=kwargs['version'])

    logging.info("Successfully created analysis with ID {}".format(
        analysis["id"]))
Ejemplo n.º 4
0
def main(**kwargs):
    """
    Queries the GSC for WGS bams. Transfers bams to specified storage if 
    necessary and uploads metadata to tantalus

    Args:
        ids:                (string) a list of internal IDs to query the GSC for 
        storage:            (string) destination storage to transfer bams to
        id_type:            (string) type of ID specified (either sample or library) 
        skip_older_than:    (string) skip bams older than this date
        tag_name:           (string) tag name to associate the resulting sequence datasets
                            with when importing into tantalus
        update:             (flag) specifies whether metadata in tantalus is
                            to be updated or not
        skip_file_import:   (flag) import only new lanes into tantalus
        query_only:         (flag) only query for the bam paths on the GSC 
    """
    # Check if this script is being run on thost
    # If not, connect to an ssh client to access /projects/files
    if socket.gethostname() != "txshah":
        ssh_client = connect_to_client("10.9.208.161")
        sftp = ssh_client.open_sftp()
    else:
        sftp = None

    # Connect to the Tantalus API
    tantalus_api = TantalusApi()
    storage = tantalus_api.get_storage(kwargs["storage"])

    # Convert the date to the format we want
    if kwargs["skip_older_than"]:
        skip_older_than = valid_date(kwargs["skip_older_than"])

    # Check that an ID type was specified
    if not kwargs["id_type"]:
        raise Exception("Please specify an ID type (sample or library")

    details = []
    for identifier in kwargs["ids"]:
        # Query the GSC to see if the ID exists
        infos = query_gsc(identifier, kwargs["id_type"])

        if not infos:
            logging.info("No results for {} {}. Skipping import".format(
                kwargs["id_type"], identifier))
        else:
            logging.info("{} {} exists on the GSC".format(
                kwargs["id_type"], identifier))

        # Get the data from GSC
        details = get_gsc_details(
            infos,
            skip_older_than=kwargs["skip_older_than"],
        )

        # Import and transfer each file
        for detail in details:
            # Rename the bams according to internal templates
            bam_paths = rename_bam_paths(detail, storage, sftp)

            # If the bam path does not exist at the source, skip
            # the transfer and import
            if not bam_paths["source_bam_path"]:
                break

            # Skip import if we only wanted to query for paths
            if kwargs["query_only"]:
                continue

            if not kwargs["skip_file_import"]:
                # Transfer the bam to the specified storage
                transfer_gsc_bams(detail, bam_paths, storage, sftp)

                # Add the files to Tantalus
                logging.info("Importing {} to Tantalus".format(
                    bam_paths["tantalus_bam_path"]))

                dataset = import_bam(
                    storage_name=storage["name"],
                    bam_file_path=bam_paths["tantalus_bam_path"],
                    sample=detail["sample"],
                    library=detail["library"],
                    lane_infos=detail["lane_info"],
                    read_type=detail["read_type"],
                    tag_name=kwargs["tag_name"],
                    update=kwargs["update"])

                logging.info(
                    "Successfully added sequence dataset with ID {}".format(
                        dataset["id"]))
            else:
                logging.info("Importing library {} to tantalus".format(
                    detail["library"]["library_id"]))
                library_pk = tantalus_api.get_or_create(
                    "dna_library",
                    library_id=detail["library"]["library_id"],
                    library_type=detail["library"]["library_type"],
                    index_format=detail["library"]["index_format"])["id"]

                #Only add lanes, libraries, and samples to tantalus
                logging.info(
                    "Importing lanes for library {} to tantalus".format(
                        detail["library"]["library_id"]))
                for lane in detail["lane_info"]:
                    lane = tantalus_api.get_or_create(
                        "sequencing_lane",
                        flowcell_id=lane["flowcell_id"],
                        dna_library=library_pk,
                        read_type=lane["read_type"],
                        lane_number=str(lane["lane_number"]),
                        sequencing_centre="GSC",
                        sequencing_instrument=lane["sequencing_instrument"])
                    logging.info(
                        "Successfully created lane {} in tantalus".format(
                            lane["id"]))
Ejemplo n.º 5
0
def add_generic_results(filepaths,
                        storage_name,
                        results_name,
                        results_type,
                        results_version,
                        sample_ids=(),
                        library_ids=(),
                        analysis_pk=None,
                        recursive=False,
                        tag_name=None,
                        update=False,
                        remote_storage_name=None):

    tantalus_api = TantalusApi()

    sample_pks = []
    for sample_id in sample_ids:
        samples = tantalus_api.get(
            "sample",
            sample_id=sample_id,
        )
        sample_pks.append(samples['id'])

    library_pks = []
    for library_id in library_ids:
        librarys = tantalus_api.get(
            "dna_library",
            library_id=library_id,
        )
        library_pks.append(librarys['id'])

    #Add the file resource to tantalus
    file_resource_pks = []
    for filepath in filepaths:
        if recursive:
            logging.info("Recursing directory {}".format(filepath))
            add_filepaths = []
            for (dirpath, dirnames, filenames) in os.walk(filepath):
                for filename in filenames:
                    add_filepaths.append(os.path.join(dirpath, filename))

        else:
            add_filepaths = [filepath]

        for add_filepath in add_filepaths:
            logging.info(
                "Adding file resource for {} to Tantalus".format(add_filepath))
            resource, instance = tantalus_api.add_file(
                storage_name=storage_name,
                filepath=add_filepath,
                update=update,
            )
            file_resource_pks.append(resource["id"])

    results_dataset_fields = dict(
        name=results_name,
        results_type=results_type,
        results_version=results_version,
        analysis=analysis_pk,
        samples=sample_pks,
        libraries=library_pks,
        file_resources=file_resource_pks,
    )

    #Add the dataset to tantalus
    try:
        results_id = tantalus_api.get(
            "results", name=results_dataset_fields["name"])["id"]
    except NotFoundError:
        results_id = None

    if update and results_id is not None:
        logging.warning("results dataset {} exists, updating".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.update("results",
                                              id=results_id,
                                              **results_dataset_fields)

    else:
        logging.info("creating results dataset {}".format(
            results_dataset_fields["name"]))
        results_dataset = tantalus_api.get_or_create("results",
                                                     **results_dataset_fields)

    if tag_name is not None:
        tantalus_api.tag(tag_name, resultsdataset_set=[results_id])

    logging.info("Succesfully created sequence dataset with ID {}".format(
        results_dataset["id"]))

    if remote_storage_name is not None:
        transfer_files.transfer_dataset(tantalus_api, results_dataset['id'],
                                        "resultsdataset", storage_name,
                                        remote_storage_name)

    return results_dataset
import json
import os
import datetime

from dbclients.tantalus import TantalusApi

if __name__ == "__main__":

    tantalusApi = TantalusApi()
    print os.getcwd()
    json_file = open('docker/dummy.json', "r")

    data = json.load(json_file)

    storage = tantalusApi.get_or_create("storage_azure_blob",
                                        name="singlecellblob",
                                        storage_account="singlecelldata",
                                        storage_container="data")

    ids = []
    for i in range(1, 81):
        ids.append(i)
        resource = tantalusApi.get_or_create(
            'file_resource',
            file_type=data[i]['file_type'],
            last_updated=data[i]['last_updated'],
            size=data[i]['size'],
            created=data[i]['created'],
            compression=data[i]['compression'],
            filename=data[i]['filename'],
            is_folder=data[i]['is_folder'],
            owner=data[i]['owner'])