コード例 #1
0
def create_fastq_metadata_yaml(library_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a all FQ datasets for a library id.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    for dataset_info, metadata in create_lane_fastq_metadata(
            tantalus_api, library_id):
        metadata_filename = os.path.join(dataset_info['base_dir'],
                                         'metadata.yaml')
        metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                      metadata_filename)

        metadata_io = io.BytesIO()
        metadata_io.write(
            yaml.dump(metadata, default_flow_style=False).encode())

        logging.info(f'writing metadata to file {metadata_filepath}')
        client.write_data(metadata_filename, metadata_io)

        logging.info(f'adding {metadata_filepath} to tantalus')

        if not dry_run:
            file_resource, file_instance = tantalus_api.add_file(
                storage_name, metadata_filepath, update=True)

            for dataset_id in dataset_info['dataset_ids']:
                dataset = tantalus_api.get('sequencedataset', id=dataset_id)

                new_file_resources = set(dataset['file_resources'])
                new_file_resources.add(file_resource['id'])

                tantalus_api.update('sequencedataset',
                                    id=dataset_id,
                                    file_resources=list(new_file_resources))
コード例 #2
0
def add_fastq_metadata_yaml(dataset_id, storage_name, dry_run=False):
    """
    Create a metadata.yaml file for a dataset and add to tantalus.
    """
    tantalus_api = TantalusApi()

    storage = tantalus_api.get_storage(storage_name)
    client = tantalus_api.get_storage_client(storage_name)

    metadata, base_dir = create_lane_fastq_metadata(tantalus_api, dataset_id)

    metadata_filename = os.path.join(base_dir, 'metadata.yaml')
    metadata_filepath = tantalus_api.get_filepath(storage_name,
                                                  metadata_filename)

    metadata_io = io.BytesIO()
    metadata_io.write(yaml.dump(metadata, default_flow_style=False).encode())

    print(f'writing metadata to file {metadata_filepath}')
    client.write_data(metadata_filename, metadata_io)

    print(f'adding {metadata_filepath} to tantalus')

    if not dry_run:
        file_resource, file_instance = tantalus_api.add_file(storage_name,
                                                             metadata_filepath,
                                                             update=True)

        dataset = tantalus_api.get('sequencedataset', id=dataset_id)

        new_file_resources = set(dataset['file_resources'])
        new_file_resources.add(file_resource['id'])

        tantalus_api.update('sequencedataset',
                            id=dataset_id,
                            file_resources=list(new_file_resources))
コード例 #3
0
def main(**kwargs):
    """
    Queries the GSC for WGS bams. Transfers bams to specified storage if 
    necessary and uploads metadata to tantalus

    Args:
        ids:                (string) a list of internal IDs to query the GSC for 
        storage:            (string) destination storage to transfer bams to
        id_type:            (string) type of ID specified (either sample or library) 
        skip_older_than:    (string) skip bams older than this date
        tag_name:           (string) tag name to associate the resulting sequence datasets
                            with when importing into tantalus
        update:             (flag) specifies whether metadata in tantalus is
                            to be updated or not
        skip_file_import:   (flag) import only new lanes into tantalus
        query_only:         (flag) only query for the bam paths on the GSC 
    """
    # Check if this script is being run on thost
    # If not, connect to an ssh client to access /projects/files
    if socket.gethostname() != "txshah":
        ssh_client = connect_to_client("10.9.208.161")
        sftp = ssh_client.open_sftp()
    else:
        sftp = None

    # Connect to the Tantalus API
    tantalus_api = TantalusApi()
    storage = tantalus_api.get_storage(kwargs["storage"])

    # Convert the date to the format we want
    if kwargs["skip_older_than"]:
        skip_older_than = valid_date(kwargs["skip_older_than"])

    # Check that an ID type was specified
    if not kwargs["id_type"]:
        raise Exception("Please specify an ID type (sample or library")

    details = []
    for identifier in kwargs["ids"]:
        # Query the GSC to see if the ID exists
        infos = query_gsc(identifier, kwargs["id_type"])

        if not infos:
            logging.info("No results for {} {}. Skipping import".format(
                kwargs["id_type"], identifier))
        else:
            logging.info("{} {} exists on the GSC".format(
                kwargs["id_type"], identifier))

        # Get the data from GSC
        details = get_gsc_details(
            infos,
            skip_older_than=kwargs["skip_older_than"],
        )

        # Import and transfer each file
        for detail in details:
            # Rename the bams according to internal templates
            bam_paths = rename_bam_paths(detail, storage, sftp)

            # If the bam path does not exist at the source, skip
            # the transfer and import
            if not bam_paths["source_bam_path"]:
                break

            # Skip import if we only wanted to query for paths
            if kwargs["query_only"]:
                continue

            if not kwargs["skip_file_import"]:
                # Transfer the bam to the specified storage
                transfer_gsc_bams(detail, bam_paths, storage, sftp)

                # Add the files to Tantalus
                logging.info("Importing {} to Tantalus".format(
                    bam_paths["tantalus_bam_path"]))

                dataset = import_bam(
                    storage_name=storage["name"],
                    bam_file_path=bam_paths["tantalus_bam_path"],
                    sample=detail["sample"],
                    library=detail["library"],
                    lane_infos=detail["lane_info"],
                    read_type=detail["read_type"],
                    tag_name=kwargs["tag_name"],
                    update=kwargs["update"])

                logging.info(
                    "Successfully added sequence dataset with ID {}".format(
                        dataset["id"]))
            else:
                logging.info("Importing library {} to tantalus".format(
                    detail["library"]["library_id"]))
                library_pk = tantalus_api.get_or_create(
                    "dna_library",
                    library_id=detail["library"]["library_id"],
                    library_type=detail["library"]["library_type"],
                    index_format=detail["library"]["index_format"])["id"]

                #Only add lanes, libraries, and samples to tantalus
                logging.info(
                    "Importing lanes for library {} to tantalus".format(
                        detail["library"]["library_id"]))
                for lane in detail["lane_info"]:
                    lane = tantalus_api.get_or_create(
                        "sequencing_lane",
                        flowcell_id=lane["flowcell_id"],
                        dna_library=library_pk,
                        read_type=lane["read_type"],
                        lane_number=str(lane["lane_number"]),
                        sequencing_centre="GSC",
                        sequencing_instrument=lane["sequencing_instrument"])
                    logging.info(
                        "Successfully created lane {} in tantalus".format(
                            lane["id"]))