Esempio n. 1
0
 def _get_lanes(self):
     lanes = dict()
     for dataset_id in self.analysis['input_datasets']:
         dataset = self.tantalus_api.get('sequence_dataset', id=dataset_id)
         for lane in dataset['sequence_lanes']:
             lanes[get_lane_str(lane)] = lane
     return lanes
def create_normal_bam(bam_dir):
    """
    Create a normal bam dataset.
    """
    try:
        os.makedirs(bam_dir)
    except:
        pass

    normal_bam_info = get_normal_bam()
    normal_filepath = normal_bam_info['bam']

    normal_dataset = normal_bam_info['dataset']
    lane_ids = [get_lane_str(l) for l in normal_dataset['sequence_lanes']]
    sample_id = normal_dataset['sample']['sample_id']
    library_id = normal_dataset['library']['library_id']

    normal_bam_metadata = {
        'filenames': [],
        'meta': {
            'type': 'wgsbam',
            'version': 'v0.0.1',
            'lane_ids': lane_ids,
            'sample_id': sample_id,
            'library_id': library_id,
        },
    }

    normal_filtered_bam_filename = f'{sample_id}_{library_id}.bam'
    normal_filtered_bam_filepath = os.path.join(bam_dir,
                                                normal_filtered_bam_filename)
    run_filter_cmd(normal_filtered_bam_filepath, normal_filepath)

    normal_bam_metadata['filenames'].append(normal_filtered_bam_filename)
    normal_bam_metadata['filenames'].append(normal_filtered_bam_filename +
                                            '.bai')

    metadata_yaml_filename = os.path.join(bam_dir, 'metadata.yaml')

    with open(metadata_yaml_filename, 'w') as meta_yaml:
        yaml.safe_dump(normal_bam_metadata,
                       meta_yaml,
                       default_flow_style=False)
Esempio n. 3
0
def create_sequence_dataset_models(file_info,
                                   storage_name,
                                   tag_name,
                                   tantalus_api,
                                   analysis_id=None,
                                   update=False):
    """Create tantalus sequence models for a list of files."""

    analysis = None
    if analysis_id is not None:
        analysis = tantalus_api.get('analysis', id=analysis_id)

    # Get storage and tag PKs
    storage = tantalus_api.get("storage", name=storage_name)
    storage_pk = storage["id"]

    # Sort files by dataset
    dataset_info = collections.defaultdict(list)
    for info in file_info:
        if info["dataset_type"] == 'BAM':
            dataset_name = templates.SC_WGS_BAM_NAME_TEMPLATE.format(
                dataset_type=info["dataset_type"],
                sample_id=info["sample_id"],
                library_type=info["library_type"],
                library_id=info["library_id"],
                lanes_hash=get_lanes_hash(info["sequence_lanes"]),
                aligner=info["aligner_name"],
                reference_genome=info["ref_genome"],
                jira_ticket=analysis["jira_ticket"],
            )
        elif info["dataset_type"] == 'FQ':
            dataset_name = templates.SC_WGS_FQ_NAME_TEMPLATE.format(
                dataset_type=info["dataset_type"],
                sample_id=info["sample_id"],
                library_type=info["library_type"],
                library_id=info["library_id"],
                lane=get_lane_str(info["sequence_lanes"][0]),
            )
        dataset_info[dataset_name].append(info)

    # Create datasets
    dataset_ids = set()
    for dataset_name, infos in dataset_info.items():
        # Get library PK
        library = tantalus_api.get_or_create(
            "dna_library",
            library_id=infos[0]["library_id"],
            library_type=infos[0]["library_type"],
            index_format=infos[0]["index_format"],
        )
        library_pk = library["id"]

        # Get sample PK
        sample = tantalus_api.get_or_create(
            "sample",
            sample_id=infos[0]["sample_id"],
        )
        sample_pk = sample["id"]

        # Build up sequence dataset attrs; we'll add to this as we
        # proceed throughout the function
        sequence_dataset = dict(
            name=dataset_name,
            dataset_type=infos[0]["dataset_type"],
            sample=sample_pk,
            library=library_pk,
            sequence_lanes=[],
            file_resources=[],
        )

        # Add in the analysis id if it's provided
        if analysis_id is not None:
            sequence_dataset["analysis"] = analysis_id

        # Add in BAM specific items
        if infos[0]["dataset_type"] == "BAM":
            sequence_dataset["aligner"] = infos[0]["aligner_name"]
            sequence_dataset["reference_genome"] = infos[0]["ref_genome"]

        for info in infos:
            # Check consistency for fields used for dataset
            check_fields = (
                "dataset_type",
                "sample_id",
                "library_id",
                "library_type",
                "index_format",
            )
            for field_name in check_fields:
                if info[field_name] != infos[0][field_name]:
                    raise Exception("error with field {}".format(field_name))

            for sequence_lane in info["sequence_lanes"]:
                sequence_lane = dict(sequence_lane)
                sequence_lane["dna_library"] = library_pk
                sequence_lane["lane_number"] = str(
                    sequence_lane["lane_number"])

                sequence_lane = tantalus_api.get_or_create(
                    "sequencing_lane", **sequence_lane)

                sequence_dataset["sequence_lanes"].append(sequence_lane["id"])

            sequence_file_info = dict(index_sequence=info["index_sequence"])

            if "read_end" in info:
                sequence_file_info["read_end"] = info["read_end"]

            file_resource, file_instance = tantalus_api.add_file(
                storage_name,
                info["filepath"],
                update=update,
            )

            sequence_file_info = tantalus_api.get_or_create(
                "sequence_file_info",
                file_resource=file_resource["id"],
                **sequence_file_info)

            sequence_dataset["file_resources"].append(file_resource["id"])

        try:
            dataset_id = tantalus_api.get("sequence_dataset",
                                          name=sequence_dataset["name"])["id"]
        except NotFoundError:
            dataset_id = None

        if update and dataset_id is not None:
            log.warning("sequence dataset {} has changed, updating".format(
                sequence_dataset["name"]))
            dataset = tantalus_api.update("sequence_dataset",
                                          id=dataset_id,
                                          **sequence_dataset)

        else:
            log.info("creating sequence dataset {}".format(
                sequence_dataset["name"]))
            dataset = tantalus_api.get_or_create("sequence_dataset",
                                                 **sequence_dataset)

        if tag_name is not None:
            tantalus_api.tag(tag_name, sequencedataset_set=[dataset['id']])

        dataset_ids.add(dataset['id'])

    return dataset_ids
def create_tumour_fastqs(fastq_dir, temp_dir):
    """ Create a filterd fastq dataset
    """
    tumour_bam_info = get_tumour_bams()

    dataset = tumour_bam_info['dataset']
    lane_ids = [get_lane_str(l) for l in dataset['sequence_lanes']]
    sample_id = dataset['sample']['sample_id']
    library_id = dataset['library']['library_id']

    FASTQ_TEMPLATE = '{cell_id}_{read_end}.fastq.gz'

    tumour_fastq_metadata = {
        'filenames': [],
        'meta': {
            'type': 'cellfastqs',
            'version': 'v0.0.1',
            'cell_ids': [],
            'lane_ids': lane_ids,
            'sample_id': sample_id,
            'library_id': library_id,
            'fastqs': {
                'template': FASTQ_TEMPLATE,
                'instances': []
            },
        }
    }

    for cell_id in tumour_bam_info['cells']:
        bam_path = tumour_bam_info['cells'][cell_id]['bam']
        sublib = tumour_bam_info['cells'][cell_id]['sublib']

        logging.info('creating paired end fastqs for bam {}'.format(bam_path))

        # Filter bams
        cell_filtered_bam = os.path.join(fastq_dir, f'{cell_id}.bam')
        run_filter_cmd(cell_filtered_bam, bam_path)

        # Convert bams to fastq, uncompressed
        cell_end_1_fastq = os.path.join(fastq_dir, f'{cell_id}_1.fastq')
        cell_end_2_fastq = os.path.join(fastq_dir, f'{cell_id}_2.fastq')
        run_bam_fastq(cell_end_1_fastq, cell_end_2_fastq, cell_filtered_bam)

        # Gzip final fastq
        cell_end_1_fastq_filename = FASTQ_TEMPLATE.format(cell_id=cell_id,
                                                          read_end='1')
        cell_end_2_fastq_filename = FASTQ_TEMPLATE.format(cell_id=cell_id,
                                                          read_end='2')
        cell_end_1_fastq_gz = os.path.join(fastq_dir,
                                           cell_end_1_fastq_filename)
        cell_end_2_fastq_gz = os.path.join(fastq_dir,
                                           cell_end_2_fastq_filename)
        gzip_file(cell_end_1_fastq, cell_end_1_fastq_gz)
        gzip_file(cell_end_2_fastq, cell_end_2_fastq_gz)

        tumour_fastq_metadata['filenames'].append(cell_end_1_fastq_filename)
        tumour_fastq_metadata['filenames'].append(cell_end_2_fastq_filename)

        tumour_fastq_metadata['meta']['cell_ids'].append(cell_id)

        for read_end in ('1', '2'):
            tumour_fastq_metadata['meta']['fastqs']['instances'].append({
                'cell_id':
                cell_id,
                'read_end':
                read_end,
                'condition':
                sublib['condition'],
                'img_col':
                sublib['img_col'],
                'index_i5':
                sublib['index_i5'],
                'index_i7':
                sublib['index_i7'],
                'pick_met':
                sublib['pick_met'],
                'primer_i5':
                sublib['primer_i5'],
                'primer_i7':
                sublib['primer_i7'],
                'row':
                sublib['row'],
                'column':
                sublib['column'],
            })

    metadata_yaml_filename = os.path.join(fastq_dir, 'metadata.yaml')

    with open(metadata_yaml_filename, 'w') as meta_yaml:
        yaml.safe_dump(tumour_fastq_metadata,
                       meta_yaml,
                       default_flow_style=False)