def add_vcf_dataset(alignment_group, vcf_dataset_type, vcf_output_filename): """ Sort the vcf file, and create a vcf dataset, Add it to the alignment group. """ sort_vcf(vcf_output_filename) # If a Dataset already exists, delete it, might have been a bad run. existing_set = Dataset.objects.filter( type=vcf_dataset_type, label=vcf_dataset_type, filesystem_location=clean_filesystem_location( vcf_output_filename) ) if len(existing_set) > 0: existing_set[0].delete() vcf_dataset = Dataset.objects.create( type=vcf_dataset_type, label=vcf_dataset_type, filesystem_location=clean_filesystem_location( vcf_output_filename), ) alignment_group.dataset_set.add(vcf_dataset) return vcf_dataset
def add_vcf_dataset(alignment_group, vcf_dataset_type, vcf_output_filename): """Sort vcf file, creates vcf dataset, and adds it to the alignment group. """ if not os.path.exists(vcf_output_filename): return None sort_vcf(vcf_output_filename) # If a Dataset already exists, delete it, might have been a bad run. existing_set = Dataset.objects.filter( type=vcf_dataset_type, label=vcf_dataset_type, filesystem_location=clean_filesystem_location(vcf_output_filename)) if len(existing_set) > 0: existing_set[0].delete() vcf_dataset = Dataset.objects.create( type=vcf_dataset_type, label=vcf_dataset_type, filesystem_location=clean_filesystem_location(vcf_output_filename), ) alignment_group.dataset_set.add(vcf_dataset) return vcf_dataset
def test_dataset_compression_piping(self): """ Make sure data set compression behaves correctly. """ dataset = Dataset.objects.create(label='test_dataset', type=Dataset.TYPE.FASTQ1) GZIPPED_FASTQ_FILEPATH = os.path.join(settings.PWD, 'test_data', 'compressed_fastq', 'sample0.simLibrary.1.fq.gz') dataset.filesystem_location = clean_filesystem_location( GZIPPED_FASTQ_FILEPATH) assert dataset.is_compressed() process = subprocess.Popen( ('head ' + dataset.wrap_if_compressed() + ' | wc -l'), shell=True, executable=settings.BASH_PATH, stdout=subprocess.PIPE, stderr=subprocess.PIPE) wc_output, errmsg = process.communicate() rc = process.returncode assert rc == 0, ( "Compression process returned non-zero exit status: %s" % (errmsg)) assert int(wc_output) == 10, ("Compression failed: %s" % (errmsg))
def test_clean_filesystem_location(self): FAKE_ABS_ROOT = '/root/of/all/evil' EXPECTED_CLEAN_URL = 'projects/blah' dirty_full_url = os.path.join(FAKE_ABS_ROOT, settings.MEDIA_ROOT, EXPECTED_CLEAN_URL) clean_location = clean_filesystem_location(dirty_full_url) self.assertEqual(EXPECTED_CLEAN_URL, clean_location)
def test_multiple_chromosome_dataset_import(self): user = User.objects.create_user( TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL) project = Project.objects.create( title=TEST_PROJECT_NAME, owner=user.get_profile()) test_yeast_genome = ReferenceGenome.objects.create( project=project, label='superbrewer2000') test_dataset_path = os.path.join(settings.PWD, 'test_data/yeast_chrom_jkl.fasta') dataset_path = copy_dataset_to_entity_data_dir(test_yeast_genome, test_dataset_path) test_chroms_dataset = Dataset.objects.create( label='jkl_chroms', type=Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=clean_filesystem_location(dataset_path)) test_yeast_genome.dataset_set.add(test_chroms_dataset) # Assert correct number of chromosomes assert(test_yeast_genome.num_chromosomes == 3) # Assert correct number of bases assert(test_yeast_genome.num_bases == sum([chrom.num_bases for chrom in Chromosome.objects.filter(reference_genome=test_yeast_genome)])) # Assert correct chromosome labels expected_chrom_names = [ 'gi|448092123|ref|NC_020215.1|', 'gi|448096713|ref|NC_020216.1|', 'gi|448100869|ref|NC_020217.1|'] assert([chrom.label for chrom in Chromosome.objects.filter(reference_genome=test_yeast_genome)] == expected_chrom_names)
def test_dataset_compression_piping(self): """ Make sure data set compression behaves correctly. """ dataset = Dataset.objects.create( label='test_dataset', type=Dataset.TYPE.FASTQ1) GZIPPED_FASTQ_FILEPATH = os.path.join(settings.PWD, 'test_data', 'compressed_fastq', 'sample0.simLibrary.1.fq.gz') dataset.filesystem_location = clean_filesystem_location( GZIPPED_FASTQ_FILEPATH) assert dataset.is_compressed() process = subprocess.Popen( ('head '+dataset.wrap_if_compressed()+' | wc -l'), shell=True, executable=settings.BASH_PATH, stdout=subprocess.PIPE, stderr=subprocess.PIPE) wc_output, errmsg = process.communicate() rc = process.returncode assert rc == 0, ( "Compression process returned non-zero exit status: %s" % ( errmsg)) assert int(wc_output) == 10, ( "Compression failed: %s" % (errmsg))
def compute_callable_loci(reference_genome, sample_alignment, bam_file_location, stderr=None): # Set output fn to None in case try fails. callable_loci_bed_fn = None try: ref_genome_fasta_location = get_dataset_with_type( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() output = _get_callable_loci_output_filename(bam_file_location) get_callable_loci(bam_file_location, output) # Add callable loci bed as dataset callable_loci_bed = Dataset.objects.create( label=Dataset.TYPE.BED_CALLABLE_LOCI, type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=clean_filesystem_location(output)) sample_alignment.dataset_set.add(callable_loci_bed) sample_alignment.save() callable_loci_bed_fn = callable_loci_bed.get_absolute_location() output = subprocess.check_output( ['cat', callable_loci_bed_fn]) with open(callable_loci_bed_fn, 'w') as callable_loci_bed_fh: for i, line in enumerate(output.split('\n')): try: fields = line.split() if len(fields) == 0: continue chrom, start, end, feature = fields feature = titlecase_spaces(feature) # Bed feature can't have spaces =( feature = feature.replace(' ', '_') print >> callable_loci_bed_fh, '\t'.join( [chrom, start, end, feature]) except Exception as e: print >> stderr, ( 'WARNING: Callable Loci line' + '%d: (%s) couldn\'t be parsed: %s') % ( i, line, str(e)) # add it as a jbrowse track add_bed_file_track(reference_genome, sample_alignment, callable_loci_bed) except Exception as e: print >> stderr, 'WARNING: Callable Loci failed.' print >> stderr, str(e) return callable_loci_bed_fn
def get_split_reads(sample_alignment): """Isolate split reads from a sample alignment. This uses a python script supplied with Lumpy that is run as a separate process. NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM. """ bwa_split_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_SPLIT) if bwa_split_dataset is not None: if (bwa_split_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_split_dataset.get_absolute_location())): return bwa_split_dataset else: bwa_split_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_SPLIT, type=Dataset.TYPE.BWA_SPLIT, status=Dataset.STATUS.NOT_STARTED) sample_alignment.dataset_set.add(bwa_split_dataset) # If here, we are going to run or re-run the Dataset. bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED bwa_split_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists( bam_filename), "BAM file '%s' is missing." % (bam_filename) bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_split_reads.bam') try: bwa_split_dataset.status = Dataset.STATUS.COMPUTING bwa_split_dataset.save(update_fields=['status']) extract_split_reads(bam_filename, bam_split_filename) except subprocess.CalledProcessError: # if there are no split reads, then fail. bwa_split_dataset.filesystem_location = '' bwa_split_dataset.status = Dataset.STATUS.FAILED finally: bwa_split_dataset.status = Dataset.STATUS.READY bwa_split_dataset.filesystem_location = clean_filesystem_location( bam_split_filename) bwa_split_dataset.save() return bwa_split_dataset
def get_split_reads(sample_alignment): """Isolate split reads from a sample alignment. This uses a python script supplied with Lumpy that is run as a separate process. NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM. """ bwa_split_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_SPLIT) if bwa_split_dataset is not None: if (bwa_split_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_split_dataset.get_absolute_location())): return bwa_split_dataset else: bwa_split_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_SPLIT, type=Dataset.TYPE.BWA_SPLIT, status=Dataset.STATUS.NOT_STARTED) sample_alignment.dataset_set.add(bwa_split_dataset) # If here, we are going to run or re-run the Dataset. bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED bwa_split_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists(bam_filename), "BAM file '%s' is missing." % ( bam_filename) bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_split_reads.bam') try: bwa_split_dataset.status = Dataset.STATUS.COMPUTING bwa_split_dataset.save(update_fields=['status']) extract_split_reads(bam_filename, bam_split_filename) except subprocess.CalledProcessError: # if there are no split reads, then fail. bwa_split_dataset.filesystem_location = '' bwa_split_dataset.status = Dataset.STATUS.FAILED finally: bwa_split_dataset.status = Dataset.STATUS.READY bwa_split_dataset.filesystem_location = clean_filesystem_location( bam_split_filename) bwa_split_dataset.save() return bwa_split_dataset
def get_discordant_read_pairs(sample_alignment): """Isolate discordant pairs of reads from a sample alignment. """ # First, check if completed dataset already exists. bwa_disc_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_DISCORDANT) if bwa_disc_dataset is not None: if (bwa_disc_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_disc_dataset.get_absolute_location())): return bwa_disc_dataset else: bwa_disc_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_DISCORDANT, type=Dataset.TYPE.BWA_DISCORDANT) sample_alignment.dataset_set.add(bwa_disc_dataset) # If here, we are going to run or re-run the Dataset. bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED bwa_disc_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists( bam_filename), "BAM file '%s' is missing." % (bam_filename) # NOTE: This assumes the index just adds at .bai, w/ same path otherwise # - will this always be true? if not os.path.exists(bam_filename + '.bai'): index_bam_file(bam_filename) bam_discordant_filename = os.path.join( sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam') try: bwa_disc_dataset.status = Dataset.STATUS.COMPUTING bwa_disc_dataset.save(update_fields=['status']) extract_discordant_read_pairs(bam_filename, bam_discordant_filename) except subprocess.CalledProcessError: bwa_disc_dataset.filesystem_location = '' bwa_disc_dataset.status = Dataset.STATUS.FAILED finally: bwa_disc_dataset.status = Dataset.STATUS.READY bwa_disc_dataset.filesystem_location = clean_filesystem_location( bam_discordant_filename) bwa_disc_dataset.save() return bwa_disc_dataset
def get_discordant_read_pairs(sample_alignment): """Isolate discordant pairs of reads from a sample alignment. """ # First, check if completed dataset already exists. bwa_disc_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_DISCORDANT) if bwa_disc_dataset is not None: if (bwa_disc_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_disc_dataset.get_absolute_location())): return bwa_disc_dataset else: bwa_disc_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_DISCORDANT, type=Dataset.TYPE.BWA_DISCORDANT) sample_alignment.dataset_set.add(bwa_disc_dataset) # If here, we are going to run or re-run the Dataset. bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED bwa_disc_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists(bam_filename), "BAM file '%s' is missing." % ( bam_filename) # NOTE: This assumes the index just adds at .bai, w/ same path otherwise # - will this always be true? if not os.path.exists(bam_filename+'.bai'): index_bam_file(bam_filename) bam_discordant_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam') try: bwa_disc_dataset.status = Dataset.STATUS.COMPUTING bwa_disc_dataset.save(update_fields=['status']) extract_discordant_read_pairs(bam_filename, bam_discordant_filename) except subprocess.CalledProcessError: bwa_disc_dataset.filesystem_location = '' bwa_disc_dataset.status = Dataset.STATUS.FAILED finally: bwa_disc_dataset.status = Dataset.STATUS.READY bwa_disc_dataset.filesystem_location = clean_filesystem_location( bam_discordant_filename) bwa_disc_dataset.save() return bwa_disc_dataset
def add_dataset_to_entity(entity, dataset_label, dataset_type, filesystem_location=None): """Helper function for adding a Dataset to a model. """ dataset = Dataset.objects.create( label=dataset_label, type=dataset_type) if filesystem_location is not None: dataset.filesystem_location = clean_filesystem_location( filesystem_location) dataset.save() entity.dataset_set.add(dataset) entity.save() return dataset
def derivation_fn(sample_alignment, unmapped_reads_dataset): # Get the original bam file. bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() # Allocate a filename for the unmapped reads. unmapped_reads_bam_file = (os.path.splitext(bam_filename)[0] + '.unmapped.bam') unmapped_reads_dataset.filesystem_location = clean_filesystem_location( unmapped_reads_bam_file) unmapped_reads_dataset.save(update_fields=['filesystem_location']) cmd = '{samtools} view -h -b -f 0x4 {bam_filename}'.format( samtools=settings.SAMTOOLS_BINARY, bam_filename=bam_filename) with open(unmapped_reads_bam_file, 'w') as output_fh: subprocess.check_call(cmd, stdout=output_fh, shell=True)
def main(): # Create a User and Project. user = get_or_create_user() test_project = Project.objects.create(title=EXAMPLE_PROJECT_NAME, owner=user.get_profile()) ref_genome = import_reference_genome_from_local_file(test_project, 'mg1655', MG1655_REF_GENOME, 'genbank', move=False) # Create alignment group and and relate the vcf Dataset to it. alignment_group = AlignmentGroup.objects.create( label='Fix Recoli Alignment', reference_genome=ref_genome, aligner=AlignmentGroup.ALIGNER.BWA) vcf_output_path = get_snpeff_vcf_output_path(alignment_group, Dataset.TYPE.BWA_ALIGN) shutil.copy(LARGE_VCF, vcf_output_path) dataset = Dataset.objects.create( type=Dataset.TYPE.VCF_FREEBAYES_SNPEFF, label=Dataset.TYPE.VCF_FREEBAYES_SNPEFF, filesystem_location=clean_filesystem_location(vcf_output_path), ) alignment_group.dataset_set.add(dataset) # Import ExperimentSampleo objects, setting specific uid to match # the vcf file. with open(EXPERIMENT_SAMPLE_MODEL_DATA_PICKLE) as sample_data_fh: es_data = pickle.load(sample_data_fh) for es in es_data: es_obj = ExperimentSample.objects.create(uid=es.uid, project=test_project, label=es.label) es_obj.data.update({ 'group': es.group, 'well': es.well, 'num_reads': es.num_reads }) es_obj.save() parse_alignment_group_vcf(alignment_group, Dataset.TYPE.VCF_FREEBAYES_SNPEFF)
def main(): # Create a User and Project. user = get_or_create_user() test_project = Project.objects.create( title=EXAMPLE_PROJECT_NAME, owner=user.get_profile()) ref_genome = import_reference_genome_from_local_file(test_project, 'mg1655', MG1655_REF_GENOME, 'genbank', move=False) # Create alignment group and and relate the vcf Dataset to it. alignment_group = AlignmentGroup.objects.create( label='Fix Recoli Alignment', reference_genome=ref_genome, aligner=AlignmentGroup.ALIGNER.BWA) vcf_output_path = get_snpeff_vcf_output_path(alignment_group, Dataset.TYPE.BWA_ALIGN) shutil.copy(LARGE_VCF, vcf_output_path) dataset = Dataset.objects.create( type=Dataset.TYPE.VCF_FREEBAYES_SNPEFF, label=Dataset.TYPE.VCF_FREEBAYES_SNPEFF, filesystem_location=clean_filesystem_location(vcf_output_path), ) alignment_group.dataset_set.add(dataset) # Import ExperimentSampleo objects, setting specific uid to match # the vcf file. with open(EXPERIMENT_SAMPLE_MODEL_DATA_PICKLE) as sample_data_fh: es_data = pickle.load(sample_data_fh) for es in es_data: es_obj = ExperimentSample.objects.create( uid=es.uid, project=test_project, label=es.label ) es_obj.data.update( {'group':es.group, 'well':es.well, 'num_reads':es.num_reads}) es_obj.save() parse_alignment_group_vcf(alignment_group, Dataset.TYPE.VCF_FREEBAYES_SNPEFF)
def compute_callable_loci(reference_genome, sample_alignment, bam_file_location, stderr=None): # Set output fn to None in case try fails. callable_loci_bed_fn = None try: ref_genome_fasta_location = get_dataset_with_type( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() callable_loci_bed_fn = ( _get_callable_loci_output_filename(bam_file_location)) get_callable_loci(bam_file_location, callable_loci_bed_fn) # Add callable loci bed as dataset callable_loci_bed_dataset = Dataset.objects.create( label=Dataset.TYPE.BED_CALLABLE_LOCI, type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=clean_filesystem_location( callable_loci_bed_fn)) sample_alignment.dataset_set.add(callable_loci_bed_dataset) sample_alignment.save() clean_bed_fn = clean_bed_features(callable_loci_bed_dataset, stderr=stderr) # add it as a jbrowse track add_bed_file_track(reference_genome, sample_alignment, callable_loci_bed_dataset) except Exception as e: print >> stderr, 'WARNING: Callable Loci failed.' print >> stderr, str(e) clean_bed_fn = '' finally: return clean_bed_fn
def compute_callable_loci(reference_genome, sample_alignment, bam_file_location, stderr=None): # Set output fn to None in case try fails. callable_loci_bed_fn = None try: ref_genome_fasta_location = get_dataset_with_type( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() callable_loci_bed_fn = ( _get_callable_loci_output_filename(bam_file_location)) get_callable_loci(bam_file_location, callable_loci_bed_fn) # Add callable loci bed as dataset callable_loci_bed_dataset = Dataset.objects.create( label=Dataset.TYPE.BED_CALLABLE_LOCI, type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=clean_filesystem_location(callable_loci_bed_fn)) sample_alignment.dataset_set.add(callable_loci_bed_dataset) sample_alignment.save() clean_bed_fn = clean_bed_features(callable_loci_bed_dataset, stderr=stderr) # add it as a jbrowse track add_bed_file_track( reference_genome, sample_alignment, callable_loci_bed_dataset) except Exception as e: print >> stderr, 'WARNING: Callable Loci failed.' print >> stderr, str(e) clean_bed_fn = '' finally: return clean_bed_fn
def test_multiple_chromosome_dataset_import(self): user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL) project = Project.objects.create(title=TEST_PROJECT_NAME, owner=user.get_profile()) test_yeast_genome = ReferenceGenome.objects.create( project=project, label='superbrewer2000') test_dataset_path = os.path.join(settings.PWD, 'test_data/yeast_chrom_jkl.fasta') dataset_path = copy_dataset_to_entity_data_dir(test_yeast_genome, test_dataset_path) test_chroms_dataset = Dataset.objects.create( label='jkl_chroms', type=Dataset.TYPE.REFERENCE_GENOME_FASTA, filesystem_location=clean_filesystem_location(dataset_path)) test_yeast_genome.dataset_set.add(test_chroms_dataset) # Assert correct number of chromosomes assert (test_yeast_genome.num_chromosomes == 3) # Assert correct number of bases assert (test_yeast_genome.num_bases == sum([ chrom.num_bases for chrom in Chromosome.objects.filter( reference_genome=test_yeast_genome) ])) # Assert correct chromosome labels expected_chrom_names = [ 'gi|448092123|ref', 'gi|448096713|ref', 'gi|448100869|ref' ] assert (set([ chrom.seqrecord_id for chrom in Chromosome.objects.filter( reference_genome=test_yeast_genome) ]) == set(expected_chrom_names))
def sanitize_sequence_dataset(dataset): dataset_type_to_parse_format = { Dataset.TYPE.REFERENCE_GENOME_FASTA: 'fasta', Dataset.TYPE.REFERENCE_GENOME_GENBANK: 'genbank' } if dataset.type not in dataset_type_to_parse_format: return dirty_file_path = dataset.get_absolute_location() parse_format = dataset_type_to_parse_format[dataset.type] needs_santizing = False with open(dirty_file_path, 'r') as dirty_fh: for seq_record in SeqIO.parse(dirty_fh, parse_format): if len(seq_record.id) > 16: needs_santizing = True break if not needs_santizing: return prefix, ext = os.path.splitext(dirty_file_path) clean_file_path = prefix + '.clean' + ext seq_record_list = [] with open(dirty_file_path, 'r') as dirty_fh: for seq_record in SeqIO.parse(dirty_fh, parse_format): seq_record.id = seq_record.id[:16] seq_record.name = seq_record.id seq_record_list.append(seq_record) with open(clean_file_path, 'w') as clean_fh: SeqIO.write(seq_record_list, clean_fh, parse_format) dataset.filesystem_location = clean_filesystem_location(clean_file_path) dataset.save()
def test_run_lumpy(self): TEST_SAMPLE_UID = '8c57e7b9' # Create a ref genome. self.reference_genome = import_reference_genome_from_local_file( self.project, 'ref_genome', TEST_FASTA, 'fasta') # Create a sample. self.experiment_sample = ExperimentSample.objects.create( uid=TEST_SAMPLE_UID, project=self.project, label='sample1') # Create a new alignment group. alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.reference_genome) self.alignment_group = alignment_group # Create the expected models. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.experiment_sample) bwa_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_ALIGN, type=Dataset.TYPE.BWA_ALIGN, status=Dataset.STATUS.READY) bwa_dataset.filesystem_location = clean_filesystem_location( TEST_DISC_SPLIT_BAM) bwa_dataset.save() sample_alignment.dataset_set.add(bwa_dataset) sample_alignment.save() self.bwa_dataset = bwa_dataset self.sample_alignment = sample_alignment fasta_ref = get_dataset_with_type( self.reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() sample_alignments = [self.sample_alignment] vcf_output_dir = self.alignment_group.get_model_data_dir() vcf_output_filename = os.path.join(vcf_output_dir, 'lumpy.vcf') alignment_type = 'BWA_ALIGN' # NOTE: Running these functions but not checking results. get_discordant_read_pairs(self.sample_alignment) get_split_reads(self.sample_alignment) run_lumpy(fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type) dataset = Dataset.objects.create( type=Dataset.TYPE.VCF_LUMPY, label=Dataset.TYPE.VCF_LUMPY, filesystem_location=vcf_output_filename, ) self.alignment_group.dataset_set.add(dataset) # Parse the resulting vcf, grab variant objects parse_alignment_group_vcf(self.alignment_group, Dataset.TYPE.VCF_LUMPY) # Grab the resulting variants. variants = Variant.objects.filter(reference_genome=self.reference_genome) # There should be a Variant object for each sv event. self.assertEqual(2, len(variants)) # One event should be located very close to 25k va_positions = [v.position for v in variants] va_offset = [25000 - va_pos for va_pos in va_positions] self.assertTrue(any([v < 50 for v in va_offset]))
def derivation_fn(sample_alignment, new_dataset): """Creates a bam file with reads relevant for de novo assembly. """ # Get the original bam file. bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) orig_bam_filename = bam_dataset.get_absolute_location() # Allocate a filename for the new dataset. de_novo_bam_filelocation = (os.path.splitext(orig_bam_filename)[0] + '.de_novo.bam') new_dataset.filesystem_location = clean_filesystem_location( de_novo_bam_filelocation) new_dataset.save(update_fields=['filesystem_location']) ### Strategy # 0. Create intermediate sam # 1. Grab unmapped reads. # a. If the corresponding pair for any read was mapped (and thus not # in the unmapped dataset), grab that from the original bam file. # 2. Grab split reads. # 2b. Add reads that are in intervals of interest. # 3. Sort by name so that paired reads are next to each other # 4. Filter out duplicate reads. Requires sort in step 3. # 5. Convert to bam. # 6. Delete intermediate files. # 0. Create intermediate files. intermediate_sam = (os.path.splitext(de_novo_bam_filelocation)[0] + '.int.sam') paired_intermediate_sam = (os.path.splitext(intermediate_sam)[0] + '.paired.sam') sorted_intermediate_sam = ( os.path.splitext(paired_intermediate_sam)[0] + '.sorted.sam') deduped_sorted_intermediate_sam = ( os.path.splitext(sorted_intermediate_sam)[0] + '.deduped.sam') intermediate_files = [ intermediate_sam, paired_intermediate_sam, sorted_intermediate_sam, deduped_sorted_intermediate_sam ] # 1. Get unmapped reads. unmapped_reads_dataset = get_unmapped_reads(sample_alignment) unmapped_reads_bam_file = unmapped_reads_dataset.get_absolute_location( ) cmd = '{samtools} view -h {bam_filename}'.format( samtools=settings.SAMTOOLS_BINARY, bam_filename=unmapped_reads_bam_file) with open(intermediate_sam, 'w') as output_fh: subprocess.check_call(cmd, stdout=output_fh, shell=True) # 2. Grab split reads. split_reads_dataset = get_split_reads(sample_alignment) split_rads_bam_file = split_reads_dataset.get_absolute_location() cmd = '{samtools} view {bam_filename}'.format( samtools=settings.SAMTOOLS_BINARY, bam_filename=split_rads_bam_file) with open(intermediate_sam, 'a') as output_fh: subprocess.check_call(cmd, stdout=output_fh, shell=True) # 2b. Add reads that are in any of intervals that we want to include. if force_include_reads_in_intervals: with open(intermediate_sam, 'a') as output_fh: get_reads_in_interval_list(orig_bam_filename, force_include_reads_in_intervals, output_fh) # 2c. For each of the reads that we've included, grab their # corresponding pairs. add_paired_mates(intermediate_sam, orig_bam_filename, paired_intermediate_sam) # 3. Sort by name so that paired reads are next to each other cmd = ('(grep ^"@" {sam_file}; grep -v ^"@" {sam_file} | ' 'sort -k1,1 -k2,2n) > {sorted_sam_file}').format( sam_file=paired_intermediate_sam, sorted_sam_file=sorted_intermediate_sam) subprocess.call(cmd, shell=True) # 4. Filter out duplicate reads. Requires sort in step 3. cmd = 'uniq {sorted_sam_file} > {deduped_sam_file}'.format( sorted_sam_file=sorted_intermediate_sam, deduped_sam_file=deduped_sorted_intermediate_sam) subprocess.call(cmd, shell=True) # 5. Convert to bam. cmd = '{samtools} view -Sb {sam_file} > {final_bam_file}'.format( samtools=settings.SAMTOOLS_BINARY, sam_file=intermediate_sam, final_bam_file=de_novo_bam_filelocation) subprocess.call(cmd, shell=True) # 6. Delete intermediate files. for f in intermediate_files: os.remove(f)
def test_run_lumpy(self): TEST_SAMPLE_UID = '8c57e7b9' # Create a ref genome. self.reference_genome = import_reference_genome_from_local_file( self.project, 'ref_genome', TEST_FASTA, 'fasta') # Create a sample. self.experiment_sample = ExperimentSample.objects.create( uid=TEST_SAMPLE_UID, project=self.project, label='sample1') # Create a new alignment group. alignment_group = AlignmentGroup.objects.create( label='test alignment', reference_genome=self.reference_genome) self.alignment_group = alignment_group # Create the expected models. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group, experiment_sample=self.experiment_sample) bwa_dataset = Dataset.objects.create(label=Dataset.TYPE.BWA_ALIGN, type=Dataset.TYPE.BWA_ALIGN, status=Dataset.STATUS.READY) bwa_dataset.filesystem_location = clean_filesystem_location( TEST_DISC_SPLIT_BAM) bwa_dataset.save() sample_alignment.dataset_set.add(bwa_dataset) sample_alignment.save() self.bwa_dataset = bwa_dataset self.sample_alignment = sample_alignment fasta_ref = get_dataset_with_type( self.reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() sample_alignments = [self.sample_alignment] vcf_output_dir = self.alignment_group.get_model_data_dir() vcf_output_filename = os.path.join(vcf_output_dir, 'lumpy.vcf') alignment_type = 'BWA_ALIGN' # NOTE: Running these functions but not checking results. get_discordant_read_pairs(self.sample_alignment) get_split_reads(self.sample_alignment) run_lumpy(fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type) dataset = Dataset.objects.create( type=Dataset.TYPE.VCF_LUMPY, label=Dataset.TYPE.VCF_LUMPY, filesystem_location=vcf_output_filename, ) self.alignment_group.dataset_set.add(dataset) # Parse the resulting vcf, grab variant objects parse_alignment_group_vcf(self.alignment_group, Dataset.TYPE.VCF_LUMPY) # Grab the resulting variants. variants = Variant.objects.filter( reference_genome=self.reference_genome) # There should be a Variant object for each sv event. self.assertEqual(2, len(variants)) # One event should be located very close to 25k va_positions = [v.position for v in variants] va_offset = [25000 - va_pos for va_pos in va_positions] self.assertTrue(any([v < 50 for v in va_offset]))
def get_split_reads(sample_alignment): """Isolate split reads from a sample alignment. This uses a python script supplied with Lumppy, that is run as a separate process. NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM. """ bwa_split_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_SPLIT) if bwa_split_dataset is not None: if (bwa_split_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_split_dataset.get_absolute_location())): return bwa_split_dataset else: bwa_split_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_SPLIT, type=Dataset.TYPE.BWA_SPLIT, status=Dataset.STATUS.NOT_STARTED) sample_alignment.dataset_set.add(bwa_split_dataset) # If here, we are going to run or re-run the Dataset. bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED bwa_split_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists(bam_filename), "BAM file '%s' is missing." % ( bam_filename) # NOTE: This assumes the index just adds at .bai, w/ same path otherwise # - will this always be true? if not os.path.exists(bam_filename+'.bai'): index_bam_file(bam_filename) bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_split_reads.bam') # Use lumpy bwa-mem split read script to pull out split reads. filter_split_reads = ' | '.join([ '{samtools} view -h {bam_filename}', 'python {lumpy_bwa_mem_sr_script} -i stdin', '{samtools} view -Sb -']).format( samtools=settings.SAMTOOLS_BINARY, bam_filename=bam_filename, lumpy_bwa_mem_sr_script= settings.LUMPY_EXTRACT_SPLIT_READS_BWA_MEM) try: bwa_split_dataset.status = Dataset.STATUS.COMPUTING bwa_split_dataset.save(update_fields=['status']) with open(bam_split_filename, 'w') as fh: subprocess.check_call(filter_split_reads, stdout=fh, shell=True, executable=settings.BASH_PATH) # sort the split reads, overwrite the old file subprocess.check_call([settings.SAMTOOLS_BINARY, 'sort', bam_split_filename, os.path.splitext(bam_split_filename)[0]]) _filter_out_interchromosome_reads(bam_split_filename) bwa_split_dataset.status = Dataset.STATUS.READY bwa_split_dataset.filesystem_location = clean_filesystem_location( bam_split_filename) except subprocess.CalledProcessError: # if there are no split reads, then fail. bwa_split_dataset.filesystem_location = '' bwa_split_dataset.status = Dataset.STATUS.FAILED bwa_split_dataset.save() return bwa_split_dataset
def get_discordant_read_pairs(sample_alignment): """Isolate discordant pairs of reads from a sample alignment. """ # First, check if completed dataset already exists. bwa_disc_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_DISCORDANT) if bwa_disc_dataset is not None: if (bwa_disc_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_disc_dataset.get_absolute_location())): return bwa_disc_dataset else: bwa_disc_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_DISCORDANT, type=Dataset.TYPE.BWA_DISCORDANT) sample_alignment.dataset_set.add(bwa_disc_dataset) # If here, we are going to run or re-run the Dataset. bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED bwa_disc_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists(bam_filename), "BAM file '%s' is missing." % ( bam_filename) # NOTE: This assumes the index just adds at .bai, w/ same path otherwise # - will this always be true? if not os.path.exists(bam_filename+'.bai'): index_bam_file(bam_filename) bam_discordant_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam') try: bwa_disc_dataset.status = Dataset.STATUS.COMPUTING bwa_disc_dataset.save(update_fields=['status']) # Use bam read alignment flags to pull out discordant pairs only filter_discordant = ' | '.join([ '{samtools} view -u -F 0x0002 {bam_filename} ', '{samtools} view -u -F 0x0100 - ', '{samtools} view -u -F 0x0004 - ', '{samtools} view -u -F 0x0008 - ', '{samtools} view -b -F 0x0400 - ']).format( samtools=settings.SAMTOOLS_BINARY, bam_filename=bam_filename) with open(bam_discordant_filename, 'w') as fh: subprocess.check_call(filter_discordant, stdout=fh, shell=True, executable=settings.BASH_PATH) # sort the discordant reads, overwrite the old file subprocess.check_call([settings.SAMTOOLS_BINARY, 'sort', bam_discordant_filename, os.path.splitext(bam_discordant_filename)[0]]) _filter_out_interchromosome_reads(bam_discordant_filename) bwa_disc_dataset.filesystem_location = clean_filesystem_location( bam_discordant_filename) bwa_disc_dataset.status = Dataset.STATUS.READY except subprocess.CalledProcessError: bwa_disc_dataset.filesystem_location = '' bwa_disc_dataset.status = Dataset.STATUS.FAILED bwa_disc_dataset.save() return bwa_disc_dataset
def align_with_bwa_mem(alignment_group, sample_alignment): """ REPLACES OLD BWA PIPELINE USING ALN AND SAMPE/SAMSE Aligns a sample to a reference genome using the bwa tool. Args: alignment_group: AlignmentGroup that this alignment is part of. sample_alignment: ExperimentSampleToAlignment. The respective dataset is assumed to have been created as well. """ # Start by gettng fresh objects from database. sample_alignment = ExperimentSampleToAlignment.objects.get( id=sample_alignment.id) experiment_sample = sample_alignment.experiment_sample alignment_group = AlignmentGroup.objects.get(id=alignment_group.id) # Grab the reference genome fasta for the alignment. ref_genome_fasta = get_dataset_with_type( alignment_group.reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() # Get the BWA Dataset and set it to computing. bwa_dataset = sample_alignment.dataset_set.get( type=Dataset.TYPE.BWA_ALIGN) bwa_dataset.status = Dataset.STATUS.COMPUTING bwa_dataset.save(update_fields=['status']) # Create a file that we'll write stderr to. error_path = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_align.error') error_output = open(error_path, 'w') # The alignment group is now officially ALIGNING. if alignment_group.status != AlignmentGroup.STATUS.ALIGNING: alignment_group.status = AlignmentGroup.STATUS.ALIGNING alignment_group.start_time = datetime.now() alignment_group.end_time = None alignment_group.save(update_fields=['status', 'start_time', 'end_time']) error_output.write( "==START OF ALIGNMENT PIPELINE FOR %s, (%s) ==\n" % ( sample_alignment.experiment_sample.label, sample_alignment.uid)) # We wrap the alignment logic in a try-except so that if an error occurs, # we record it and update the status of the Dataset to FAILED if anything # should fail. try: # Build index if the index doesn't exist. # NOTE: When aligning multiple samples to the same reference genome # concurrently, the build index method should be called once to completion # before starting the concurrent alignment jobs. ensure_bwa_index(ref_genome_fasta) # Grab the fastq sources, and determine whether we are doing paired ends. # First, grab fastq1, which must exist fq1_queryset = experiment_sample.dataset_set.filter( type=Dataset.TYPE.FASTQ1) assert fq1_queryset, "Must have at least one .fastq file" fq1_dataset = fq1_queryset[0] input_reads_1_fq = fq1_dataset.wrap_if_compressed() input_reads_1_fq_path = fq1_dataset.get_absolute_location() # Second, check if fastq2 exists and set is_paired_end fq2_queryset = experiment_sample.dataset_set.filter( type=Dataset.TYPE.FASTQ2) if fq2_queryset: is_paired_end = True fq2_dataset = fq2_queryset[0] input_reads_2_fq = fq2_dataset.wrap_if_compressed() input_reads_2_fq_path = fq2_dataset.get_absolute_location() else: is_paired_end = False # 1. Generate SA coordinates. read_fq_1_path, read_fq_1_fn = os.path.split(input_reads_1_fq_path) align_input_args = ' '.join([ '%s/bwa/bwa' % settings.TOOLS_DIR, 'mem', '-t', '1', # threads '-R', '"'+read_group_string(experiment_sample)+'"', # uncomment this to keep secondary alignments (for finding and marking paralogy regions) # But before we can uncomment we need to fix de novo assembly code '-a', ref_genome_fasta, input_reads_1_fq, ]) if is_paired_end: read_fq_2_path, read_fq_2_fn = os.path.split(input_reads_2_fq_path) align_input_args += ' ' + input_reads_2_fq # To skip saving the SAM file to disk directly, pipe output directly to # make a BAM file. align_input_args += ' | ' + settings.SAMTOOLS_BINARY + ' view -bS -' ### 2. Generate SAM output. output_bam = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_align.bam') error_output.write(align_input_args) # Flush the output here so it gets written before the alignments. error_output.flush() with open(output_bam, 'w') as fh: subprocess.check_call(align_input_args, stdout=fh, stderr=error_output, shell=True, executable=settings.BASH_PATH) # Set processing mask to not compute insert metrics if reads are # not paired end, as the lumpy script only works on paired end reads opt_processing_mask = {} if not is_paired_end: opt_processing_mask['compute_insert_metrics'] = False # Do several layers of processing on top of the initial alignment. result_bam_file = process_sam_bam_file(sample_alignment, alignment_group.reference_genome, output_bam, error_output, opt_processing_mask=opt_processing_mask) # Add the resulting file to the dataset. bwa_dataset.filesystem_location = clean_filesystem_location( result_bam_file) bwa_dataset.save() # Isolate split and discordant reads for SV calling. get_discordant_read_pairs(sample_alignment) get_split_reads(sample_alignment) # Add track to JBrowse. add_bam_file_track(alignment_group.reference_genome, sample_alignment, Dataset.TYPE.BWA_ALIGN) bwa_dataset.status = Dataset.STATUS.READY bwa_dataset.save() delete_redundant_files(sample_alignment.get_model_data_dir()) except: import traceback error_output.write(traceback.format_exc()) bwa_dataset.status = Dataset.STATUS.FAILED bwa_dataset.save() return finally: print error_path error_output.write('==END OF ALIGNMENT PIPELINE==\n') error_output.close() # Add the error Dataset to the object. error_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_ALIGN_ERROR, type=Dataset.TYPE.BWA_ALIGN_ERROR, filesystem_location=clean_filesystem_location(error_path)) sample_alignment.dataset_set.add(error_dataset) sample_alignment.save() return sample_alignment
def generate_new_reference_genome(variant_set, new_ref_genome_params): """Uses reference_genome_maker code to create a new ReferenceGenome from the given VariantSet (applies Variants to existing ReferenceGenome.) Args: variant_set: The VariantSet from which we'll generate the new ReferenceGenome. new_ref_genome_params: Dictionary of params, including: * label (required): Label for the new ReferenceGenome. Returns: The new ReferenceGenome. Raises: ValidationException if we don't support this use case. """ try: # Validate / parse params. assert 'label' in new_ref_genome_params new_ref_genome_label = new_ref_genome_params['label'] original_ref_genome = variant_set.reference_genome # Create a ReferenceGenome to track the position. reference_genome = ReferenceGenome.objects.create( project=original_ref_genome.project, label=new_ref_genome_label) # Location for the generated Genbank. filename_prefix = generate_safe_filename_prefix_from_label( new_ref_genome_label) output_root = os.path.join(reference_genome.get_model_data_dir(), filename_prefix) full_output_path = clean_filesystem_location(output_root + '.genbank') # Dataset to track the location. dataset = Dataset.objects.create( label=Dataset.TYPE.REFERENCE_GENOME_GENBANK, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK, filesystem_location=full_output_path, status=Dataset.STATUS.NOT_STARTED) reference_genome.dataset_set.add(dataset) # Prepare params for calling referece_genome_maker. # If the old genome is annotated, use it, otherwise, use the FASTA. # The BioPython SeqRecord should be the same either way. if original_ref_genome.is_annotated(): original_genome_path = original_ref_genome.dataset_set.get( type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).\ get_absolute_location() sequence_record = SeqIO.read(original_genome_path, 'genbank') else: original_genome_path = original_ref_genome.dataset_set.get( type=Dataset.TYPE.REFERENCE_GENOME_FASTA).\ get_absolute_location() sequence_record = SeqIO.read(original_genome_path, 'fasta') filename_prefix = generate_safe_filename_prefix_from_label( new_ref_genome_label) output_root = os.path.join(reference_genome.get_model_data_dir(), filename_prefix) # Create a fake, empty vcf path for now, as we're just getting # end-to-end to work. if not os.path.exists(settings.TEMP_FILE_ROOT): os.mkdir(settings.TEMP_FILE_ROOT) _, vcf_path = tempfile.mkstemp(suffix='_' + filename_prefix + '.vcf', dir=settings.TEMP_FILE_ROOT) with open(vcf_path, 'w') as vcf_fh: export_variant_set_as_vcf(variant_set, vcf_fh) dataset.status = Dataset.STATUS.COMPUTING dataset.save(update_fields=['status']) try: new_ref_genome_seq_record = reference_genome_maker.run( sequence_record, output_root, vcf_path) except Exception as e: dataset.status = Dataset.STATUS.FAILED dataset.save(update_fields=['status']) raise e reference_genome.save() dataset.status = Dataset.STATUS.READY dataset.save(update_fields=['status']) # Since the post_add_seq_to_ref_genome() signal couldn't run before, # we need to make sure to run it now. prepare_ref_genome_related_datasets(reference_genome, dataset) return reference_genome except Exception as e: raise ValidationException(str(e))
def generate_new_reference_genome(variant_set, new_ref_genome_params): """Uses reference_genome_maker code to create a new ReferenceGenome from the given VariantSet (applies Variants to existing ReferenceGenome.) Args: variant_set: The VariantSet from which we'll generate the new ReferenceGenome. new_ref_genome_params: Dictionary of params, including: * label (required): Label for the new ReferenceGenome. Returns: The new ReferenceGenome. Raises: ValidationException if we don't support this use case. """ try: # Validate / parse params. assert 'label' in new_ref_genome_params new_ref_genome_label = new_ref_genome_params['label'] original_ref_genome = variant_set.reference_genome # Create a ReferenceGenome to track the position. reference_genome = ReferenceGenome.objects.create( project=original_ref_genome.project, label=new_ref_genome_label) # Location for the generated Genbank. filename_prefix = generate_safe_filename_prefix_from_label( new_ref_genome_label) output_root = os.path.join(reference_genome.get_model_data_dir(), filename_prefix) full_output_path = clean_filesystem_location(output_root + '.genbank') # Dataset to track the location. dataset = Dataset.objects.create( label=Dataset.TYPE.REFERENCE_GENOME_GENBANK, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK, filesystem_location=full_output_path, status=Dataset.STATUS.NOT_STARTED) reference_genome.dataset_set.add(dataset) # Prepare params for calling referece_genome_maker. original_fasta_path = original_ref_genome.dataset_set.get( type=Dataset.TYPE.REFERENCE_GENOME_FASTA).\ get_absolute_location() sequence_record = SeqIO.read(original_fasta_path, 'fasta') filename_prefix = generate_safe_filename_prefix_from_label( new_ref_genome_label) output_root = os.path.join(reference_genome.get_model_data_dir(), filename_prefix) # Create a fake, empty vcf path for now, as we're just getting # end-to-end to work. if not os.path.exists(settings.TEMP_FILE_ROOT): os.mkdir(settings.TEMP_FILE_ROOT) _, vcf_path = tempfile.mkstemp( suffix='_' + filename_prefix + '.vcf', dir=settings.TEMP_FILE_ROOT) with open(vcf_path, 'w') as vcf_fh: export_variant_set_as_vcf(variant_set, vcf_fh) dataset.status = Dataset.STATUS.COMPUTING dataset.save(update_fields=['status']) try: new_ref_genome_seq_record = reference_genome_maker.run( sequence_record, output_root, vcf_path) except Exception as e: dataset.status = Dataset.STATUS.FAILED dataset.save(update_fields=['status']) raise e reference_genome.save() dataset.status = Dataset.STATUS.READY dataset.save(update_fields=['status']) # Since the post_add_seq_to_ref_genome() signal couldn't run before, # we need to make sure to run it now. prepare_ref_genome_related_datasets(reference_genome, dataset) return reference_genome except Exception as e: raise ValidationException(str(e))
def derivation_fn(sample_alignment, new_dataset): """Creates a bam file with reads relevant for de novo assembly. """ # Get the original bam file. bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) orig_bam_filename = bam_dataset.get_absolute_location() # Allocate a filename for the new dataset. de_novo_bam_filelocation = (os.path.splitext(orig_bam_filename)[0] + '.de_novo.bam') new_dataset.filesystem_location = clean_filesystem_location( de_novo_bam_filelocation) new_dataset.save(update_fields=['filesystem_location']) ### Strategy # 0. Create intermediate sam # 1. Grab unmapped reads. # a. If the corresponding pair for any read was mapped (and thus not # in the unmapped dataset), grab that from the original bam file. # 2. Grab split reads. # 2b. Add reads that are in intervals of interest. # 3. Sort by name so that paired reads are next to each other # 4. Filter out duplicate reads. Requires sort in step 3. # 5. Convert to bam. # 6. Delete intermediate files. # 0. Create intermediate files. intermediate_sam = ( os.path.splitext(de_novo_bam_filelocation)[0] + '.int.sam') paired_intermediate_sam = ( os.path.splitext(intermediate_sam)[0] + '.paired.sam') sorted_intermediate_sam = ( os.path.splitext(paired_intermediate_sam)[0] + '.sorted.sam') deduped_sorted_intermediate_sam = ( os.path.splitext(sorted_intermediate_sam)[0] + '.deduped.sam') intermediate_files = [ intermediate_sam, paired_intermediate_sam, sorted_intermediate_sam, deduped_sorted_intermediate_sam ] # 1. Get unmapped reads. unmapped_reads_dataset = get_unmapped_reads(sample_alignment) unmapped_reads_bam_file = unmapped_reads_dataset.get_absolute_location() cmd = '{samtools} view -h {bam_filename}'.format( samtools=settings.SAMTOOLS_BINARY, bam_filename=unmapped_reads_bam_file) with open(intermediate_sam, 'w') as output_fh: subprocess.check_call(cmd, stdout=output_fh, shell=True) # 2. Grab split reads. split_reads_dataset = get_split_reads(sample_alignment) split_rads_bam_file = split_reads_dataset.get_absolute_location() cmd = '{samtools} view {bam_filename}'.format( samtools=settings.SAMTOOLS_BINARY, bam_filename=split_rads_bam_file) with open(intermediate_sam, 'a') as output_fh: subprocess.check_call(cmd, stdout=output_fh, shell=True) # 2b. Add reads that are in any of intervals that we want to include. if force_include_reads_in_intervals: with open(intermediate_sam, 'a') as output_fh: get_reads_in_interval_list(orig_bam_filename, force_include_reads_in_intervals, output_fh) # 2c. For each of the reads that we've included, grab their # corresponding pairs. add_paired_mates(intermediate_sam, orig_bam_filename, paired_intermediate_sam) # 3. Sort by name so that paired reads are next to each other cmd = ( '(grep ^"@" {sam_file}; grep -v ^"@" {sam_file} | ' 'sort -k1,1 -k2,2n) > {sorted_sam_file}' ).format( sam_file=paired_intermediate_sam, sorted_sam_file=sorted_intermediate_sam) subprocess.call(cmd, shell=True) # 4. Filter out duplicate reads. Requires sort in step 3. cmd = 'uniq {sorted_sam_file} > {deduped_sam_file}'.format( sorted_sam_file=sorted_intermediate_sam, deduped_sam_file=deduped_sorted_intermediate_sam) subprocess.call(cmd, shell=True) # 5. Convert to bam. cmd = '{samtools} view -Sb {sam_file} > {final_bam_file}'.format( samtools=settings.SAMTOOLS_BINARY, sam_file=intermediate_sam, final_bam_file=de_novo_bam_filelocation) subprocess.call(cmd, shell=True) # 6. Delete intermediate files. for f in intermediate_files: os.remove(f)
def align_with_bwa_mem(alignment_group, sample_alignment): """ REPLACES OLD BWA PIPELINE USING ALN AND SAMPE/SAMSE Aligns a sample to a reference genome using the bwa tool. Args: alignment_group: AlignmentGroup that this alignment is part of. sample_alignment: ExperimentSampleToAlignment. The respective dataset is assumed to have been created as well. """ # Start by gettng fresh objects from database. sample_alignment = ExperimentSampleToAlignment.objects.get( id=sample_alignment.id) experiment_sample = sample_alignment.experiment_sample alignment_group = AlignmentGroup.objects.get(id=alignment_group.id) # Grab the reference genome fasta for the alignment. ref_genome_fasta = get_dataset_with_type( alignment_group.reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() # Get the BWA Dataset and set it to computing. bwa_dataset = sample_alignment.dataset_set.get(type=Dataset.TYPE.BWA_ALIGN) bwa_dataset.status = Dataset.STATUS.COMPUTING bwa_dataset.save(update_fields=['status']) # Create a file that we'll write stderr to. error_path = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_align.error') error_output = open(error_path, 'w') # The alignment group is now officially ALIGNING. if alignment_group.status != AlignmentGroup.STATUS.ALIGNING: alignment_group.status = AlignmentGroup.STATUS.ALIGNING alignment_group.start_time = datetime.now() alignment_group.end_time = None alignment_group.save( update_fields=['status', 'start_time', 'end_time']) error_output.write( "==START OF ALIGNMENT PIPELINE FOR %s, (%s) ==\n" % (sample_alignment.experiment_sample.label, sample_alignment.uid)) # We wrap the alignment logic in a try-except so that if an error occurs, # we record it and update the status of the Dataset to FAILED if anything # should fail. try: # Build index if the index doesn't exist. # NOTE: When aligning multiple samples to the same reference genome # concurrently, the build index method should be called once to completion # before starting the concurrent alignment jobs. ensure_bwa_index(ref_genome_fasta) # Grab the fastq sources, and determine whether we are doing paired ends. # First, grab fastq1, which must exist fq1_queryset = experiment_sample.dataset_set.filter( type=Dataset.TYPE.FASTQ1) assert fq1_queryset, "Must have at least one .fastq file" fq1_dataset = fq1_queryset[0] input_reads_1_fq = fq1_dataset.wrap_if_compressed() input_reads_1_fq_path = fq1_dataset.get_absolute_location() # Second, check if fastq2 exists and set is_paired_end fq2_queryset = experiment_sample.dataset_set.filter( type=Dataset.TYPE.FASTQ2) if fq2_queryset: is_paired_end = True fq2_dataset = fq2_queryset[0] input_reads_2_fq = fq2_dataset.wrap_if_compressed() input_reads_2_fq_path = fq2_dataset.get_absolute_location() else: is_paired_end = False # 1. Generate SA coordinates. read_fq_1_path, read_fq_1_fn = os.path.split(input_reads_1_fq_path) align_input_args = ' '.join([ '%s/bwa/bwa' % settings.TOOLS_DIR, 'mem', '-t', '1', # threads '-R', '"' + read_group_string(experiment_sample) + '"', # uncomment this to keep secondary alignments (for finding and marking paralogy regions) # But before we can uncomment we need to fix de novo assembly code '-a', ref_genome_fasta, input_reads_1_fq, ]) if is_paired_end: read_fq_2_path, read_fq_2_fn = os.path.split(input_reads_2_fq_path) align_input_args += ' ' + input_reads_2_fq # To skip saving the SAM file to disk directly, pipe output directly to # make a BAM file. align_input_args += ' | ' + settings.SAMTOOLS_BINARY + ' view -bS -' ### 2. Generate SAM output. output_bam = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_align.bam') error_output.write(align_input_args) # Flush the output here so it gets written before the alignments. error_output.flush() with open(output_bam, 'w') as fh: subprocess.check_call(align_input_args, stdout=fh, stderr=error_output, shell=True, executable=settings.BASH_PATH) # Set processing mask to not compute insert metrics if reads are # not paired end, as the lumpy script only works on paired end reads opt_processing_mask = {} if not is_paired_end: opt_processing_mask['compute_insert_metrics'] = False # Do several layers of processing on top of the initial alignment. result_bam_file = process_sam_bam_file( sample_alignment, alignment_group.reference_genome, output_bam, error_output, opt_processing_mask=opt_processing_mask) # Add the resulting file to the dataset. bwa_dataset.filesystem_location = clean_filesystem_location( result_bam_file) bwa_dataset.save() # Isolate split and discordant reads for SV calling. get_discordant_read_pairs(sample_alignment) get_split_reads(sample_alignment) # Add track to JBrowse. add_bam_file_track(alignment_group.reference_genome, sample_alignment, Dataset.TYPE.BWA_ALIGN) bwa_dataset.status = Dataset.STATUS.READY bwa_dataset.save() delete_redundant_files(sample_alignment.get_model_data_dir()) except: import traceback error_output.write(traceback.format_exc()) bwa_dataset.status = Dataset.STATUS.FAILED bwa_dataset.save() return finally: print error_path error_output.write('==END OF ALIGNMENT PIPELINE==\n') error_output.close() # Add the error Dataset to the object. error_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_ALIGN_ERROR, type=Dataset.TYPE.BWA_ALIGN_ERROR, filesystem_location=clean_filesystem_location(error_path)) sample_alignment.dataset_set.add(error_dataset) sample_alignment.save() return sample_alignment