def test_post_add_seq_to_ref_genome(self): """ Ensure that everything gets converted after creating a new reference genome object, like snpeff, fasta, gff, etc. """ # SNPEFF # Make sure Genbank file exists where expected. gbk_path = self.test_ext_ref_genome.get_snpeff_genbank_file_path() self.assertTrue(os.path.exists(gbk_path), 'snpeff gbk conversion failed: %s' % gbk_path) # check that the db was made snpEffPredictor_bin_path = os.path.join( self.test_ext_ref_genome.get_snpeff_genbank_parent_dir(), 'snpEffectPredictor.bin') self.assertTrue(os.path.exists(snpEffPredictor_bin_path), 'snpeff db was not made') # FASTA fasta = get_dataset_with_type(self.test_ext_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_FASTA) assert os.path.exists(fasta.get_absolute_location()), ( 'fasta conversion failed') # GFF gff = get_dataset_with_type(self.test_ext_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GFF) assert os.path.exists(gff.get_absolute_location()), ( 'gff conversion failed')
def test_post_add_seq_to_ref_genome(self): """ Ensure that everything gets converted after creating a new reference genome object, like snpeff, fasta, gff, etc. """ # SNPEFF # check that the genbank file was symlinked gbk_path = os.path.join( self.test_ext_ref_genome.get_snpeff_directory_path(), 'genes.gb') assert os.path.exists(gbk_path), 'snpeff gbk conversion failed: %s' % ( gbk_path) # check that the db was made assert os.path.exists(os.path.join( self.test_ext_ref_genome.get_snpeff_directory_path(), 'snpEffectPredictor.bin')), 'snpeff db was not made' # FASTA fasta = get_dataset_with_type(self.test_ext_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_FASTA) assert os.path.exists(fasta.get_absolute_location()), ( 'fasta conversion failed') # GFF gff = get_dataset_with_type(self.test_ext_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GFF) assert os.path.exists(gff.get_absolute_location()), ( 'gff conversion failed')
def test_compress_dataset(self): """ Make sure that compressing a dataset and putting a new dataset entry into the db works correctly. """ user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL) self.test_project = Project.objects.create(title=TEST_PROJECT_NAME, owner=user.get_profile()) self.test_ref_genome = import_reference_genome_from_local_file( self.test_project, TEST_REF_GENOME_NAME, TEST_REF_GENOME_PATH, 'genbank') dataset = get_dataset_with_type( self.test_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK) # All the magic happens here compressed_dataset = dataset.make_compressed('.gz') # Grab the new compressed dataset through the ref genome to # make sure that it got added compressed_dataset_through_ref_genome = get_dataset_with_type( entity=self.test_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK, compressed=True) assert compressed_dataset == compressed_dataset_through_ref_genome
def test_compress_dataset(self): """ Make sure that compressing a dataset and putting a new dataset entry into the db works correctly. """ user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL) self.test_project = Project.objects.create( title=TEST_PROJECT_NAME, owner=user.get_profile()) self.test_ref_genome = import_reference_genome_from_local_file( self.test_project, TEST_REF_GENOME_NAME, TEST_REF_GENOME_PATH, 'genbank') dataset = get_dataset_with_type(self.test_ref_genome, type= Dataset.TYPE.REFERENCE_GENOME_GENBANK) # All the magic happens here compressed_dataset = dataset.make_compressed('.gz') # Grab the new compressed dataset through the ref genome to # make sure that it got added compressed_dataset_through_ref_genome = get_dataset_with_type( entity= self.test_ref_genome, type= Dataset.TYPE.REFERENCE_GENOME_GENBANK, compressed= True) assert compressed_dataset == compressed_dataset_through_ref_genome
def add_genbank_file_track(reference_genome, **kwargs): """ Jbrowse has the ability to make tracks out of genbank files. This takes the genbank file from a reference_genome object and attempts to make such a track and then add it to the track list. """ FLATFILE_TRACK_BIN = os.path.join(JBROWSE_BIN_PATH, 'flatfile-to-json.pl') reference_gbk = get_dataset_with_type( reference_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() jbrowse_path = reference_genome.get_jbrowse_directory_path() reference_gff = get_dataset_with_type( reference_genome, type=Dataset.TYPE.REFERENCE_GENOME_GFF).get_absolute_location() json_update_fields = { 'style': { 'label': 'name,CDS,gene', 'description': 'note,function,gene_synonym', 'color': '#5fbcdd' } } genbank_json_command = [ FLATFILE_TRACK_BIN, '--gff', reference_gff, '--out', os.path.join(jbrowse_path, 'indiv_tracks', 'gbk'), '--type', JBROWSE_GBK_TYPES_TO_DISPLAY, '--autocomplete', 'all', '--trackLabel', 'gbk', '--key', "Genome Features", '--trackType', "CanvasFeatures", #'--getSubfeatures', #'--className','transcript', #'--subfeatureClasses', "{\"CDS\":\"transcript-CDS\"}" ] subprocess.check_call(genbank_json_command) # Finally, manually update tracklist json with style info tracklist_json = get_tracklist_json(reference_genome, 'gbk') for i, track in enumerate(tracklist_json['tracks']): if track['key'] == 'Genome Features': tracklist_json['tracks'][i] = merge_nested_dictionaries( track, json_update_fields) write_tracklist_json(reference_genome, tracklist_json, 'gbk')
def test_basic(self): """Basic test. """ self.reference_genome = import_reference_genome_from_local_file( self.project, 'ref_genome', TEST_GENBANK, 'genbank') variant_set = VariantSet.objects.create( reference_genome=self.reference_genome, label='vs1') ref_genome_filepath = get_dataset_with_type(self.reference_genome, Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() with open(ref_genome_filepath) as fh: ref_genome_seq_record = SeqIO.read(fh, 'genbank') for position in range(10, 111, 10): ref_value = ref_genome_seq_record[position - 1] var = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.reference_genome, chromosome=Chromosome.objects.get(reference_genome=self.reference_genome), position=position, ref_value=ref_value) VariantAlternate.objects.create( variant=var, alt_value='G') VariantToVariantSet.objects.create( variant=var, variant_set=variant_set) new_ref_genome_params = { 'label': 'new' } new_ref_genome = generate_new_reference_genome( variant_set, new_ref_genome_params) new_ref_genome_filepath = get_dataset_with_type( new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_GENBANK)\ .get_absolute_location() with open(new_ref_genome_filepath) as fh: new_ref_genome_seq_record = SeqIO.read(fh, 'genbank') # Assert size unchangd. self.assertEqual(len(new_ref_genome_seq_record), len(ref_genome_seq_record)) # Assert mutations are there. for position in range(10, 111, 10): self.assertEqual('G', str(new_ref_genome_seq_record[position - 1])) # Assert new genome is annotated. self.assertTrue(new_ref_genome.is_annotated())
def get_split_reads(sample_alignment): """Isolate split reads from a sample alignment. This uses a python script supplied with Lumpy that is run as a separate process. NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM. """ bwa_split_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_SPLIT) if bwa_split_dataset is not None: if (bwa_split_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_split_dataset.get_absolute_location())): return bwa_split_dataset else: bwa_split_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_SPLIT, type=Dataset.TYPE.BWA_SPLIT, status=Dataset.STATUS.NOT_STARTED) sample_alignment.dataset_set.add(bwa_split_dataset) # If here, we are going to run or re-run the Dataset. bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED bwa_split_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists(bam_filename), "BAM file '%s' is missing." % ( bam_filename) bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_split_reads.bam') try: bwa_split_dataset.status = Dataset.STATUS.COMPUTING bwa_split_dataset.save(update_fields=['status']) extract_split_reads(bam_filename, bam_split_filename) except subprocess.CalledProcessError: # if there are no split reads, then fail. bwa_split_dataset.filesystem_location = '' bwa_split_dataset.status = Dataset.STATUS.FAILED finally: bwa_split_dataset.status = Dataset.STATUS.READY bwa_split_dataset.filesystem_location = clean_filesystem_location( bam_split_filename) bwa_split_dataset.save() return bwa_split_dataset
def test_basic(self): """Basic test. """ self.reference_genome = import_reference_genome_from_local_file( self.project, 'ref_genome', TEST_GENBANK, 'genbank') variant_set = VariantSet.objects.create( reference_genome=self.reference_genome, label='vs1') ref_genome_filepath = get_dataset_with_type( self.reference_genome, Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() with open(ref_genome_filepath) as fh: ref_genome_seq_record = SeqIO.read(fh, 'genbank') for position in range(10, 111, 10): ref_value = ref_genome_seq_record[position - 1] var = Variant.objects.create( type=Variant.TYPE.TRANSITION, reference_genome=self.reference_genome, chromosome=Chromosome.objects.get( reference_genome=self.reference_genome), position=position, ref_value=ref_value) VariantAlternate.objects.create(variant=var, alt_value='G') VariantToVariantSet.objects.create(variant=var, variant_set=variant_set) new_ref_genome_params = {'label': 'new'} new_ref_genome = generate_new_reference_genome(variant_set, new_ref_genome_params) new_ref_genome_filepath = get_dataset_with_type( new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_GENBANK)\ .get_absolute_location() with open(new_ref_genome_filepath) as fh: new_ref_genome_seq_record = SeqIO.read(fh, 'genbank') # Assert size unchangd. self.assertEqual(len(new_ref_genome_seq_record), len(ref_genome_seq_record)) # Assert mutations are there. for position in range(10, 111, 10): self.assertEqual('G', str(new_ref_genome_seq_record[position - 1])) # Assert new genome is annotated. self.assertTrue(new_ref_genome.is_annotated())
def get_split_reads(sample_alignment): """Isolate split reads from a sample alignment. This uses a python script supplied with Lumpy that is run as a separate process. NOTE THAT THIS SCRIPT ONLY WORKS WITH BWA MEM. """ bwa_split_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_SPLIT) if bwa_split_dataset is not None: if (bwa_split_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_split_dataset.get_absolute_location())): return bwa_split_dataset else: bwa_split_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_SPLIT, type=Dataset.TYPE.BWA_SPLIT, status=Dataset.STATUS.NOT_STARTED) sample_alignment.dataset_set.add(bwa_split_dataset) # If here, we are going to run or re-run the Dataset. bwa_split_dataset.status = Dataset.STATUS.NOT_STARTED bwa_split_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists( bam_filename), "BAM file '%s' is missing." % (bam_filename) bam_split_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_split_reads.bam') try: bwa_split_dataset.status = Dataset.STATUS.COMPUTING bwa_split_dataset.save(update_fields=['status']) extract_split_reads(bam_filename, bam_split_filename) except subprocess.CalledProcessError: # if there are no split reads, then fail. bwa_split_dataset.filesystem_location = '' bwa_split_dataset.status = Dataset.STATUS.FAILED finally: bwa_split_dataset.status = Dataset.STATUS.READY bwa_split_dataset.filesystem_location = clean_filesystem_location( bam_split_filename) bwa_split_dataset.save() return bwa_split_dataset
def get_discordant_read_pairs(sample_alignment): """Isolate discordant pairs of reads from a sample alignment. """ # First, check if completed dataset already exists. bwa_disc_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_DISCORDANT) if bwa_disc_dataset is not None: if (bwa_disc_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_disc_dataset.get_absolute_location())): return bwa_disc_dataset else: bwa_disc_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_DISCORDANT, type=Dataset.TYPE.BWA_DISCORDANT) sample_alignment.dataset_set.add(bwa_disc_dataset) # If here, we are going to run or re-run the Dataset. bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED bwa_disc_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists( bam_filename), "BAM file '%s' is missing." % (bam_filename) # NOTE: This assumes the index just adds at .bai, w/ same path otherwise # - will this always be true? if not os.path.exists(bam_filename + '.bai'): index_bam_file(bam_filename) bam_discordant_filename = os.path.join( sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam') try: bwa_disc_dataset.status = Dataset.STATUS.COMPUTING bwa_disc_dataset.save(update_fields=['status']) extract_discordant_read_pairs(bam_filename, bam_discordant_filename) except subprocess.CalledProcessError: bwa_disc_dataset.filesystem_location = '' bwa_disc_dataset.status = Dataset.STATUS.FAILED finally: bwa_disc_dataset.status = Dataset.STATUS.READY bwa_disc_dataset.filesystem_location = clean_filesystem_location( bam_discordant_filename) bwa_disc_dataset.save() return bwa_disc_dataset
def add_genbank_file_track(reference_genome, **kwargs): """ Jbrowse has the ability to make tracks out of genbank files. This takes the genbank file from a reference_genome object and attempts to make such a track and then add it to the track list. """ FLATFILE_TRACK_BIN = os.path.join(JBROWSE_BIN_PATH, 'flatfile-to-json.pl') reference_gbk = get_dataset_with_type( reference_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() jbrowse_path = reference_genome.get_jbrowse_directory_path() reference_gff = get_dataset_with_type( reference_genome, type=Dataset.TYPE.REFERENCE_GENOME_GFF).get_absolute_location() json_update_fields = { 'style': { 'label': 'name,CDS,gene', 'description': 'note,function,gene_synonym', 'color': '#5fbcdd' } } genbank_json_command = [ FLATFILE_TRACK_BIN, '--gff', reference_gff, '--out', os.path.join(jbrowse_path,'indiv_tracks','gbk'), '--type', JBROWSE_GBK_TYPES_TO_DISPLAY, '--autocomplete','all', '--trackLabel','gbk', '--key',"Genome Features", '--trackType',"CanvasFeatures", #'--getSubfeatures', #'--className','transcript', #'--subfeatureClasses', "{\"CDS\":\"transcript-CDS\"}" ] subprocess.check_call(genbank_json_command) # Finally, manually update tracklist json with style info tracklist_json = get_tracklist_json(reference_genome, 'gbk') for i, track in enumerate(tracklist_json['tracks']): if track['key'] == 'Genome Features': tracklist_json['tracks'][i] = merge_nested_dictionaries( track, json_update_fields) write_tracklist_json(reference_genome, tracklist_json, 'gbk')
def get_discordant_read_pairs(sample_alignment): """Isolate discordant pairs of reads from a sample alignment. """ # First, check if completed dataset already exists. bwa_disc_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_DISCORDANT) if bwa_disc_dataset is not None: if (bwa_disc_dataset.status == Dataset.STATUS.READY and os.path.exists(bwa_disc_dataset.get_absolute_location())): return bwa_disc_dataset else: bwa_disc_dataset = Dataset.objects.create( label=Dataset.TYPE.BWA_DISCORDANT, type=Dataset.TYPE.BWA_DISCORDANT) sample_alignment.dataset_set.add(bwa_disc_dataset) # If here, we are going to run or re-run the Dataset. bwa_disc_dataset.status = Dataset.STATUS.NOT_STARTED bwa_disc_dataset.save(update_fields=['status']) bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() assert os.path.exists(bam_filename), "BAM file '%s' is missing." % ( bam_filename) # NOTE: This assumes the index just adds at .bai, w/ same path otherwise # - will this always be true? if not os.path.exists(bam_filename+'.bai'): index_bam_file(bam_filename) bam_discordant_filename = os.path.join(sample_alignment.get_model_data_dir(), 'bwa_discordant_pairs.bam') try: bwa_disc_dataset.status = Dataset.STATUS.COMPUTING bwa_disc_dataset.save(update_fields=['status']) extract_discordant_read_pairs(bam_filename, bam_discordant_filename) except subprocess.CalledProcessError: bwa_disc_dataset.filesystem_location = '' bwa_disc_dataset.status = Dataset.STATUS.FAILED finally: bwa_disc_dataset.status = Dataset.STATUS.READY bwa_disc_dataset.filesystem_location = clean_filesystem_location( bam_discordant_filename) bwa_disc_dataset.save() return bwa_disc_dataset
def _fastqc_test_runner(self, fastq1_location, fastq2_location): """Helper that takes different fastqs as source. This function is a test itself. """ # Run FastQC gz_backed_sample = self.common_entities['sample_1'] gz_fastq1_dataset = copy_and_add_dataset_source( gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, fastq1_location) gz_fastq2_dataset = copy_and_add_dataset_source( gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2, fastq2_location) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True) # We expect 2 Dataset per Fastq so 4 total. self.assertEqual(4, Dataset.objects.count()) # Check link matches file extension. FASTQC_DATASET_TYPES = [ Dataset.TYPE.FASTQC1_HTML, Dataset.TYPE.FASTQC2_HTML] for fastqc_dataset_type in FASTQC_DATASET_TYPES: fastqc_1_dataset = get_dataset_with_type( gz_backed_sample, fastqc_dataset_type) assert os.path.exists(fastqc_1_dataset.get_absolute_location())
def get_features_at_locations(ref_genome, intervals, chromosome=None): """ Use the genbank index dataset and return gene or mobile element names that are within these intervals. """ feature_index_path = get_dataset_with_type( ref_genome, Dataset.TYPE.FEATURE_INDEX).get_absolute_location() with open(feature_index_path, 'r') as fh: gbk_feature_list = pickle.load(fh) # Dictionary of features to return, for each interval. return_features = {} # For each input interval, return a list of feature names that # overlap. for interval in intervals: q_ivl = pyinter.closedopen(*interval) features = [ f_ivl for f_ivl in gbk_feature_list if q_ivl.intersect(f_ivl) ] return_features[interval] = features return return_features
def _run_genome_finish_test(self, variant_set, target_fasta, mismatch_tolerance=0): self.assertTrue(variant_set.variants.exists(), 'No placeable contigs found.') # Make new reference genome new_ref_genome_params = {'label': 'new_ref'} new_ref_genome = generate_new_reference_genome( variant_set, new_ref_genome_params) # Verify insertion was placed correctly new_ref_genome_fasta = get_dataset_with_type( new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA ).get_absolute_location() fastas_same, indexes = are_fastas_same( target_fasta, new_ref_genome_fasta) indexes_str = str(indexes) if len(indexes) < 50 else ( str(indexes[:50]) + '...') self.assertTrue(len(indexes) <= mismatch_tolerance, 'Fastas dissimilar at indexes:' + indexes_str)
def get_features_at_locations(ref_genome, intervals, chromosome=None): """ Use the genbank index dataset and return gene or mobile element names that are within these intervals. """ feature_index_path = get_dataset_with_type(ref_genome, Dataset.TYPE.FEATURE_INDEX).get_absolute_location() with open(feature_index_path, 'r') as fh: gbk_feature_list = pickle.load(fh) # Dictionary of features to return, for each interval. return_features = {} # For each input interval, return a list of feature names that # overlap. for interval in intervals: q_ivl = pyinter.closedopen(*interval) features = [f_ivl for f_ivl in gbk_feature_list if q_ivl.intersect(f_ivl)] return_features[interval] = features return return_features
def _find_valid_sample_alignments(alignment_group, alignment_type): """ Returns a list sample alignment objects for an alignment, skipping those that failed. """ sample_alignment_list = ( alignment_group.experimentsampletoalignment_set.all()) # Filter out mis-aligned files. # TODO: Should we show in the UI that some alignments failed and are # being skipped? def _is_successful_alignment(sample_alignment): bam_dataset = get_dataset_with_type(sample_alignment, alignment_type) return bam_dataset.status == Dataset.STATUS.READY sample_alignment_list = [sample_alignment for sample_alignment in sample_alignment_list if _is_successful_alignment(sample_alignment)] if len(sample_alignment_list) == 0: raise Exception('No successful alignments, Freebayes cannot proceed.') bam_files = [ get_dataset_with_type(sa, alignment_type).get_absolute_location() for sa in sample_alignment_list] # Keep only valid bam_files valid_bam_files = [] for bam_file in bam_files: if bam_file is None: continue if not os.stat(bam_file).st_size > 0: continue valid_bam_files.append(bam_file) assert len(valid_bam_files) == len(sample_alignment_list), ( "Expected %d bam files, but found %d" % ( len(sample_alignment_list), len(bam_files))) return sample_alignment_list
def _vcf_to_vcftabix(vcf_dataset): """Compresses and indexes a vcf using samtools tabix. Creates a new Dataset model instance for this compressed version, with the same related objects (e.g. pointing to the same AlignmentGroup). The Dataset is flagged as compressed, indexed, etc. Args: vcf_dataset: Dataset pointing to a vcf, or its compressed version. Index may or may not exist. Returns: Dataset that points to compressed version of input vcf_dataset, if it wasn't compressed already. The index file is asserted to exist for this compressed Dataset. """ ### This function has two steps: ### 1. Get or create compressed Dataset. ### 2. Create index if it doesn't exist. ### 1. Get or create compressed Dataset. if vcf_dataset.is_compressed(): compressed_dataset = vcf_dataset else: # Check for existing compressed version using related model. # Assume that the first model will do. related_model = vcf_dataset.get_related_model_set().all()[0] compressed_dataset = get_dataset_with_type( entity=related_model, type=vcf_dataset.type, compressed=True) # If there is no compressed dataset, then make it if compressed_dataset is None: compressed_dataset = vcf_dataset.make_compressed('.bgz') ### 2. Create index if it doesn't exist. if compressed_dataset.filesystem_idx_location == '': # Set the tabix index location compressed_dataset.filesystem_idx_location = ( compressed_dataset.filesystem_location + '.tbi') compressed_dataset.save() # Make tabix index subprocess.check_call([ TABIX_BINARY, '-f', '-p', 'vcf', compressed_dataset.get_absolute_location() ]) # Make sure the index exists, whether created now or previously. assert compressed_dataset.filesystem_idx_location == ( compressed_dataset.filesystem_location + '.tbi'), ( 'Tabix index file location is not correct.') assert os.path.exists( compressed_dataset.get_absolute_idx_location()), ( 'Tabix index file does not exist on filesystem.') return compressed_dataset
def add_vcf_track(reference_genome, alignment_group, vcf_dataset_type): """DEPRECATED. Use add_vcf_track_given_dataset(). """ # Get the vcf file location from the the Dataset of the genome # keyed by the alignment_type. vcf_dataset = get_dataset_with_type(alignment_group, vcf_dataset_type) return add_vcf_track_given_dataset(reference_genome, alignment_group, vcf_dataset)
def prepare_ref_genome_related_datasets(ref_genome, dataset): """Prepares data related to a ReferenceGenome. For example, if only Genbank exists, creates a Fasta Dataset. If related Datasets exists, this function is a no-op. Args: ref_genome: ReferenceGenome. dataset: A dataset pointing to a genome. Raises: AssertionError if dataset status is NOT_STARTED. """ assert dataset.status != Dataset.STATUS.NOT_STARTED if dataset.type == Dataset.TYPE.REFERENCE_GENOME_FASTA: # make sure the fasta index is generated # Run jbrowse ref genome processing prepare_jbrowse_ref_sequence(ref_genome) elif dataset.type == Dataset.TYPE.REFERENCE_GENOME_GENBANK: # Run snpeff build after creating ReferenceGenome obj. build_snpeff(ref_genome) # These functions are NO-OPS if the respective Datasets exist. generate_fasta_from_genbank(ref_genome) generate_gff_from_genbank(ref_genome) # Run jbrowse genbank genome processing for genes add_genbank_file_track(ref_genome) # Create an indexed set of intervals so we can find contigs # and snps within genes without using snpEFF. feature_index_output_path = os.path.join( ref_genome.get_snpeff_genbank_parent_dir(), 'gbk_feature_idx.pickle') generate_gbk_feature_index( ref_genome.get_snpeff_genbank_file_path(), feature_index_output_path) gbk_idx_dataset = Dataset.objects.create( label=Dataset.TYPE.FEATURE_INDEX, type=Dataset.TYPE.FEATURE_INDEX) gbk_idx_dataset.filesystem_location = feature_index_output_path gbk_idx_dataset.save() ref_genome.dataset_set.add(gbk_idx_dataset) # We create the bwa index once here, so that alignments running in # parallel don't step on each others' toes. ref_genome_fasta = get_dataset_with_type(ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() ensure_bwa_index(ref_genome_fasta)
def add_vcf_track(reference_genome, alignment_group, vcf_dataset_type): """DEPRECATED. Use add_vcf_track_given_dataset(). """ # Get the vcf file location from the the Dataset of the genome # keyed by the alignment_type. vcf_dataset = get_dataset_with_type(alignment_group, vcf_dataset_type) return add_vcf_track_given_dataset( reference_genome, alignment_group, vcf_dataset)
def compute_callable_loci(reference_genome, sample_alignment, bam_file_location, stderr=None): # Set output fn to None in case try fails. callable_loci_bed_fn = None try: ref_genome_fasta_location = get_dataset_with_type( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() output = _get_callable_loci_output_filename(bam_file_location) get_callable_loci(bam_file_location, output) # Add callable loci bed as dataset callable_loci_bed = Dataset.objects.create( label=Dataset.TYPE.BED_CALLABLE_LOCI, type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=clean_filesystem_location(output)) sample_alignment.dataset_set.add(callable_loci_bed) sample_alignment.save() callable_loci_bed_fn = callable_loci_bed.get_absolute_location() output = subprocess.check_output( ['cat', callable_loci_bed_fn]) with open(callable_loci_bed_fn, 'w') as callable_loci_bed_fh: for i, line in enumerate(output.split('\n')): try: fields = line.split() if len(fields) == 0: continue chrom, start, end, feature = fields feature = titlecase_spaces(feature) # Bed feature can't have spaces =( feature = feature.replace(' ', '_') print >> callable_loci_bed_fh, '\t'.join( [chrom, start, end, feature]) except Exception as e: print >> stderr, ( 'WARNING: Callable Loci line' + '%d: (%s) couldn\'t be parsed: %s') % ( i, line, str(e)) # add it as a jbrowse track add_bed_file_track(reference_genome, sample_alignment, callable_loci_bed) except Exception as e: print >> stderr, 'WARNING: Callable Loci failed.' print >> stderr, str(e) return callable_loci_bed_fn
def _compute(): """Calls compute function then recursively calls get_insert_size_mean_and_stdev(). """ bam_file = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN).get_absolute_location() compute_insert_metrics(bam_file, sample_alignment, stderr=stderr) return get_insert_size_mean_and_stdev(sample_alignment, stderr, _iteration=_iteration + 1)
def _compute(): """Calls compute function then recursively calls get_insert_size_mean_and_stdev(). """ bam_file = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_ALIGN).get_absolute_location() compute_insert_metrics(bam_file, sample_alignment, stderr=stderr) return get_insert_size_mean_and_stdev(sample_alignment, stderr, _iteration=_iteration + 1)
def run_pindel(fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type, **kwargs): """Run pindel to find SVs.""" if not os.path.isdir('%s/pindel' % settings.TOOLS_DIR): raise Exception('Pindel is not installed. Aborting.') bam_files = [ get_dataset_with_type(sa, alignment_type).get_absolute_location() for sa in sample_alignments] samples = [sa.experiment_sample for sa in sample_alignments] insert_sizes = [get_insert_size_mean_and_stdev(sa) for sa in sample_alignments] assert len(bam_files) == len(insert_sizes) # Create pindel config file pindel_config = os.path.join(vcf_output_dir, 'pindel_config.txt') at_least_one_config_line_written = False with open(pindel_config, 'w') as fh: for bam_file, sample, insert_size in zip( bam_files, samples, insert_sizes): # Skip bad alignments. mean, stdev = insert_size if mean == -1: continue fh.write('%s %s %s\n' % (bam_file, mean, sample.uid)) at_least_one_config_line_written = True if not at_least_one_config_line_written: raise Exception return False # failure # Build the full pindel command. pindel_root = vcf_output_filename[:-4] # get rid of .vcf extension subprocess.check_call(['%s/pindel/pindel' % settings.TOOLS_DIR, '-f', fasta_ref, '-i', pindel_config, '-c', 'ALL', '-o', pindel_root ]) # convert all different structural variant types to vcf subprocess.check_call(['%s/pindel/pindel2vcf' % settings.TOOLS_DIR, '-P', pindel_root, '-r', fasta_ref, '-R', 'name', '-d', 'date', '-mc', '1', # just need one read to show 1/1 in vcf ]) postprocess_pindel_vcf(vcf_output_filename) return True # success
def run_freebayes(fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type, region=None, **kwargs): """Run freebayes using the bam alignment files keyed by the alignment_type for all Genomes of the passed in ReferenceGenome. NOTE: If a Genome doesn't have a bam alignment file with this alignment_type, then it won't be used. Returns: Boolean, True if successfully made it to the end, else False. """ bam_files = [ get_dataset_with_type(sa, alignment_type).get_absolute_location() for sa in sample_alignments] # Build up the bam part of the freebayes binary call. bam_part = [] for bam_file in bam_files: bam_part.append('--bam') bam_part.append(bam_file) # Determine alignment ploidy (haploid or diploid). alignment_group = sample_alignments[0].alignment_group if alignment_group.alignment_options['call_as_haploid']: alignment_ploidy = 1 else: alignment_ploidy = 2 other_args_part = [ '--fasta-reference', fasta_ref, '--pvar', '0.001', '--ploidy', str(alignment_ploidy), '--min-alternate-fraction', '.3', '--hwe-priors-off', # '--binomial-obs-priors-off', '--use-mapping-quality', '--min-base-quality', '25', '--min-mapping-quality', '30' ] if region: other_args_part.extend(['--region',region]) # Build the full command and execute it for all bam files at once. full_command = ( ['%s/freebayes/freebayes' % settings.TOOLS_DIR] + bam_part + other_args_part) with open(vcf_output_filename, 'w') as fh: subprocess.check_call(full_command, stdout=fh) return True # success
def test_generate_genbank_mobile_element_multifasta(self): """Test generation of the mobile element fasta. """ self.reference_genome = import_reference_genome_from_local_file( self.project, 'ref_genome', TEST_GENBANK, 'genbank') self.reference_genome.ensure_mobile_element_multifasta() me_fa_dataset = get_dataset_with_type( self.reference_genome, Dataset.TYPE.MOBILE_ELEMENT_FASTA) assert os.path.exists( me_fa_dataset.get_absolute_location())
def run_lumpy(fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type, **kwargs): """Runs lumpy. """ print 'RUNNING LUMPY...' # NOTE: Only supporting single sample alignment for now. Previously we # tried to use lumpy for multiple sample alignments but the machine would # run out of memory so we are going to limit functionality to single # alignment only for now. assert len(sample_alignments) == 1 # Get relevant files. Note this is written to handle more than 1 sample # although right now we are not running lumpy on more than one sample at # a time as enforced by the assert above. bam_file_list = [] bam_disc_file_list = [] bam_sr_file_list = [] for sa in sample_alignments: bam_dataset = get_dataset_with_type(sa, Dataset.TYPE.BWA_ALIGN) bam_file_list.append(bam_dataset.get_absolute_location()) # Get or create discordant reads. bam_disc_dataset = get_discordant_read_pairs(sa) bam_disc_file_list.append(bam_disc_dataset.get_absolute_location()) # Get or create split reads. bam_sr_dataset = get_split_reads(sa) bam_sr_file_list.append(bam_sr_dataset.get_absolute_location()) lumpy_cmd = [ settings.LUMPY_EXPRESS_BINARY, '-B', ','.join(bam_file_list), '-S', ','.join(bam_sr_file_list), '-D', ','.join(bam_disc_file_list), '-o', vcf_output_filename, '-P' # get probability distributions, required for merge ] print ' '.join(lumpy_cmd) # Run Lumpy Express. lumpy_error_output = vcf_output_filename + '.error' with open(lumpy_error_output, 'w') as error_output_fh: subprocess.check_call(lumpy_cmd, stderr=error_output_fh) return True # success
def get_insert_size_mean_and_stdev(sample_alignment, stderr=None, _iteration=0): """Returns a tuple (mean, stdev) for insert sizes from the alignment. Calls the compute functoin if metrics don't exist. If the insert size can't be calculated, perhaps because of a bad alignment, returns (-1, -1). Args: sample_alignment: ExperimentSampleToAlignment we want metrics for. iteration: Used internally to avoid getting stuck in case where computation repeatedly fails. Returns: Tuple of ints (mean, stdev). """ # Prevent getting stuck in case computation keeps failing. if _iteration >= 3: return (-1, -1) def _compute(): """Calls compute function then recursively calls get_insert_size_mean_and_stdev(). """ bam_file = get_dataset_with_type( sample_alignment, Dataset.TYPE.BWA_ALIGN).get_absolute_location() compute_insert_metrics(bam_file, sample_alignment, stderr=stderr) return get_insert_size_mean_and_stdev(sample_alignment, stderr, _iteration=_iteration + 1) mean_stdev_dataset = get_dataset_with_type( sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV) if not mean_stdev_dataset: return _compute() file_location = mean_stdev_dataset.get_absolute_location() if not os.path.exists(file_location): return _compute() with open(file_location) as fh: combined_str = fh.read().strip() parts = combined_str.split(',') if not len(parts) == 2: return _compute() else: return tuple([int(p) for p in parts])
def flag_variants_from_bed(alignment_group, bed_dataset_type): sample_alignments = alignment_group.experimentsampletoalignment_set.all() for sample_alignment in sample_alignments: # If there is no callable_loci bed, skip the sample alignment. # TODO: Make this extensible to other BED files we might have callable_loci_bed = get_dataset_with_type( entity=sample_alignment, type=Dataset.TYPE.BED_CALLABLE_LOCI) if not callable_loci_bed: continue # need to add sample_alignment and bed_dataset here. add_variants_to_set_from_bed(sample_alignment=sample_alignment, bed_dataset=callable_loci_bed)
def run_lumpy( fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type, **kwargs): """Runs lumpy. """ print 'RUNNING LUMPY...' # NOTE: Only supporting single sample alignment for now. Previously we # tried to use lumpy for multiple sample alignments but the machine would # run out of memory so we are going to limit functionality to single # alignment only for now. assert len(sample_alignments) == 1 # Get relevant files. Note this is written to handle more than 1 sample # although right now we are not running lumpy on more than one sample at # a time as enforced by the assert above. bam_file_list = [] bam_disc_file_list = [] bam_sr_file_list = [] for sa in sample_alignments: bam_dataset = get_dataset_with_type(sa, Dataset.TYPE.BWA_ALIGN) bam_file_list.append(bam_dataset.get_absolute_location()) # Get or create discordant reads. bam_disc_dataset = get_discordant_read_pairs(sa) bam_disc_file_list.append(bam_disc_dataset.get_absolute_location()) # Get or create split reads. bam_sr_dataset = get_split_reads(sa) bam_sr_file_list.append(bam_sr_dataset.get_absolute_location()) lumpy_cmd = [ settings.LUMPY_EXPRESS_BINARY, '-B', ','.join(bam_file_list), '-S', ','.join(bam_sr_file_list), '-D', ','.join(bam_disc_file_list), '-o', vcf_output_filename, '-P' # get probability distributions, required for merge ] print ' '.join(lumpy_cmd) # Run Lumpy Express. lumpy_error_output = vcf_output_filename + '.error' with open(lumpy_error_output, 'w') as error_output_fh: subprocess.check_call(lumpy_cmd, stderr=error_output_fh) return True # success
def main(): if not os.path.exists(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) for sa in ExperimentSampleToAlignment.objects.all(): histo_dataset = get_dataset_with_type(sa, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM) histo_dataset_full_path = histo_dataset.get_absolute_location() # Update Dataset name. histo_dataset_name = ( os.path.split(os.path.split(histo_dataset_full_path)[0])[1] + '.txt') # Copy. new_full_path = os.path.join(OUTPUT_DIR, histo_dataset_name) shutil.copyfile(histo_dataset_full_path, new_full_path)
def prepare_jbrowse_ref_sequence(reference_genome, **kwargs): """Prepare the reference sequence and place it in the ref_genome dir. This implicitly creates the config directory structure for this reference genome. Tracks added in the future are added relative to this reference genome. The implemenation of this method is a light wrapper around jbrowse/bin/prepare-refseqs.pl. """ PREPARE_REFSEQS_BIN = os.path.join(JBROWSE_BIN_PATH, 'prepare-refseqs.pl') # First ensure that the reference genome exists. If it fails, try to # convert from genbank, then give up. reference_fasta = get_dataset_with_type( reference_genome, type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() # Next, ensure that the jbrowse directory exists. reference_genome.ensure_jbrowse_dir() jbrowse_path = os.path.join(reference_genome.get_jbrowse_directory_path(), 'indiv_tracks', 'DNA') # Now run prepare-refseqs.pl to get the ReferenceGenome in. subprocess.check_call([ PREPARE_REFSEQS_BIN, '--fasta', reference_fasta, '--out', jbrowse_path, ]) json_tracks = get_tracklist_json(reference_genome, 'DNA') # DNA track should be the first track dna_track = json_tracks['tracks'][0] assert dna_track['type'] == 'SequenceTrack' # Get rid of translation and reverse strand dna_track.update({ "showForwardStrand": True, "showReverseStrand": False, "showTranslation": False }) write_tracklist_json(reference_genome, json_tracks, 'DNA')
def get_insert_size_mean_and_stdev(sample_alignment, stderr=None, _iteration=0): """Returns a tuple (mean, stdev) for insert sizes from the alignment. Calls the compute functoin if metrics don't exist. If the insert size can't be calculated, perhaps because of a bad alignment, returns (-1, -1). Args: sample_alignment: ExperimentSampleToAlignment we want metrics for. iteration: Used internally to avoid getting stuck in case where computation repeatedly fails. Returns: Tuple of ints (mean, stdev). """ # Prevent getting stuck in case computation keeps failing. if _iteration >= 3: return (-1, -1) def _compute(): """Calls compute function then recursively calls get_insert_size_mean_and_stdev(). """ bam_file = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN).get_absolute_location() compute_insert_metrics(bam_file, sample_alignment, stderr=stderr) return get_insert_size_mean_and_stdev(sample_alignment, stderr, _iteration=_iteration + 1) mean_stdev_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV) if not mean_stdev_dataset: return _compute() file_location = mean_stdev_dataset.get_absolute_location() if not os.path.exists(file_location): return _compute() with open(file_location) as fh: combined_str = fh.read().strip() parts = combined_str.split(',') if not len(parts) == 2: return _compute() else: return tuple([int(p) for p in parts])
def flag_variants_from_bed(alignment_group, bed_dataset_type): sample_alignments = alignment_group.experimentsampletoalignment_set.all() for sample_alignment in sample_alignments: # If there is no callable_loci bed, skip the sample alignment. # TODO: Make this extensible to other BED files we might have callable_loci_bed = get_dataset_with_type( entity=sample_alignment, type=Dataset.TYPE.BED_CALLABLE_LOCI) if not callable_loci_bed: continue # need to add sample_alignment and bed_dataset here. add_variants_to_set_from_bed( sample_alignment=sample_alignment, bed_dataset=callable_loci_bed)
def prepare_jbrowse_ref_sequence(reference_genome, **kwargs): """Prepare the reference sequence and place it in the ref_genome dir. This implicitly creates the config directory structure for this reference genome. Tracks added in the future are added relative to this reference genome. The implemenation of this method is a light wrapper around jbrowse/bin/prepare-refseqs.pl. """ PREPARE_REFSEQS_BIN = os.path.join(JBROWSE_BIN_PATH, 'prepare-refseqs.pl') # First ensure that the reference genome exists. If it fails, try to # convert from genbank, then give up. reference_fasta = get_dataset_with_type( reference_genome, type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() # Next, ensure that the jbrowse directory exists. reference_genome.ensure_jbrowse_dir() jbrowse_path = os.path.join( reference_genome.get_jbrowse_directory_path(), 'indiv_tracks', 'DNA') # Now run prepare-refseqs.pl to get the ReferenceGenome in. subprocess.check_call([ PREPARE_REFSEQS_BIN, '--fasta', reference_fasta, '--out', jbrowse_path, ]) json_tracks = get_tracklist_json(reference_genome, 'DNA') # DNA track should be the first track dna_track = json_tracks['tracks'][0] assert dna_track['type'] == 'SequenceTrack' # Get rid of translation and reverse strand dna_track.update({ "showForwardStrand": True, "showReverseStrand": False, "showTranslation": False }) write_tracklist_json(reference_genome, json_tracks, 'DNA')
def derivation_fn(sample_alignment, unmapped_reads_dataset): # Get the original bam file. bam_dataset = get_dataset_with_type(sample_alignment, Dataset.TYPE.BWA_ALIGN) bam_filename = bam_dataset.get_absolute_location() # Allocate a filename for the unmapped reads. unmapped_reads_bam_file = (os.path.splitext(bam_filename)[0] + '.unmapped.bam') unmapped_reads_dataset.filesystem_location = clean_filesystem_location( unmapped_reads_bam_file) unmapped_reads_dataset.save(update_fields=['filesystem_location']) cmd = '{samtools} view -h -b -f 0x4 {bam_filename}'.format( samtools=settings.SAMTOOLS_BINARY, bam_filename=bam_filename) with open(unmapped_reads_bam_file, 'w') as output_fh: subprocess.check_call(cmd, stdout=output_fh, shell=True)
def get_vcf_files(alignment_group): """Gets vcf files related to the AlignmentGroup. Returns: Dict mapping from vcf type to file location. """ vcf_files = {} vcf_types = [VARIANT_TOOL_PARAMS_MAP[tool]['dataset_type'] for tool in settings.ENABLED_VARIANT_CALLERS] for vcf_type in vcf_types: vcf_dataset = get_dataset_with_type(alignment_group, vcf_type) if vcf_dataset is None: continue vcf_location = vcf_dataset.get_absolute_location() assert os.path.exists(vcf_location) vcf_files[vcf_type] = vcf_location return vcf_files
def main(): if not os.path.exists(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) for sa in ExperimentSampleToAlignment.objects.all(): histo_dataset = get_dataset_with_type( sa, Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM) histo_dataset_full_path = histo_dataset.get_absolute_location() # Update Dataset name. histo_dataset_name = ( os.path.split(os.path.split(histo_dataset_full_path)[0])[1] + '.txt') # Copy. new_full_path = os.path.join(OUTPUT_DIR, histo_dataset_name) shutil.copyfile(histo_dataset_full_path, new_full_path)
def get_vcf_files(alignment_group): """Gets vcf files related to the AlignmentGroup. Returns: Dict mapping from vcf type to file location. """ vcf_files = {} vcf_types = [ VARIANT_TOOL_PARAMS_MAP[tool]['dataset_type'] for tool in settings.ENABLED_VARIANT_CALLERS ] for vcf_type in vcf_types: vcf_dataset = get_dataset_with_type(alignment_group, vcf_type) if vcf_dataset is None: continue vcf_location = vcf_dataset.get_absolute_location() assert os.path.exists(vcf_location) vcf_files[vcf_type] = vcf_location return vcf_files
def compute_callable_loci(reference_genome, sample_alignment, bam_file_location, stderr=None): # Set output fn to None in case try fails. callable_loci_bed_fn = None try: ref_genome_fasta_location = get_dataset_with_type( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() callable_loci_bed_fn = ( _get_callable_loci_output_filename(bam_file_location)) get_callable_loci(bam_file_location, callable_loci_bed_fn) # Add callable loci bed as dataset callable_loci_bed_dataset = Dataset.objects.create( label=Dataset.TYPE.BED_CALLABLE_LOCI, type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=clean_filesystem_location( callable_loci_bed_fn)) sample_alignment.dataset_set.add(callable_loci_bed_dataset) sample_alignment.save() clean_bed_fn = clean_bed_features(callable_loci_bed_dataset, stderr=stderr) # add it as a jbrowse track add_bed_file_track(reference_genome, sample_alignment, callable_loci_bed_dataset) except Exception as e: print >> stderr, 'WARNING: Callable Loci failed.' print >> stderr, str(e) clean_bed_fn = '' finally: return clean_bed_fn
def test_dataset_strings(self): user = User.objects.create_user(TEST_USERNAME, password=TEST_PASSWORD, email=TEST_EMAIL) self.test_project = Project.objects.create(title=TEST_PROJECT_NAME, owner=user.get_profile()) self.test_ref_genome = import_reference_genome_from_local_file( self.test_project, TEST_REF_GENOME_NAME, TEST_REF_GENOME_PATH, 'genbank') dataset = get_dataset_with_type( self.test_ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK) self.assertEquals( dataset.internal_string(self.test_ref_genome), (str(self.test_ref_genome.uid) + '_' + uppercase_underscore(Dataset.TYPE.REFERENCE_GENOME_GENBANK)))
def compute_callable_loci(reference_genome, sample_alignment, bam_file_location, stderr=None): # Set output fn to None in case try fails. callable_loci_bed_fn = None try: ref_genome_fasta_location = get_dataset_with_type( reference_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() callable_loci_bed_fn = ( _get_callable_loci_output_filename(bam_file_location)) get_callable_loci(bam_file_location, callable_loci_bed_fn) # Add callable loci bed as dataset callable_loci_bed_dataset = Dataset.objects.create( label=Dataset.TYPE.BED_CALLABLE_LOCI, type=Dataset.TYPE.BED_CALLABLE_LOCI, filesystem_location=clean_filesystem_location(callable_loci_bed_fn)) sample_alignment.dataset_set.add(callable_loci_bed_dataset) sample_alignment.save() clean_bed_fn = clean_bed_features(callable_loci_bed_dataset, stderr=stderr) # add it as a jbrowse track add_bed_file_track( reference_genome, sample_alignment, callable_loci_bed_dataset) except Exception as e: print >> stderr, 'WARNING: Callable Loci failed.' print >> stderr, str(e) clean_bed_fn = '' finally: return clean_bed_fn
def _run_genome_finish_test(self, data_dict, mismatch_tolerance=0): contigs = self._perform_assembly(data_dict) # Assert contigs were generated self.assertTrue(contigs.count() > 0) self.assertTrue(contigs[0].num_bases > 0) ag = contigs[0].experiment_sample_to_alignment.alignment_group # Get set of de novo variants variant_set = create_de_novo_variants_set(ag, 'de_novo_variants') contigs_found_error_str = (str(len(contigs)) + ' found with lengths:' + ', '.join([str(c.num_bases) for c in contigs])) self.assertTrue(variant_set.variants.exists(), 'No placeable contigs found. ' + contigs_found_error_str) # Make new reference genome new_ref_genome_params = {'label': 'new_ref'} new_ref_genome = generate_new_reference_genome( variant_set, new_ref_genome_params) # Verify insertion was placed correctly target_fasta = data_dict['target_fasta'] new_ref_genome_fasta = get_dataset_with_type( new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA ).get_absolute_location() fastas_same, indexes = are_fastas_same( target_fasta, new_ref_genome_fasta) indexes_str = str(indexes) if len(indexes) < 50 else ( str(indexes[:50]) + '...') self.assertTrue(len(indexes) <= mismatch_tolerance, 'Fastas dissimilar at indexes:' + indexes_str + '\n' + contigs_found_error_str)
def add_vcf_track(reference_genome, alignment_group, vcf_dataset_type): """Adds a vcf track to JBrowse for this vcf. See JBrowse Docs: http://gmod.org/wiki/JBrowse_Configuration_Guide#Example_VCF-based_Variant_Track_Configuration """ # Get the vcf file location from the the Dataset of the genome # keyed by the alignment_type. vcf_dataset = get_dataset_with_type(alignment_group, vcf_dataset_type) vcf_dataset = _vcf_to_vcftabix(vcf_dataset) if reference_genome.project.is_s3_backed(): urlTemplate = os.path.join('http://%s.s3.amazonaws.com/' % S3_BUCKET, vcf_dataset.filesystem_location.strip("/jbrowse")) urlTemplate_idx = os.path.join('http://%s.s3.amazonaws.com/' % S3_BUCKET, vcf_dataset.filesystem_idx_location.strip("/jbrowse")) else: urlTemplate = os.path.join(JBROWSE_DATA_URL_ROOT, vcf_dataset.filesystem_location) urlTemplate_idx = os.path.join(JBROWSE_DATA_URL_ROOT, vcf_dataset.filesystem_idx_location) label = vcf_dataset.internal_string(alignment_group) key = "{:s} {:s} SNVs".format(vcf_dataset.type,alignment_group.label) # Build the JSON object. raw_dict_obj = { 'tracks' : [{ "label" : label, "key" : key, "storeClass" : "JBrowse/Store/SeqFeature/VCFTabix", "urlTemplate" : urlTemplate, "tbiUrlTemplate": urlTemplate_idx, 'category' : 'VCF Tracks', "type" : "JBrowse/View/Track/HTMLVariants" }] } write_tracklist_json(reference_genome, raw_dict_obj, label)
def fastqc_view(request, project_uid, sample_uid, read_num): project = get_object_or_404(Project, owner=request.user.get_profile(), uid=project_uid) sample = get_object_or_404(ExperimentSample, project=project, uid=sample_uid) if int(read_num) == 1: dataset_type = Dataset.TYPE.FASTQC1_HTML elif int(read_num) == 2: dataset_type = Dataset.TYPE.FASTQC2_HTML else: raise Exception('Read number must be 1 or 2') fastqc_dataset = get_dataset_with_type(sample, dataset_type) response = HttpResponse(mimetype="text/html") for line in open(fastqc_dataset.get_absolute_location()): response.write(line) return response
def prepare_ref_genome_related_datasets(ref_genome, dataset): """Prepares data related to a ReferenceGenome. For example, if only Genbank exists, creates a Fasta Dataset. If related Datasets exists, this function is a no-op. Args: ref_genome: ReferenceGenome. dataset: A dataset pointing to a genome. Raises: AssertionError if dataset status is NOT_STARTED. """ assert dataset.status != Dataset.STATUS.NOT_STARTED if dataset.type == Dataset.TYPE.REFERENCE_GENOME_FASTA: # make sure the fasta index is generated # Run jbrowse ref genome processing prepare_jbrowse_ref_sequence(ref_genome) elif dataset.type == Dataset.TYPE.REFERENCE_GENOME_GENBANK: # Run snpeff build after creating ReferenceGenome obj. build_snpeff(ref_genome) # These functions are NO-OPS if the respective Datasets exist. generate_fasta_from_genbank(ref_genome) generate_gff_from_genbank(ref_genome) # Run jbrowse genbank genome processing for genes add_genbank_file_track(ref_genome) # We create the bwa index once here, so that alignments running in # parallel don't step on each others' toes. ref_genome_fasta = get_dataset_with_type(ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location() ensure_bwa_index(ref_genome_fasta)
def generate_gff_from_genbank(ref_genome): """If this reference genome has a genbank but not a GFF, generate a GFF from the genbank. """ # If a GFF already exists, then just return. if ref_genome.dataset_set.filter( type=Dataset.TYPE.REFERENCE_GENOME_GFF).exists(): return # Check that a genbank exists. assert ref_genome.dataset_set.filter( type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).exists() # Get genbank path and filename components (for creating GFF file name). genbank_path = get_dataset_with_type( ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() genbank_dir, genbank_filename = os.path.split(genbank_path) genbank_noext = os.path.splitext(genbank_filename)[0] # Put the GFF file in the same dir, just change the extension to .gff. gff_filename = os.path.join(genbank_dir, (genbank_noext + '.gff')) # Get the individual records, each corresponding to a chromosome. genome_records = list(SeqIO.parse(genbank_path, 'genbank')) # SnpEFF takes the name attr, but the BioPython uses the id attr to make its # GFF file, so overwrite the id with the name when converting to GFF. for genome_record in genome_records: genome_record.name = genome_record.id GFF.write(genome_records, open(gff_filename, 'w')) dataset_type = IMPORT_FORMAT_TO_DATASET_TYPE['gff'] copy_and_add_dataset_source(ref_genome, dataset_type, dataset_type, gff_filename)
def _find_valid_sample_alignments(alignment_group, alignment_type): """ Returns a list sample alignment objects for an alignment, skipping those that failed. """ sample_alignment_list = ( alignment_group.experimentsampletoalignment_set.all()) # Filter out mis-aligned files. # TODO: Should we show in the UI that some alignments failed and are # being skipped? def _is_successful_alignment(sample_alignment): bam_dataset = get_dataset_with_type(sample_alignment, alignment_type) return bam_dataset.status == Dataset.STATUS.READY sample_alignment_list = [ sample_alignment for sample_alignment in sample_alignment_list if _is_successful_alignment(sample_alignment) ] if len(sample_alignment_list) == 0: raise Exception('No successful alignments, Freebayes cannot proceed.') bam_files = [ get_dataset_with_type(sa, alignment_type).get_absolute_location() for sa in sample_alignment_list ] # Keep only valid bam_files valid_bam_files = [] for bam_file in bam_files: if bam_file is None: continue if not os.stat(bam_file).st_size > 0: continue valid_bam_files.append(bam_file) assert len(valid_bam_files) == len(sample_alignment_list), ( "Expected %d bam files, but found %d" % (len(sample_alignment_list), len(bam_files))) return sample_alignment_list
def add_contig_reads_bam_track(contig, alignment_type): """Update the JBrowse track config file, trackList.json, for this ReferenceGenome with a track for the given sample_alignment and alignment_type. """ # Get the bam file location from the the Dataset of the genome # keyed by the alignment_type. bam_dataset = get_dataset_with_type(contig, alignment_type) # Figure out the url that JBrowse would use to show the data, e.g.: # /jbrowse/gd_data/projects/58a62c7d/genomes/8dc829ec/align.bam # urlTemplate = os.path.join(JBROWSE_DATA_URL_ROOT, # bam_dataset.filesystem_location) # NOTE: We should construct bam file urls using project.get_client # jbrowse_link() rather than checking S3 flag here. reference_genome = contig.parent_reference_genome if reference_genome.project.is_s3_backed(): urlTemplate = os.path.join( 'http://%s.s3.amazonaws.com/' % S3_BUCKET, bam_dataset.filesystem_location.strip("/jbrowse")) else: urlTemplate = os.path.join(JBROWSE_DATA_URL_ROOT, bam_dataset.filesystem_location) # doing label as ES_AG because SA isn't currently used in the variant view label = bam_dataset.internal_string(contig) key = bam_dataset.external_string(contig) # Build the JSON object. raw_dict_obj = { 'tracks': [{ 'storeClass': 'JBrowse/Store/SeqFeature/BAM', 'urlTemplate': urlTemplate, 'label': label, 'type': 'JBrowse/View/Track/Alignments2', 'chunkSizeLimit': 10000000, # double the default chunk size 'key': key, 'category': 'Contig BAM Tracks', 'style': { 'className': 'alignment', 'arrowheadClass': 'arrowhead', 'labelScale': 100 } }] } write_tracklist_json(reference_genome, raw_dict_obj, label) # Also add a snp coverage track. snp_coverage_label = bam_dataset.internal_string(contig) + '_COVERAGE' snp_coverage_key = key + ' Coverage' coverage_raw_dict_obj = { 'tracks': [{ 'storeClass': 'JBrowse/Store/SeqFeature/BAM', 'urlTemplate': urlTemplate, 'label': snp_coverage_label, 'type': 'JBrowse/View/Track/SNPCoverage', 'category': 'Contig Coverage Tracks', 'key': snp_coverage_key }] } write_tracklist_json(reference_genome, coverage_raw_dict_obj, snp_coverage_label)
def build_snpeff(ref_genome): """Setup the SnpEff database for ref_genome. This function does the following: * Sets up the directory structure for SnpEff-related files. * Writes a possibly modified Genbank to the location that SnpEff expects to find it. A few cleanups are necessary to avoid SnpEff quirks. * Creates the SnpEff config file for building the database/index. * Builds the SnpEff database/index. SnpEFF needs a config file for every reference genome, which lists a single reference genome, its chromosomes, and the codon table that each uses. For now we can assume that all our genomes will use bacterial codons. Every reference genome in the config file should look similar to: # Ecoli K12 MG1655 NC_000913.genome : Escherichia_coli NC_000913.chromosomes : NC_000913 NC_000913.NC_000913.codonTable: Bacterial_and_Plant_Plastid We have made a template that can do this with yaml rendering, in the snpEFF tools directory. Given a ref_genome object, it generates a snpEFF config file and builds and snpEFF database file for the genome, and places it in the ref genome's data dir under ./snpeff. """ # if no genbank file for this ref genome, then do nothing if not ref_genome.is_annotated(): print "Skipping SnpEff indexing: No genbank for reference genome %s" % ( ref_genome.uid) return # Get the path to the reference genbank, making sure it exists. ref_genome_path = get_dataset_with_type( ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() assert ref_genome_path is not None, "Reference Genbank missing." # Create the snpeff directory structure. ref_genome.ensure_snpeff_dir() # Build a template data dictionary which will be passed to the django # template renderer in order to generate the config file. templ_data = {} templ_data['snpeff_dir'] = ref_genome.get_snpeff_dir() templ_data['uid'] = ref_genome.uid templ_data['label'] = ref_genome.label # The following block does 2 things: # 1. Identifies all chromosomes in the Genbank. # 2. Ensures that the contained SeqRecord name and ids match, which is # required by SnpEff. templ_data['chromosomes'] = [] new_genbank_seq_records = [] with open(ref_genome_path) as genbank_fh: for seq_record in SeqIO.parse(genbank_fh, 'genbank'): # Set the ACCESSION/LOCUS/VERSION to all be the same for this # new modified genbank seq_record.name = seq_record.id new_genbank_seq_records.append(seq_record) # Add this record as a chromosome to this ref genome # TODO: Do we want to check seqrecords for sane/sanitized names? templ_data['chromosomes'].append(seq_record.name) templ_data['chromosomes'].append(seq_record.name) templ_data['chrs_string'] = ','.join(templ_data['chromosomes']) # Write the updated Genbank. snpeff_genbank_path = ref_genome.get_snpeff_genbank_file_path() SeqIO.write(new_genbank_seq_records, snpeff_genbank_path, 'genbank') # Stop-gap fix to ensure line lengths in Genbank to appease SnpEff. ensure_line_lengths(ref_genome.get_snpeff_genbank_file_path()) # Render SnpEff config template. render_snpeff_config(templ_data, ref_genome.get_snpeff_config_path()) # Build snpEff database build_snpeff_db(ref_genome.get_snpeff_config_path(), ref_genome.uid)
def run_snpeff(alignment_group, vcf_source_tool): """Run snpeff on an alignment group after creating a vcf with a snpcaller. We only use the alignment type to store the snpeff file. Returns the snpeff vcf output filename. """ assert vcf_source_tool in MAP_VCF_SOURCE_TOOL_TO_ORIGINAL_VCF_DATASET_TYPE # Get the reference genome uid to get the config path and snpeff genome name ref_genome = alignment_group.reference_genome ref_genome_uid = alignment_group.reference_genome.uid source_vcf_dataset_type = ( MAP_VCF_SOURCE_TOOL_TO_ORIGINAL_VCF_DATASET_TYPE[vcf_source_tool]) source_vcf_dataset = get_dataset_with_type(alignment_group, type=source_vcf_dataset_type) assert source_vcf_dataset is not None vcf_input_filename = source_vcf_dataset.get_absolute_location() assert os.path.exists(vcf_input_filename) # Make sure vcf has at least one record. If not, return. with open(vcf_input_filename) as unannotated_fh: vcf_reader = vcf.Reader(unannotated_fh) try: vcf_reader.next() except StopIteration: # No variants called. No need to do SnpEff. return # Prepare a directory to put the output file. vcf_output_filename = get_snpeff_vcf_output_path(alignment_group, vcf_source_tool) snpeff_args = [ 'java', '-jar', settings.SNPEFF_JAR_PATH, 'eff', '-v', '-i', 'vcf', '-o', 'vcf', '-c', ref_genome.get_snpeff_config_path(), '-ud', str(settings.SNPEFF_UD_INTERVAL_LENGTH), '-formatEff', '-q', '-noLog', # '-t', str(settings.SNPEFF_THREADS), ref_genome_uid, vcf_input_filename ] print ' '.join(snpeff_args) with open(vcf_output_filename, 'w') as fh_out: snpeff_proc = subprocess.Popen(snpeff_args, stdout=subprocess.PIPE) convert_snpeff_info_fields(snpeff_proc.stdout, fh_out) return vcf_output_filename
def parse_alignment_group_vcf(alignment_group, vcf_dataset_type): """Parses the VCF associated with the AlignmentGroup and saves data there. """ vcf_dataset = get_dataset_with_type(alignment_group, vcf_dataset_type) parse_vcf(vcf_dataset, alignment_group)
def run_delly(fasta_ref, sample_alignments, vcf_output_dir, vcf_output_filename, alignment_type, **kwargs): """Run delly to find SVs.""" assert os.path.exists( settings.DELLY_BIN), ('Delly is not installed. Aborting.') delly_root = vcf_output_filename[:-4] # get rid of .vcf extension transformations = ['DEL', 'DUP', 'INV'] vcf_outputs = map( lambda transformation: '%s_%s.vcf' % (delly_root, transformation), transformations) # Create symlinks to bam files which use uid because Delly uses the name of # the file as sample uid in the output report. new_bam_files = [] bam_files = [ get_dataset_with_type(sa, alignment_type).get_absolute_location() for sa in sample_alignments ] samples = [sa.experiment_sample for sa in sample_alignments] for bam_file, sample in zip(bam_files, samples): new_bam_file = os.path.join(os.path.dirname(bam_file), sample.uid + '.bam') _clean_symlink(bam_file, new_bam_file) _clean_symlink(bam_file + '.bai', new_bam_file + '.bai') new_bam_files.append(new_bam_file) # run delly for each type of transformation for transformation, vcf_output in zip(transformations, vcf_outputs): # not checked_call, because delly errors if it doesn't find any SVs subprocess.call([ settings.DELLY_BIN, '-t', transformation, '-o', vcf_output, '-g', fasta_ref ] + new_bam_files) # combine the separate vcfs for each transformation vcf_outputs = [f for f in vcf_outputs if os.path.exists(f)] if vcf_outputs: temp_vcf = os.path.join(vcf_output_dir, 'temp_vcf') os.putenv('PERL5LIB', os.path.join(settings.VCFTOOLS_DIR, 'perl')) with open(temp_vcf, 'w') as fh: subprocess.check_call([settings.VCF_CONCAT_BINARY] + vcf_outputs, stdout=fh) with open(vcf_output_filename, 'w') as fh: subprocess.check_call([settings.VCF_SORT_BINARY, temp_vcf], stdout=fh) os.remove(temp_vcf) else: # hack: create empty vcf subprocess.check_call(['touch', delly_root]) subprocess.check_call([ '%s/pindel/pindel2vcf' % settings.TOOLS_DIR, '-p', delly_root, # TODO does this work? '-r', fasta_ref, '-R', 'name', '-d', 'date' ]) # Delete temporary bam file symlinks. for f in new_bam_files: os.remove(f) os.remove(f + '.bai') postprocess_delly_vcf(vcf_output_filename) return True # success