def test_run_pipeline__samples_not_ready__fastq2(self): """Tests that the pipeline raises an AssertionError if samples aren't ready, fastq2. """ fastq_dataset = self.experiment_sample.dataset_set.filter( type=Dataset.TYPE.FASTQ2)[0] fastq_dataset.status = Dataset.STATUS.QUEUED_TO_COPY fastq_dataset.save() sample_list = [self.experiment_sample] with self.assertRaises(AssertionError): run_pipeline('name_placeholder', self.reference_genome, sample_list)
def test_run_pipeline__bad_alignment(self): """Alignment of bad reads. Might happen if user tries to align wrong reads to wrong reference genome. """ ref_genome = import_reference_genome_from_local_file( self.project, 'concat_mg1655_partials', FullVCFTestSet.TEST_CONCAT_GENBANK, 'genbank') sample_list = [self.experiment_sample] # NOTE: Ideally there would be a better way to test this. # In general, we need to figure out how to better communicate the reason # for a failed alignment to the user. with self.assertRaises(Exception): run_pipeline('name_placeholder', ref_genome, sample_list)
def test_run_pipeline__genbank_from_ncbi_with_spaces_in_label(self): """Tests the pipeline where the genome is imported from NCBI with spaces in the name. """ if not internet_on(): return MG1655_ACCESSION = 'NC_000913.3' MG1655_LABEL = 'mg1655 look a space' ref_genome = import_reference_genome_from_ncbi(self.project, MG1655_LABEL, MG1655_ACCESSION, 'genbank') sample_list = [self.experiment_sample] alignment_group_obj, async_result = run_pipeline( 'name_placeholder', ref_genome, sample_list) # Block until pipeline finishes. while not async_result.ready(): time.sleep(1) if async_result.status == 'FAILURE': self.fail('Async task failed.') alignment_group_obj = AlignmentGroup.objects.get( id=alignment_group_obj.id) self.assertEqual( 1, len(alignment_group_obj.experimentsampletoalignment_set.all())) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group_obj.status)
def test_run_pipeline(self): """Tests running the full pipeline. """ sample_list = [self.experiment_sample] alignment_group_obj, async_result = run_pipeline( 'name_placeholder', self.reference_genome, sample_list) # Block until pipeline finishes. while not async_result.ready(): time.sleep(1) if async_result.status == 'FAILURE': self.fail('Async task failed.') # Refresh the object. alignment_group_obj = AlignmentGroup.objects.get( id=alignment_group_obj.id) # Verify the AlignmentGroup object is created. self.assertEqual( 1, len(alignment_group_obj.experimentsampletoalignment_set.all())) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group_obj.status) # Make sure the initial JBrowse config has been created. jbrowse_dir = self.reference_genome.get_jbrowse_directory_path() self.assertTrue(os.path.exists(jbrowse_dir)) self.assertTrue( os.path.exists(os.path.join(jbrowse_dir, 'indiv_tracks')))
def test_run_pipeline(self): """End-to-end test of pipeline. Fails if any errors. """ # Create an extra sample that will not be aligned but has parent-child # relationship with the sample that is aligned. This would catch the # bug reported in https://github.com/churchlab/millstone/issues/561. unused_es = ExperimentSample.objects.create(project=self.project, label='unused sample') self.experiment_sample.add_child(unused_es) sample_list = [self.experiment_sample] result = run_pipeline('name_placeholder', self.reference_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status) # Make sure some expected variants are found. variants = Variant.objects.filter( reference_genome=self.reference_genome) self.assertTrue(len(variants)) v_1834 = Variant.objects.get(position=1834) v_1834_vccd = v_1834.variantcallercommondata_set.all()[0] v_1834_ve = v_1834_vccd.variantevidence_set.all()[0] self.assertFalse(v_1834_ve.data.get('IS_SV', False))
def test_run_pipeline__snps_with_effect__no_svs(self): """Tests pipeline with SNPs with effect, but no SVs called. """ ref_genome = import_reference_genome_from_local_file( self.project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK, 'genbank') sample_obj = ExperimentSample.objects.create(project=self.project, label='Sample %d' % 0) # Add raw reads to each sample. copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[0]) copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[0]) result = run_pipeline('test_align', ref_genome, [sample_obj]) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status) # Check that SnpEff worked. v_205 = Variant.objects.get( reference_genome=alignment_group.reference_genome, position=205) v_205_va = v_205.variantalternate_set.all()[0] self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
def test_run_pipeline(self): """End-to-end test of pipeline. Fails if any errors. """ # Create an extra sample that will not be aligned but has parent-child # relationship with the sample that is aligned. This would catch the # bug reported in https://github.com/churchlab/millstone/issues/561. unused_es = ExperimentSample.objects.create( project=self.project, label='unused sample') self.experiment_sample.add_child(unused_es) sample_list = [self.experiment_sample] result = run_pipeline( 'name_placeholder', self.reference_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status) # Make sure some expected variants are found. variants = Variant.objects.filter( reference_genome=self.reference_genome) self.assertTrue(len(variants)) v_1834 = Variant.objects.get(position=1834) v_1834_vccd = v_1834.variantcallercommondata_set.all()[0] v_1834_ve = v_1834_vccd.variantevidence_set.all()[0] self.assertFalse(v_1834_ve.data.get('IS_SV', False))
def test_run_pipeline(self): """End-to-end test of pipeline. Fails if any errors. """ sample_list = [self.experiment_sample] alignment_group, async_result = run_pipeline('name_placeholder', self.reference_genome, sample_list) async_result.wait() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def test_run_alignment_with_spaces_in_genbank_filename(self): project = self.common_entities['project'] ref_genome_label = 'dirty_upload' request = HttpRequest() request.POST = { 'projectUid': project.uid, 'refGenomeLabel': ref_genome_label, 'importFileFormat': 'genbank' } request.method = 'POST' request.user = self.common_entities['user'] authenticate(username=TEST_USERNAME, password=TEST_PASSWORD) self.assertTrue(request.user.is_authenticated()) request.FILES['refGenomeFile'] = UploadedFile( file=open(TEST_GENBANK), name='dirty_genbank (spaces).gb') response = create_ref_genome_from_browser_upload(request) self.assertEqual(STATUS_CODE__SUCCESS, response.status_code) self.assertFalse(json.loads(response.content).get('error', False)) # Get reference genome ref_genome = ReferenceGenome.objects.get(project=project, label=ref_genome_label) # Create sample model sample = ExperimentSample.objects.create(project=project, label='test_sample') # Add fastq datasets to sample add_dataset_to_entity(sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=TEST_DIRTY_FQ_1) # Add fastq datasets to sample add_dataset_to_entity(sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=TEST_DIRTY_FQ_2) # Run alignment of sample to reference alignment_group_label = 'test_alignment' sample_list = [sample] result = run_pipeline(alignment_group_label, ref_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def sv_testing_bootstrap(project): sv_testing_dir = os.path.join(GD_ROOT, 'test_data', 'sv_testing', 'all_svs') fasta = os.path.join(sv_testing_dir, 'ref.fa') fq1 = os.path.join(sv_testing_dir, 'simLibrary.1.fq') fq2 = os.path.join(sv_testing_dir, 'simLibrary.2.fq') ref_genome = import_reference_genome_from_local_file( project, 'ref', fasta, 'fasta') sample = ExperimentSample.objects.create( project=project, label='simLibrary', ) copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, fq1) copy_and_add_dataset_source(sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, fq2) if '--sv' in sys.argv: # using --sv argument runs pipeline for SV project run_pipeline('sample_alignment_ref', ref_genome, [sample])
def _perform_assembly(self, data_dict): ref_fasta = data_dict['ref_fasta'] fq_1 = data_dict['fq_1'] fq_2 = data_dict['fq_2'] # Import reference genome ref_genome = import_reference_genome_from_local_file( self.project, 'test_ref', ref_fasta, 'fasta', move=False) # Create sample model sample = ExperimentSample.objects.create( project=self.project, label='test_sample') # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=fq_1) add_dataset_to_entity( sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=fq_2) # Run alignment of sample to reference alignment_group_label = 'test_alignment' sample_list = [sample] alignment_group, _, _ = run_pipeline( alignment_group_label, ref_genome, sample_list, perform_variant_calling=False, alignment_options={}) # Get resulting ExperimentSampleToAlignment sample_align = ExperimentSampleToAlignment.objects.get( alignment_group=alignment_group, experiment_sample=sample) # Run pipeline and wait on result async_result = run_de_novo_assembly_pipeline([sample_align]) async_result.get() # Retrieve contigs contigs = Contig.objects.filter( parent_reference_genome=ref_genome, experiment_sample_to_alignment=sample_align) return contigs
def test_run_pipeline__multiple_samples(self): """End-to-end test of pipeline. Fails if any errors. """ sample_list = [self.experiment_sample, self.experiment_sample_2] result = run_pipeline( 'name_placeholder', self.reference_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def test_run_pipeline__multiple_samples(self): """End-to-end test of pipeline. Fails if any errors. """ sample_list = [self.experiment_sample, self.experiment_sample_2] result = run_pipeline('name_placeholder', self.reference_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def test_run_pipeline__multiple_chromosomes(self): """Makes sure variant calling works when there are multiple chromosomes on a single reference genome. """ ref_genome = import_reference_genome_from_local_file( self.project, 'concat_mg1655_partials', FullVCFTestSet.TEST_CONCAT_GENBANK, 'genbank') sample_obj = ExperimentSample.objects.create( project=self.project, label='Sample 0') # Add raw reads to each sample. copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[0]) copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[0]) sample_list = [sample_obj] result = run_pipeline( 'name_placeholder', ref_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status) # Validate that all variants calld. # TODO: Add Chromosome checking. v_515 = Variant.objects.get( reference_genome=alignment_group.reference_genome, position=515) v_515_va = v_515.variantalternate_set.all()[0] self.assertEqual('ygiB', v_515_va.data['INFO_EFF_GENE']) v_205 = Variant.objects.get( reference_genome=alignment_group.reference_genome, position=205) v_205_va = v_205.variantalternate_set.all()[0] self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
def _align_and_assemble(self, ref_genome, sample_list): # Run alignment of sample to reference alignment_group_label = 'test_alignment' alignment_group, _, _ = run_pipeline( alignment_group_label, ref_genome, sample_list, perform_variant_calling=False, alignment_options={}) # Get resulting ExperimentSampleToAlignment sample_align_list = ExperimentSampleToAlignment.objects.filter( alignment_group=alignment_group, experiment_sample__in=sample_list) # Run pipeline and wait on result run_de_novo_assembly_pipeline(sample_align_list) return alignment_group
def _align_and_assemble(self, ref_genome, sample_list): # Run alignment of sample to reference alignment_group_label = 'test_alignment' alignment_group, _, _ = run_pipeline( alignment_group_label, ref_genome, sample_list, perform_variant_calling=False, alignment_options={}) # Get resulting ExperimentSampleToAlignment sample_align_list = ExperimentSampleToAlignment.objects.filter( alignment_group=alignment_group, experiment_sample__in=sample_list) # Run pipeline and wait on result async_result = run_de_novo_assembly_pipeline(sample_align_list) async_result.get() return alignment_group
def test_run_pipeline__snps_with_effect__no_svs(self): """Tests pipeline with SNPs with effect, but no SVs called. """ ref_genome = import_reference_genome_from_local_file( self.project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK, 'genbank') sample_obj = ExperimentSample.objects.create( project=self.project, label='Sample %d' % 0) # Add raw reads to each sample. copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[0]) copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[0]) result = run_pipeline( 'test_align', ref_genome, [sample_obj]) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status) # Check that SnpEff worked. v_205 = Variant.objects.get( reference_genome=alignment_group.reference_genome, position=205) v_205_va = v_205.variantalternate_set.all()[0] self.assertEqual('tolC', v_205_va.data['INFO_EFF_GENE'])
def bootstrap_fake_data(): """Fill the database with fake data. """ user = get_or_create_user() ### Create some projects (test_project, project_created) = Project.objects.get_or_create(title=TEST_PROJECT_NAME, owner=user.get_profile()) (test_project_2, project_created) = Project.objects.get_or_create(title=SV_PROJECT_NAME, owner=user.get_profile()) ### Create some reference genomes ref_genome_1 = import_reference_genome_from_local_file( test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta') ref_genome_2 = import_reference_genome_from_local_file( test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta') ref_genome_3 = import_reference_genome_from_local_file( test_project, 'test_genome', TEST_FASTA, 'fasta') ### Create some saved queries. for saved_query_text in CUSTOM_SAVED_QUERY_LIST: SavedVariantFilterQuery.objects.get_or_create(owner=user.get_profile(), text=saved_query_text) ### Create some ExperimentSamples. # Create some samples without backing data just to explore the UI. ExperimentSample.objects.create(project=test_project, label='C321D_MiSeq', data={'SAMPLE_WELL': 'A01'}) ExperimentSample.objects.create(project=test_project, label='C321D Fixed 01', data={'SAMPLE_WELL': 'A02'}) ExperimentSample.objects.create(project=test_project, label='C321D Fixed 02', data={'SAMPLE_WELL': 'A03'}) # Create some samples with backing data. (sample_1, created) = ExperimentSample.objects.get_or_create(project=test_project, label=SAMPLE_1_LABEL) # Add datasets to the samples. if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1) if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2) # Create sample backed by g-zipped data. gz_backed_sample = ExperimentSample.objects.create( project=test_project, label='sample backed by gz data') gz_fastq1_dataset = copy_and_add_dataset_source(gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ_GZ_1) gz_fastq2_dataset = copy_and_add_dataset_source(gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2, TEST_FASTQ_GZ_2) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True) ### Create an alignment. alignment_group_1 = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=ref_genome_3, aligner=AlignmentGroup.ALIGNER.BWA) # Link it to a sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group_1, experiment_sample=sample_1) ### Add alignment data. NOTE: Stored in sample model dir. # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments # in the ExperimentSampleToAlignment directory. copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM) copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest) # Create fake variants. create_fake_variants_and_variant_sets(ref_genome_1) ############################# # Full VCF Testing (annotated for snpeff, variant filtering, etc) ############################# # Create a new reference genome and samples using full_vcf_test_set full_vcf_reference_genome = import_reference_genome_from_local_file( test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK, 'genbank') # Create all samples. parent_obj = None full_vcf_samples = [] for i in range(FullVCFTestSet.NUM_SAMPLES): sample_obj = ExperimentSample.objects.create(project=test_project, label='Sample %d' % i) sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i + 1) if i == 0: parent_obj = sample_obj if i > 0: sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label parent_obj.add_child(sample_obj) parent_obj.save() sample_obj.save() # Add raw reads to each sample. fastq1_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[i]) fastq2_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[i]) # Run FASTQC on sample reads. run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset) run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True) full_vcf_samples.append(sample_obj) # Run the alignment. Return the alignment group created, indexed by the # reference genome's uid. run_pipeline('test_align', full_vcf_reference_genome, full_vcf_samples) import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed', FullVCFTestSet.TEST_DESIGNED_SNPS) def _create_region_intervals(region, interval_tuple_list): """Helper method to create RegionIntervals for a Region. Args: region: Region Model object. interval_tuple_list: List of tuples of intervals to create. """ for interval in interval_tuple_list: RegionInterval.objects.create(region=region, start=interval[0], end=interval[1]) # Create some fake regions. # TODO: Should not be much harder to replace this with real regions. region_1 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_1', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_1, [(1, 150), (300, 400), (500, 900)]) region_2 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_2', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_2, [(1000, 1500)]) region_3 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_3', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)]) # And some GENE regions. gene_A = Region.objects.create(reference_genome=full_vcf_reference_genome, label='geneA', type=Region.TYPE.GENE) _create_region_intervals(gene_A, [(2000, 2400)]) gene_B = Region.objects.create(reference_genome=full_vcf_reference_genome, label='geneB', type=Region.TYPE.GENE) _create_region_intervals(gene_B, [(4800, 5200)]) gene_C = Region.objects.create(reference_genome=full_vcf_reference_genome, label='geneC', type=Region.TYPE.GENE) _create_region_intervals(gene_C, [(1, 500)]) # Bootstrap test_project_2 with SV stuff sv_testing_bootstrap(test_project_2)
def test_pipeline_and_svs(self): alignment_group_obj, async_result = run_pipeline( 'name', self.reference_genome, [self.experiment_sample]) # Block until pipeline finishes. while not async_result.ready(): time.sleep(1) if async_result.status == 'FAILURE': self.fail('Async task failed.') # Get fresh copy of AlignmentGroup object since it was processed # different thread. alignment_group_obj = AlignmentGroup.objects.get( id=alignment_group_obj.id) self.assertEqual(1, len(alignment_group_obj.experimentsampletoalignment_set.all())) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group_obj.status) # Make sure the initial JBrowse config has been created. jbrowse_dir = self.reference_genome.get_jbrowse_directory_path() self.assertTrue(os.path.exists(jbrowse_dir)) self.assertTrue(os.path.exists(os.path.join(jbrowse_dir, 'indiv_tracks'))) vcf_files = {} vcf_types = [VARIANT_TOOL_PARAMS_MAP[tool]['dataset_type'] for tool in settings.ENABLED_VARIANT_CALLERS] for vcf_type in vcf_types: vcf_dataset = get_dataset_with_type(alignment_group_obj, vcf_type) self.assertIsNotNone(vcf_dataset, msg='Missing vcf_dataset for {vcf_type}.'.format( vcf_type=vcf_type)) vcf_location = vcf_dataset.get_absolute_location() self.assertTrue(os.path.exists(vcf_location)) vcf_files[vcf_type] = vcf_location # Check actual variants, with this helper vcf-parser function def get_variants(vcf_type): variants = [] with open(vcf_files[vcf_type]) as fh: vcf_reader = vcf.Reader(fh) for record_idx, record in enumerate(vcf_reader): raw_data_dict = extract_raw_data_dict(record) # we should expect exactly 1 alternate assert len(raw_data_dict['INFO_SVLEN']) == 1, ( 'length of INFO_SVLEN > 1: {svlen}'.format( svlen=raw_data_dict['INFO_SVLEN'])) assert len(raw_data_dict['INFO_SVTYPE']) == 1, ( 'length of INFO_SVLEN > 1: {svtype}'.format( svtype=raw_data_dict['INFO_SVTYPE'])) variant_type = str(raw_data_dict.get('INFO_SVTYPE', raw_data_dict.get('TYPE'))[0]) pos = int(raw_data_dict.get('POS')) length = int(raw_data_dict.get('INFO_SVLEN')[0]) variants.append({ 'type': variant_type, 'pos': pos, 'length': length }) return variants lumpy_variants = get_variants(Dataset.TYPE.VCF_LUMPY) # Helper function for checking a specific variant type def verify_variant_type(variants, variant_type, pos, length): for variant in variants: # Check variant against following gauntlet. if variant['type'] != variant_type: continue # Fail, incorrect type. if abs(abs(variant['pos']) - pos) >= 100: continue # Fail, incorrect position. if (length != -1 and abs(abs(variant['length']) - length) >= 100): continue # Fail, incorrect length. # Success, variant made it through gauntlet. return # If we got here, no matches were found, fail. self.fail('No %s position %s found' % (variant_type, pos)) verify_variant_type(lumpy_variants, 'DEL', 10000, 1000)
def test_run_alignment_with_spaces_in_genbank_filename(self): project = self.common_entities['project'] ref_genome_label = 'dirty_upload' request = HttpRequest() request.POST = { 'projectUid': project.uid, 'refGenomeLabel': ref_genome_label, 'importFileFormat': 'genbank' } request.method = 'POST' request.user = self.common_entities['user'] authenticate(username=TEST_USERNAME, password=TEST_PASSWORD) self.assertTrue(request.user.is_authenticated()) request.FILES['refGenomeFile'] = UploadedFile( file=open(TEST_GENBANK), name='dirty_genbank (spaces).gb') response = create_ref_genome_from_browser_upload(request) self.assertEqual(STATUS_CODE__SUCCESS, response.status_code) self.assertFalse(json.loads(response.content).get('error', False)) # Get reference genome ref_genome = ReferenceGenome.objects.get( project=project, label=ref_genome_label) # Create sample model sample = ExperimentSample.objects.create( project=project, label='test_sample') # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, filesystem_location=TEST_DIRTY_FQ_1) # Add fastq datasets to sample add_dataset_to_entity( sample, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, filesystem_location=TEST_DIRTY_FQ_2) # Run alignment of sample to reference alignment_group_label = 'test_alignment' sample_list = [sample] result = run_pipeline( alignment_group_label, ref_genome, sample_list) alignment_group = result[0] alignment_async_result = result[1] variant_calling_async_result = result[2] alignment_async_result.get() variant_calling_async_result.get() alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid) self.assertEqual(AlignmentGroup.STATUS.COMPLETED, alignment_group.status)
def _start_new_alignment(request, project): """Delegate function that handles logic of kicking off alignment. """ # Parse the data from the request body. request_data = json.loads(request.body) # Make sure the required keys are present. REQUIRED_KEYS = [ 'name', 'refGenomeUidList', 'sampleUidList', 'skipHetOnly', 'callAsHaploid'] if not all(key in request_data for key in REQUIRED_KEYS): return HttpResponseBadRequest("Invalid request. Missing keys.") try: # Parse the data and look up the relevant model instances. alignment_group_name = request_data['name'] assert len(alignment_group_name), "Name required." ref_genome_list = ReferenceGenome.objects.filter( project=project, uid__in=request_data['refGenomeUidList']) assert (len(ref_genome_list) == len(request_data['refGenomeUidList'])), ( "Invalid reference genome uid(s).") assert len(ref_genome_list) == 1, ( "Exactly one reference genome must be provided.") ref_genome = ref_genome_list[0] # Make sure AlignmentGroup has a unique name, because run_pipeline # will re-use an alignment based on label, reference genome, # aligner. We are currently hard-coding the aligner to BWA. assert AlignmentGroup.objects.filter( label=alignment_group_name, reference_genome=ref_genome).count() == 0, ( "Please pick unique alignment name.") sample_list = ExperimentSample.objects.filter( project=project, uid__in=request_data['sampleUidList']) assert len(sample_list) == len(request_data['sampleUidList']), ( "Invalid expeirment sample uid(s).") assert len(sample_list) > 0, "At least one sample required." # Populate alignment options. alignment_options = dict() if request_data['skipHetOnly']: alignment_options['skip_het_only'] = True if request_data['callAsHaploid']: alignment_options['call_as_haploid'] = True # Kick off alignments. run_pipeline( alignment_group_name, ref_genome, sample_list, alignment_options=alignment_options) # Success. Return a redirect response. response_data = { 'redirect': reverse( 'main.views.alignment_list_view', args=(project.uid,)), } except Exception as e: response_data = { 'error': str(e) } return HttpResponse(json.dumps(response_data), content_type='application/json')
def bootstrap_fake_data(): """Fill the database with fake data. """ user = get_or_create_user() ### Create some projects (test_project, project_created) = Project.objects.get_or_create( title=TEST_PROJECT_NAME, owner=user.get_profile()) (test_project_2, project_created) = Project.objects.get_or_create( title=SV_PROJECT_NAME, owner=user.get_profile()) ### Create some reference genomes ref_genome_1 = import_reference_genome_from_local_file( test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta') ref_genome_2 = import_reference_genome_from_local_file( test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta') ref_genome_3 = import_reference_genome_from_local_file( test_project, 'test_genome', TEST_FASTA, 'fasta') ### Create some saved queries. for saved_query_text in CUSTOM_SAVED_QUERY_LIST: SavedVariantFilterQuery.objects.get_or_create( owner=user.get_profile(), text=saved_query_text) ### Create some ExperimentSamples. # Create some samples without backing data just to explore the UI. ExperimentSample.objects.create( project=test_project, label='C321D_MiSeq', data = {'SAMPLE_WELL': 'A01'} ) ExperimentSample.objects.create( project=test_project, label='C321D Fixed 01', data = {'SAMPLE_WELL': 'A02'} ) ExperimentSample.objects.create( project=test_project, label='C321D Fixed 02', data = {'SAMPLE_WELL': 'A03'} ) # Create some samples with backing data. (sample_1, created) = ExperimentSample.objects.get_or_create( project=test_project, label=SAMPLE_1_LABEL) # Add datasets to the samples. if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ1) if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2): copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, TEST_FASTQ2) # Create sample backed by g-zipped data. gz_backed_sample = ExperimentSample.objects.create( project=test_project, label='sample backed by gz data') gz_fastq1_dataset = copy_and_add_dataset_source( gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, TEST_FASTQ_GZ_1) gz_fastq2_dataset = copy_and_add_dataset_source( gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2, TEST_FASTQ_GZ_2) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset) run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True) ### Create an alignment. alignment_group_1 = AlignmentGroup.objects.create( label='Alignment 1', reference_genome=ref_genome_3, aligner=AlignmentGroup.ALIGNER.BWA) # Link it to a sample. sample_alignment = ExperimentSampleToAlignment.objects.create( alignment_group=alignment_group_1, experiment_sample=sample_1) ### Add alignment data. NOTE: Stored in sample model dir. # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments # in the ExperimentSampleToAlignment directory. copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM) copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX) add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN, Dataset.TYPE.BWA_ALIGN, copy_dest) # Create fake variants. create_fake_variants_and_variant_sets(ref_genome_1) ############################# # Full VCF Testing (annotated for snpeff, variant filtering, etc) ############################# # Create a new reference genome and samples using full_vcf_test_set full_vcf_reference_genome = import_reference_genome_from_local_file( test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK, 'genbank') # Create all samples. parent_obj = None full_vcf_samples = [] for i in range(FullVCFTestSet.NUM_SAMPLES): sample_obj = ExperimentSample.objects.create( project=test_project, label='Sample %d' % i) sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i+1) if i == 0: parent_obj = sample_obj if i > 0: sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label parent_obj.add_child(sample_obj) parent_obj.save() sample_obj.save() # Add raw reads to each sample. fastq1_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1, FullVCFTestSet.FASTQ1[i]) fastq2_dataset = copy_and_add_dataset_source(sample_obj, Dataset.TYPE.FASTQ2, Dataset.TYPE.FASTQ2, FullVCFTestSet.FASTQ2[i]) # Run FASTQC on sample reads. run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset) run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True) full_vcf_samples.append(sample_obj) # Run the alignment. Return the alignment group created, indexed by the # reference genome's uid. (full_vcf_alignment_group, pipeline_async_result) = run_pipeline( 'test_align', full_vcf_reference_genome, full_vcf_samples) import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed', FullVCFTestSet.TEST_DESIGNED_SNPS) def _create_region_intervals(region, interval_tuple_list): """Helper method to create RegionIntervals for a Region. Args: region: Region Model object. interval_tuple_list: List of tuples of intervals to create. """ for interval in interval_tuple_list: RegionInterval.objects.create( region=region, start=interval[0], end=interval[1]) # Create some fake regions. # TODO: Should not be much harder to replace this with real regions. region_1 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_1', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_1, [(1,150), (300, 400), (500, 900)]) region_2 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_2', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_2, [(1000, 1500)]) region_3 = Region.objects.create( reference_genome=full_vcf_reference_genome, label='region_3', type=Region.TYPE.POOR_MAPPING_QUALITY) _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)]) # And some GENE regions. gene_A = Region.objects.create( reference_genome=full_vcf_reference_genome, label='geneA', type=Region.TYPE.GENE) _create_region_intervals(gene_A, [(2000, 2400)]) gene_B = Region.objects.create( reference_genome=full_vcf_reference_genome, label='geneB', type=Region.TYPE.GENE) _create_region_intervals(gene_B, [(4800, 5200)]) gene_C = Region.objects.create( reference_genome=full_vcf_reference_genome, label='geneC', type=Region.TYPE.GENE) _create_region_intervals(gene_C, [(1, 500)]) # Bootstrap test_project_2 with SV stuff sv_testing_bootstrap(test_project_2)
def _rerun_alignment(alignment_group): """Re-runs existing alignment. """ run_pipeline(alignment_group.label, alignment_group.reference_genome, alignment_group.get_samples()) return HttpResponse(json.dumps({}), content_type='application/json')