def _create_alignment(self, haploid=False):

        # Create a new alignment group.
        self.alignment_group = AlignmentGroup.objects.create(
            label='test alignment', reference_genome=self.REFERENCE_GENOME)

        if haploid:
            self.alignment_group.alignment_options['call_as_haploid'] = True

        # Create a sample.
        self.sample_1 = ExperimentSample.objects.create(
            uid=self.FAKE_READS_SAMPLE_UID,
            project=self.project,
            label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1,
                                    self.FAKE_READS_FASTQ1)
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2,
                                    self.FAKE_READS_FASTQ2)

        # Create alignment to the sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=self.alignment_group,
            experiment_sample=self.sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.
        copy_dest = copy_dataset_to_entity_data_dir(self.sample_1,
                                                    self.FAKE_READS_BAM)
        copy_dataset_to_entity_data_dir(self.sample_1,
                                        self.FAKE_READS_BAM_INDEX)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                              Dataset.TYPE.BWA_ALIGN, copy_dest)
    def _create_alignment(self, haploid=False):

        # Create a new alignment group.
        self.alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.REFERENCE_GENOME)

        if haploid:
            self.alignment_group.alignment_options['call_as_haploid'] = True

        # Create a sample.
        self.sample_1 = ExperimentSample.objects.create(
                uid=self.FAKE_READS_SAMPLE_UID,
                project=self.project,
                label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, self.FAKE_READS_FASTQ1)
        copy_and_add_dataset_source(self.sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, self.FAKE_READS_FASTQ2)

        # Create alignment to the sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=self.alignment_group,
                experiment_sample=self.sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.
        copy_dest = copy_dataset_to_entity_data_dir(
                self.sample_1, self.FAKE_READS_BAM)
        copy_dataset_to_entity_data_dir(self.sample_1,
                self.FAKE_READS_BAM_INDEX)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                Dataset.TYPE.BWA_ALIGN, copy_dest)
Example #3
0
def compute_insert_metrics(bam_file, sample_alignment, stderr=None):
    """Computes read fragment insert size distribution.

    Creates a Dataset for each of:
        * histogram file
        * file with mean and stdev comma-separated
    """
    histo_file = os.path.splitext(bam_file)[0] + '.insert_size_histogram.txt'
    mean_stdev_file = (os.path.splitext(bam_file)[0] +
            '.insert_size_mean_stdev.txt')

    # First, we analyze the bam distribution.
    read_bam_cmd = [
            settings.SAMTOOLS_BINARY,
            'view',
            bam_file
    ]
    p1 = Popen(read_bam_cmd, stdout=PIPE, stderr=stderr)

    read_length = get_read_length(bam_file)

    pairend_distro_cmd = [
        settings.LUMPY_PAIREND_DISTRO_BIN,
        '-r', str(read_length),
        '-X', '4', # num stdevs from end to extend
        '-N', '10000', # number to sample
        '-o', histo_file
    ]
    p2 = Popen(pairend_distro_cmd, stdin=p1.stdout, stdout=PIPE, stderr=stderr)

    # Allow p1 to receive a SIGPIPE if p2 exits.
    p1.stdout.close()

    # Run the command and get mean, stdev
    mean_and_stdev_str = p2.communicate()[0]
    raw_mean, raw_stdev = mean_and_stdev_str.split('\t')
    mean = int(float(raw_mean.split(':')[1].strip()))
    stdev = int(float(raw_stdev.split(':')[1].strip()))

    # Lumpy doesn't like stdev of 0.
    if stdev < 1:
        stdev = 1

    # Save the histogram file as a Dataset.
    add_dataset_to_entity(sample_alignment,
            Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM,
            Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM,
            filesystem_location=histo_file)

    # Write mean, stdev to another file and create another Dataset.
    with open(mean_stdev_file, 'w') as fh:
        fh.write("%d,%d" % (mean, stdev))
    add_dataset_to_entity(sample_alignment,
            Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV,
            Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV,
            filesystem_location=mean_stdev_file)
Example #4
0
    def test_run_alignment_with_spaces_in_genbank_filename(self):
        project = self.common_entities['project']
        ref_genome_label = 'dirty_upload'
        request = HttpRequest()
        request.POST = {
            'projectUid': project.uid,
            'refGenomeLabel': ref_genome_label,
            'importFileFormat': 'genbank'
        }
        request.method = 'POST'
        request.user = self.common_entities['user']
        authenticate(username=TEST_USERNAME, password=TEST_PASSWORD)
        self.assertTrue(request.user.is_authenticated())

        request.FILES['refGenomeFile'] = UploadedFile(
            file=open(TEST_GENBANK), name='dirty_genbank (spaces).gb')

        response = create_ref_genome_from_browser_upload(request)
        self.assertEqual(STATUS_CODE__SUCCESS, response.status_code)
        self.assertFalse(json.loads(response.content).get('error', False))

        # Get reference genome
        ref_genome = ReferenceGenome.objects.get(project=project,
                                                 label=ref_genome_label)

        # Create sample model
        sample = ExperimentSample.objects.create(project=project,
                                                 label='test_sample')

        # Add fastq datasets to sample
        add_dataset_to_entity(sample,
                              Dataset.TYPE.FASTQ1,
                              Dataset.TYPE.FASTQ1,
                              filesystem_location=TEST_DIRTY_FQ_1)

        # Add fastq datasets to sample
        add_dataset_to_entity(sample,
                              Dataset.TYPE.FASTQ2,
                              Dataset.TYPE.FASTQ2,
                              filesystem_location=TEST_DIRTY_FQ_2)

        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        sample_list = [sample]

        result = run_pipeline(alignment_group_label, ref_genome, sample_list)

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                         alignment_group.status)
Example #5
0
    def _perform_assembly(self, data_dict):

        ref_fasta = data_dict['ref_fasta']
        fq_1 = data_dict['fq_1']
        fq_2 = data_dict['fq_2']

        # Import reference genome
        ref_genome = import_reference_genome_from_local_file(
                self.project, 'test_ref',
                ref_fasta, 'fasta', move=False)

        # Create sample model
        sample = ExperimentSample.objects.create(
                project=self.project,
                label='test_sample')

        # Add fastq datasets to sample
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                filesystem_location=fq_1)
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                filesystem_location=fq_2)

        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        sample_list = [sample]
        alignment_group, _, _ = run_pipeline(
                alignment_group_label, ref_genome, sample_list,
                perform_variant_calling=False, alignment_options={})

        # Get resulting ExperimentSampleToAlignment
        sample_align = ExperimentSampleToAlignment.objects.get(
                alignment_group=alignment_group,
                experiment_sample=sample)

        # Run pipeline and wait on result
        async_result = run_de_novo_assembly_pipeline([sample_align])
        async_result.get()

        # Retrieve contigs
        contigs = Contig.objects.filter(
                parent_reference_genome=ref_genome,
                experiment_sample_to_alignment=sample_align)

        return contigs
Example #6
0
    def _get_or_create_sv_dataset(key):
        dataset_query = sample_alignment.dataset_set.filter(type=key)

        if dataset_query.exists() and not overwrite or (
                dataset_query.exists() and
                key not in sv_indicant_class_to_generator):
            assert len(dataset_query) == 1
            return dataset_query[0]
        elif dataset_query.exists() and overwrite and (
                key in sv_indicant_class_to_generator):
            assert len(dataset_query) == 1
            dataset_query[0].delete()

        if (overwrite and key in sv_indicant_class_to_generator) or (
                not dataset_query.exists()):
            dataset_path = '.'.join([
                    alignment_file_prefix,
                    sv_indicant_class_to_filename_suffix[key],
                    'bam'
                    ])
            generator = sv_indicant_class_to_generator[key]
            generator(alignment_bam, dataset_path)

            return add_dataset_to_entity(
                    sample_alignment,
                    key,
                    key,
                    filesystem_location=dataset_path)
Example #7
0
    def _get_or_create_sv_dataset(key):
        dataset_query = sample_alignment.dataset_set.filter(type=key)

        if dataset_query.exists() and not overwrite or (
                dataset_query.exists()
                and key not in sv_indicant_class_to_generator):
            assert len(dataset_query) == 1
            return dataset_query[0]
        elif dataset_query.exists() and overwrite and (
                key in sv_indicant_class_to_generator):
            assert len(dataset_query) == 1
            dataset_query[0].delete()

        if (overwrite and key in sv_indicant_class_to_generator) or (
                not dataset_query.exists()):
            dataset_path = '.'.join([
                alignment_file_prefix,
                sv_indicant_class_to_filename_suffix[key], 'bam'
            ])
            generator = sv_indicant_class_to_generator[key]
            generator(alignment_bam, dataset_path)

            return add_dataset_to_entity(sample_alignment,
                                         key,
                                         key,
                                         filesystem_location=dataset_path)
Example #8
0
def cov_detect_deletion_make_vcf(sample_alignment):
    """Uses coverage data to call large deletions and
    creates a VCF_COV_DETECT_DELETIONS dataset for the sample alignment

    Args:
        sample_alignment: ExperimentSampleToAlignment instance
    """
    # Don't proceed if processing this sample alignment previously failed or
    # in another async process.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
            uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') ==
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        return

    # Set assembly status for UI
    # NOTE: Setting this status is playing whack-a-mole against other async sv
    # detection functions, e.g. assembly.generate_contigs().
    set_assembly_status(
                sample_alignment,
                ExperimentSampleToAlignment.ASSEMBLY_STATUS.ANALYZING_COVERAGE)

    print "Generating coverage data\n"
    chrom_regions = get_deleted_regions(sample_alignment)
    var_dict_list = make_var_dict_list(
            chrom_regions,
            get_fasta(sample_alignment.alignment_group.reference_genome))

    if var_dict_list:

        vcf_path = os.path.join(
            sample_alignment.get_model_data_dir(),
            'cov_detect_deletion.vcf')

        # Write variant dicts to vcf
        export_var_dict_list_as_vcf(var_dict_list, vcf_path,
                sample_alignment, CUSTOM_SV_METHOD__COVERAGE)

        # Make dataset for contigs vcf
        new_dataset = add_dataset_to_entity(
                sample_alignment,
                Dataset.TYPE.VCF_COV_DETECT_DELETIONS,
                Dataset.TYPE.VCF_COV_DETECT_DELETIONS,
                vcf_path)

        new_dataset.save()

    # Update status again if not FAILED.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
            uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') !=
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        set_assembly_status(
                sample_alignment,
                ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)
def make_contig_reads_dataset(contig, sv_indicant_reads_in_contig):
    '''
    Using the contig reads generated by extract_contig_reads(),
    generate a bam file, index and sort it.
    '''
    # Get bam filename
    extracted_reads_bam_file = os.path.join(
            contig.get_model_data_dir(),
            'sv_indicants.bam')

    bwa_align_bam = contig.experiment_sample_to_alignment.dataset_set.get(
            type=Dataset.TYPE.BWA_ALIGN).get_absolute_location()
    sam_file = pysam.AlignmentFile(bwa_align_bam)

    # Write extracted reads into bam file
    extracted_reads_alignment_file = pysam.AlignmentFile(
            extracted_reads_bam_file, "wb", template=sam_file)
    sam_file.close()

    for read in sv_indicant_reads_in_contig:
        extracted_reads_alignment_file.write(read)

    extracted_reads_alignment_file.close()

    coordinate_sorted_bam = (os.path.splitext(extracted_reads_bam_file)[0] +
            '.coordinate_sorted.bam')
    sort_bam_by_coordinate(extracted_reads_bam_file, coordinate_sorted_bam)
    index_bam(coordinate_sorted_bam)

    # Add the bam file to contig as BWA_SV_INDICANTS dataset, overwriting it
    # if it already exists
    dataset_query = contig.dataset_set.filter(
            type=Dataset.TYPE.BWA_SV_INDICANTS)
    if dataset_query.count():
        dataset_query[0].delete()

    add_dataset_to_entity(contig,
            Dataset.TYPE.BWA_SV_INDICANTS,
            Dataset.TYPE.BWA_SV_INDICANTS,
            filesystem_location=coordinate_sorted_bam)
def cov_detect_deletion_make_vcf(sample_alignment):
    """Uses coverage data to call large deletions and
    creates a VCF_COV_DETECT_DELETIONS dataset for the sample alignment

    Args:
        sample_alignment: ExperimentSampleToAlignment instance
    """
    # Don't proceed if processing this sample alignment previously failed or
    # in another async process.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
        uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') ==
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        return

    # Set assembly status for UI
    # NOTE: Setting this status is playing whack-a-mole against other async sv
    # detection functions, e.g. assembly.generate_contigs().
    set_assembly_status(
        sample_alignment,
        ExperimentSampleToAlignment.ASSEMBLY_STATUS.ANALYZING_COVERAGE)

    print "Generating coverage data\n"
    chrom_regions = get_deleted_regions(sample_alignment)
    var_dict_list = make_var_dict_list(
        chrom_regions,
        get_fasta(sample_alignment.alignment_group.reference_genome))

    if var_dict_list:

        vcf_path = os.path.join(sample_alignment.get_model_data_dir(),
                                'cov_detect_deletion.vcf')

        # Write variant dicts to vcf
        export_var_dict_list_as_vcf(var_dict_list, vcf_path, sample_alignment,
                                    CUSTOM_SV_METHOD__COVERAGE)

        # Make dataset for contigs vcf
        new_dataset = add_dataset_to_entity(
            sample_alignment, Dataset.TYPE.VCF_COV_DETECT_DELETIONS,
            Dataset.TYPE.VCF_COV_DETECT_DELETIONS, vcf_path)

        new_dataset.save()

    # Update status again if not FAILED.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
        uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') !=
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        set_assembly_status(
            sample_alignment,
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)
def _make_fake_contig(label, esta):
    c = Contig.objects.create(label=label,
                              parent_reference_genome=ag.reference_genome,
                              experiment_sample_to_alignment=esta)
    c.metadata['coverage'] = random.random() * 100

    # Add fasta.
    c.ensure_model_data_dir_exists()

    # Random sequence.
    num_bases = random.randint(0, 100)
    seq = Seq(''.join([random.choice('ATCG') for i in range(num_bases)]))
    seq_record = SeqRecord(seq, id=c.uid)
    dataset_path = os.path.join(c.get_model_data_dir(), 'fasta.fa')
    with open(dataset_path, 'w') as fh:
        SeqIO.write(seq_record, fh, 'fasta')
    add_dataset_to_entity(c,
                          'contig_fasta',
                          Dataset.TYPE.REFERENCE_GENOME_FASTA,
                          filesystem_location=dataset_path)

    c.save()
def make_altalign_dataset(sample_alignment):

    sample_alignment_bam = sample_alignment.dataset_set.get(
        type=Dataset.TYPE.BWA_ALIGN).get_absolute_location()
    alignment_file_prefix = os.path.join(sample_alignment.get_model_data_dir(),
                                         'bwa_align')
    altalign_bam = '.'.join([alignment_file_prefix, 'altalign', 'bam'])
    get_altalign_reads(sample_alignment_bam, altalign_bam)

    return add_dataset_to_entity(sample_alignment,
                                 Dataset.TYPE.BWA_ALTALIGN,
                                 Dataset.TYPE.BWA_ALTALIGN,
                                 filesystem_location=altalign_bam)
def make_contig_reads_dataset(contig, sv_indicant_reads_in_contig):
    '''
    Using the contig reads generated by extract_contig_reads(),
    generate a bam file, index and sort it.
    '''
    # Get bam filename
    extracted_reads_bam_file = os.path.join(contig.get_model_data_dir(),
                                            'sv_indicants.bam')

    bwa_align_bam = contig.experiment_sample_to_alignment.dataset_set.get(
        type=Dataset.TYPE.BWA_ALIGN).get_absolute_location()
    sam_file = pysam.AlignmentFile(bwa_align_bam)

    # Write extracted reads into bam file
    extracted_reads_alignment_file = pysam.AlignmentFile(
        extracted_reads_bam_file, "wb", template=sam_file)
    sam_file.close()

    for read in sv_indicant_reads_in_contig:
        extracted_reads_alignment_file.write(read)

    extracted_reads_alignment_file.close()

    coordinate_sorted_bam = (os.path.splitext(extracted_reads_bam_file)[0] +
                             '.coordinate_sorted.bam')
    sort_bam_by_coordinate(extracted_reads_bam_file, coordinate_sorted_bam)
    index_bam(coordinate_sorted_bam)

    # Add the bam file to contig as BWA_SV_INDICANTS dataset, overwriting it
    # if it already exists
    dataset_query = contig.dataset_set.filter(
        type=Dataset.TYPE.BWA_SV_INDICANTS)
    if dataset_query.count():
        dataset_query[0].delete()

    add_dataset_to_entity(contig,
                          Dataset.TYPE.BWA_SV_INDICANTS,
                          Dataset.TYPE.BWA_SV_INDICANTS,
                          filesystem_location=coordinate_sorted_bam)
    def _create_samples(self, fq_1, fq_2, num=1):

        sample_list = []
        for sample_num in range(num):
            sample = ExperimentSample.objects.create(
                    project=self.project,
                    label='test_sample_' + str(sample_num))

            # Add fastq datasets to sample
            add_dataset_to_entity(
                    sample,
                    Dataset.TYPE.FASTQ1,
                    Dataset.TYPE.FASTQ1,
                    filesystem_location=fq_1)
            add_dataset_to_entity(
                    sample,
                    Dataset.TYPE.FASTQ2,
                    Dataset.TYPE.FASTQ2,
                    filesystem_location=fq_2)

            sample_list.append(sample)

        return sample_list
    def _create_samples(self, fq_1, fq_2, num=1):

        sample_list = []
        for sample_num in range(num):
            sample = ExperimentSample.objects.create(
                    project=self.project,
                    label='test_sample_' + str(sample_num))

            # Add fastq datasets to sample
            add_dataset_to_entity(
                    sample,
                    Dataset.TYPE.FASTQ1,
                    Dataset.TYPE.FASTQ1,
                    filesystem_location=fq_1)
            add_dataset_to_entity(
                    sample,
                    Dataset.TYPE.FASTQ2,
                    Dataset.TYPE.FASTQ2,
                    filesystem_location=fq_2)

            sample_list.append(sample)

        return sample_list
def _make_fake_contig(label, esta):
    c = Contig.objects.create(
            label=label,
            parent_reference_genome=ag.reference_genome,
            experiment_sample_to_alignment=esta)
    c.metadata['coverage'] = random.random() * 100

    # Add fasta.
    c.ensure_model_data_dir_exists()

    # Random sequence.
    num_bases = random.randint(0, 100)
    seq = Seq(''.join([random.choice('ATCG') for i in range(num_bases)]))
    seq_record = SeqRecord(seq, id=c.uid)
    dataset_path = os.path.join(c.get_model_data_dir(), 'fasta.fa')
    with open(dataset_path, 'w') as fh:
        SeqIO.write(seq_record, fh, 'fasta')
    add_dataset_to_entity(
            c,
            'contig_fasta',
            Dataset.TYPE.REFERENCE_GENOME_FASTA,
            filesystem_location=dataset_path)

    c.save()
Example #17
0
def make_altalign_dataset(sample_alignment):

    sample_alignment_bam = sample_alignment.dataset_set.get(
                type=Dataset.TYPE.BWA_ALIGN).get_absolute_location()
    alignment_file_prefix = os.path.join(
        sample_alignment.get_model_data_dir(),
        'bwa_align')
    altalign_bam = '.'.join([
                alignment_file_prefix,
                'altalign',
                'bam'
                ])
    get_altalign_reads(sample_alignment_bam, altalign_bam)

    return add_dataset_to_entity(
                sample_alignment,
                Dataset.TYPE.BWA_ALTALIGN,
                Dataset.TYPE.BWA_ALTALIGN,
                filesystem_location=altalign_bam)
    def test_end_to_end(self):
        """Test running full pipline on small-ish data.

        The data file consists of 20,000 bases. At 5,000 bases there is
        a 400 base deletion. At 10,000 bases there is a 400 base inversion.
        At 15,000 bases there is a 400 base tandem duplication.

        It seems that Pindel cannot find the inversion. Fortunately,
        delly can usually find inversions. Unfortunately, delly only
        works well on large data, so we will not test it here.
        """
        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
            label='test alignment', reference_genome=self.reference_genome)

        # Create a sample.
        sample_1 = ExperimentSample.objects.create(uid=TEST_SAMPLE_UID,
                                                   project=self.project,
                                                   label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Create relationship between alignment and sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group, experiment_sample=sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.

        # index (no dataset)
        copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)

        # bam file (with dataset)
        copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                              Dataset.TYPE.BWA_ALIGN, copy_dest)

        # Make sure there are no variants before.
        self.assertEqual(
            0,
            len(Variant.objects.filter(
                reference_genome=self.reference_genome)))

        # Test with Pindel only for now.
        for tool in ['pindel']:
            find_variants_with_tool(alignment_group,
                                    VARIANT_TOOL_PARAMS_MAP[tool],
                                    project=self.project)

        # Check that the alignment group has a freebayes vcf dataset associated
        # with it.
        vcf_dataset = get_dataset_with_type(alignment_group,
                                            Dataset.TYPE.VCF_PINDEL)
        self.assertIsNotNone(vcf_dataset)

        # Make sure the .vcf file actually exists.
        self.assertTrue(os.path.exists(vcf_dataset.get_absolute_location()))

        # Make sure the vcf is valid by reading it using pyvcf.
        with open(vcf_dataset.get_absolute_location()) as vcf_fh:
            try:
                reader = vcf.Reader(vcf_fh)
                reader.next()
            except:
                self.fail("Not valid vcf")

        # Grab the resulting variants.
        variants = Variant.objects.filter(
            reference_genome=self.reference_genome)

        # Confirm that 2 variants found.
        self.assertEqual(2, len(variants))

        variant_map = {}
        for variant in variants:
            variant_alternates = VariantAlternate.objects.filter(
                variant=variant)

            # There should be only one variant alternate per SV.
            self.assertEqual(len(variant_alternates), 1)

            pos = variant.position
            svtype = variant_alternates[0].data['INFO_SVTYPE']
            svlen = variant_alternates[0].data['INFO_SVLEN']
            variant_map[svtype] = (pos, svlen)

        # Check that there is a deletion around base 5000.
        self.assertTrue('DEL' in variant_map)
        self.assertTrue(abs(variant_map['DEL'][0] - 5000) <= 3)
        self.assertTrue(abs(variant_map['DEL'][1] - 400) <= 3)

        # Check that there is a tandem duplication around base 15000.
        self.assertTrue('DUP:TANDEM' in variant_map)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][0] - 15000) <= 3)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][1] - 400) <= 3)
def align_contig_reads_to_contig(contig):

    # Get fasta of reads used to make contig
    contig_reads_fasta = os.path.join(
            contig.get_model_data_dir(),
            'extracted_reads.fa')

    # Pull out contig read qnames and put in dictionary contig_reads
    p1 = re.compile('>(\S+)/(\d)')
    contig_reads = defaultdict(list)
    with open(contig_reads_fasta) as fh:
        for line in fh:
            m1 = p1.match(line)
            if m1:
                read_id = m1.group(1)
                read_number = int(m1.group(2))
                contig_reads[read_id].append(read_number)

    # Get source reads fastqs
    sample = contig.experiment_sample_to_alignment.experiment_sample
    source_fq1 = sample.dataset_set.get(
            type=Dataset.TYPE.FASTQ1).get_absolute_location()
    source_fq2_query = sample.dataset_set.filter(
            type=Dataset.TYPE.FASTQ2)
    is_paired_end = source_fq2_query.exists()
    if is_paired_end:
        source_fq2 = source_fq2_query[0].get_absolute_location()

    # Make filenames for contig read fastqs
    output_fq1 = os.path.join(
            contig.get_model_data_dir(),
            'reads.1.fq')
    if is_paired_end:
        output_fq2 = os.path.join(
                contig.get_model_data_dir(),
                'reads.2.fq')

    # Go through source fastqs and write reads in contig_reads to file
    source_fq_list = [source_fq1]
    output_fq_list = [output_fq1]
    if is_paired_end:
        source_fq_list.append(source_fq2)
        output_fq_list.append(output_fq2)

    p1 = re.compile('@(\S+)')
    for input_fq_path, output_fq_path in zip(source_fq_list, output_fq_list):
        if input_fq_path.endswith('.fq'):
            file_like = open(input_fq_path)
        elif input_fq_path.endswith('.gz'):
            file_like = gzip.open(input_fq_path)
        else:
            raise Exception('Compression type not supported')

        with file_like as in_fh, \
             open(output_fq_path, 'w') as out_fh:
            for line in in_fh:
                m1 = p1.match(line)
                if m1:
                    qname = m1.group(1)
                    if qname in contig_reads:
                        out_fh.write(line)
                        out_fh.write(in_fh.next())
                        out_fh.write(in_fh.next())
                        out_fh.write(in_fh.next())

    # Align fastqs to contig fasta
    contig_fasta = contig.dataset_set.get(
            type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()
    contig_reads_to_contig_bam = os.path.join(
            contig.get_model_data_dir(),
            'reads_to_contig.bam')
    simple_align_paired_with_bwa_mem(
            output_fq_list,
            contig_fasta,
            contig_reads_to_contig_bam)

    # Coordinate sort and index bam for jbrowse
    coordinate_sorted_bam = (os.path.splitext(contig_reads_to_contig_bam)[0] +
            '.coordinate_sorted.bam')
    sort_bam_by_coordinate(contig_reads_to_contig_bam, coordinate_sorted_bam)
    index_bam(coordinate_sorted_bam)

    # Add the bam file to contig as BWA_ALIGN dataset, overwriting it
    # if it already exists
    dataset_query = contig.dataset_set.filter(
            type=Dataset.TYPE.BWA_ALIGN)
    if dataset_query.count():
        dataset_query[0].delete()

    add_dataset_to_entity(
            contig,
            Dataset.TYPE.BWA_ALIGN,
            Dataset.TYPE.BWA_ALIGN,
            filesystem_location=coordinate_sorted_bam)
def bootstrap_fake_data():
    """Fill the database with fake data.
    """
    user = get_or_create_user()

    ### Create some projects
    (test_project, project_created) = Project.objects.get_or_create(
            title=TEST_PROJECT_NAME, owner=user.get_profile())
    (test_project_2, project_created) = Project.objects.get_or_create(
            title=SV_PROJECT_NAME, owner=user.get_profile())

    ### Create some reference genomes
    ref_genome_1 = import_reference_genome_from_local_file(
            test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta')

    ref_genome_2 = import_reference_genome_from_local_file(
            test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta')

    ref_genome_3 = import_reference_genome_from_local_file(
            test_project, 'test_genome', TEST_FASTA, 'fasta')

    ### Create some saved queries.
    for saved_query_text in CUSTOM_SAVED_QUERY_LIST:
        SavedVariantFilterQuery.objects.get_or_create(
                owner=user.get_profile(),
                text=saved_query_text)

    ### Create some ExperimentSamples.

    # Create some samples without backing data just to explore the UI.
    ExperimentSample.objects.create(
            project=test_project,
            label='C321D_MiSeq',
            data = {'SAMPLE_WELL': 'A01'}
    )

    ExperimentSample.objects.create(
            project=test_project,
            label='C321D Fixed 01',
            data = {'SAMPLE_WELL': 'A02'}
    )

    ExperimentSample.objects.create(
            project=test_project,
            label='C321D Fixed 02',
            data = {'SAMPLE_WELL': 'A03'}
    )

    # Create some samples with backing data.
    (sample_1, created) = ExperimentSample.objects.get_or_create(
            project=test_project,
            label=SAMPLE_1_LABEL)
    # Add datasets to the samples.
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

    # Create sample backed by g-zipped data.
    gz_backed_sample = ExperimentSample.objects.create(
            project=test_project,
            label='sample backed by gz data')
    gz_fastq1_dataset = copy_and_add_dataset_source(
            gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ1,
            TEST_FASTQ_GZ_1)
    gz_fastq2_dataset = copy_and_add_dataset_source(
            gz_backed_sample, Dataset.TYPE.FASTQ1, Dataset.TYPE.FASTQ2,
            TEST_FASTQ_GZ_2)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True)

    ### Create an alignment.
    alignment_group_1 = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=ref_genome_3,
            aligner=AlignmentGroup.ALIGNER.BWA)
    # Link it to a sample.
    sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group_1,
            experiment_sample=sample_1)
    ### Add alignment data. NOTE: Stored in sample model dir.
    # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments
    # in the ExperimentSampleToAlignment directory.
    copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
    copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
            Dataset.TYPE.BWA_ALIGN, copy_dest)

    # Create fake variants.
    create_fake_variants_and_variant_sets(ref_genome_1)

    #############################
    # Full VCF Testing (annotated for snpeff, variant filtering, etc)
    #############################

    # Create a new reference genome and samples using full_vcf_test_set
    full_vcf_reference_genome = import_reference_genome_from_local_file(
                test_project, 'mg1655_tolC_through_zupT',
                FullVCFTestSet.TEST_GENBANK, 'genbank')

    # Create all samples.
    parent_obj = None
    full_vcf_samples = []
    for i in range(FullVCFTestSet.NUM_SAMPLES):
        sample_obj = ExperimentSample.objects.create(
                project=test_project,
                label='Sample %d' % i)

        sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i+1)

        if i == 0:
            parent_obj = sample_obj
        if i > 0:
            sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label
            parent_obj.add_child(sample_obj)
            parent_obj.save()

        sample_obj.save()

        # Add raw reads to each sample.
        fastq1_dataset = copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                FullVCFTestSet.FASTQ1[i])
        fastq2_dataset = copy_and_add_dataset_source(sample_obj,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                FullVCFTestSet.FASTQ2[i])

        # Run FASTQC on sample reads.
        run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset)
        run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True)

        full_vcf_samples.append(sample_obj)

    # Run the alignment. Return the alignment group created, indexed by the
    # reference genome's uid.
    (full_vcf_alignment_group, pipeline_async_result) = run_pipeline(
            'test_align', full_vcf_reference_genome, full_vcf_samples)

    import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed',
            FullVCFTestSet.TEST_DESIGNED_SNPS)

    def _create_region_intervals(region, interval_tuple_list):
        """Helper method to create RegionIntervals for a Region.

        Args:
            region: Region Model object.
            interval_tuple_list: List of tuples of intervals to create.
        """
        for interval in interval_tuple_list:
            RegionInterval.objects.create(
                    region=region,
                    start=interval[0],
                    end=interval[1])

    # Create some fake regions.
    # TODO: Should not be much harder to replace this with real regions.
    region_1 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_1',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_1, [(1,150), (300, 400), (500, 900)])

    region_2 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_2',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_2, [(1000, 1500)])

    region_3 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_3',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)])

    # And some GENE regions.

    gene_A = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneA',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_A, [(2000, 2400)])

    gene_B = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneB',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_B, [(4800, 5200)])

    gene_C = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='geneC',
        type=Region.TYPE.GENE)
    _create_region_intervals(gene_C, [(1, 500)])

    # Bootstrap test_project_2 with SV stuff
    sv_testing_bootstrap(test_project_2)
Example #21
0
def evaluate_contigs(contig_uid_list, skip_extracted_read_alignment=False,
        use_read_alignment=True):

    if not contig_uid_list:
        return

    def _length_weighted_coverage(contig):
        return contig.num_bases * contig.coverage

    # Request contig_list from db and order by highest length weighted coverage
    contig_list = list(Contig.objects.filter(uid__in=contig_uid_list))
    contig_list.sort(key=_length_weighted_coverage, reverse=True)

    # All contigs have have same sample_alignment so grab sample alignment from
    # the first one.
    contig = contig_list[0]
    sample_alignment = contig.experiment_sample_to_alignment
    ref_genome = sample_alignment.alignment_group.reference_genome

    # Attempt placing contigs. Get back placeable contigs,
    # translocation variants (dict obj), and mobile elements translocation
    # variants (dict obj).
    placeable_contig_uid_list, var_dict_list, me_var_dict_list = graph_contig_placement(
            contig_uid_list, skip_extracted_read_alignment, use_read_alignment)

    # update contig list with new features from graph_contig_placement
    contig_list = list(Contig.objects.filter(uid__in=contig_uid_list))
    contig_list.sort(key=_length_weighted_coverage, reverse=True)

    # Annotate contig with the gene names that they fall within 50 bp of.
    annotate_contig_junctions(contig_uid_list, ref_genome, dist=50)

    # Handle placeable contigs, if any.
    if len(placeable_contig_uid_list):
        placeable_contigs = Contig.objects.filter(
                uid__in=placeable_contig_uid_list)

        for contig in placeable_contigs:
            contig.metadata['is_placeable'] = True
            contig.save()

        placeable_contig_vcf_path = os.path.join(
                sample_alignment.get_model_data_dir(),
                'de_novo_assembled_contigs.vcf')

        # Write contigs to vcf
        export_contig_list_as_vcf(placeable_contigs, placeable_contig_vcf_path)

        # Make dataset for contigs vcf
        add_dataset_to_entity(
                sample_alignment,
                Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS,
                Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS,
                placeable_contig_vcf_path)

    # Handle other types of contig objects, if any.
    var_dict_vcf_path = os.path.join(
            sample_alignment.get_model_data_dir(),
            'de_novo_assembly_translocations.vcf')

    me_var_dict_vcf_path = os.path.join(
            sample_alignment.get_model_data_dir(),
            'de_novo_assembly_me_translocations.vcf')

    for var_dl, method, path, dataset_type in [
            (var_dict_list, 'GRAPH_WALK', var_dict_vcf_path,
                    Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_GRAPH_WALK),
            (me_var_dict_list, 'ME_GRAPH_WALK', me_var_dict_vcf_path,
                    Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_ME_GRAPH_WALK)]:

        if not var_dl:
            continue

        # Write variant dicts to vcf
        export_var_dict_list_as_vcf(
                var_dl, path,
                sample_alignment,
                method)

        # Make dataset for contigs vcf
        add_dataset_to_entity(
                sample_alignment,
                dataset_type,
                dataset_type,
                path)
Example #22
0
def assemble_with_velvet(assembly_dir, velvet_opts, sv_indicants_bam,
        sample_alignment, overwrite=True, reassemble_contig_from_reads=False):
    # NOTE: Unused. If enabled, will call make_contig_reads_to_ref_alignments()
    # which is not used anywhere currently due to performance issues which
    # which are particularly bad when many unused reads.
    assert not reassemble_contig_from_reads

    timestamp = str(datetime.datetime.now())
    contig_number_pattern = re.compile('^NODE_(\d+)_')

    reference_genome = sample_alignment.alignment_group.reference_genome

    contig_files = []
    contig_uid_list = []

    _run_velvet(assembly_dir, velvet_opts, sv_indicants_bam)

    # Collect resulting contigs fasta
    contigs_fasta = os.path.join(assembly_dir, 'contigs.fa')
    contig_files.append(contigs_fasta)

    records = list(SeqIO.parse(contigs_fasta, 'fasta'))
    digits = len(str(len(records))) + 1

    for (i, seq_record) in enumerate(records, 1):

        # Extract contig sequence from the contigs.fa file, number, and
        # name it.

        contig_node_number = int(
                    contig_number_pattern.findall(
                            seq_record.description)[0])
        coverage = float(seq_record.description.rsplit('_', 1)[1])
        seq_record.seq = reduce(
                lambda x, y: x + y,
                [seq for seq in seq_record.seq.split('N')])
        seq_record.id = seq_record.name = seq_record.description = (
                'NODE_' + str(i))
        leading_zeros = digits - len(str(i))
        contig_label = '%s_%s' % (
                sample_alignment.experiment_sample.label,
                leading_zeros * '0' + str(i))

        # Create model and metadata.

        contig = Contig.objects.create(
                label=contig_label,
                parent_reference_genome=reference_genome,
                experiment_sample_to_alignment=(
                        sample_alignment))
        contig.metadata['coverage'] = coverage
        contig.metadata['timestamp'] = timestamp
        contig.metadata['node_number'] = contig_node_number
        contig.metadata['assembly_dir'] = assembly_dir

        contig.ensure_model_data_dir_exists()

        # NOTE: Unused code.
        # Reassemble the contig from its constituent reads separately,
        # using a second velvet call.
        # if reassemble_contig_from_reads:
        #     # 1. Grab reads from velvet to reassemble the contig
        #     make_contig_reads_to_ref_alignments(contig,
        #             add_jbrowse_track=False, overwrite=overwrite)
        #     contig_reads_bam = os.path.join(
        #             contig.get_model_data_dir(),
        #             'sv_indicants.bam')

        #     # 2. Reassemble the contig from its whole reads using velvet -
        #     # this generates longer contigs because the graph will trim the
        #     # edges if there is a branchpoint. With only one node it should
        #     # be very fast.
        #     _run_velvet(contig.get_model_data_dir(), velvet_opts,
        #             contig_reads_bam)
        #     reassembled_seqrecord = _extract_node_from_contig_reassembly(
        #             contig)
        #     if reassembled_seqrecord:
        #         seq_record.seq = reassembled_seqrecord.seq

        # Write the contig fasta and add it as a dataset to the contig object.

        dataset_path = os.path.join(contig.get_model_data_dir(),
                'fasta.fa')

        with open(dataset_path, 'w') as fh:
            SeqIO.write([seq_record], fh, 'fasta')

        add_dataset_to_entity(
                contig,
                'contig_fasta',
                Dataset.TYPE.REFERENCE_GENOME_FASTA,
                filesystem_location=dataset_path)

        contig.save()

        # NOTE: Disabled for now. Severe performance issues.
        # Make a bam track on the reference for each contig that shows only the
        # reads that assembled the contig and their mates
        # make_contig_reads_to_ref_alignments(contig.uid)

        # append the uid to the contig_uid_list
        contig_uid_list.append(contig.uid)

    # once contigs are extracted, remove velvet data
    _cleanup_velvet_dir(assembly_dir)

    return contig_uid_list
Example #23
0
    def _run_contig_walk_test(self, test_dir):

        ref_fasta = os.path.join(test_dir, 'ref.fa')
        target_fasta = os.path.join(test_dir, 'target.fa')
        contig_fasta_list = []
        i = 0
        contig_fasta_path = os.path.join(test_dir, 'contig_' + str(i) + '.fa')
        while os.path.exists(contig_fasta_path):
            contig_fasta_list.append(contig_fasta_path)
            i += 1
            contig_fasta_path = os.path.join(test_dir,
                    'contig_' + str(i) + '.fa')

        dummy_models = self._make_dummy_models()
        reference_genome = dummy_models['reference_genome']
        sample_alignment = dummy_models['sample_alignment']
        alignment_group = dummy_models['alignment_group']

        add_dataset_to_entity(
                reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_FASTA,
                Dataset.TYPE.REFERENCE_GENOME_FASTA,
                filesystem_location=ref_fasta)

        # Make data_dir directory to house genome_finishing files
        assembly_dir = os.path.join(
                sample_alignment.get_model_data_dir(),
                'assembly')

        # Make assembly directory
        os.mkdir(assembly_dir)

        data_dir = os.path.join(assembly_dir, '0')
        os.mkdir(data_dir)

        # Create contigs
        contig_list = []
        for i, contig_fasta in enumerate(contig_fasta_list):
            contig = Contig.objects.create(
                parent_reference_genome=reference_genome,
                experiment_sample_to_alignment=sample_alignment,
                label='test_contig_' + str(i))
            add_dataset_to_entity(
                    contig,
                    Dataset.TYPE.REFERENCE_GENOME_FASTA,
                    Dataset.TYPE.REFERENCE_GENOME_FASTA,
                    filesystem_location=contig_fasta)
            contig.metadata['assembly_dir'] = data_dir
            contig.metadata['node_number'] = i
            contig_list.append(contig)

        # Place contigs and create variants
        evaluate_contigs(contig_list,
                skip_extracted_read_alignment=True,
                use_read_alignment=False)
        parse_variants_from_vcf(sample_alignment)

        # Get set of de novo variants
        variant_set = create_de_novo_variants_set(
                alignment_group, 'de_novo_variants')

        self.assertTrue(variant_set.variants.exists())
        self.assertEqual(len(variant_set.variants.all()), 1)

        # Make new reference genome
        new_ref_genome_params = {'label': 'new_ref'}
        new_ref_genome = generate_new_reference_genome(
                variant_set, new_ref_genome_params)

        # Verify insertion was placed correctly
        new_ref_genome_fasta = get_dataset_with_type(
                new_ref_genome, Dataset.TYPE.REFERENCE_GENOME_FASTA
                ).get_absolute_location()

        fastas_same, indexes = are_fastas_same(
                target_fasta, new_ref_genome_fasta)

        self.assertTrue(fastas_same)
Example #24
0
def evaluate_contigs(contig_uid_list,
                     skip_extracted_read_alignment=False,
                     use_read_alignment=True):

    if not contig_uid_list:
        return

    def _length_weighted_coverage(contig):
        return contig.num_bases * contig.coverage

    # Request contig_list from db and order by highest length weighted coverage
    contig_list = list(Contig.objects.filter(uid__in=contig_uid_list))
    contig_list.sort(key=_length_weighted_coverage, reverse=True)

    # All contigs have have same sample_alignment so grab sample alignment from
    # the first one.
    contig = contig_list[0]
    sample_alignment = contig.experiment_sample_to_alignment
    ref_genome = sample_alignment.alignment_group.reference_genome

    # Attempt placing contigs. Get back placeable contigs,
    # translocation variants (dict obj), and mobile elements translocation
    # variants (dict obj).
    placeable_contig_uid_list, var_dict_list, me_var_dict_list = graph_contig_placement(
        contig_uid_list, skip_extracted_read_alignment, use_read_alignment)

    # update contig list with new features from graph_contig_placement
    contig_list = list(Contig.objects.filter(uid__in=contig_uid_list))
    contig_list.sort(key=_length_weighted_coverage, reverse=True)

    # Annotate contig with the gene names that they fall within 50 bp of.
    annotate_contig_junctions(contig_uid_list, ref_genome, dist=50)

    # Handle placeable contigs, if any.
    if len(placeable_contig_uid_list):
        placeable_contigs = Contig.objects.filter(
            uid__in=placeable_contig_uid_list)

        for contig in placeable_contigs:
            contig.metadata['is_placeable'] = True
            contig.save()

        placeable_contig_vcf_path = os.path.join(
            sample_alignment.get_model_data_dir(),
            'de_novo_assembled_contigs.vcf')

        # Write contigs to vcf
        export_contig_list_as_vcf(placeable_contigs, placeable_contig_vcf_path)

        # Make dataset for contigs vcf
        add_dataset_to_entity(sample_alignment,
                              Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS,
                              Dataset.TYPE.VCF_DE_NOVO_ASSEMBLED_CONTIGS,
                              placeable_contig_vcf_path)

    # Handle other types of contig objects, if any.
    var_dict_vcf_path = os.path.join(sample_alignment.get_model_data_dir(),
                                     'de_novo_assembly_translocations.vcf')

    me_var_dict_vcf_path = os.path.join(
        sample_alignment.get_model_data_dir(),
        'de_novo_assembly_me_translocations.vcf')

    for var_dl, method, path, dataset_type in [
        (var_dict_list, 'GRAPH_WALK', var_dict_vcf_path,
         Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_GRAPH_WALK),
        (me_var_dict_list, 'ME_GRAPH_WALK', me_var_dict_vcf_path,
         Dataset.TYPE.VCF_DE_NOVO_ASSEMBLY_ME_GRAPH_WALK)
    ]:

        if not var_dl:
            continue

        # Write variant dicts to vcf
        export_var_dict_list_as_vcf(var_dl, path, sample_alignment, method)

        # Make dataset for contigs vcf
        add_dataset_to_entity(sample_alignment, dataset_type, dataset_type,
                              path)
Example #25
0
    def _run_contig_walk_test(self, test_dir):

        ref_fasta = os.path.join(test_dir, 'ref.fa')
        self.target_fasta = os.path.join(test_dir, 'target.fa')

        contig_fasta_list = filter(
                lambda x: re.match(r'contig_\d+\.fa', x),
                os.listdir(test_dir))
        contig_fasta_list = [os.path.join(test_dir, filename) for
                filename in contig_fasta_list]

        dummy_models = self._make_dummy_models()
        reference_genome = dummy_models['reference_genome']
        sample_alignment = dummy_models['sample_alignment']
        alignment_group = dummy_models['alignment_group']

        add_dataset_to_entity(
                    reference_genome,
                    Dataset.TYPE.REFERENCE_GENOME_FASTA,
                    Dataset.TYPE.REFERENCE_GENOME_FASTA,
                    filesystem_location=ref_fasta)

        ref_genbank = os.path.join(test_dir, 'ref.gb')
        if os.path.exists(ref_genbank):
            add_dataset_to_entity(
                reference_genome,
                Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                ref_genbank)
            reference_genome.ensure_mobile_element_multifasta()

        # Make data_dir directory to house genome_finishing files
        assembly_dir = os.path.join(
                sample_alignment.get_model_data_dir(),
                'assembly')

        # Make assembly directory
        os.mkdir(assembly_dir)

        data_dir = os.path.join(assembly_dir, '0')
        os.mkdir(data_dir)

        # Create contigs
        contig_uid_list = []
        for i, contig_fasta in enumerate(contig_fasta_list):
            contig = Contig.objects.create(
                parent_reference_genome=reference_genome,
                experiment_sample_to_alignment=sample_alignment,
                label='test_contig_' + str(i))
            add_dataset_to_entity(
                    contig,
                    Dataset.TYPE.REFERENCE_GENOME_FASTA,
                    Dataset.TYPE.REFERENCE_GENOME_FASTA,
                    filesystem_location=contig_fasta)
            contig.metadata['assembly_dir'] = data_dir
            contig.metadata['node_number'] = i
            contig_uid_list.append(contig.uid)
            contig.save()

        # Place contigs and create variants
        evaluate_contigs(contig_uid_list,
                skip_extracted_read_alignment=True,
                use_read_alignment=False)

        parse_variants_from_vcf(sample_alignment)

        self.contig_uid_list = contig_uid_list

        # Get set of de novo variants
        variant_set = create_de_novo_variants_set(
                alignment_group, 'de_novo_variants')

        for v in variant_set.variants.all():
            alts = v.get_alternates()
            assert len(alts) == 1
            alt = alts[0]
            print '\npos:%s\nref: %dbp :%s\nalt: %dbp :%s\n' % (
                    v.position,
                    len(v.ref_value), v.ref_value,
                    len(alt), alt)

        return variant_set
def align_contig_reads_to_contig(contig):

    # Get fasta of reads used to make contig
    contig_reads_fasta = os.path.join(contig.get_model_data_dir(),
                                      'extracted_reads.fa')

    # Pull out contig read qnames and put in dictionary contig_reads
    p1 = re.compile('>(\S+)/(\d)')
    contig_reads = defaultdict(list)
    with open(contig_reads_fasta) as fh:
        for line in fh:
            m1 = p1.match(line)
            if m1:
                read_id = m1.group(1)
                read_number = int(m1.group(2))
                contig_reads[read_id].append(read_number)

    # Get source reads fastqs
    sample = contig.experiment_sample_to_alignment.experiment_sample
    source_fq1 = sample.dataset_set.get(
        type=Dataset.TYPE.FASTQ1).get_absolute_location()
    source_fq2_query = sample.dataset_set.filter(type=Dataset.TYPE.FASTQ2)
    is_paired_end = source_fq2_query.exists()
    if is_paired_end:
        source_fq2 = source_fq2_query[0].get_absolute_location()

    # Make filenames for contig read fastqs
    output_fq1 = os.path.join(contig.get_model_data_dir(), 'reads.1.fq')
    if is_paired_end:
        output_fq2 = os.path.join(contig.get_model_data_dir(), 'reads.2.fq')

    # Go through source fastqs and write reads in contig_reads to file
    source_fq_list = [source_fq1]
    output_fq_list = [output_fq1]
    if is_paired_end:
        source_fq_list.append(source_fq2)
        output_fq_list.append(output_fq2)

    p1 = re.compile('@(\S+)')
    for input_fq_path, output_fq_path in zip(source_fq_list, output_fq_list):
        if input_fq_path.endswith('.fq'):
            file_like = open(input_fq_path)
        elif input_fq_path.endswith('.gz'):
            file_like = gzip.open(input_fq_path)
        else:
            raise Exception('Compression type not supported')

        with file_like as in_fh, \
             open(output_fq_path, 'w') as out_fh:
            for line in in_fh:
                m1 = p1.match(line)
                if m1:
                    qname = m1.group(1)
                    if qname in contig_reads:
                        out_fh.write(line)
                        out_fh.write(in_fh.next())
                        out_fh.write(in_fh.next())
                        out_fh.write(in_fh.next())

    # Align fastqs to contig fasta
    contig_fasta = contig.dataset_set.get(
        type=Dataset.TYPE.REFERENCE_GENOME_FASTA).get_absolute_location()
    contig_reads_to_contig_bam = os.path.join(contig.get_model_data_dir(),
                                              'reads_to_contig.bam')
    simple_align_paired_with_bwa_mem(output_fq_list, contig_fasta,
                                     contig_reads_to_contig_bam)

    # Coordinate sort and index bam for jbrowse
    coordinate_sorted_bam = (os.path.splitext(contig_reads_to_contig_bam)[0] +
                             '.coordinate_sorted.bam')
    sort_bam_by_coordinate(contig_reads_to_contig_bam, coordinate_sorted_bam)
    index_bam(coordinate_sorted_bam)

    # Add the bam file to contig as BWA_ALIGN dataset, overwriting it
    # if it already exists
    dataset_query = contig.dataset_set.filter(type=Dataset.TYPE.BWA_ALIGN)
    if dataset_query.count():
        dataset_query[0].delete()

    add_dataset_to_entity(contig,
                          Dataset.TYPE.BWA_ALIGN,
                          Dataset.TYPE.BWA_ALIGN,
                          filesystem_location=coordinate_sorted_bam)
Example #27
0
def graph_contig_placement(contig_uid_list,
                           skip_extracted_read_alignment,
                           use_alignment_reads=True):
    """Align contigs passed in contig_list to the reference and to any
    annotated mobile elements in the reference genbank and use the alignment
    to build a sequence graph.  The sequence graph is then used by graph
    walking algorithms that call structural variants from paths in the graph.

    Args:
        contig_uid_list: list of Contig objects
        skip_extracted_read_alignment: if False, extract the reads that
                assembled each contig and make them a bam track on the
                reference
        use_alignment_reads: if True, filter contig placements that would
                delete regions of moderate coverage

    Returns:
         placeable_contig_uid_list: Contig objects with metadata fields holding
                their reference placement parameters
         var_dict_list: list of dictionary representations of translocation
                variants with keys: chromosome, pos, ref_seq, alt_seq
         me_var_dict_list: list of dictionary representations of mobile
                element translocation variants with keys: chromosome, pos,
                ref_seq, alt_seq, MEINFO
    """
    def _length_weighted_coverage(contig):
        return contig.num_bases * contig.coverage

    # Request contig_list from db and order by highest length weighted coverage
    contig_list = list(Contig.objects.filter(uid__in=contig_uid_list))
    contig_list.sort(key=_length_weighted_coverage, reverse=True)

    sample_alignment = contig_list[0].experiment_sample_to_alignment
    sample_alignment.data['assembly_status'] = (
        ExperimentSampleToAlignment.ASSEMBLY_STATUS.BUILDING_SEQUENCE_GRAPH)
    sample_alignment.save()

    ref_genome = sample_alignment.alignment_group.reference_genome

    # Make Assembly dir
    assembly_dir = os.path.join(sample_alignment.get_model_data_dir(),
                                'assembly')

    contig_alignment_dir = os.path.join(assembly_dir, 'contig_alignment')

    if os.path.exists(contig_alignment_dir):
        shutil.rmtree(contig_alignment_dir)
    os.mkdir(contig_alignment_dir)

    # NOTE(gleb): Not sure whether these have to be ordered, but keeping
    # them ordered while refactoring.
    contigs_as_ordered_dict = OrderedDict([(c.uid, c) for c in contig_list])

    # Concatenate contig fastas for alignment
    contig_concat = os.path.join(contig_alignment_dir, 'contig_concat.fa')
    with open(contig_concat, 'w') as output_fh:
        for contig_uid, c in contigs_as_ordered_dict.iteritems():
            contig_fasta_file = get_fasta(c)
            with open(contig_fasta_file) as read_fh:
                output_fh.write(read_fh.read())

    # Create dictionaries to translate contig uid to its fasta descriptor line
    contig_qname_to_uid = {}
    for contig_uid, c in contigs_as_ordered_dict.items():
        contig_fasta_file = get_fasta(c)
        with open(contig_fasta_file, 'r') as fh:
            descriptor = fh.next()
            contig_qname_to_uid[descriptor.strip('>\n')] = contig_uid

    # Get extracted mobile elements in addition to contigs
    if ref_genome.is_annotated():
        me_fa_dataset = get_dataset_with_type(
            ref_genome, Dataset.TYPE.MOBILE_ELEMENT_FASTA)
        me_concat_fasta = me_fa_dataset.get_absolute_location()

        contig_alignment_to_me_bam = os.path.join(
            contig_alignment_dir, 'contig_alignment_to_me.bam')

        if not os.path.exists(contig_alignment_to_me_bam):
            ensure_bwa_index(me_concat_fasta)
            simple_align_with_bwa_mem(contig_concat, me_concat_fasta,
                                      contig_alignment_to_me_bam, ['-T', '15'])

    # Align concatenated contig fastas to reference
    contig_alignment_bam = os.path.join(contig_alignment_dir,
                                        'contig_alignment.bam')
    print 'Aligning contigs to reference'
    simple_align_with_bwa_mem(contig_concat, get_fasta(ref_genome),
                              contig_alignment_bam, ['-T', '15'])

    # Create graph
    G = nx.DiGraph()

    # Create sequence interval instances for reference and each contig
    ref_intervals = SequenceIntervals(ref_genome.uid,
                                      ref_genome.num_bases,
                                      tag='ref')

    G.ref_intervals = ref_intervals

    add_alignment_to_graph(G, contig_alignment_bam)

    if ref_genome.is_annotated():
        add_me_alignment_to_graph(G, contig_alignment_to_me_bam)

    # Add SEQUENCE_GRAPH_PICKLE dataset to sample alignment
    graph_pickle_path = os.path.join(contig_alignment_dir,
                                     'sequence_graph.pickle')
    nx.write_gpickle(G, graph_pickle_path)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.SEQUENCE_GRAPH_PICKLE,
                          Dataset.TYPE.SEQUENCE_GRAPH_PICKLE,
                          graph_pickle_path)

    detect_strand_chromosome_junctions(contig_qname_to_uid,
                                       contig_alignment_bam)

    placeable_contig_uid_list = []
    iv_list = novel_seq_ins_walk(G)
    if use_alignment_reads:
        coverage_stats = get_coverage_stats(sample_alignment)
        sample_alignment_bam = sample_alignment.dataset_set.get(
            type=Dataset.TYPE.BWA_ALIGN).get_absolute_location()

    for insertion_vertices in iv_list:
        contig_qname = insertion_vertices.enter_contig.seq_uid
        contig_uid = contig_qname_to_uid[contig_qname]
        contig = Contig.objects.get(uid=contig_uid)
        set_contig_placement_params(contig, insertion_vertices)

        if use_alignment_reads:
            # Filter out deletions of good coverage regions
            deletion_length = (insertion_vertices.enter_ref.pos -
                               insertion_vertices.exit_ref.pos)

            if deletion_length > 0:
                deletion_cov = avg_coverage(sample_alignment_bam,
                                            contig.metadata['chromosome'],
                                            insertion_vertices.exit_ref.pos,
                                            insertion_vertices.enter_ref.pos)

            chrom_cov_stats = coverage_stats[contig.metadata['chromosome']]
            chrom_cov_mean = chrom_cov_stats['mean']
            chrom_cov_std = chrom_cov_stats['std']
            if deletion_length <= 0 or (deletion_cov <
                                        chrom_cov_mean - chrom_cov_std):
                placeable_contig_uid_list.append(contig.uid)
        else:
            placeable_contig_uid_list.append(contig.uid)

    # Perform translocation walk
    if ref_genome.num_chromosomes == 1:

        trans_iv_pairs = translocation_walk(G)
        var_dict_list = [
            parse_path_into_ref_alt(iv_pair, contig_qname_to_uid,
                                    sample_alignment)
            for iv_pair in trans_iv_pairs
        ]

        var_dict_list = [
            var_d for var_d in var_dict_list
            if any([var_d['ref_seq'], var_d['alt_seq']])
        ]

        if ref_genome.is_annotated():
            me_trans_iv_pairs = me_translocation_walk(G)

            me_var_dict_list = [
                parse_path_into_ref_alt(iv_pair, contig_qname_to_uid,
                                        sample_alignment)
                for iv_pair in me_trans_iv_pairs
            ]
        else:
            me_var_dict_list = []

    else:
        print 'Translocation walk not implemented for multi-chromosomal refs'
        var_dict_list = []
        me_var_dict_list = []

    return placeable_contig_uid_list, var_dict_list, me_var_dict_list
Example #28
0
    def test_add_variants_to_set_from_bed(self):

        common_entities = create_common_entities()
        project = common_entities['project']
        self.ref_genome_1 = common_entities['reference_genome']

        alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=self.ref_genome_1,
            aligner=AlignmentGroup.ALIGNER.BWA)

        (self.sample_1, created) = ExperimentSample.objects.get_or_create(
                project=project,
                label=SAMPLE_1_LABEL)

        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=alignment_group,
                experiment_sample=self.sample_1)

        # Create variants in the bed regions from best_test.bed
        for var_poor_map in range(20):
            variant = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.ref_genome_1,
                    chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1),
                    position=random.randint(101,200),
                    ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={}
            )

        for var_no_cov in range(20):
            variant = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.ref_genome_1,
                    chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1),
                    position=random.randint(301,400),
                    ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={}
            )

            variant = Variant.objects.create(
                    type=Variant.TYPE.TRANSITION,
                    reference_genome=self.ref_genome_1,
                    chromosome=Chromosome.objects.get(reference_genome=self.ref_genome_1),
                    position=random.randint(501,600),
                    ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={}
            )

        new_bed_path = copy_dataset_to_entity_data_dir(
                entity= sample_alignment,
                original_source_location= TEST_BED)

        bed_dataset = add_dataset_to_entity(sample_alignment,
                dataset_label= Dataset.TYPE.BED_CALLABLE_LOCI,
                dataset_type= Dataset.TYPE.BED_CALLABLE_LOCI,
                filesystem_location= new_bed_path)

        vs_to_v_map = add_variants_to_set_from_bed(
                sample_alignment, bed_dataset)

        variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()])
        self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']),
                variant_set_labels)

        for variant_set, variants in vs_to_v_map.items():
            for v in variants:
                # POOR MAPPING QUAL should be from 101 to 200
                if variant_set.label == 'POOR_MAPPING_QUALITY':
                    self.assertTrue(v.position in pyinter.closedopen(
                            101, 200))
                # NO COVERAGE should be from 301 to 400, 501 to 600
                elif variant_set.label == 'NO_COVERAGE':
                    self.assertTrue(v.position in pyinter.IntervalSet([
                                    pyinter.closedopen(301,400),
                                    pyinter.closedopen(501,600)]))
                else:
                    raise AssertionError(
                            'bad variant set %s made.' % variant_set.label)
Example #29
0
    def test_add_variants_to_set_from_bed(self):

        common_entities = create_common_entities()
        project = common_entities['project']
        self.ref_genome_1 = common_entities['reference_genome']

        alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=self.ref_genome_1,
            aligner=AlignmentGroup.ALIGNER.BWA)

        (self.sample_1, created) = ExperimentSample.objects.get_or_create(
            project=project, label=SAMPLE_1_LABEL)

        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group, experiment_sample=self.sample_1)

        # Create variants in the bed regions from best_test.bed
        for var_poor_map in range(20):
            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(101, 200),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

        for var_no_cov in range(20):
            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(301, 400),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(501, 600),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

        new_bed_path = copy_dataset_to_entity_data_dir(
            entity=sample_alignment, original_source_location=TEST_BED)

        bed_dataset = add_dataset_to_entity(
            sample_alignment,
            dataset_label=Dataset.TYPE.BED_CALLABLE_LOCI,
            dataset_type=Dataset.TYPE.BED_CALLABLE_LOCI,
            filesystem_location=new_bed_path)

        vs_to_v_map = add_variants_to_set_from_bed(sample_alignment,
                                                   bed_dataset)

        variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()])
        self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']),
                         variant_set_labels)

        for variant_set, variants in vs_to_v_map.items():
            for v in variants:
                # POOR MAPPING QUAL should be from 101 to 200
                if variant_set.label == 'POOR_MAPPING_QUALITY':
                    self.assertTrue(v.position in pyinter.closedopen(101, 200))
                # NO COVERAGE should be from 301 to 400, 501 to 600
                elif variant_set.label == 'NO_COVERAGE':
                    self.assertTrue(v.position in pyinter.IntervalSet([
                        pyinter.closedopen(301, 400),
                        pyinter.closedopen(501, 600)
                    ]))
                else:
                    raise AssertionError('bad variant set %s made.' %
                                         variant_set.label)
def combine_list_allformats(reference_genome_list,
            new_ref_genome_label, project):
    """Combine ReferenceGenomes into a new single ReferenceGenome
    composed of the component parts.

    Args:
        reference_genome_list: List of ReferenceGenome objects.
        new_ref_genome_label: Label for the new ReferenceGenome.
        project: Project to which the new ReferenceGenome will be added.

    Returns:
        Object with keys:
            * is_success
            * new_reference_genome (when is_success = True)
            * error_msg (when is_success = False)
    """
    rg_dataset_list = []
    for ref_genome in reference_genome_list:
        rg_dataset_tup = None
        for dataset_type in [Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                Dataset.TYPE.REFERENCE_GENOME_FASTA]:
            filter_result = ref_genome.dataset_set.filter(type=dataset_type)
            if len(filter_result):
                rg_dataset_tup = (ref_genome, filter_result[0])
                break
        if (not rg_dataset_tup or
                not os.path.exists(rg_dataset_tup[1].get_absolute_location())):
            return {
                'is_success': False,
                'error_msg': 'All reference genomes must have an associated \
                        FASTA or Genbank dataset'
            }
        else:
            rg_dataset_list.append(rg_dataset_tup)
    assert len(rg_dataset_list) == len(reference_genome_list)

    # Read the datasets into Biopython SeqRecord objects.
    rg_seqrecord_list = []
    seqrecord_ids = []
    for rg, dataset in rg_dataset_list:
        with open(dataset.get_absolute_location()) as input_fh:
            for record in SeqIO.parse(input_fh,
                        DATASET_TO_SEQIO_FORMAT[dataset.type]):
                rg_seqrecord_list.append((rg,record))
                seqrecord_ids.append('_'.join([rg.label[:7], record.id[:8]]))

    # If ReferenceGenome label and Chromosome id are the same, there will be
    # duplicate seqrecord_ids: resolve by including numeric prefix in id
    seq_record_list = []
    MAX_LOCUS_NAME_LEN = 16
    unique_id_len = len(str(len(seqrecord_ids)))
    label_len = (MAX_LOCUS_NAME_LEN - 2 - unique_id_len) / 2
    for i,seqrecord_id in enumerate(seqrecord_ids):
        rg, seqrecord = rg_seqrecord_list[i]

        if seqrecord_ids.count(seqrecord_id) == 1:
            unique_seqrecord_id = seqrecord_id
        else:
            unique_seqrecord_id = '_'.join(
                [str(i), rg.label[:label_len], seqrecord.id[:label_len]])

        seqrecord.name = seqrecord.id = unique_seqrecord_id
        seqrecord.seq.alphabet = ambiguous_dna
        seq_record_list.append(seqrecord)

    # Create a new ReferenceGenome.
    new_ref_genome = ReferenceGenome.objects.create(
            project=project,
            label=new_ref_genome_label,
            num_chromosomes=len(seq_record_list),
            num_bases=sum([len(seq) for seq in seq_record_list]))

    # Generate a filename from the label with non-alphanumeric characters
    # replaced by underscores.
    filename_prefix = generate_safe_filename_prefix_from_label(
            new_ref_genome_label)
    does_list_include_genbank = Dataset.TYPE.REFERENCE_GENOME_GENBANK in \
            [rg_dataset_tup[1].type for rg_dataset_tup in rg_dataset_list]
    if does_list_include_genbank:
        filename = filename_prefix + '.gb'
    else:
        filename = filename_prefix + '.fa'
    new_file_dest = os.path.join(new_ref_genome.get_model_data_dir(), filename)

    # Write the result.
    ref_genome_dataset_type = Dataset.TYPE.REFERENCE_GENOME_GENBANK if \
            does_list_include_genbank else Dataset.TYPE.REFERENCE_GENOME_FASTA
    output_file_format = DATASET_TO_SEQIO_FORMAT[ref_genome_dataset_type]
    with open(new_file_dest, 'w') as output_fh:
        SeqIO.write(seq_record_list, output_fh, output_file_format)

    # Create a dataset which will point to the file. This step must happen after
    # writing the file because a signal will be triggered which requires the
    # Genbank to exist already.
    add_dataset_to_entity(new_ref_genome, ref_genome_dataset_type,
            ref_genome_dataset_type, new_file_dest)

    return {
        'is_success': True,
        'new_reference_genome': new_ref_genome
    }
Example #31
0
    def test_end_to_end(self):
        """Test running full pipline on small-ish data.

        The data file consists of 20,000 bases. At 5,000 bases there is
        a 400 base deletion. At 10,000 bases there is a 400 base inversion.
        At 15,000 bases there is a 400 base tandem duplication.

        It seems that Pindel cannot find the inversion. Fortunately,
        delly can usually find inversions. Unfortunately, delly only
        works well on large data, so we will not test it here.
        """
        # Create a new alignment group.
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)

        # Create a sample.
        sample_1 = ExperimentSample.objects.create(
                uid=TEST_SAMPLE_UID,
                project=self.project,
                label='sample1')
        ### Add the raw reads
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1, TEST_FASTQ1)
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2, TEST_FASTQ2)

        # Create relationship between alignment and sample.
        sample_alignment = ExperimentSampleToAlignment.objects.create(
                alignment_group=alignment_group,
                experiment_sample=sample_1)
        ### Add alignment data. NOTE: Stored in sample model dir.

        # index (no dataset)
        copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)

        # bam file (with dataset)
        copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
        add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                Dataset.TYPE.BWA_ALIGN, copy_dest)

        # Make sure there are no variants before.
        self.assertEqual(0, len(Variant.objects.filter(
                reference_genome=self.reference_genome)))

        # Test with Pindel only for now.
        for tool in ['pindel']:
            find_variants_with_tool(alignment_group,
                    VARIANT_TOOL_PARAMS_MAP[tool], project=self.project)

        # Check that the alignment group has a freebayes vcf dataset associated
        # with it.
        vcf_dataset = get_dataset_with_type(alignment_group,
                Dataset.TYPE.VCF_PINDEL)
        self.assertIsNotNone(vcf_dataset)

        # Make sure the .vcf file actually exists.
        self.assertTrue(os.path.exists(vcf_dataset.get_absolute_location()))

        # Make sure the vcf is valid by reading it using pyvcf.
        with open(vcf_dataset.get_absolute_location()) as vcf_fh:
            try:
                reader = vcf.Reader(vcf_fh)
                reader.next()
            except:
                self.fail("Not valid vcf")

        # Grab the resulting variants.
        variants = Variant.objects.filter(reference_genome=self.reference_genome)

        # Confirm that 2 variants found.
        self.assertEqual(2, len(variants))

        variant_map = {}
        for variant in variants:
            variant_alternates = VariantAlternate.objects.filter(variant=variant)

            # There should be only one variant alternate per SV.
            self.assertEqual(len(variant_alternates), 1)

            pos = variant.position
            svtype = variant_alternates[0].data['INFO_SVTYPE']
            svlen = variant_alternates[0].data['INFO_SVLEN']
            variant_map[svtype] = (pos, svlen)

        # Check that there is a deletion around base 5000.
        self.assertTrue('DEL' in variant_map)
        self.assertTrue(abs(variant_map['DEL'][0] - 5000) <= 3)
        self.assertTrue(abs(variant_map['DEL'][1] - 400) <= 3)

        # Check that there is a tandem duplication around base 15000.
        self.assertTrue('DUP:TANDEM' in variant_map)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][0] - 15000) <= 3)
        self.assertTrue(abs(variant_map['DUP:TANDEM'][1] - 400) <= 3)
def graph_contig_placement(contig_uid_list, skip_extracted_read_alignment,
        use_alignment_reads=True):
    """Align contigs passed in contig_list to the reference and to any
    annotated mobile elements in the reference genbank and use the alignment
    to build a sequence graph.  The sequence graph is then used by graph
    walking algorithms that call structural variants from paths in the graph.

    Args:
        contig_uid_list: list of Contig objects
        skip_extracted_read_alignment: if False, extract the reads that
                assembled each contig and make them a bam track on the
                reference
        use_alignment_reads: if True, filter contig placements that would
                delete regions of moderate coverage

    Returns:
         placeable_contig_uid_list: Contig objects with metadata fields holding
                their reference placement parameters
         var_dict_list: list of dictionary representations of translocation
                variants with keys: chromosome, pos, ref_seq, alt_seq
         me_var_dict_list: list of dictionary representations of mobile
                element translocation variants with keys: chromosome, pos,
                ref_seq, alt_seq, MEINFO
    """

    def _length_weighted_coverage(contig):
        return contig.num_bases * contig.coverage

    # Request contig_list from db and order by highest length weighted coverage
    contig_list = list(Contig.objects.filter(uid__in=contig_uid_list))
    contig_list.sort(key=_length_weighted_coverage, reverse=True)

    sample_alignment = contig_list[0].experiment_sample_to_alignment
    sample_alignment.data['assembly_status'] = (
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.BUILDING_SEQUENCE_GRAPH
    )
    sample_alignment.save()

    ref_genome = sample_alignment.alignment_group.reference_genome

    # Make Assembly dir
    assembly_dir = os.path.join(sample_alignment.get_model_data_dir(),
            'assembly')

    contig_alignment_dir = os.path.join(
            assembly_dir, 'contig_alignment')

    if os.path.exists(contig_alignment_dir):
        shutil.rmtree(contig_alignment_dir)
    os.mkdir(contig_alignment_dir)

    # NOTE(gleb): Not sure whether these have to be ordered, but keeping
    # them ordered while refactoring.
    contigs_as_ordered_dict = OrderedDict(
            [(c.uid, c) for c in contig_list])

    # Concatenate contig fastas for alignment
    contig_concat = os.path.join(contig_alignment_dir, 'contig_concat.fa')
    with open(contig_concat, 'w') as output_fh:
        for contig_uid, c in contigs_as_ordered_dict.iteritems():
            contig_fasta_file = get_fasta(c)
            with open(contig_fasta_file) as read_fh:
                output_fh.write(read_fh.read())

    # Create dictionaries to translate contig uid to its fasta descriptor line
    contig_qname_to_uid = {}
    for contig_uid, c in contigs_as_ordered_dict.items():
        contig_fasta_file = get_fasta(c)
        with open(contig_fasta_file, 'r') as fh:
            descriptor = fh.next()
            contig_qname_to_uid[descriptor.strip('>\n')] = contig_uid

    # Get extracted mobile elements in addition to contigs
    if ref_genome.is_annotated():
        me_fa_dataset = get_dataset_with_type(
                ref_genome,
                Dataset.TYPE.MOBILE_ELEMENT_FASTA)
        me_concat_fasta = me_fa_dataset.get_absolute_location()

        contig_alignment_to_me_bam = os.path.join(
                contig_alignment_dir, 'contig_alignment_to_me.bam')

        if not os.path.exists(contig_alignment_to_me_bam):
            ensure_bwa_index(me_concat_fasta)
            simple_align_with_bwa_mem(
                    contig_concat,
                    me_concat_fasta,
                    contig_alignment_to_me_bam,
                    ['-T', '15'])

    # Align concatenated contig fastas to reference
    contig_alignment_bam = os.path.join(
            contig_alignment_dir, 'contig_alignment.bam')
    print 'Aligning contigs to reference'
    simple_align_with_bwa_mem(
            contig_concat,
            get_fasta(ref_genome),
            contig_alignment_bam,
            ['-T', '15'])

    # Create graph
    G = nx.DiGraph()

    # Create sequence interval instances for reference and each contig
    ref_intervals = SequenceIntervals(
            ref_genome.uid, ref_genome.num_bases, tag='ref')

    G.ref_intervals = ref_intervals

    add_alignment_to_graph(G, contig_alignment_bam)

    if ref_genome.is_annotated():
        add_me_alignment_to_graph(G, contig_alignment_to_me_bam)

    # Add SEQUENCE_GRAPH_PICKLE dataset to sample alignment
    graph_pickle_path = os.path.join(
            contig_alignment_dir,
            'sequence_graph.pickle')
    nx.write_gpickle(G, graph_pickle_path)
    add_dataset_to_entity(
            sample_alignment,
            Dataset.TYPE.SEQUENCE_GRAPH_PICKLE,
            Dataset.TYPE.SEQUENCE_GRAPH_PICKLE,
            graph_pickle_path)

    detect_strand_chromosome_junctions(contig_qname_to_uid, contig_alignment_bam)

    placeable_contig_uid_list = []
    iv_list = novel_seq_ins_walk(G)
    if use_alignment_reads:
        coverage_stats = get_coverage_stats(sample_alignment)
        sample_alignment_bam = sample_alignment.dataset_set.get(
            type=Dataset.TYPE.BWA_ALIGN).get_absolute_location()

    for insertion_vertices in iv_list:
        contig_qname = insertion_vertices.enter_contig.seq_uid
        contig_uid = contig_qname_to_uid[contig_qname]
        contig = Contig.objects.get(uid=contig_uid)
        set_contig_placement_params(contig, insertion_vertices)

        if use_alignment_reads:
            # Filter out deletions of good coverage regions
            deletion_length = (insertion_vertices.enter_ref.pos -
                    insertion_vertices.exit_ref.pos)

            if deletion_length > 0:
                deletion_cov = avg_coverage(
                        sample_alignment_bam,
                        contig.metadata['chromosome'],
                        insertion_vertices.exit_ref.pos,
                        insertion_vertices.enter_ref.pos)

            chrom_cov_stats = coverage_stats[contig.metadata['chromosome']]
            chrom_cov_mean = chrom_cov_stats['mean']
            chrom_cov_std = chrom_cov_stats['std']
            if deletion_length <= 0 or (
                    deletion_cov < chrom_cov_mean - chrom_cov_std):
                placeable_contig_uid_list.append(contig.uid)
        else:
            placeable_contig_uid_list.append(contig.uid)

    # Perform translocation walk
    if ref_genome.num_chromosomes == 1:

        trans_iv_pairs = translocation_walk(G)
        var_dict_list = [parse_path_into_ref_alt(iv_pair, contig_qname_to_uid,
                sample_alignment)
                for iv_pair in trans_iv_pairs]

        var_dict_list = [var_d for var_d in var_dict_list
                if any([var_d['ref_seq'], var_d['alt_seq']])]

        if ref_genome.is_annotated():
            me_trans_iv_pairs = me_translocation_walk(G)

            me_var_dict_list = [parse_path_into_ref_alt(
                    iv_pair, contig_qname_to_uid,
                    sample_alignment)
                for iv_pair in me_trans_iv_pairs]
        else:
            me_var_dict_list = []

    else:
        print 'Translocation walk not implemented for multi-chromosomal refs'
        var_dict_list = []
        me_var_dict_list = []

    return placeable_contig_uid_list, var_dict_list, me_var_dict_list
Example #33
0
def assemble_with_velvet(assembly_dir,
                         velvet_opts,
                         sv_indicants_bam,
                         sample_alignment,
                         overwrite=True,
                         reassemble_contig_from_reads=False):
    # NOTE: Unused. If enabled, will call make_contig_reads_to_ref_alignments()
    # which is not used anywhere currently due to performance issues which
    # which are particularly bad when many unused reads.
    assert not reassemble_contig_from_reads

    timestamp = str(datetime.datetime.now())
    contig_number_pattern = re.compile('^NODE_(\d+)_')

    reference_genome = sample_alignment.alignment_group.reference_genome

    contig_files = []
    contig_uid_list = []

    _run_velvet(assembly_dir, velvet_opts, sv_indicants_bam)

    # Collect resulting contigs fasta
    contigs_fasta = os.path.join(assembly_dir, 'contigs.fa')
    contig_files.append(contigs_fasta)

    records = list(SeqIO.parse(contigs_fasta, 'fasta'))
    digits = len(str(len(records))) + 1

    for (i, seq_record) in enumerate(records, 1):

        # Extract contig sequence from the contigs.fa file, number, and
        # name it.

        contig_node_number = int(
            contig_number_pattern.findall(seq_record.description)[0])
        coverage = float(seq_record.description.rsplit('_', 1)[1])
        seq_record.seq = reduce(lambda x, y: x + y,
                                [seq for seq in seq_record.seq.split('N')])
        seq_record.id = seq_record.name = seq_record.description = ('NODE_' +
                                                                    str(i))
        leading_zeros = digits - len(str(i))
        contig_label = '%s_%s' % (sample_alignment.experiment_sample.label,
                                  leading_zeros * '0' + str(i))

        # Create model and metadata.

        contig = Contig.objects.create(
            label=contig_label,
            parent_reference_genome=reference_genome,
            experiment_sample_to_alignment=(sample_alignment))
        contig.metadata['coverage'] = coverage
        contig.metadata['timestamp'] = timestamp
        contig.metadata['node_number'] = contig_node_number
        contig.metadata['assembly_dir'] = assembly_dir

        contig.ensure_model_data_dir_exists()

        # NOTE: Unused code.
        # Reassemble the contig from its constituent reads separately,
        # using a second velvet call.
        # if reassemble_contig_from_reads:
        #     # 1. Grab reads from velvet to reassemble the contig
        #     make_contig_reads_to_ref_alignments(contig,
        #             add_jbrowse_track=False, overwrite=overwrite)
        #     contig_reads_bam = os.path.join(
        #             contig.get_model_data_dir(),
        #             'sv_indicants.bam')

        #     # 2. Reassemble the contig from its whole reads using velvet -
        #     # this generates longer contigs because the graph will trim the
        #     # edges if there is a branchpoint. With only one node it should
        #     # be very fast.
        #     _run_velvet(contig.get_model_data_dir(), velvet_opts,
        #             contig_reads_bam)
        #     reassembled_seqrecord = _extract_node_from_contig_reassembly(
        #             contig)
        #     if reassembled_seqrecord:
        #         seq_record.seq = reassembled_seqrecord.seq

        # Write the contig fasta and add it as a dataset to the contig object.

        dataset_path = os.path.join(contig.get_model_data_dir(), 'fasta.fa')

        with open(dataset_path, 'w') as fh:
            SeqIO.write([seq_record], fh, 'fasta')

        add_dataset_to_entity(contig,
                              'contig_fasta',
                              Dataset.TYPE.REFERENCE_GENOME_FASTA,
                              filesystem_location=dataset_path)

        contig.save()

        # NOTE: Disabled for now. Severe performance issues.
        # Make a bam track on the reference for each contig that shows only the
        # reads that assembled the contig and their mates
        # make_contig_reads_to_ref_alignments(contig.uid)

        # append the uid to the contig_uid_list
        contig_uid_list.append(contig.uid)

    # once contigs are extracted, remove velvet data
    _cleanup_velvet_dir(assembly_dir)

    return contig_uid_list
Example #34
0
def generate_contigs(sample_alignment,
        sv_read_classes={}, input_velvet_opts={},
        overwrite=True):
    """Generates contigs.
    """
    # Don't proceed if processing this sample alignment previously failed or
    # in another async process.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
            uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') ==
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        return

    # Set assembly status for UI
    # NOTE: Setting this status is playing whack-a-mole against other async sv
    # detection functions, e.g. detect_deletion.cov_detect_deletion_make_vcf().
    set_assembly_status(
            sample_alignment,
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.ASSEMBLING)

    print 'Generating contigs\n'

    # Grab reference genome fasta path and ensure exists.
    reference_genome = sample_alignment.alignment_group.reference_genome
    reference_genome.dataset_set.get_or_create(
            type=Dataset.TYPE.REFERENCE_GENOME_FASTA)[0]

    # Make assembly_dir directory to house genome_finishing files
    assembly_dir = os.path.join(
            sample_alignment.get_model_data_dir(),
            'assembly')

    # Make assembly directory if it does not exist, and remove it if it does
    if os.path.exists(assembly_dir):
        shutil.rmtree(assembly_dir)
    os.mkdir(assembly_dir)


    # Get a bam of sorted SV indicants with pairs
    sv_indicants_bam = get_sv_indicating_reads(sample_alignment,
            sv_read_classes, overwrite=overwrite)

    prev_dataset = get_dataset_with_type(
            sample_alignment,
            Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY)

    if overwrite and prev_dataset:
        prev_dataset.delete()

    if overwrite or prev_dataset is None:

        sv_indicants_sorted_bam = (os.path.splitext(sv_indicants_bam)[0] +
                '.coordinate_sorted.bam')

        # Bam needs to be coordinated sorted to index
        sort_bam_by_coordinate(sv_indicants_bam, sv_indicants_sorted_bam)

        # Bam needs to be indexed for jbrowse
        index_bam(sv_indicants_sorted_bam)

        for_assembly_dataset = add_dataset_to_entity(
                sample_alignment,
                Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY,
                Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY,
                filesystem_location=sv_indicants_sorted_bam)

        for_assembly_dataset.save()

        # TODO(dbgoodman): Look into re-enabling this. Right now, this creates
        # thousands of tracks and appears to significantly slow down JBrowse.
        # add_bam_file_track(reference_genome,
        #         sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY)

    velvet_opts = dict(DEFAULT_VELVET_OPTS)

    # Find insertion metrics
    ins_length, ins_length_sd = get_insert_size_mean_and_stdev(
            sample_alignment)
    velvet_opts['velvetg']['ins_length'] = ins_length
    velvet_opts['velvetg']['ins_length_sd'] = ins_length_sd

    # Find expected coverage
    avg_read_coverage = get_avg_genome_coverage(
            sample_alignment)

    # Calculate expected coverage in kmers
    genome_kmer_coverage = kmer_coverage(avg_read_coverage, ins_length,
            velvet_opts['velveth']['hash_length'])
    exp_cov = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_EXPECTED
    velvet_opts['velvetg']['exp_cov'] = exp_cov

    # # Set cov cutoff
    cov_cutoff = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_CUTOFF
    velvet_opts['velvetg']['cov_cutoff'] = cov_cutoff

    # Update velvet_opts with input_velvet_opts
    for shallow_key in ['velveth', 'velvetg']:
        if shallow_key in input_velvet_opts:
            for deep_key in input_velvet_opts[shallow_key]:
                velvet_opts[shallow_key][deep_key] = (
                        input_velvet_opts[shallow_key][deep_key])

    # Perform velvet assembly and generate contig objects.
    contig_uid_list = assemble_with_velvet(
            assembly_dir, velvet_opts, sv_indicants_bam,
            sample_alignment, overwrite=overwrite)

    # Evaluate contigs for mapping.
    evaluate_contigs(contig_uid_list)

    # Update status again if not FAILED.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
            uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') !=
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        set_assembly_status(
                sample_alignment,
                ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)
Example #35
0
    def test_run_alignment_with_spaces_in_genbank_filename(self):
        project = self.common_entities['project']
        ref_genome_label = 'dirty_upload'
        request = HttpRequest()
        request.POST = {
            'projectUid': project.uid,
            'refGenomeLabel': ref_genome_label,
            'importFileFormat': 'genbank'
        }
        request.method = 'POST'
        request.user = self.common_entities['user']
        authenticate(username=TEST_USERNAME, password=TEST_PASSWORD)
        self.assertTrue(request.user.is_authenticated())

        request.FILES['refGenomeFile'] = UploadedFile(
                file=open(TEST_GENBANK),
                name='dirty_genbank (spaces).gb')

        response = create_ref_genome_from_browser_upload(request)
        self.assertEqual(STATUS_CODE__SUCCESS, response.status_code)
        self.assertFalse(json.loads(response.content).get('error', False))

        # Get reference genome
        ref_genome = ReferenceGenome.objects.get(
                project=project,
                label=ref_genome_label)

        # Create sample model
        sample = ExperimentSample.objects.create(
                project=project,
                label='test_sample')

        # Add fastq datasets to sample
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ1,
                Dataset.TYPE.FASTQ1,
                filesystem_location=TEST_DIRTY_FQ_1)

        # Add fastq datasets to sample
        add_dataset_to_entity(
                sample,
                Dataset.TYPE.FASTQ2,
                Dataset.TYPE.FASTQ2,
                filesystem_location=TEST_DIRTY_FQ_2)

        # Run alignment of sample to reference
        alignment_group_label = 'test_alignment'
        sample_list = [sample]

        result = run_pipeline(
                alignment_group_label, ref_genome, sample_list)

        alignment_group = result[0]
        alignment_async_result = result[1]
        variant_calling_async_result = result[2]
        alignment_async_result.get()
        variant_calling_async_result.get()
        alignment_group = AlignmentGroup.objects.get(uid=alignment_group.uid)
        self.assertEqual(AlignmentGroup.STATUS.COMPLETED,
                alignment_group.status)
def compute_insert_metrics(bam_file, sample_alignment, stderr=None):
    """Computes read fragment insert size distribution.

    Creates a Dataset for each of:
        * histogram file
        * file with mean and stdev comma-separated

    Raises:
        ValueError if calculating paired-end distribution failed.
    """
    histo_file = os.path.splitext(bam_file)[0] + '.insert_size_histogram.txt'
    mean_stdev_file = (os.path.splitext(bam_file)[0] +
                       '.insert_size_mean_stdev.txt')

    # First, we analyze the bam distribution.
    read_bam_cmd = [settings.SAMTOOLS_BINARY, 'view', bam_file]
    p1 = Popen(read_bam_cmd, stdout=PIPE, stderr=stderr)

    read_length = get_read_length(bam_file)

    pairend_distro_cmd = [
        settings.LUMPY_PAIREND_DISTRO_BIN,
        '-r',
        str(read_length),
        '-X',
        '4',  # num stdevs from end to extend
        '-N',
        '10000',  # number to sample
        '-o',
        histo_file
    ]
    p2 = Popen(pairend_distro_cmd, stdin=p1.stdout, stdout=PIPE, stderr=stderr)

    # Allow p1 to receive a SIGPIPE if p2 exits.
    p1.stdout.close()

    # Run the command and get mean, stdev
    mean_and_stdev_str = p2.communicate()[0]
    mean_and_stdev_parts = mean_and_stdev_str.split('\t')
    if len(mean_and_stdev_parts) != 2:
        raise ValueError(
            "Poor alignment. Perhaps you tried aligning to the wrong reference "
            "genome?")
    raw_mean, raw_stdev = mean_and_stdev_parts
    mean = int(float(raw_mean.split(':')[1].strip()))
    stdev = int(float(raw_stdev.split(':')[1].strip()))

    # Lumpy doesn't like stdev of 0.
    if stdev < 1:
        stdev = 1

    # Save the histogram file as a Dataset.
    add_dataset_to_entity(sample_alignment,
                          Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM,
                          Dataset.TYPE.LUMPY_INSERT_METRICS_HISTOGRAM,
                          filesystem_location=histo_file)

    # Write mean, stdev to another file and create another Dataset.
    with open(mean_stdev_file, 'w') as fh:
        fh.write("%d,%d" % (mean, stdev))
    add_dataset_to_entity(sample_alignment,
                          Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV,
                          Dataset.TYPE.LUMPY_INSERT_METRICS_MEAN_STDEV,
                          filesystem_location=mean_stdev_file)
Example #37
0
def combine_list_allformats(reference_genome_list, new_ref_genome_label,
                            project):
    """Combine ReferenceGenomes into a new single ReferenceGenome
    composed of the component parts.

    Args:
        reference_genome_list: List of ReferenceGenome objects.
        new_ref_genome_label: Label for the new ReferenceGenome.
        project: Project to which the new ReferenceGenome will be added.

    Returns:
        Object with keys:
            * is_success
            * new_reference_genome (when is_success = True)
            * error_msg (when is_success = False)
    """
    rg_dataset_list = []
    for ref_genome in reference_genome_list:
        rg_dataset_tup = None
        for dataset_type in [
                Dataset.TYPE.REFERENCE_GENOME_GENBANK,
                Dataset.TYPE.REFERENCE_GENOME_FASTA
        ]:
            filter_result = ref_genome.dataset_set.filter(type=dataset_type)
            if len(filter_result):
                rg_dataset_tup = (ref_genome, filter_result[0])
                break
        if (not rg_dataset_tup or
                not os.path.exists(rg_dataset_tup[1].get_absolute_location())):
            return {
                'is_success':
                False,
                'error_msg':
                'All reference genomes must have an associated \
                        FASTA or Genbank dataset'
            }
        else:
            rg_dataset_list.append(rg_dataset_tup)
    assert len(rg_dataset_list) == len(reference_genome_list)

    # Read the datasets into Biopython SeqRecord objects.
    rg_seqrecord_list = []
    seqrecord_ids = []
    seqrecord_descriptions = []
    for rg, dataset in rg_dataset_list:
        with open(dataset.get_absolute_location()) as input_fh:
            for record in SeqIO.parse(input_fh,
                                      DATASET_TO_SEQIO_FORMAT[dataset.type]):
                rg_seqrecord_list.append((rg, record))
                seqrecord_ids.append('_'.join([
                    remove_whitespace(rg.label)[:7],
                    remove_whitespace(record.id)[:8]
                ]))
                seqrecord_descriptions.append(record.description)

    # Create a new ReferenceGenome.
    new_ref_genome = ReferenceGenome.objects.create(project=project,
                                                    label=new_ref_genome_label)

    # If ReferenceGenome label and Chromosome id are the same, there will be
    # duplicate seqrecord_ids: resolve by including numeric prefix in id
    seq_record_list = []
    MAX_LOCUS_NAME_LEN = 16
    unique_id_len = len(str(len(seqrecord_ids)))
    label_len = (MAX_LOCUS_NAME_LEN - 2 - unique_id_len) / 2
    for i, seqrecord_id in enumerate(seqrecord_ids):
        rg, seqrecord = rg_seqrecord_list[i]

        if seqrecord_ids.count(seqrecord_id) == 1:
            unique_seqrecord_id = seqrecord_id
        else:
            unique_seqrecord_id = '_'.join([
                str(i),
                remove_whitespace(rg.label)[:label_len],
                remove_whitespace(seqrecord.id)[:label_len]
            ])

        seqrecord.seq.alphabet = ambiguous_dna
        seqrecord.name = unique_seqrecord_id
        seqrecord.id = unique_seqrecord_id

        if seqrecord_descriptions.count(seqrecord.description) > 1:
            seqrecord.description = ' '.join(
                [seqrecord.description, 'from Reference Genome:', rg.label])

        seq_record_list.append(seqrecord)
        Chromosome.objects.create(reference_genome=new_ref_genome,
                                  label=seqrecord.id,
                                  seqrecord_id=seqrecord.id,
                                  num_bases=len(seqrecord))

    # Generate a filename from the label with non-alphanumeric characters
    # replaced by underscores.
    filename_prefix = generate_safe_filename_prefix_from_label(
        new_ref_genome_label)
    does_list_include_genbank = (Dataset.TYPE.REFERENCE_GENOME_GENBANK in [
        rg_dataset_tup[1].type for rg_dataset_tup in rg_dataset_list
    ])

    if does_list_include_genbank:
        filename = filename_prefix + '.gb'
    else:
        filename = filename_prefix + '.fa'
    new_file_dest = os.path.join(new_ref_genome.get_model_data_dir(), filename)

    # Write the result.
    if does_list_include_genbank:
        ref_genome_dataset_type = Dataset.TYPE.REFERENCE_GENOME_GENBANK
    else:
        ref_genome_dataset_type = Dataset.TYPE.REFERENCE_GENOME_FASTA
    output_file_format = DATASET_TO_SEQIO_FORMAT[ref_genome_dataset_type]

    with open(new_file_dest, 'w') as output_fh:
        SeqIO.write(seq_record_list, output_fh, output_file_format)

    # Create a dataset which will point to the file. This step must happen
    # after writing the file because a signal will be triggered which requires
    # the Genbank to exist already.
    add_dataset_to_entity(new_ref_genome, ref_genome_dataset_type,
                          ref_genome_dataset_type, new_file_dest)

    return {'is_success': True, 'new_reference_genome': new_ref_genome}
Example #38
0
def bootstrap_fake_data():
    """Fill the database with fake data.
    """
    user = get_or_create_user()

    ### Create some projects
    (test_project,
     project_created) = Project.objects.get_or_create(title=TEST_PROJECT_NAME,
                                                      owner=user.get_profile())
    (test_project_2,
     project_created) = Project.objects.get_or_create(title=SV_PROJECT_NAME,
                                                      owner=user.get_profile())

    ### Create some reference genomes
    ref_genome_1 = import_reference_genome_from_local_file(
        test_project, REF_GENOME_1_LABEL, TEST_FASTA, 'fasta')

    ref_genome_2 = import_reference_genome_from_local_file(
        test_project, REF_GENOME_2_LABEL, TEST_FASTA, 'fasta')

    ref_genome_3 = import_reference_genome_from_local_file(
        test_project, 'test_genome', TEST_FASTA, 'fasta')

    ### Create some saved queries.
    for saved_query_text in CUSTOM_SAVED_QUERY_LIST:
        SavedVariantFilterQuery.objects.get_or_create(owner=user.get_profile(),
                                                      text=saved_query_text)

    ### Create some ExperimentSamples.

    # Create some samples without backing data just to explore the UI.
    ExperimentSample.objects.create(project=test_project,
                                    label='C321D_MiSeq',
                                    data={'SAMPLE_WELL': 'A01'})

    ExperimentSample.objects.create(project=test_project,
                                    label='C321D Fixed 01',
                                    data={'SAMPLE_WELL': 'A02'})

    ExperimentSample.objects.create(project=test_project,
                                    label='C321D Fixed 02',
                                    data={'SAMPLE_WELL': 'A03'})

    # Create some samples with backing data.
    (sample_1,
     created) = ExperimentSample.objects.get_or_create(project=test_project,
                                                       label=SAMPLE_1_LABEL)
    # Add datasets to the samples.
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ1):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ1,
                                    Dataset.TYPE.FASTQ1, TEST_FASTQ1)
    if not sample_1.dataset_set.filter(type=Dataset.TYPE.FASTQ2):
        copy_and_add_dataset_source(sample_1, Dataset.TYPE.FASTQ2,
                                    Dataset.TYPE.FASTQ2, TEST_FASTQ2)

    # Create sample backed by g-zipped data.
    gz_backed_sample = ExperimentSample.objects.create(
        project=test_project, label='sample backed by gz data')
    gz_fastq1_dataset = copy_and_add_dataset_source(gz_backed_sample,
                                                    Dataset.TYPE.FASTQ1,
                                                    Dataset.TYPE.FASTQ1,
                                                    TEST_FASTQ_GZ_1)
    gz_fastq2_dataset = copy_and_add_dataset_source(gz_backed_sample,
                                                    Dataset.TYPE.FASTQ1,
                                                    Dataset.TYPE.FASTQ2,
                                                    TEST_FASTQ_GZ_2)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq1_dataset)
    run_fastqc_on_sample_fastq(gz_backed_sample, gz_fastq2_dataset, rev=True)

    ### Create an alignment.
    alignment_group_1 = AlignmentGroup.objects.create(
        label='Alignment 1',
        reference_genome=ref_genome_3,
        aligner=AlignmentGroup.ALIGNER.BWA)
    # Link it to a sample.
    sample_alignment = ExperimentSampleToAlignment.objects.create(
        alignment_group=alignment_group_1, experiment_sample=sample_1)
    ### Add alignment data. NOTE: Stored in sample model dir.
    # NOTE: This is a bit convoluted. Perhaps it would be better to store alignments
    # in the ExperimentSampleToAlignment directory.
    copy_dest = copy_dataset_to_entity_data_dir(sample_1, TEST_BAM)
    copy_dataset_to_entity_data_dir(sample_1, TEST_BAM_INDEX)
    add_dataset_to_entity(sample_alignment, Dataset.TYPE.BWA_ALIGN,
                          Dataset.TYPE.BWA_ALIGN, copy_dest)

    # Create fake variants.
    create_fake_variants_and_variant_sets(ref_genome_1)

    #############################
    # Full VCF Testing (annotated for snpeff, variant filtering, etc)
    #############################

    # Create a new reference genome and samples using full_vcf_test_set
    full_vcf_reference_genome = import_reference_genome_from_local_file(
        test_project, 'mg1655_tolC_through_zupT', FullVCFTestSet.TEST_GENBANK,
        'genbank')

    # Create all samples.
    parent_obj = None
    full_vcf_samples = []
    for i in range(FullVCFTestSet.NUM_SAMPLES):
        sample_obj = ExperimentSample.objects.create(project=test_project,
                                                     label='Sample %d' % i)

        sample_obj.data['SAMPLE_WELL'] = 'A0%d' % (i + 1)

        if i == 0:
            parent_obj = sample_obj
        if i > 0:
            sample_obj.data['SAMPLE_PARENTS'] = parent_obj.label
            parent_obj.add_child(sample_obj)
            parent_obj.save()

        sample_obj.save()

        # Add raw reads to each sample.
        fastq1_dataset = copy_and_add_dataset_source(sample_obj,
                                                     Dataset.TYPE.FASTQ1,
                                                     Dataset.TYPE.FASTQ1,
                                                     FullVCFTestSet.FASTQ1[i])
        fastq2_dataset = copy_and_add_dataset_source(sample_obj,
                                                     Dataset.TYPE.FASTQ2,
                                                     Dataset.TYPE.FASTQ2,
                                                     FullVCFTestSet.FASTQ2[i])

        # Run FASTQC on sample reads.
        run_fastqc_on_sample_fastq(sample_obj, fastq1_dataset)
        run_fastqc_on_sample_fastq(sample_obj, fastq2_dataset, rev=True)

        full_vcf_samples.append(sample_obj)

    # Run the alignment. Return the alignment group created, indexed by the
    # reference genome's uid.
    run_pipeline('test_align', full_vcf_reference_genome, full_vcf_samples)

    import_variant_set_from_vcf(full_vcf_reference_genome, 'Designed',
                                FullVCFTestSet.TEST_DESIGNED_SNPS)

    def _create_region_intervals(region, interval_tuple_list):
        """Helper method to create RegionIntervals for a Region.

        Args:
            region: Region Model object.
            interval_tuple_list: List of tuples of intervals to create.
        """
        for interval in interval_tuple_list:
            RegionInterval.objects.create(region=region,
                                          start=interval[0],
                                          end=interval[1])

    # Create some fake regions.
    # TODO: Should not be much harder to replace this with real regions.
    region_1 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_1',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_1, [(1, 150), (300, 400), (500, 900)])

    region_2 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_2',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_2, [(1000, 1500)])

    region_3 = Region.objects.create(
        reference_genome=full_vcf_reference_genome,
        label='region_3',
        type=Region.TYPE.POOR_MAPPING_QUALITY)
    _create_region_intervals(region_3, [(1800, 1900), (2150, 2300)])

    # And some GENE regions.

    gene_A = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneA',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_A, [(2000, 2400)])

    gene_B = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneB',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_B, [(4800, 5200)])

    gene_C = Region.objects.create(reference_genome=full_vcf_reference_genome,
                                   label='geneC',
                                   type=Region.TYPE.GENE)
    _create_region_intervals(gene_C, [(1, 500)])

    # Bootstrap test_project_2 with SV stuff
    sv_testing_bootstrap(test_project_2)
Example #39
0
def generate_contigs(sample_alignment,
                     sv_read_classes={},
                     input_velvet_opts={},
                     overwrite=True):
    """Generates contigs.
    """
    # Don't proceed if processing this sample alignment previously failed or
    # in another async process.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
        uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') ==
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        return

    # Set assembly status for UI
    # NOTE: Setting this status is playing whack-a-mole against other async sv
    # detection functions, e.g. detect_deletion.cov_detect_deletion_make_vcf().
    set_assembly_status(sample_alignment,
                        ExperimentSampleToAlignment.ASSEMBLY_STATUS.ASSEMBLING)

    print 'Generating contigs\n'

    # Grab reference genome fasta path and ensure exists.
    reference_genome = sample_alignment.alignment_group.reference_genome
    reference_genome.dataset_set.get_or_create(
        type=Dataset.TYPE.REFERENCE_GENOME_FASTA)[0]

    # Make assembly_dir directory to house genome_finishing files
    assembly_dir = os.path.join(sample_alignment.get_model_data_dir(),
                                'assembly')

    # Make assembly directory if it does not exist, and remove it if it does
    if os.path.exists(assembly_dir):
        shutil.rmtree(assembly_dir)
    os.mkdir(assembly_dir)

    # Get a bam of sorted SV indicants with pairs
    sv_indicants_bam = get_sv_indicating_reads(sample_alignment,
                                               sv_read_classes,
                                               overwrite=overwrite)

    prev_dataset = get_dataset_with_type(sample_alignment,
                                         Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY)

    if overwrite and prev_dataset:
        prev_dataset.delete()

    if overwrite or prev_dataset is None:

        sv_indicants_sorted_bam = (os.path.splitext(sv_indicants_bam)[0] +
                                   '.coordinate_sorted.bam')

        # Bam needs to be coordinated sorted to index
        sort_bam_by_coordinate(sv_indicants_bam, sv_indicants_sorted_bam)

        # Bam needs to be indexed for jbrowse
        index_bam(sv_indicants_sorted_bam)

        for_assembly_dataset = add_dataset_to_entity(
            sample_alignment,
            Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY,
            Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY,
            filesystem_location=sv_indicants_sorted_bam)

        for_assembly_dataset.save()

        # TODO(dbgoodman): Look into re-enabling this. Right now, this creates
        # thousands of tracks and appears to significantly slow down JBrowse.
        # add_bam_file_track(reference_genome,
        #         sample_alignment, Dataset.TYPE.BWA_FOR_DE_NOVO_ASSEMBLY)

    velvet_opts = dict(DEFAULT_VELVET_OPTS)

    # Find insertion metrics
    ins_length, ins_length_sd = get_insert_size_mean_and_stdev(
        sample_alignment)
    velvet_opts['velvetg']['ins_length'] = ins_length
    velvet_opts['velvetg']['ins_length_sd'] = ins_length_sd

    # Find expected coverage
    avg_read_coverage = get_avg_genome_coverage(sample_alignment)

    # Calculate expected coverage in kmers
    genome_kmer_coverage = kmer_coverage(avg_read_coverage, ins_length,
                                         velvet_opts['velveth']['hash_length'])
    exp_cov = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_EXPECTED
    velvet_opts['velvetg']['exp_cov'] = exp_cov

    # # Set cov cutoff
    cov_cutoff = genome_kmer_coverage * VELVET_CONTIG_COVERAGE_CUTOFF
    velvet_opts['velvetg']['cov_cutoff'] = cov_cutoff

    # Update velvet_opts with input_velvet_opts
    for shallow_key in ['velveth', 'velvetg']:
        if shallow_key in input_velvet_opts:
            for deep_key in input_velvet_opts[shallow_key]:
                velvet_opts[shallow_key][deep_key] = (
                    input_velvet_opts[shallow_key][deep_key])

    # Perform velvet assembly and generate contig objects.
    contig_uid_list = assemble_with_velvet(assembly_dir,
                                           velvet_opts,
                                           sv_indicants_bam,
                                           sample_alignment,
                                           overwrite=overwrite)

    # Evaluate contigs for mapping.
    evaluate_contigs(contig_uid_list)

    # Update status again if not FAILED.
    sample_alignment = ExperimentSampleToAlignment.objects.get(
        uid=sample_alignment.uid)
    if (sample_alignment.data.get('assembly_status') !=
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.FAILED):
        set_assembly_status(
            sample_alignment,
            ExperimentSampleToAlignment.ASSEMBLY_STATUS.WAITING_TO_PARSE)