def test_get_allele_frequencies(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        input1 = master_list[0]['transcript_data']

        output1 = [
            0.3391, 0.2859, 0.4539, 0.4274, 0.253, 0.3272, 0.392, 0.4176,
            0.3196, 0.43, 0.2793, 0.5536, 0.4469, 0.4538, 0.3747, 0.2942,
            0.4278
        ]

        self.assertEqual(vcf_parser.get_allele_frequencies(input1), output1)
    def test_get_rs_number(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        input1 = master_list[0]['transcript_data']

        output1 = 'rs3795269'

        input2 = {
            'NM_002617.3': {
                'Existing_variation': 'gene1'
            },
            'NM_002614.3': {
                'Existing_variation': 'gene2'
            }
        }

        output2 = 'gene1|gene2'

        input3 = {
            'NM_002617.3': {
                'Existing_variation': ''
            },
            'NM_002614.3': {
                'Existing_variation': 'gene2'
            }
        }

        output3 = 'gene2'

        input4 = {
            'NM_002617.3': {
                'gene1': ''
            },
            'NM_002614.3': {
                'test': 'gene2'
            }
        }

        output4 = "None"

        self.assertEqual(vcf_parser.get_rs_number(input1), output1)
        self.assertEqual(vcf_parser.get_rs_number(input2), output2)
        self.assertEqual(vcf_parser.get_rs_number(input3), output3)
        self.assertEqual(vcf_parser.get_rs_number(input4), output4)
    def test_get_max_af(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        input1 = master_list[0]['transcript_data']

        input2 = {'NM_002617.3': {'MAX_AF': '0.01'}}

        input3 = {'NM_002617.3': {'MAX_AF': ''}}

        input4 = {'NM_002617.3': {'NO_MAX_AF': ''}}

        self.assertEqual(vcf_parser.get_max_af(input1), 0.5536)
        self.assertEqual(vcf_parser.get_max_af(input2), 0.01)
        self.assertEqual(vcf_parser.get_max_af(input3), 0.0)
        self.assertRaises(KeyError, vcf_parser.get_max_af, input4)
    def test_get_variant_genes_list(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        input1 = master_list[0]['transcript_data']

        output1 = [('PEX10', '-1'), ('RER1', '1')]

        input2 = {
            'NM_002617.3': {
                'SYMBOL': 'gene1',
                'STRAND': '1'
            },
            'NM_002614.3': {
                'SYMBOL': 'gene2',
                'STRAND': '1'
            }
        }

        output2 = [('gene1', '1'), ('gene2', '1')]

        input3 = {
            'NM_002617.3': {
                'SYMBOL': '',
                'STRAND': '1'
            },
            'NM_002614.3': {
                'SYMBOL': 'gene2',
                'STRAND': '1'
            }
        }

        output3 = [('gene2', '1')]

        self.assertEqual(vcf_parser.get_variant_genes_list(input1), output1)
        self.assertEqual(set(vcf_parser.get_variant_genes_list(input2)),
                         set(output2))
        self.assertEqual(set(vcf_parser.get_variant_genes_list(input3)),
                         set(output3))
    def test_get_clin_sig(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        input1 = master_list[0]['transcript_data']

        input2 = {'NM_002617.3': {'CLIN_SIG': 'test2'}}

        input3 = {'NM_002617.3': {'CLIN_SIG': 'test3'}}

        input4 = {'NM_002617.3': {'NO_CLIN_SIG': 'test4'}}

        input5 = master_list[5]['transcript_data']

        self.assertEqual(vcf_parser.get_clin_sig(input1), 'None')
        self.assertEqual(vcf_parser.get_clin_sig(input2), 'test2')
        self.assertEqual(vcf_parser.get_clin_sig(input3), 'test3')
        self.assertRaises(KeyError, vcf_parser.get_clin_sig, input4)
        self.assertEqual(vcf_parser.get_clin_sig(input5), 'benign')
Esempio n. 6
0
def upload_sample_vcf(output_dir, sample_name):
    """
	Takes a VEP annotated vcf and uploads the variants contained within it.

	Input:

	output_dir = The directory containing the pipline output.
	sample_name = A string representing the unique sample name.


	Output:

	None - uploads and returns None.


	"""

    #Get the sample  - if we cannot find it then raise and error.
    try:

        sample = Sample.objects.get(name=sample_name)

    except Sample.DoesNotExist:

        raise CommandError("Cannot find a sample in the DB with name " +
                           sample_name + ".")

    already_uploaded = VariantSample.objects.filter(sample=sample)

    if already_uploaded.exists():

        raise CommandError("Stuff already uploaded against this sample.")

    #Find VCF file

    query = output_dir + "vcfs*/" + sample_name + "*.vcf.gz"

    vcf_file_path = glob.glob(query)

    if len(vcf_file_path) != 1:

        raise CommandError("Found more than one file for sample: " +
                           sample_name)

    else:

        vcf_file_path = vcf_file_path[0]

    #validate vcf

    validation_report = vcf_parser.validate_input(vcf_file_path, sample_name)

    if validation_report[0] == False:

        raise CommandError('Error opening vcf file: ' + validation_report[1])

    if vcf_parser.vep_annotated(vcf_file_path) == False:

        raise CommandError("No VEP annotations detected in vcf")

    #Check we have an admin user in the database for the next stage

    try:

        user = User.objects.get(pk=1)  # a superuser has to have been created

    except:

        raise CommandError(
            'Could not find an appropiate user to use for downstream data entry - please create an admin with pk=1'
        )

    #Try and parse the vcf using the vcf_parser
    try:

        vcf_data = vcf_parser.create_master_list(vcf_file_path, sample_name)

    except:

        raise CommandError('Could not process data using vcf_parser function')

    #update sample information e.g. vcf location.

    sample.vcf_file = vcf_file_path

    sample.save()

    for variant in vcf_data:

        chromosome = variant['chrom']
        pos = str(variant['pos'])
        ref = variant['reference']
        alt = variant['alt_alleles'][0]
        #hash_id = hashlib.sha256(chromosome+" "+pos+" "+ref+" "+alt).hexdigest()

        hash_id = variant_utilities.get_variant_hash(chromosome, pos, ref, alt)

        gene_list = vcf_parser.get_variant_genes_list(
            variant['transcript_data'])

        rs_number = vcf_parser.get_rs_number(variant['transcript_data'])

        worst_consequence = vcf_parser.worst_consequence(
            variant['transcript_data'])

        worst_consequence = Consequence.objects.get(name=worst_consequence)

        max_af = vcf_parser.get_max_af(variant['transcript_data'])

        allele_frequencies = vcf_parser.get_allele_frequencies(
            variant['transcript_data'])

        clin_sig = vcf_parser.get_clin_sig(variant['transcript_data'])

        af = allele_frequencies[0]
        afr_af = allele_frequencies[1]
        amr_af = allele_frequencies[2]
        eur_af = allele_frequencies[3]
        eas_af = allele_frequencies[4]
        sas_af = allele_frequencies[5]
        exac_af = allele_frequencies[6]
        exac_adj_af = allele_frequencies[7]
        exac_afr_af = allele_frequencies[8]
        exac_amr_af = allele_frequencies[9]
        exac_eas_af = allele_frequencies[10]
        exac_fin_af = allele_frequencies[11]
        exac_nfe_af = allele_frequencies[12]
        exac_oth_af = allele_frequencies[13]
        exac_sas_af = allele_frequencies[14]
        esp_aa_af = allele_frequencies[15]
        esp_ea_af = allele_frequencies[16]

        #Look for a variant in the database if we have not seen it before create a new one

        try:

            new_variant = Variant.objects.get(variant_hash=hash_id)

        except Variant.DoesNotExist:

            new_variant = Variant(chromosome=chromosome,
                                  position=pos,
                                  ref=ref,
                                  alt=alt,
                                  variant_hash=hash_id,
                                  rs_number=rs_number,
                                  last_updated=timezone.now(),
                                  worst_consequence=worst_consequence,
                                  max_af=max_af,
                                  af=af,
                                  afr_af=afr_af,
                                  amr_af=amr_af,
                                  eur_af=eur_af,
                                  eas_af=eas_af,
                                  sas_af=sas_af,
                                  exac_af=exac_af,
                                  exac_adj_af=exac_adj_af,
                                  exac_afr_af=exac_afr_af,
                                  exac_amr_af=exac_amr_af,
                                  exac_eas_af=exac_eas_af,
                                  exac_fin_af=exac_fin_af,
                                  exac_nfe_af=exac_nfe_af,
                                  exac_oth_af=exac_oth_af,
                                  exac_sas_af=exac_sas_af,
                                  esp_aa_af=esp_aa_af,
                                  esp_ea_af=esp_ea_af,
                                  clinical_sig=clin_sig)

            new_variant.save()

            for gene in gene_list:

                try:

                    gene_model = Gene.objects.get(name=gene[0])

                except Gene.DoesNotExist:

                    gene_model = Gene(name=gene[0])
                    gene_model.save()

            #Now create transcripts

            for transcript_key in variant['transcript_data']:

                if transcript_key == "":

                    try:

                        transcript_model = Transcript.objects.get(
                            name='no_transcript')

                    except Transcript.DoesNotExist:

                        transcript_model = Transcript(name='no_transcript',
                                                      canonical=False)

                        transcript_model.save()

                else:

                    try:

                        transcript_model = Transcript.objects.get(
                            name=transcript_key)

                    except Transcript.DoesNotExist:

                        canonical = variant['transcript_data'][transcript_key][
                            'CANONICAL']

                        if canonical == 'YES':

                            canonical = True
                        else:

                            canonical = False

                        gene = variant['transcript_data'][transcript_key][
                            'SYMBOL']

                        if gene != "":

                            gene = Gene.objects.get(name=gene)

                            transcript_model = Transcript(name=transcript_key,
                                                          canonical=canonical,
                                                          gene=gene)

                            transcript_model.save()

                        else:
                            transcript_model = Transcript(name=transcript_key,
                                                          canonical=canonical)

                            transcript_model.save()

                #now create transcriptvariant model

                consequence = variant['transcript_data'][transcript_key][
                    'Consequence']
                exon = variant['transcript_data'][transcript_key]['EXON']
                intron = variant['transcript_data'][transcript_key]['INTRON']
                hgvsc_t = variant['transcript_data'][transcript_key]['HGVSc']
                hgvsp_t = variant['transcript_data'][transcript_key]['HGVSp']
                codons = variant['transcript_data'][transcript_key]['Codons']
                cdna_position = variant['transcript_data'][transcript_key][
                    'cDNA_position']
                cds_position = variant['transcript_data'][transcript_key][
                    'CDS_position']
                protein_position = variant['transcript_data'][transcript_key][
                    'Protein_position']
                amino_acids = variant['transcript_data'][transcript_key][
                    'Amino_acids']
                picked = variant['transcript_data'][transcript_key]['PICK']

                if picked == '1':

                    picked = True

                else:

                    picked = False

                variant_transcript = VariantTranscript(
                    variant=new_variant,
                    transcript=transcript_model,
                    consequence=consequence,
                    exon=exon,
                    intron=intron,
                    hgvsc=hgvsc_t,
                    hgvsp=hgvsp_t,
                    codons=codons,
                    cdna_position=cdna_position,
                    protein_position=protein_position,
                    amino_acids=amino_acids,
                    picked=picked)

                variant_transcript.save()

        genotype = variant['genotype']
        caller = variant['Caller']
        allele_depth = variant['allele_depth']
        filter_status = variant['filter_status']
        total_count_forward = variant['TCF']
        total_count_reverse = variant['TCR']
        vafs = ":".join(str(x) for x in variant['VAFS'])

        new_variant_sample = VariantSample(
            variant=new_variant,
            sample=sample,
            genotype=genotype,
            caller=caller,
            allele_depth=allele_depth,
            filter_status=filter_status,
            total_count_forward=total_count_forward,
            total_count_reverse=total_count_reverse,
            vafs=vafs)

        new_variant_sample.save()

    return None
    def test_worst_consequence(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        input1 = master_list[0]['transcript_data']
        input2 = {
            'NM_002617.3': {
                'Consequence': 'transcript_ablation'
            },
            'NM_002614.3': {
                'Consequence': 'splice_acceptor_variant'
            }
        }
        input3 = {
            'NM_002617.3': {
                'Consequence': 'transcript_ablation'
            },
            'NM_002614.3': {
                'Consequence': 'splice_acceptor_variant&splice_donor_variant'
            },
            'NM_002684.3': {
                'Consequence': 'inframe_deletion'
            }
        }
        input4 = {
            'NM_002617.3': {
                'Consequence': 'stop_gained'
            },
            'NM_002614.3': {
                'Consequence': 'splice_acceptor_variant'
            }
        }
        input5 = {
            'NM_002617.3': {
                'Consequence': 'frameshift_variant'
            },
            'NM_002614.3': {
                'Consequence': 'splice_acceptor_variant'
            }
        }
        input6 = {
            'NM_002617.3': {
                'Consequence':
                'intron_variant&missense_variant&non_coding_transcript_variant'
            },
            'NM_002614.3': {
                'Consequence': 'frameshift_variant'
            }
        }
        input7 = {
            'NM_002617.3': {
                'Consequence': 'transcript_amplification'
            },
            'NM_002614.3': {
                'Consequence': 'inframe_insertion'
            }
        }
        input8 = {
            'NM_002617.3': {
                'Consequence': 'inframe_insertion'
            },
            'NM_002614.3': {
                'Consequence':
                'incomplete_terminal_codon_variant&incomplete_terminal_codon_variant'
            }
        }
        input9 = {
            'NM_002617.3': {
                'Consequence': 'feature_elongation'
            },
            'NM_002614.3': {
                'Consequence': 'intergenic_variant'
            }
        }
        input10 = {
            'NM_002617.3': {
                'Consequence': 'feature_truncation'
            },
            'NM_002614.3': {
                'Consequence': 'intergenic_variant'
            }
        }
        input11 = {
            'NM_002617.3': {
                'Consequence': 'stop_lost'
            },
            'NM_002614.3': {
                'Consequence': 'stop_lost'
            }
        }

        self.assertEqual(vcf_parser.worst_consequence(input1),
                         'intron_variant')
        self.assertEqual(vcf_parser.worst_consequence(input2),
                         'transcript_ablation')
        self.assertEqual(vcf_parser.worst_consequence(input3),
                         'transcript_ablation')
        self.assertEqual(vcf_parser.worst_consequence(input4),
                         'splice_acceptor_variant')
        self.assertEqual(vcf_parser.worst_consequence(input5),
                         'splice_acceptor_variant')
        self.assertEqual(vcf_parser.worst_consequence(input6),
                         'frameshift_variant')
        self.assertEqual(vcf_parser.worst_consequence(input7),
                         'transcript_amplification')
        self.assertEqual(vcf_parser.worst_consequence(input8),
                         'inframe_insertion')
        self.assertEqual(vcf_parser.worst_consequence(input9),
                         'feature_elongation')
        self.assertEqual(vcf_parser.worst_consequence(input10),
                         'feature_truncation')
        self.assertEqual(vcf_parser.worst_consequence(input11), 'stop_lost')
    def test_create_master_list(self):

        master_list = vcf_parser.create_master_list(
            'VariantDatabase/tests/test_files/vcfs/vep_annotated_test_vcf.vcf',
            'WS61594_14000835')

        self.assertEqual(master_list[0]['reference'], 'C')
        self.assertEqual(master_list[0]['genotype'], '0/1')
        self.assertEqual(master_list[0]['alt_alleles'], ('A', ))
        self.assertEqual(master_list[0]['filter_status'], '.')
        self.assertEqual(
            master_list[0]['hash_id'],
            'd360384c2a1df84a02bc9b2f19ee584ed837d600081450beab17762532ce18ba')
        self.assertEqual(master_list[0]['allele_depth'], '124:124')
        self.assertEqual(
            master_list[0]['transcript_data']['NM_002617.3'], {
                'MAX_AF_POPS': 'ExAC_FIN',
                'TSL': '',
                'APPRIS': '',
                'ExAC_AF': '0.392',
                'ExAC_NFE_AF': '0.4469',
                'AMR_AF': '0.4539',
                'SYMBOL': 'PEX10',
                'AFR_AF': '0.2859',
                'ExAC_EAS_AF': '0.2793',
                'Feature': 'NM_002617.3',
                'Codons': '',
                'MOTIF_NAME': '',
                'DOMAINS': '',
                'SIFT': '',
                'VARIANT_CLASS': 'SNV',
                'EA_AF': '0.4278',
                'CDS_position': '',
                'CCDS': '',
                'Allele': 'A',
                'PolyPhen': '',
                'AA_AF': '0.2942',
                'MOTIF_SCORE_CHANGE': '',
                'IMPACT': 'MODIFIER',
                'HGVSp': '',
                'ENSP': 'NP_002608.1',
                'MAX_AF': '0.5536',
                'INTRON': '4/5',
                'ExAC_AFR_AF': '0.3196',
                'Existing_variation': 'rs3795269',
                'HGVSc': 'NM_002617.3:c.776+33G>T',
                'MOTIF_POS': '',
                'HIGH_INF_POS': '',
                'ExAC_FIN_AF': '0.5536',
                'PICK': '',
                'GENE_PHENO': '',
                'ExAC_SAS_AF': '0.3747',
                'UNIPARC': '',
                'cDNA_position': '',
                'PUBMED': '',
                'EAS_AF': '0.253',
                'Feature_type': 'Transcript',
                'AF': '0.3391',
                'ExAC_Adj_AF': '0.4176',
                'ExAC_OTH_AF': '0.4538',
                'HGNC_ID': '',
                'SAS_AF': '0.3272',
                'SWISSPROT': '',
                'FLAGS': '',
                'Consequence': 'intron_variant',
                'Protein_position': '',
                'Gene': '5192',
                'STRAND': '-1',
                'EUR_AF': '0.4274',
                'DISTANCE': '',
                'PHENO': '',
                'SYMBOL_SOURCE': '',
                'Amino_acids': '',
                'ExAC_AMR_AF': '0.43',
                'TREMBL': '',
                'CLIN_SIG': '',
                'REFSEQ_MATCH': '',
                'HGVS_OFFSET': '',
                'BIOTYPE': 'protein_coding',
                'EXON': '',
                'SOMATIC': '',
                'CANONICAL': ''
            })
        self.assertEqual(len(master_list), 281)