Example #1
0
 def b37_hgvs_id(self):
     return myvariant.format_hgvs(
         self.get_chromosome_display(),
         self.pos,
         self.ref_allele,
         self.var_allele
     )
Example #2
0
 def b37_hgvs_id(self):
     return myvariant.format_hgvs(
         self.get_chromosome_display(),
         self.pos,
         self.ref_allele,
         self.var_allele
     )
Example #3
0
def getvariant(chromosome, start, ref, var):
    # Create myvariant info instance
    mv = myvariant.MyVariantInfo()

    # Get variant information for: chromosome, int(start), ref, var
    v = myvariant.format_hgvs(chromosome, int(start), ref, var)
    dir_ = mv.getvariant(v)

    # Return variant information found in all databases as a directory
    return dir_
Example #4
0
    def _parse_single_variant_record(cls, normed_headers_list,
                                     curr_line_fields_list, sample_names_list):
        # This code assumes that the VCF-produced format string and the genotype fields string(s) for the sample(s)
        # will be the last fields on every line and that they will NOT all have their own headers--rather, it
        # assumes the last header will indicate that the rest of the fields are "other info".  Here is a simplified
        # example:
        # chr	start	end	ref	alt	func_knowngene	    otherinfo
        # chrM	146     146	T	C	upstream;downstream 1	    61.74	AC=2;AF=1.00;AN=2;DP=2;FS=0.000	GT:AD:DP:GQ:PL	1/1:0,22:22:66:794,66,0	./.:0,0	1/1:0,40:40:99:1494,119,0
        # Note that the content at and after the position of the otherinfo header may list additional information
        # before the format string and genotype fields info (which are required to be at the end of the line);
        # any such extra info is ignored.

        # make a dictionary that pairs every named header *except* the last one with its content in this line
        last_field_index = len(normed_headers_list) - 1
        raw_fields_dict = dict(
            zip(normed_headers_list[0:last_field_index],
                curr_line_fields_list[0:last_field_index]))

        # TODO: someday: perhaps stop limiting this to only the fields in ANNOVAR_OUTPUT_COLS instead of all
        # For only a limited subset of columns, look those columns up in raw_fields_dict; if they hold real content,
        # do any clean-up necessary to their values and write them into a new dict
        cleaned_fields_dict = {}
        for curr_header in cls._ANNOVAR_OUTPUT_COLS:
            curr_value = raw_fields_dict[curr_header]
            if curr_value != ".":
                curr_value = cls._rewrite_value_if_special_header(
                    curr_header, curr_value)
                cleaned_fields_dict[curr_header] = curr_value

        # generate the hgvs id for this variant
        hgvs_id = myvariant.format_hgvs(cleaned_fields_dict[cls.CHR_HEADER],
                                        cleaned_fields_dict[cls.START_HEADER],
                                        cleaned_fields_dict[cls.REF_HEADER],
                                        cleaned_fields_dict[cls.ALT_HEADER])

        # now grab the number-of-samples-plus-one-th field from the *end* of the line--this holds the format
        # string--and also grab a list of the number-of-samples fields from the *end* of the line--these are
        # the genotype fields strings for each sample.
        num_samples_plus_one = len(sample_names_list) + 1
        format_string = curr_line_fields_list[-num_samples_plus_one]
        genotype_field_strings_per_sample = curr_line_fields_list[
            -len(sample_names_list)::]
        genotype_field_strings_by_sample_name = dict(
            zip(sample_names_list, genotype_field_strings_per_sample))

        # turn the dictionary of annovar fields into a dictionary of annotations for the variant, including
        # nested structures containing sample-specific genotype-related info
        annotations_dict_for_curr_variant = AnnovarAnnotatedVariant.make_per_variant_annotation_dict(
            cleaned_fields_dict, hgvs_id, format_string,
            genotype_field_strings_by_sample_name)

        return hgvs_id, annotations_dict_for_curr_variant
Example #5
0
def get_mv_data(chrom, pos, ref_allele, var_allele):
    hgvs_format = myvariant.format_hgvs(get_chrom_display(chrom), pos,
                                        ref_allele, var_allele)
    mv = myvariant.MyVariantInfo()
    mv_data = mv.getvariant(hgvs_format, fields=['clinvar', 'dbsnp', 'exac'])
    if mv_data and 'clinvar' in mv_data and 'rcv' in mv_data['clinvar']:
        if not type(mv_data['clinvar']['rcv']) == list:
            mv_data['clinvar']['rcv'] = [mv_data['clinvar']['rcv']]
    if mv_data:
        allele_freq, freq_url = get_allele_freq(mv_data, var_allele)
    else:
        allele_freq, freq_url = None, None
    return hgvs_format, mv_data, allele_freq, freq_url
Example #6
0
 def test_format_hgvs(self):
     self.assertEqual(myvariant.format_hgvs("1", 35366, "C", "T"),
                      'chr1:g.35366C>T')
     self.assertEqual(myvariant.format_hgvs("chr2", 17142, "G", "GA"),
                      'chr2:g.17142_17143insA')
     self.assertEqual(myvariant.format_hgvs("1", 10019, "TA", "T"),
                      'chr1:g.10020del')
     self.assertEqual(myvariant.format_hgvs("MT", 8270, "CACCCCCTCT", "C"),
                      'chrMT:g.8271_8279del')
     self.assertEqual(myvariant.format_hgvs("7", 15903, "G", "GC"),
                      'chr7:g.15903_15904insC')
     self.assertEqual(myvariant.format_hgvs("X", 107930849, "GGA", "C"),
                      'chrX:g.107930849_107930851delinsC')
     self.assertEqual(myvariant.format_hgvs("20", 1234567, "GTC", "GTCT"),
                      'chr20:g.1234569_1234570insT')
Example #7
0
def _get_hgvs_ids_from_vcf(vcf_file_obj, chunk_index, chunk_size):
    reader = vcf.Reader(vcf_file_obj)
    hgvs_ids = []

    for record in itertools.islice(reader, chunk_index * chunk_size, (chunk_index + 1) * chunk_size):
        hgvs_id = myvariant.format_hgvs(record.CHROM, record.POS, record.REF, str(record.ALT[0]))

        # ensure syntax consistency for chromosome M variants
        if AnnovarTxtParser.RAW_CHR_MT_SUFFIX_VAL in hgvs_id:
            one = hgvs_id.split(':')[0]
            two = hgvs_id.split(':')[1]
            if AnnovarTxtParser.STANDARDIZED_CHR_MT_SUFFIX_VAL not in one:
                one = AnnovarTxtParser.STANDARDIZED_CHR_MT_VAL
                hgvs_id = "".join([one, ':', two])

        hgvs_ids.append(hgvs_id)

    return hgvs_ids
def match_genome(inputfile, outputfile, inputfilename):
    """
    Produce a CSV genome report at outputfile for a given VCF inputfile.
    """
    data = dict()

    # Set up ClinVar data.
    clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37')
    if clinvar_filepath.endswith('.vcf'):
            input_clinvar_file = open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvar_filepath)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    # Run vcf2clinvar on genome data.
    clinvar_matches = vcf2clinvar.match_to_clinvar(
        inputfile, input_clinvar_file)
    # Set up to get myvariant.info data (mainly for ExAC data.)
    mv = myvariant.MyVariantInfo()

    # iterate through all ClinVar matches.
    for genome_vcf_line, allele, zygosity in clinvar_matches:
        # Discard low quality data.
        if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters:
            continue
        # Check significance. Only keep this as a notable variant if one of the
        # submissions has reported "pathogenic" and "likely pathogenic" effect.
        sigs = [rec.sig for rec in allele.records]
        if not ('4' in sigs or '5' in sigs):
            continue
        # Store data in a dict according to HGVS position.
        poskey = myvariant.format_hgvs(
            genome_vcf_line.chrom,
            genome_vcf_line.start,
            genome_vcf_line.ref_allele,
            allele.sequence)
        data[poskey] = {'genome_vcf_line': genome_vcf_line,
                        'clinvar_allele': allele,
                        'zygosity': zygosity}

    # Add data from myvariant.info using the HGVS positions.
    variants = data.keys()
    mv_output = mv.getvariants(variants, fields=['clinvar', 'exac'])
    for i in range(len(variants)):
        if 'clinvar' in mv_output[i]:
            data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar']
        if 'exac' in mv_output[i]:
            data[variants[i]]['mv_exac'] = mv_output[i]['exac']

    # Write report as CSV.
    with open(outputfile, 'w') as f:
        csv_out = csv.writer(f)
        for var in variants:
            # Clinvar URL for variant.
            cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format(
                data[var]['clinvar_allele'].records[0].acc)
            disease_name = ''
            preferred_name = ''
            getev_url = ''
            # Disease name, preferred name, and GET-Evidence URL if we have
            # myvariant.info information with ClinVar data.
            if 'mv_clinvar' in data[var]:
                cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format(
                    data[var]['mv_clinvar']['variant_id'])
                try:
                    disease_name = data[var]['mv_clinvar']['rcv']['conditions']['name']
                    preferred_name = data[var]['mv_clinvar']['rcv']['preferred_name']
                except TypeError:
                    disease_name = ', '.join(
                        set([rcv['conditions']['name'] for rcv in
                            data[var]['mv_clinvar']['rcv']]))
                    preferred_name = data[var]['mv_clinvar']['rcv'][0]['preferred_name']
                getev_url = guess_getevidence_url(preferred_name)
            exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format(
                data[var]['genome_vcf_line'].chrom[3:],
                data[var]['genome_vcf_line'].start,
                data[var]['genome_vcf_line'].ref_allele,
                data[var]['clinvar_allele'].sequence)
            # Allele frequency using ExAC data, if myvariant.info had that.
            if 'mv_exac' in data[var]:
                total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[var]['mv_exac']['an']['an']
                total_freq = str(total_freq)
                freq_source = 'ExAC'
            else:
                # If not, try to get it from our ClinVar data.
                try:
                    total_freq = str(data[var]['clinvar_allele'].frequency)
                    freq_source = 'ClinVar'
                except KeyError:
                    # If that fails, give up on frequency.
                    total_freq = ''
                    freq_source = 'Unknown'
            data_row = [
                inputfilename, var, preferred_name, disease_name, cv_url,
                exac_url, total_freq, freq_source, getev_url]
            csv_out.writerow(data_row)
    return
Example #9
0
def _hgvs_id(chrom, pos, ref, var):
    return parse.unquote(myvariant.format_hgvs(chrom, pos, ref, var))
Example #10
0
p.add_run('Time Processed ' + '\t' + '\t').bold = True
p.add_run(str(currentDT.strftime("%I:%M:%S %p")))

# Start running list for variant counts
processed = 0
clinical_count = 0

# Iterate through variant list and pull information
for i, row in somatic_variants.iterrows():
    processed += 1

    chrom = row['Chromosome']
    start = int(row['Start'])
    ref = row['Ref']
    var = row['Var']
    variant = myvariant.format_hgvs(chrom, start, ref, var)
    directory = mv.getvariant(variant)

    # Pull CIViC Data
    if directory:
        if 'civic' in directory:
            variant_descriptions = []
            assertions = []
            # Add count to clinical
            clinical_count += 1

            # Pull general information for variant
            gene = directory['civic']['entrez_name']
            ENST = directory['civic']['coordinates'][
                'representative_transcript']
            protein_change = directory['civic']['name']
Example #11
0
def match_genome(inputfile, outputfile, inputfilename):
    """
    Produce a CSV genome report at outputfile for a given VCF inputfile.
    """
    data = dict()

    # Set up ClinVar data.
    clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37')
    if clinvar_filepath.endswith('.vcf'):
        input_clinvar_file = open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvar_filepath)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    # Run vcf2clinvar on genome data.
    clinvar_matches = vcf2clinvar.match_to_clinvar(inputfile,
                                                   input_clinvar_file)
    # Set up to get myvariant.info data (mainly for ExAC data.)
    mv = myvariant.MyVariantInfo()

    # iterate through all ClinVar matches.
    for genome_vcf_line, allele, zygosity in clinvar_matches:
        # Discard low quality data.
        if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters:
            continue
        # Check significance. Only keep this as a notable variant if one of the
        # submissions has reported "pathogenic" and "likely pathogenic" effect.
        sigs = [rec.sig for rec in allele.records]
        if not ('4' in sigs or '5' in sigs):
            continue
        # Store data in a dict according to HGVS position.
        poskey = myvariant.format_hgvs(genome_vcf_line.chrom,
                                       genome_vcf_line.start,
                                       genome_vcf_line.ref_allele,
                                       allele.sequence)
        data[poskey] = {
            'genome_vcf_line': genome_vcf_line,
            'clinvar_allele': allele,
            'zygosity': zygosity
        }

    # Add data from myvariant.info using the HGVS positions.
    variants = data.keys()
    mv_output = mv.getvariants(variants, fields=['clinvar', 'exac'])
    for i in range(len(variants)):
        if 'clinvar' in mv_output[i]:
            data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar']
        if 'exac' in mv_output[i]:
            data[variants[i]]['mv_exac'] = mv_output[i]['exac']

    # Write report as CSV.
    with open(outputfile, 'w') as f:
        csv_out = csv.writer(f)
        for var in variants:
            # Clinvar URL for variant.
            cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format(
                data[var]['clinvar_allele'].records[0].acc)
            disease_name = ''
            preferred_name = ''
            getev_url = ''
            # Disease name, preferred name, and GET-Evidence URL if we have
            # myvariant.info information with ClinVar data.
            if 'mv_clinvar' in data[var]:
                cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format(
                    data[var]['mv_clinvar']['variant_id'])
                try:
                    disease_name = data[var]['mv_clinvar']['rcv'][
                        'conditions']['name']
                    preferred_name = data[var]['mv_clinvar']['rcv'][
                        'preferred_name']
                except TypeError:
                    disease_name = ', '.join(
                        set([
                            rcv['conditions']['name']
                            for rcv in data[var]['mv_clinvar']['rcv']
                        ]))
                    preferred_name = data[var]['mv_clinvar']['rcv'][0][
                        'preferred_name']
                getev_url = guess_getevidence_url(preferred_name)
            exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format(
                data[var]['genome_vcf_line'].chrom[3:],
                data[var]['genome_vcf_line'].start,
                data[var]['genome_vcf_line'].ref_allele,
                data[var]['clinvar_allele'].sequence)
            # Allele frequency using ExAC data, if myvariant.info had that.
            if 'mv_exac' in data[var]:
                total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[
                    var]['mv_exac']['an']['an']
                total_freq = str(total_freq)
                freq_source = 'ExAC'
            else:
                # If not, try to get it from our ClinVar data.
                try:
                    total_freq = str(data[var]['clinvar_allele'].frequency)
                    freq_source = 'ClinVar'
                except KeyError:
                    # If that fails, give up on frequency.
                    total_freq = ''
                    freq_source = 'Unknown'
            data_row = [
                inputfilename, var, preferred_name, disease_name, cv_url,
                exac_url, total_freq, freq_source, getev_url
            ]
            csv_out.writerow(data_row)
    return
Example #12
0
def annotate_mutations(file, assembly='hg19'):
    # Open variant file with pandas
    identified_variants = pd.read_csv(file, sep='\t')

    # Give a name to the output file: 'test_filename' + '_AIM_report.pdf'
    doc_name = file.split('.')[0] + '_AIV_Report.pdf'

    # Create a sample document and sample style sheet
    report = SimpleDocTemplate(doc_name)
    style_ = getSampleStyleSheet()

    # Add a paragraph style to justify text
    style_.add(ParagraphStyle('Justified', alignment=TA_JUSTIFY))

    # Create a list to store all the content which will be written into the report
    content = []

    # Put main title of the annotation report
    content.append(
        Paragraph("Annotation of Identified Variants", style_['Heading1']))

    # Add given input file name
    content.append(
        Paragraph('File Name: ' + '\t' + '\t' + '\t' + str(file) + '\n',
                  style_['BodyText']))

    # Get a myvariant info instance
    mv = myvariant.MyVariantInfo()

    # Initiliaze a counter for variants
    total_variants = 0
    annotated_variants = 0

    # Loop through identified variants and get annotations
    for i, row in identified_variants.iterrows():
        # Store the total number of variants given in the input file
        total_variants += 1

        # Get chromosome, start, reference and variant columns
        chrom_ = row['Chromosome']
        start_ = row['Start']
        ref_ = row['Ref']
        var_ = row['Var']

        # Get variant information
        v = myvariant.format_hgvs(chrom_, int(start_), ref_, var_)
        dir_ = mv.getvariant(v, assembly=assembly)

        # Get data from 'civic'
        if dir_:
            # Create an empty list to store annotations
            variant_annotations, gene_, protein_change_, info, evidence_items = _pull_data(
                dir_, 'civic')

            # Increase the number of clinically annotated variants by 1 (one).
            annotated_variants += 1

            # Add content to the report: general info, annotations & evidence statements
            _add_variant_info(variant_annotations, annotated_variants, gene_,
                              protein_change_, info, v, content, style_,
                              evidence_items, assembly)

    # Add processing information: total processed variants and number of annotated variants
    _add_additional_info(total_variants, annotated_variants, content, style_)

    # Save report in the same directory
    report.build(content)
Example #13
0
File: AIV.py Project: nesegunes/aiv
def annotate_mutations(file):
    # Open variant file with pandas
    identified_variants = pd.read_csv(file, sep='\t')

    doc_name = 'aim_report.pdf'
    report = SimpleDocTemplate(doc_name)
    style_ = getSampleStyleSheet()
    content = []

    title = Paragraph("Annotation of Identified Variants", style_['Heading1'])
    content.append(title)

    p = Paragraph(
        'Variant Call File Name: ' + '\t' + '\t' + '\t' + str(file) + '\n',
        style_['BodyText'])
    content.append(p)

    # Get a myvariant info instance
    mv = myvariant.MyVariantInfo()

    # Initiliaze a counter for variants
    total_variants = 0
    annotated_variants = 0

    # Loop through identified variants and get annotations
    for i, row in identified_variants.iterrows():
        total_variants += 1

        # Get chromosome, start, reference and variant columns
        chrom_ = row['Chromosome']
        start_ = row['Start']
        ref_ = row['Ref']
        var_ = row['Var']

        # Get variant information using 'myvariant' module
        v = myvariant.format_hgvs(chrom_, int(start_), ref_, var_)
        dir_ = mv.getvariant(v)

        # Get data
        if dir_ and 'civic' in dir_:
            # Create an empty list to store annotations
            variant_annotations = []

            # Increase the number of clinically annotated variants by 1 (one).
            annotated_variants += 1

            # Get information about variant
            gene_ = dir_['civic']['entrez_name']
            protein_change_ = dir_['civic']['name']

            # Get variant annotation for identified variant
            if 'description' in dir_['civic']:
                variant_annotations.append(dir_['civic']['description'])

            # Add info about variant to the report
            title = Paragraph('Clinical Variant: ' + str(annotated_variants),
                              style_['Heading2'])
            content.append(title)

            p = Paragraph(
                'Gene Name: ' + '\t' + '\t' + '\t' + str(gene_) + '\n',
                style_['BodyText'])
            content.append(p)

            p = Paragraph(
                'Protein Change: ' + '\t' + '\t' + str(protein_change_) + '\n',
                style_['BodyText'])
            content.append(p)

            p = Paragraph('Coordinates: ' + '\t' + '\t' + '\t' + str(v) + '\n',
                          style_['BodyText'])
            content.append(p)

            title = Paragraph('Variant Annotation: ', style_['Heading3'])
            content.append(title)

            # Add annotations to the report
            if len(variant_annotations):
                for annot in variant_annotations:
                    p = Paragraph(str(annot), style_['BodyText'])
                    content.append(p)
            else:
                p = Paragraph('Not found...' + '\n', style_['BodyText'])
                content.append(p)

    title = Paragraph('Additional information', style_['Heading3'])
    content.append(title)

    # Give additional information
    p = Paragraph(
        'Total Number of Variants Processed: ' + str(total_variants) + '\n',
        style_['BodyText'])
    content.append(p)

    p = Paragraph(
        'The Number of Clinical Annotations: ' + str(annotated_variants) +
        '\n', style_['BodyText'])
    content.append(p)

    # Save report
    report.build(content)