Beispiel #1
0
def test_simple_info_string():
    """
    Test how the build_info_string behaves
    """

    info = OrderedDict()
    info['MQ'] = ['1']

    assert build_info_string(info) == "MQ=1"
Beispiel #2
0
def test_simple_info_string():
    """
    Test how the build_info_string behaves
    """
    
    info = OrderedDict()
    info['MQ'] = ['1']
    
    assert build_info_string(info) == "MQ=1"
Beispiel #3
0
def test_info_string_with_no_value():
    """
    Test how the build_info_string behaves
    """

    info = OrderedDict()
    info['MQ'] = ['1']
    info['BOOL'] = []

    assert build_info_string(info) == "MQ=1;BOOL"
Beispiel #4
0
def test_info_string_with_no_value():
    """
    Test how the build_info_string behaves
    """
    
    info = OrderedDict()
    info['MQ'] = ['1']
    info['BOOL'] = []
    
    assert build_info_string(info) == "MQ=1;BOOL"
def main():
    opts = parse_cli()
    setup_logging(opts.loglevel)

    logging.debug("Using clinvar {}".format(opts.clinvar))
    logging.debug("Writing to vcf {}".format(opts.V))

    logging.debug("Opening files")
    my_vcf = VCFParser(fileformat='VCFv4.2')
    output = open(opts.V, 'w')

    clinvar_flat = open(opts.clinvar, 'r')
    number_of_variants_written = 0

    logging.debug("Setting up INFO in header")
    my_vcf.metadata.add_info(info_id='MEASURESET', number='1', entry_type='String', description="Measure set")
    my_vcf.metadata.add_info(info_id='HGNC', number='1', entry_type='String', description="HGNC Symbol")
    my_vcf.metadata.add_info(info_id='CLNSIGSTR', number='.', entry_type='String', description="Clinical Significance")
    my_vcf.metadata.add_info(info_id='REVSTAT', number='1', entry_type='String', description="Review Status")
    my_vcf.metadata.add_info(info_id='HGVSC', number='.', entry_type='String', description="HGVS-c")
    my_vcf.metadata.add_info(info_id='HGVSP', number='.', entry_type='String', description="HGVS-p")
    my_vcf.metadata.add_info(info_id='ALLSUBM', number='.', entry_type='String', description="All submitters")
    my_vcf.metadata.add_info(info_id='ALLTRAITS', number='.', entry_type='String', description="All traits associated with this variant")
    my_vcf.metadata.add_info(info_id='ALLPMID', number='.', entry_type='String', description="All pubmed IDs")
    my_vcf.metadata.add_info(info_id='PATHOGENIC', number='0', entry_type='Flag', description="Set if the variant has ever been asserted Pathogenic or Likely pathogenic by any submitter for any phenotype, and unset otherwise")
    my_vcf.metadata.add_info(info_id='CONFLICTED', number='0', entry_type='Flag', description="Set if the variant has ever been asserted Pathogenic or Likely pathogenic by any submitter for any phenotype, and has also been asserted Benign or Likely benign by any submitter for any phenotype, and unser otherwise. Note that having one assertion of pathogenic and one of uncertain significance does not count as conflicted for this column.")

    for header_line in my_vcf.metadata.print_header():
        logging.debug("Writing header line {}".format(header_line))
        output.write(header_line+"\n")

    header_elements = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]

    logging.debug("Parsing clinvar tsv")
    for line in clinvar_flat:
        if line.startswith("chrom") or line.strip() == "":
            continue

        elements = line.strip().split("\t")
        chrom=elements[0]
        pos=elements[1]
        ref=elements[2]
        alt=elements[3]
        mut=elements[4]
        measure_set = elements[5].replace(" ", "_").split(";")
        symbol = elements[6].replace(" ", "_").split(";")
        clnsigstr = elements[7].replace(" ", "_").split(";")
        review_status = elements[8].replace(" ", "_").split(";")
        hgvs_c = elements[9].replace(" ", "_").split(";")
        hgvs_p = elements[10].replace(" ", "_").split(";")
        all_submitters = elements[11].replace(" ", "_").split(";")
        all_traits = elements[12].replace(" ", "_").split(";")
        all_pubmed_ids = elements[13].replace(" ", "_").split(";")
        pathogenic = elements[14]
        conflicted = elements[15]

        info_dict = dict(MEASURESET=measure_set,
                         HGNC=symbol,
                         CLNSIGSTR=clnsigstr,
                         REVSTAT=review_status,
                         HGVSC=hgvs_c,
                         HGVSP=hgvs_p,
                         ALLSUBM=all_submitters,
                         ALLTRAITS=all_traits,
                         ALLPMID=all_pubmed_ids)

        # special treatment for flags
        if pathogenic == "1":
            info_dict['PATHOGENIC'] = 0

        if conflicted == "1":
            info_dict['CONFLICTED'] = 0

        info_string = build_info_string(info_dict)

        variant = dict(CHROM=chrom,
                       POS=pos,
                       ID='.',
                       REF=ref,
                       ALT=alt,
                       QUAL='.',
                       FILTER='.',
                       INFO=info_string
                       )

        variant_string = "\t".join([variant[key] for key in header_elements])
        logging.debug("Writing variant {}:{} {}/{}".format(chrom, pos, ref, alt))
        output.write(variant_string + "\n")
        number_of_variants_written += 1

    logging.info("Written {} variants to {}.".format(number_of_variants_written, opts.V))
def split_variants(variant_dict, header_parser, allele_symbol='0'):
    """
    Checks if there are multiple alternative alleles and splitts the 
    variant.
    If there are multiple alternatives the info fields, vep annotations 
    and genotype calls will be splitted in the correct way
    
    Args:
        variant_dict: a dictionary with the variant information
    
    Yields:
        variant: A variant dictionary with the splitted information for each
                alternative
    """
    logger = getLogger(__name__)
    logger.info("Allele symbol {0}".format(allele_symbol))
    alternatives = variant_dict['ALT'].split(',')
    reference = variant_dict['REF']
    number_of_values = 1
    # Go through each of the alternative alleles:
    for alternative_number, alternative in enumerate(alternatives):
        variant = {}
        info_dict = OrderedDict()
        # This is a dict on the form {ALT:[{vep_info_dict}]}
        vep_dict = {}
        genotype_dict = {}
        variant['CHROM'] = variant_dict['CHROM']
        variant['POS'] = variant_dict['POS']
        try:
            # There will not allways be one rsID for each alternative
            variant['ID'] = variant_dict['ID'].split(';')[alternative_number]
        # If only one id is present for multiple alleles they all get the same ID
        except IndexError:
            variant['ID'] = variant_dict['ID']
        
        variant['REF'] = variant_dict['REF']
        variant['ALT'] = alternative
        variant['QUAL'] = variant_dict['QUAL']
        variant['FILTER'] = variant_dict['FILTER']
        

        if 'FORMAT' in variant_dict:
            gt_format = variant_dict['FORMAT']
            variant['FORMAT'] = gt_format

        for info in variant_dict['info_dict']:
            if info and info != '.':
                # Check if the info field have one entry per allele:
                number_of_values = header_parser.extra_info[info]['Number']
                
                if info == 'CSQ':
                    vep_dict[alternative] = variant_dict['vep_info'][alternative]
                    if vep_dict[alternative]:
                        info_dict['CSQ'] = [
                            build_vep_string(
                                vep_dict[alternative], 
                                header_parser.vep_columns
                            )
                        ]
                # If there is one value per allele we need to split it in
                # the proper way
                elif number_of_values == 'A':
                    try:
                        # When we split the alleles we only want to annotate with the correct number
                        info_dict[info] = [variant_dict['info_dict'][info][alternative_number]]
                    except IndexError:
                        # If there is only one annotation we choose that one
                        info_dict[info] = [variant_dict['info_dict'][info][0]]
                # Choose the right vep info from the old variant
                elif number_of_values == 'R':
                    reference_value = variant_dict['info_dict'][info][0]
                    new_info = [reference_value]
                    try:
                        # When we split the alleles we only want to annotate with the correct number
                        allele_value = variant_dict['info_dict'][info][alternative_number + 1]
                        new_info.append(allele_value)
                        info_dict[info] = new_info
                    except IndexError:
                        # If annotation is missing we keep the original annotation
                        info_dict[info] = variant_dict['info_dict'][info]
                    
                else:
                    info_dict[info] = variant_dict['info_dict'][info]
                
            else:
                info_dict[info] = []
        
        variant['INFO'] = build_info_string(info_dict)
        
        for individual in variant_dict['genotypes']:
            new_genotype = split_genotype(
                            variant_dict[individual], 
                            variant['FORMAT'], 
                            alternative_number, 
                            allele_symbol
                        )
            
            variant[individual] = new_genotype
            genotype_dict[individual] = Genotype(**dict(zip(gt_format.split(':'), variant[individual].split(':'))))
            
        variant['info_dict'] = info_dict
        variant['vep_info'] = vep_dict
        variant['genotypes'] = genotype_dict
        variant['variant_id'] = '_'.join([variant['CHROM'],
                                    variant['POS'],
                                    variant['REF'],
                                    alternative])
        yield variant
            variant["info_dict"].pop(key, None)

    info = variant['info_dict']
    if 'HOMLEN' in info and 'SVLEN' in info:
        if len(info['HOMLEN']) > 1 or len(info['SVLEN']) > 1:
            raise ImportError("Won't parse multiallelic variant. Split them to biallelic using vcf_parser first.")

    HOMLEN = int(info['HOMLEN'][0])
    absSVLEN = abs(int(info["SVLEN"][0]))

    #  if allelic fraction > 20% in any sample, keep the variant
    #  (independent of microhomology)
    for sample in variant['genotypes']:
        gt = variant['genotypes'][sample]
        allelic_fraction = gt.alt_depth/(float(gt.alt_depth+gt.ref_depth))
        if allelic_fraction >= 0.20:
            print_variant = True

    #  if the local microhomology is short, keep the variant
    #  (independent of alleleic fractions)
    if HOMLEN < absSVLEN + 2:
        print_variant = True

    if print_variant:
        logging.debug("KEEPING VARIANT: {0}".format(variant))
        variant["INFO"] = build_info_string(variant["info_dict"])
        vcf_writer.write("\t".join([variant[head] for head in vcf_reader.header]))
        vcf_writer.write("\n")
    else:
        logging.debug("DISCARDING VARIANT: {0}".format(variant))