Esempio n. 1
0
def get_header(vcf_file_path):
    """Parse the header and return a header object

        Args:
            vcf_file_path(str): Path to vcf

        Returns:
            head: A HeaderParser object
    """
    logger.info("Parsing header of file {0}".format(vcf_file_path))
    head = HeaderParser()
    handle = get_vcf_handle(infile=vcf_file_path)
    # Parse the header
    for line in handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    handle.close()

    return head
Esempio n. 2
0
def cli(ctx, variant_file, family_file, family_type, gq_treshold, to_json,
    outfile, verbose):
    """Check for pedigree inconsistensies."""
    # configure root logger to print to STDERR
    loglevel = LEVELS.get(min(verbose, 3))
    configure_stream(level=loglevel)

    if not family_file:
        logger.error("Please provide a family file with -f/--family_file")
        logger.info("Exiting")
        sys.exit(1)
    
    logger.info("Setting up a family parser")
    family_parser = FamilyParser(family_file, family_type)
    logger.debug("Family parser done")
    # The individuals in the ped file must be present in the variant file:
    families = family_parser.families
    logger.info("Families used in analysis: {0}".format(
                    ','.join(list(families.keys()))))
    
    ctx.gq_treshold = gq_treshold
    ctx.to_json = to_json
    ctx.outfile = outfile

    ctx.families = families
    ctx.individuals = family_parser.individuals
    
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    #Add the first variant to the iterator
    if line:
        variant_file = itertools.chain([line], variant_file)
    
    try:
        check_individuals(family_parser.individuals, head.individuals)
    except IOError as e:
        logger.error(e)
        logger.info("Individuals in PED file: {0}".format(
                        ', '.join(family_parser.individuals)))
        logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals)))
        logger.info("Exiting...")
        ctx.abort()
    
    ctx.variant_file = variant_file
    ctx.header_line = head.header
def get_header(vcf_lines):
    """Parse the vcf lines and return a header object"""
    head = HeaderParser()
    
    for line in vcf_lines:
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
    return head
def test_vep_columns():
    """
    Test how the vep columns are parsed
    """
    header_parser = HeaderParser()

    vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\
    ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">'

    header_parser.parse_meta_data(vep_info_line)

    assert header_parser.vep_columns == ['Allele','Gene','Feature','Feature_type','Consequence']
Esempio n. 5
0
def test_vep_columns():
    """
    Test how the vep columns are parsed
    """
    header_parser = HeaderParser()

    vep_info_line = '##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence'\
    ' type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence">'

    header_parser.parse_meta_data(vep_info_line)

    assert header_parser.vep_columns == [
        'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence'
    ]
def test_parse_vcf_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
        'this allele was found in external db">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
        'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
        '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
        '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
        ' the ref and alt alleles in the order listed">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
        '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
        '##reference=file:///human_g1k_v37.fasta',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
    ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)

    assert header_parser.fileformat == "VCFv4.2"
    assert header_parser.individuals == ['father','mother','proband']

    assert header_parser.vep_columns == []

    assert "MQ" in header_parser.extra_info
    assert header_parser.extra_info["MQ"]['Description'] == "RMS Mapping Quality"
    assert header_parser.extra_info["CNT"]['Number'] == "A"
    assert header_parser.extra_info["CNT"]['Type'] == "Integer"
    assert "CNT" in header_parser.extra_info
    assert "DP_HIST" in header_parser.extra_info

    assert "LowQual" in header_parser.filter_dict
    assert "1" in header_parser.contig_dict

    assert header_parser.header == [
        'CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT',
        'father','mother','proband'
    ]
Esempio n. 7
0
def get_header(vcf_file_path):
    """Parse the header and return a header object

        Args:
            vcf_file_path(str): Path to vcf

        Returns:
            head: A HeaderParser object
    """
    logger.info("Parsing header of file {0}".format(vcf_file_path))
    head = HeaderParser()
    handle = get_vcf_handle(infile=vcf_file_path)
    # Parse the header
    for line in handle:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    handle.close()

    return head
Esempio n. 8
0
def get_header(vcf_lines):
    """Parse the vcf lines and return a header object"""
    head = HeaderParser()

    for line in vcf_lines:
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
    return head
Esempio n. 9
0
def test_malformed_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    malformed_fileformat = '##fileformat'
    malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">'
    malformed_contig_line = '##contig=<assembly=b37>'

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_fileformat)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_info_line)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_contig_line)
Esempio n. 10
0
def test_parse_vcf_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    header_lines = [
        '##fileformat=VCFv4.2',
        '##FILTER=<ID=LowQual,Description="Low quality">',
        '##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">',
        '##INFO=<ID=CNT,Number=A,Type=Integer,Description="Number of times '\
        'this allele was found in external db">',
        '##contig=<ID=1,length=249250621,assembly=b37>',
        '##INFO=<ID=DP_HIST,Number=R,Type=String,Description="Histogram for '\
        'DP; Mids: 2.5|7.5|12.5|17.5|22.5|27.5|32.5|37.5|42.5|47.5|52.5|57.5|'\
        '62.5|67.5|72.5|77.5|82.5|87.5|92.5|97.5">',
        '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for'\
        ' the ref and alt alleles in the order listed">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
        '##FORMAT=<ID=GQ,Number=1,Type=String,Description="GenotypeQuality">'
        '##reference=file:///human_g1k_v37.fasta',
        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tfather\tmother\tproband'
    ]
    for line in header_lines:
        if line.startswith('##'):
            header_parser.parse_meta_data(line)
        elif line.startswith('#'):
            header_parser.parse_header_line(line)

    assert header_parser.fileformat == "VCFv4.2"
    assert header_parser.individuals == ['father', 'mother', 'proband']

    assert header_parser.vep_columns == []

    assert "MQ" in header_parser.extra_info
    assert header_parser.extra_info["MQ"][
        'Description'] == "RMS Mapping Quality"
    assert header_parser.extra_info["CNT"]['Number'] == "A"
    assert header_parser.extra_info["CNT"]['Type'] == "Integer"
    assert "CNT" in header_parser.extra_info
    assert "DP_HIST" in header_parser.extra_info

    assert "LowQual" in header_parser.filter_dict
    assert "1" in header_parser.contig_dict

    assert header_parser.header == [
        'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
        'father', 'mother', 'proband'
    ]
def test_malformed_lines():
    """
    Test how the header parser behaves with simple vcf lines
    """

    header_parser = HeaderParser()

    malformed_fileformat = '##fileformat'
    malformed_info_line = '##INFO=<ID=MQ,Number=1,Description="RMS Mapping Quality">'
    malformed_contig_line = '##contig=<assembly=b37>'

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_fileformat)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_info_line)

    with pytest.raises(SyntaxError):
        header_parser.parse_meta_data(malformed_contig_line)
Esempio n. 12
0
    def _formated_variants(self, raw_variants, case_obj):
        """Return variant objects

            Args:
                raw_variants (Iterable): An iterable with variant lines
                case_obj (puzzle.nodels.Case): A case object

        """
        vcf_file_path = case_obj.variant_source

        logger.info("Parsing file {0}".format(vcf_file_path))
        head = HeaderParser()
        handle = get_vcf_handle(infile=vcf_file_path)
        # Parse the header
        for line in handle:
            line = line.rstrip()
            if line.startswith("#"):
                if line.startswith("##"):
                    head.parse_meta_data(line)
                else:
                    head.parse_header_line(line)
            else:
                break

        handle.close()

        header_line = head.header

        # Get the individual ids for individuals in vcf file
        vcf_individuals = set([ind_id for ind_id in head.individuals])

        variant_columns = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER"]

        vep_header = head.vep_columns
        snpeff_header = head.snpeff_columns

        index = 0
        for variant_line in raw_variants:
            if not variant_line.startswith("#"):
                index += 1
                # Create a variant dict:
                variant_dict = get_variant_dict(variant_line=variant_line, header_line=header_line)
                variant_dict["CHROM"] = variant_dict["CHROM"].lstrip("chrCHR")
                # Crreate a info dict:
                info_dict = get_info_dict(info_line=variant_dict["INFO"])
                # Check if vep annotation:
                vep_string = info_dict.get("CSQ")

                # Check if snpeff annotation:
                snpeff_string = info_dict.get("ANN")

                if vep_string:
                    # Get the vep annotations
                    vep_info = get_vep_info(vep_string=vep_string, vep_header=vep_header)

                elif snpeff_string:
                    # Get the vep annotations
                    snpeff_info = get_snpeff_info(snpeff_string=snpeff_string, snpeff_header=snpeff_header)

                variant = Variant(**{column: variant_dict.get(column, ".") for column in variant_columns})

                logger.debug("Creating a variant object of variant {0}".format(variant.get("variant_id")))

                variant["index"] = index
                logger.debug("Updating index to: {0}".format(index))

                variant["start"] = int(variant_dict["POS"])

                if self.variant_type == "sv":
                    other_chrom = variant["CHROM"]
                    # If we have a translocation:
                    if ":" in variant_dict["ALT"] and not "<" in variant_dict["ALT"]:
                        other_coordinates = variant_dict["ALT"].strip("ACGTN[]").split(":")
                        other_chrom = other_coordinates[0].lstrip("chrCHR")
                        other_position = other_coordinates[1]
                        variant["stop"] = other_position

                        # Set 'infinity' to length if translocation
                        variant["sv_len"] = float("inf")
                    else:
                        variant["stop"] = int(info_dict.get("END", variant_dict["POS"]))
                        variant["sv_len"] = variant["stop"] - variant["start"]

                    variant["stop_chrom"] = other_chrom

                else:
                    variant["stop"] = int(variant_dict["POS"]) + (len(variant_dict["REF"]) - len(variant_dict["ALT"]))

                variant["sv_type"] = info_dict.get("SVTYPE")
                variant["cytoband_start"] = get_cytoband_coord(chrom=variant["CHROM"], pos=variant["start"])
                if variant.get("stop_chrom"):
                    variant["cytoband_stop"] = get_cytoband_coord(chrom=variant["stop_chrom"], pos=variant["stop"])

                # It would be easy to update these keys...
                thousand_g = info_dict.get("1000GAF")
                if thousand_g:
                    logger.debug("Updating thousand_g to: {0}".format(thousand_g))
                    variant["thousand_g"] = float(thousand_g)
                    variant.add_frequency("1000GAF", variant.get("thousand_g"))

                # SV specific tag for number of occurances
                occurances = info_dict.get("OCC")
                if occurances:
                    logger.debug("Updating occurances to: {0}".format(occurances))
                    variant["occurances"] = float(occurances)
                    variant.add_frequency("OCC", occurances)

                cadd_score = info_dict.get("CADD")
                if cadd_score:
                    logger.debug("Updating cadd_score to: {0}".format(cadd_score))
                    variant["cadd_score"] = float(cadd_score)

                rank_score_entry = info_dict.get("RankScore")
                if rank_score_entry:
                    for family_annotation in rank_score_entry.split(","):
                        rank_score = family_annotation.split(":")[-1]
                    logger.debug("Updating rank_score to: {0}".format(rank_score))
                    variant["rank_score"] = float(rank_score)

                genetic_models_entry = info_dict.get("GeneticModels")
                if genetic_models_entry:
                    genetic_models = []
                    for family_annotation in genetic_models_entry.split(","):
                        for genetic_model in family_annotation.split(":")[-1].split("|"):
                            genetic_models.append(genetic_model)
                    logger.debug("Updating rank_score to: {0}".format(rank_score))
                    variant["genetic_models"] = genetic_models

                # Add genotype calls:
                for individual in case_obj.individuals:
                    sample_id = individual.ind_id

                    if sample_id in vcf_individuals:

                        raw_call = dict(zip(variant_dict["FORMAT"].split(":"), variant_dict[sample_id].split(":")))
                        variant.add_individual(
                            Genotype(
                                sample_id=sample_id,
                                genotype=raw_call.get("GT", "./."),
                                case_id=individual.case_name,
                                phenotype=individual.phenotype,
                                ref_depth=raw_call.get("AD", ",").split(",")[0],
                                alt_depth=raw_call.get("AD", ",").split(",")[1],
                                genotype_quality=raw_call.get("GQ", "."),
                                depth=raw_call.get("DP", "."),
                                supporting_evidence=raw_call.get("SU", "0"),
                                pe_support=raw_call.get("PE", "0"),
                                sr_support=raw_call.get("SR", "0"),
                            )
                        )

                # Add transcript information:
                gmaf = None
                if vep_string:
                    for transcript_info in vep_info:
                        transcript = self._get_vep_transcripts(transcript_info)
                        gmaf_raw = transcript_info.get("GMAF")
                        if gmaf_raw:
                            gmaf = float(gmaf_raw.split(":")[-1])
                        variant.add_transcript(transcript)

                if gmaf:
                    variant.add_frequency("GMAF", gmaf)
                    if not variant.thousand_g:
                        variant.thousand_g = gmaf

                elif snpeff_string:
                    for transcript_info in snpeff_info:
                        transcript = self._get_snpeff_transcripts(transcript_info)
                        variant.add_transcript(transcript)

                variant["most_severe_consequence"] = get_most_severe_consequence(variant["transcripts"])

                for gene in self._get_genes(variant):
                    variant.add_gene(gene)

                self._add_compounds(variant=variant, info_dict=info_dict)

                yield variant
Esempio n. 13
0
def cli(variant_file, thousand_g, exac, treshold, outfile, annotate, keyword,
        verbose, logfile):
    """
    Filter vcf variants based on their frequency.
    
    One can use different sources by addind --keyword multiple times.
    Variants and frequency sources should be splitted and normalized(with vt).
    """
    loglevel = LEVELS.get(min(verbose,2), "WARNING")
    init_log(root_logger, logfile, loglevel)
    
    logger = logging.getLogger(__name__)
    
    #For testing
    logger = logging.getLogger("filter_variants.cli.root")
    logger.info("Running filter_variants version {0}".format(__version__))

    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    if line:
        variant_file = itertools.chain([line], variant_file)
    
    
    if thousand_g:
        logger.info("Opening 1000G frequency file with tabix open")
        try:
            thousand_g_handle = get_tabix_handle(thousand_g)
        except OSError as e:
            logger.critical(e.message)
            logger.info("Exiting")
            sys.exit(1)
        logger.debug("1000G frequency file opened")
        if annotate:
            head.add_info(
                "1000GAF",
                "1",
                'Float',
                "Frequency in the 1000G database."
            )
    
    if exac:
        logger.info("Opening ExAC frequency file with tabix open")
        try:
            exac_handle = get_tabix_handle(exac)
        except OSError as e:
            logger.critical(e.message)
            logger.info("Exiting")
            sys.exit(1)
        
        logger.debug("ExAC frequency file opened")
        if annotate:
            head.add_info(
                "ExACAF",
                "1",
                'Float',
                "Frequency in the ExAC database."
            )
    plugins = []
    for key in keyword:
        if key not in head.info_dict:
            logger.error("{0} is not defined in vcf header.".format(key))
            logger.info("Exiting")
            sys.exit(1)
        plugins.append(Plugin(
            name=key,
            field='INFO',
            data_type='float', 
            separators=[','], 
            info_key=key, 
            record_rule='max',
        ))
    
    print_headers(head, outfile)

    for line in variant_file:
        max_freq = 0
        line = line.rstrip()
        variant_line = line.split('\t')
        chrom = variant_line[0].strip('chr')
        position = int(variant_line[1])
        ref = variant_line[3]
        alternative = variant_line[4]
        logger.debug("Checking variant {0}".format(
            '_'.join([chrom, str(position), ref, alternative])
        ))
        for plugin in plugins:
            logger.debug("Getting frequency for {0}".format(
                plugin.name))
            frequency = plugin.get_value(variant_line=line)
            logger.debug("Found frequency {0}".format(
                frequency))
            if frequency:
                if float(frequency) > max_freq:
                    logger.debug("Updating max freq")
                    max_freq = float(frequency)
        if thousand_g:
            logger.debug("Getting thousand g frequency")
            frequency = get_frequency(
                chrom = chrom,
                pos = position,
                alt = alternative,
                tabix_reader = thousand_g_handle
                )
            logger.debug("Found frequency {0}".format(
                frequency))
            
            if frequency:
                if annotate:
                    line = add_vcf_info(
                        keyword='1000GAF', 
                        variant_line=line, 
                        annotation=frequency
                    )
                if float(frequency) > max_freq:
                    logger.debug("Updating max freq")
                    max_freq = float(frequency)
        if exac:
            logger.debug("Getting ExAC frequency")
            frequency = get_frequency(
                chrom = chrom,
                pos = position,
                alt = alternative,
                tabix_reader = exac_handle
                )
            logger.debug("Found frequency {0}".format(
                frequency))
            if frequency:
                if annotate:
                    line = add_vcf_info(
                        keyword='ExACAF', 
                        variant_line=line, 
                        annotation=frequency
                    )
                if float(frequency) > max_freq:
                    logger.debug("Updating max freq")
                    max_freq = float(frequency)

        if max_freq < treshold:
            print_variant(line, outfile)
        else:
            logger.debug("Frequency {0} is higher than treshold"\
            " {1}. Skip printing variant".format(max_freq, treshold))
Esempio n. 14
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']

    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    existing_chromosomes = set(adapter.get_chromosomes())

    ordered_chromosomes = []
    for chrom in CHROMOSOME_ORDER:
        if chrom in existing_chromosomes:
            ordered_chromosomes.append(chrom)
            existing_chromosomes.remove(chrom)
    for chrom in existing_chromosomes:
        ordered_chromosomes.append(chrom)

    nr_cases = adapter.cases().count()
    logger.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("VCFv4.3")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer',
                  "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes")
    head.add_version_tracking("loqusdb", __version__,
                              datetime.now().strftime("%Y-%m-%d %H:%M"))
    for chrom in ordered_chromosomes:
        length = adapter.get_max_position(chrom)
        head.add_contig(contig_id=chrom, length=str(length))

    print_headers(head, outfile=outfile)

    for chrom in ordered_chromosomes:
        for variant in adapter.get_variants(chromosome=chrom):
            chrom = variant['chrom']
            pos = variant['start']
            ref = variant['ref']
            alt = variant['alt']
            observations = variant['observations']
            homozygotes = variant['homozygote']
            hemizygotes = variant['hemizygote']
            info = "Obs={0}".format(observations)
            if homozygotes:
                info += ";Hom={0}".format(homozygotes)
            if hemizygotes:
                info += ";Hem={0}".format(hemizygotes)
            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)
            print_variant(variant_line=variant_line, outfile=outfile)
Esempio n. 15
0
def export(ctx, outfile, variant_type):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']
    version = ctx.obj['version']
    
    LOG.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    is_sv = variant_type == 'sv'
    existing_chromosomes = set(adapter.get_chromosomes(sv=is_sv))
    
    ordered_chromosomes = []
    for chrom in CHROMOSOME_ORDER:
        if chrom in existing_chromosomes:
            ordered_chromosomes.append(chrom)
            existing_chromosomes.remove(chrom)
    for chrom in existing_chromosomes:
        ordered_chromosomes.append(chrom)
    
    nr_cases = adapter.cases().count()
    LOG.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("VCFv4.3")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer', "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes")
    head.add_version_tracking("loqusdb", version, datetime.now().strftime("%Y-%m-%d %H:%M"))
    
    if variant_type == 'sv':
        head.add_info("END", '1', 'Integer', "End position of the variant")
        head.add_info("SVTYPE", '1', 'String', "Type of structural variant")
        head.add_info("SVLEN", '1', 'Integer', "Length of structural variant")
        
        
    for chrom in ordered_chromosomes:
        length = adapter.get_max_position(chrom)
        head.add_contig(contig_id=chrom, length=str(length))

    print_headers(head, outfile=outfile)
    
    for chrom in ordered_chromosomes:
        if variant_type == 'snv':
            LOG.info("Collecting all SNV variants")
            variants = adapter.get_variants(chromosome=chrom)
        else:
            LOG.info("Collecting all SV variants")
            variants = adapter.get_sv_variants(chromosome=chrom)
        LOG.info("{} variants found".format(variants.count()))
        for variant in variants:
            variant_line = format_variant(variant, variant_type=variant_type)
            # chrom = variant['chrom']
            # pos = variant['start']
            # ref = variant['ref']
            # alt = variant['alt']
            # observations = variant['observations']
            # homozygotes = variant['homozygote']
            # hemizygotes = variant['hemizygote']
            # info = "Obs={0}".format(observations)
            # if homozygotes:
            #     info += ";Hom={0}".format(homozygotes)
            # if hemizygotes:
            #     info += ";Hem={0}".format(hemizygotes)
            # variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
            #     chrom, pos, ref, alt, info)
            print_variant(variant_line=variant_line, outfile=outfile)
Esempio n. 16
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']

    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    for nr_cases, case in enumerate(adapter.cases()):
        nr_cases += 1
    logger.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("##fileformat=VCFv4.1")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer',
                  "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_version_tracking("loqusdb", __version__,
                              datetime.now().strftime("%Y-%m-%d %H:%M"))

    logger.debug("Create tempfile to print variants from database")
    variants = tempfile.TemporaryFile()

    logger.debug("Printing headers")
    print_headers(head, outfile=outfile)

    try:
        for variant in adapter.get_variants():
            variant_id = variant['_id'].split('_')
            chrom = variant_id[0]
            pos = variant_id[1]
            ref = variant_id[2]
            alt = variant_id[3]

            observations = variant['observations']
            homozygotes = variant['homozygote']

            info = "Obs={0};Hom={1}".format(observations, homozygotes)

            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)

            variants.write(variant_line)

        variants.seek(0)
        for line in sort_variants(variants):
            print_variant(variant_line=line, outfile=outfile)
    finally:
        variants.close()
Esempio n. 17
0
def get_individuals(vcf=None, case_lines=None, case_type='ped'):
        """Get the individuals from a vcf file, and/or a ped file.

            Args:
                vcf (str): Path to a vcf
                case_lines(Iterable): Ped like lines
                case_type(str): Format of ped lines

            Returns:
                individuals (generator): generator with Individuals
        """
        individuals = []

        if case_lines:
            # read individuals from ped file
            family_parser = FamilyParser(case_lines, family_type=case_type)
            families = family_parser.families
            logger.info("Found families {0}".format(
                            ','.join(list(families.keys()))))
            if len(families) != 1:
                logger.error("Only one family can be used with vcf adapter")
                raise IOError

            case_id = list(families.keys())[0]
            logger.info("Family used in analysis: {0}".format(case_id))

            for ind_id in family_parser.individuals:
                ind = family_parser.individuals[ind_id]
                logger.info("Found individual {0}".format(ind.individual_id))

                individual = Individual(
                    ind_id=ind.individual_id,
                    case_id=case_id,
                    mother=ind.mother,
                    father=ind.father,
                    sex=str(ind.sex),
                    phenotype=str(ind.phenotype),
                    variant_source=vcf,
                )
                individuals.append(individual)

        elif vcf:
            # read individuals from vcf file
            case_id = os.path.basename(vcf)
            head = HeaderParser()
            handle = get_vcf_handle(infile=vcf)
            for line in handle:
                line = line.rstrip()
                if line.startswith('#'):
                    if line.startswith('##'):
                        head.parse_meta_data(line)
                    else:
                        head.parse_header_line(line)
                else:
                    break

            for index, ind in enumerate(head.individuals):
                # If we only have a vcf file we can not get metadata about the
                # individuals
                individual = Individual(
                    ind_id=ind,
                    case_id=case_id,
                    variant_source=vcf,
                )
                individuals.append(individual)

                logger.debug("Found individual {0} in {1}".format(
                    ind, vcf))

        return individuals
Esempio n. 18
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']
    
    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0
    
    for nr_cases, case in enumerate(adapter.cases()):
        nr_cases += 1
    logger.info("Found {0} cases in database".format(nr_cases))
    
    head = HeaderParser()
    head.add_fileformat("##fileformat=VCFv4.1")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer', "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_version_tracking("loqusdb", __version__, datetime.now().strftime("%Y-%m-%d %H:%M"))
    
    logger.debug("Create tempfile to print variants from database")
    variants = tempfile.TemporaryFile()
    
    logger.debug("Printing headers")
    print_headers(head, outfile=outfile)
    
    try:
        for variant in adapter.get_variants():
            variant_id = variant['_id'].split('_')
            chrom = variant_id[0]
            pos = variant_id[1]
            ref = variant_id[2]
            alt = variant_id[3]
            
            observations = variant['observations']
            homozygotes = variant['homozygote']
            
            info = "Obs={0};Hom={1}".format(observations, homozygotes)
            
            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)
            
            variants.write(variant_line)
        
        variants.seek(0)
        for line in sort_variants(variants):
            print_variant(variant_line=line, outfile=outfile)
    finally:
        variants.close()
Esempio n. 19
0
    def _formated_variants(self, raw_variants, case_obj):
        """Return variant objects

            Args:
                raw_variants (Iterable): An iterable with variant lines
                case_obj (puzzle.nodels.Case): A case object

        """
        vcf_file_path = case_obj.variant_source

        logger.info("Parsing file {0}".format(vcf_file_path))
        head = HeaderParser()
        handle = get_vcf_handle(infile=vcf_file_path)
        # Parse the header
        for line in handle:
            line = line.rstrip()
            if line.startswith('#'):
                if line.startswith('##'):
                    head.parse_meta_data(line)
                else:
                    head.parse_header_line(line)
            else:
                break

        handle.close()

        header_line = head.header

        # Get the individual ids for individuals in vcf file
        vcf_individuals = set([ind_id for ind_id in head.individuals])

        variant_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER']

        vep_header = head.vep_columns
        snpeff_header = head.snpeff_columns

        index = 0
        for variant_line in raw_variants:
            if not variant_line.startswith('#'):
                index += 1
                #Create a variant dict:
                variant_dict =  get_variant_dict(
                    variant_line = variant_line,
                    header_line = header_line
                )
                variant_dict['CHROM'] = variant_dict['CHROM'].lstrip('chrCHR')
                #Crreate a info dict:
                info_dict = get_info_dict(
                    info_line = variant_dict['INFO']
                )
                #Check if vep annotation:
                vep_string = info_dict.get('CSQ')

                #Check if snpeff annotation:
                snpeff_string = info_dict.get('ANN')

                if vep_string:
                    #Get the vep annotations
                    vep_info = get_vep_info(
                        vep_string = vep_string,
                        vep_header = vep_header
                    )

                elif snpeff_string:
                    #Get the vep annotations
                    snpeff_info = get_snpeff_info(
                        snpeff_string = snpeff_string,
                        snpeff_header = snpeff_header
                    )

                variant = Variant(
                    **{column: variant_dict.get(column, '.')
                        for column in variant_columns}
                    )

                logger.debug("Creating a variant object of variant {0}".format(
                    variant.get('variant_id')))

                variant['index'] = index
                logger.debug("Updating index to: {0}".format(
                    index))

                variant['start'] = int(variant_dict['POS'])


                if self.variant_type == 'sv':
                    other_chrom = variant['CHROM']
                    # If we have a translocation:
                    if ':' in variant_dict['ALT']:
                        other_coordinates = variant_dict['ALT'].strip('ACGTN[]').split(':')
                        other_chrom = other_coordinates[0].lstrip('chrCHR')
                        other_position = other_coordinates[1]
                        variant['stop'] = other_position

                        #Set 'infinity' to length if translocation
                        variant['sv_len'] = float('inf')
                    else:
                        variant['stop'] = int(info_dict.get('END', variant_dict['POS']))
                        variant['sv_len'] = variant['stop'] - variant['start']

                    variant['stop_chrom'] = other_chrom

                else:
                    variant['stop'] = int(variant_dict['POS']) + \
                        (len(variant_dict['REF']) - len(variant_dict['ALT']))

                variant['sv_type'] = info_dict.get('SVTYPE')
                variant['cytoband_start'] = get_cytoband_coord(
                                                chrom=variant['CHROM'],
                                                pos=variant['start'])
                if variant.get('stop_chrom'):
                    variant['cytoband_stop'] = get_cytoband_coord(
                                                chrom=variant['stop_chrom'],
                                                pos=variant['stop'])

                # It would be easy to update these keys...
                thousand_g = info_dict.get('1000GAF')
                if thousand_g:
                    logger.debug("Updating thousand_g to: {0}".format(
                        thousand_g))
                    variant['thousand_g'] = float(thousand_g)
                    variant.add_frequency('1000GAF', variant.get('thousand_g'))

                #SV specific tag for number of occurances
                occurances = info_dict.get('OCC')
                if occurances:
                    logger.debug("Updating occurances to: {0}".format(
                        occurances))
                    variant['occurances'] = float(occurances)
                    variant.add_frequency('OCC', occurances)

                cadd_score = info_dict.get('CADD')
                if cadd_score:
                    logger.debug("Updating cadd_score to: {0}".format(
                        cadd_score))
                    variant['cadd_score'] = float(cadd_score)

                rank_score_entry = info_dict.get('RankScore')
                if rank_score_entry:
                    for family_annotation in rank_score_entry.split(','):
                        rank_score = family_annotation.split(':')[-1]
                    logger.debug("Updating rank_score to: {0}".format(
                        rank_score))
                    variant['rank_score'] = float(rank_score)

                genetic_models_entry = info_dict.get('GeneticModels')
                if genetic_models_entry:
                    genetic_models = []
                    for family_annotation in genetic_models_entry.split(','):
                        for genetic_model in family_annotation.split(':')[-1].split('|'):
                            genetic_models.append(genetic_model)
                    logger.debug("Updating rank_score to: {0}".format(
                        rank_score))
                    variant['genetic_models'] = genetic_models

                #Add genotype calls:
                for individual in case_obj.individuals:
                    sample_id = individual.ind_id

                    if sample_id in vcf_individuals:

                        raw_call = dict(zip(
                            variant_dict['FORMAT'].split(':'),
                            variant_dict[sample_id].split(':'))
                        )
                        variant.add_individual(Genotype(
                            sample_id = sample_id,
                            genotype = raw_call.get('GT', './.'),
                            case_id = individual.case_name,
                            phenotype = individual.phenotype,
                            ref_depth = raw_call.get('AD', ',').split(',')[0],
                            alt_depth = raw_call.get('AD', ',').split(',')[1],
                            genotype_quality = raw_call.get('GQ', '.'),
                            depth = raw_call.get('DP', '.'),
                            supporting_evidence = raw_call.get('SU', '0'),
                            pe_support = raw_call.get('PE', '0'),
                            sr_support = raw_call.get('SR', '0'),
                        ))

                # Add transcript information:
                if vep_string:
                    for transcript in self._get_vep_transcripts(variant, vep_info):
                        variant.add_transcript(transcript)

                elif snpeff_string:
                    for transcript in self._get_snpeff_transcripts(variant, snpeff_info):
                        variant.add_transcript(transcript)

                variant['most_severe_consequence'] = get_most_severe_consequence(
                    variant['transcripts']
                )

                for gene in self._get_genes(variant):
                    variant.add_gene(gene)

                self._add_compounds(variant=variant, info_dict=info_dict)

                yield variant