Esempio n. 1
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']

    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    existing_chromosomes = set(adapter.get_chromosomes())

    ordered_chromosomes = []
    for chrom in CHROMOSOME_ORDER:
        if chrom in existing_chromosomes:
            ordered_chromosomes.append(chrom)
            existing_chromosomes.remove(chrom)
    for chrom in existing_chromosomes:
        ordered_chromosomes.append(chrom)

    nr_cases = adapter.cases().count()
    logger.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("VCFv4.3")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer',
                  "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes")
    head.add_version_tracking("loqusdb", __version__,
                              datetime.now().strftime("%Y-%m-%d %H:%M"))
    for chrom in ordered_chromosomes:
        length = adapter.get_max_position(chrom)
        head.add_contig(contig_id=chrom, length=str(length))

    print_headers(head, outfile=outfile)

    for chrom in ordered_chromosomes:
        for variant in adapter.get_variants(chromosome=chrom):
            chrom = variant['chrom']
            pos = variant['start']
            ref = variant['ref']
            alt = variant['alt']
            observations = variant['observations']
            homozygotes = variant['homozygote']
            hemizygotes = variant['hemizygote']
            info = "Obs={0}".format(observations)
            if homozygotes:
                info += ";Hom={0}".format(homozygotes)
            if hemizygotes:
                info += ";Hem={0}".format(hemizygotes)
            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)
            print_variant(variant_line=variant_line, outfile=outfile)
Esempio n. 2
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']

    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    for nr_cases, case in enumerate(adapter.cases()):
        nr_cases += 1
    logger.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("##fileformat=VCFv4.1")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer',
                  "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_version_tracking("loqusdb", __version__,
                              datetime.now().strftime("%Y-%m-%d %H:%M"))

    logger.debug("Create tempfile to print variants from database")
    variants = tempfile.TemporaryFile()

    logger.debug("Printing headers")
    print_headers(head, outfile=outfile)

    try:
        for variant in adapter.get_variants():
            variant_id = variant['_id'].split('_')
            chrom = variant_id[0]
            pos = variant_id[1]
            ref = variant_id[2]
            alt = variant_id[3]

            observations = variant['observations']
            homozygotes = variant['homozygote']

            info = "Obs={0};Hom={1}".format(observations, homozygotes)

            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)

            variants.write(variant_line)

        variants.seek(0)
        for line in sort_variants(variants):
            print_variant(variant_line=line, outfile=outfile)
    finally:
        variants.close()
Esempio n. 3
0
def export(ctx, outfile):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']
    
    logger.info("Export the variants from {0}".format(adapter))
    nr_cases = 0
    
    for nr_cases, case in enumerate(adapter.cases()):
        nr_cases += 1
    logger.info("Found {0} cases in database".format(nr_cases))
    
    head = HeaderParser()
    head.add_fileformat("##fileformat=VCFv4.1")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer', "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_version_tracking("loqusdb", __version__, datetime.now().strftime("%Y-%m-%d %H:%M"))
    
    logger.debug("Create tempfile to print variants from database")
    variants = tempfile.TemporaryFile()
    
    logger.debug("Printing headers")
    print_headers(head, outfile=outfile)
    
    try:
        for variant in adapter.get_variants():
            variant_id = variant['_id'].split('_')
            chrom = variant_id[0]
            pos = variant_id[1]
            ref = variant_id[2]
            alt = variant_id[3]
            
            observations = variant['observations']
            homozygotes = variant['homozygote']
            
            info = "Obs={0};Hom={1}".format(observations, homozygotes)
            
            variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
                chrom, pos, ref, alt, info)
            
            variants.write(variant_line)
        
        variants.seek(0)
        for line in sort_variants(variants):
            print_variant(variant_line=line, outfile=outfile)
    finally:
        variants.close()
Esempio n. 4
0
def export(ctx, outfile, variant_type):
    """Export the variants of a loqus db
        
        The variants are exported to a vcf file
    """
    adapter = ctx.obj['adapter']
    version = ctx.obj['version']
    
    LOG.info("Export the variants from {0}".format(adapter))
    nr_cases = 0

    is_sv = variant_type == 'sv'
    existing_chromosomes = set(adapter.get_chromosomes(sv=is_sv))
    
    ordered_chromosomes = []
    for chrom in CHROMOSOME_ORDER:
        if chrom in existing_chromosomes:
            ordered_chromosomes.append(chrom)
            existing_chromosomes.remove(chrom)
    for chrom in existing_chromosomes:
        ordered_chromosomes.append(chrom)
    
    nr_cases = adapter.cases().count()
    LOG.info("Found {0} cases in database".format(nr_cases))

    head = HeaderParser()
    head.add_fileformat("VCFv4.3")
    head.add_meta_line("NrCases", nr_cases)
    head.add_info("Obs", '1', 'Integer', "The number of observations for the variant")
    head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes")
    head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes")
    head.add_version_tracking("loqusdb", version, datetime.now().strftime("%Y-%m-%d %H:%M"))
    
    if variant_type == 'sv':
        head.add_info("END", '1', 'Integer', "End position of the variant")
        head.add_info("SVTYPE", '1', 'String', "Type of structural variant")
        head.add_info("SVLEN", '1', 'Integer', "Length of structural variant")
        
        
    for chrom in ordered_chromosomes:
        length = adapter.get_max_position(chrom)
        head.add_contig(contig_id=chrom, length=str(length))

    print_headers(head, outfile=outfile)
    
    for chrom in ordered_chromosomes:
        if variant_type == 'snv':
            LOG.info("Collecting all SNV variants")
            variants = adapter.get_variants(chromosome=chrom)
        else:
            LOG.info("Collecting all SV variants")
            variants = adapter.get_sv_variants(chromosome=chrom)
        LOG.info("{} variants found".format(variants.count()))
        for variant in variants:
            variant_line = format_variant(variant, variant_type=variant_type)
            # chrom = variant['chrom']
            # pos = variant['start']
            # ref = variant['ref']
            # alt = variant['alt']
            # observations = variant['observations']
            # homozygotes = variant['homozygote']
            # hemizygotes = variant['hemizygote']
            # info = "Obs={0}".format(observations)
            # if homozygotes:
            #     info += ";Hom={0}".format(homozygotes)
            # if hemizygotes:
            #     info += ";Hem={0}".format(hemizygotes)
            # variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format(
            #     chrom, pos, ref, alt, info)
            print_variant(variant_line=variant_line, outfile=outfile)
Esempio n. 5
0
def cli(variant_file, thousand_g, exac, treshold, outfile, annotate, keyword,
        verbose, logfile):
    """
    Filter vcf variants based on their frequency.
    
    One can use different sources by addind --keyword multiple times.
    Variants and frequency sources should be splitted and normalized(with vt).
    """
    loglevel = LEVELS.get(min(verbose,2), "WARNING")
    init_log(root_logger, logfile, loglevel)
    
    logger = logging.getLogger(__name__)
    
    #For testing
    logger = logging.getLogger("filter_variants.cli.root")
    logger.info("Running filter_variants version {0}".format(__version__))

    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    if line:
        variant_file = itertools.chain([line], variant_file)
    
    
    if thousand_g:
        logger.info("Opening 1000G frequency file with tabix open")
        try:
            thousand_g_handle = get_tabix_handle(thousand_g)
        except OSError as e:
            logger.critical(e.message)
            logger.info("Exiting")
            sys.exit(1)
        logger.debug("1000G frequency file opened")
        if annotate:
            head.add_info(
                "1000GAF",
                "1",
                'Float',
                "Frequency in the 1000G database."
            )
    
    if exac:
        logger.info("Opening ExAC frequency file with tabix open")
        try:
            exac_handle = get_tabix_handle(exac)
        except OSError as e:
            logger.critical(e.message)
            logger.info("Exiting")
            sys.exit(1)
        
        logger.debug("ExAC frequency file opened")
        if annotate:
            head.add_info(
                "ExACAF",
                "1",
                'Float',
                "Frequency in the ExAC database."
            )
    plugins = []
    for key in keyword:
        if key not in head.info_dict:
            logger.error("{0} is not defined in vcf header.".format(key))
            logger.info("Exiting")
            sys.exit(1)
        plugins.append(Plugin(
            name=key,
            field='INFO',
            data_type='float', 
            separators=[','], 
            info_key=key, 
            record_rule='max',
        ))
    
    print_headers(head, outfile)

    for line in variant_file:
        max_freq = 0
        line = line.rstrip()
        variant_line = line.split('\t')
        chrom = variant_line[0].strip('chr')
        position = int(variant_line[1])
        ref = variant_line[3]
        alternative = variant_line[4]
        logger.debug("Checking variant {0}".format(
            '_'.join([chrom, str(position), ref, alternative])
        ))
        for plugin in plugins:
            logger.debug("Getting frequency for {0}".format(
                plugin.name))
            frequency = plugin.get_value(variant_line=line)
            logger.debug("Found frequency {0}".format(
                frequency))
            if frequency:
                if float(frequency) > max_freq:
                    logger.debug("Updating max freq")
                    max_freq = float(frequency)
        if thousand_g:
            logger.debug("Getting thousand g frequency")
            frequency = get_frequency(
                chrom = chrom,
                pos = position,
                alt = alternative,
                tabix_reader = thousand_g_handle
                )
            logger.debug("Found frequency {0}".format(
                frequency))
            
            if frequency:
                if annotate:
                    line = add_vcf_info(
                        keyword='1000GAF', 
                        variant_line=line, 
                        annotation=frequency
                    )
                if float(frequency) > max_freq:
                    logger.debug("Updating max freq")
                    max_freq = float(frequency)
        if exac:
            logger.debug("Getting ExAC frequency")
            frequency = get_frequency(
                chrom = chrom,
                pos = position,
                alt = alternative,
                tabix_reader = exac_handle
                )
            logger.debug("Found frequency {0}".format(
                frequency))
            if frequency:
                if annotate:
                    line = add_vcf_info(
                        keyword='ExACAF', 
                        variant_line=line, 
                        annotation=frequency
                    )
                if float(frequency) > max_freq:
                    logger.debug("Updating max freq")
                    max_freq = float(frequency)

        if max_freq < treshold:
            print_variant(line, outfile)
        else:
            logger.debug("Frequency {0} is higher than treshold"\
            " {1}. Skip printing variant".format(max_freq, treshold))