def export(ctx, outfile): """Export the variants of a loqus db The variants are exported to a vcf file """ adapter = ctx.obj['adapter'] logger.info("Export the variants from {0}".format(adapter)) nr_cases = 0 existing_chromosomes = set(adapter.get_chromosomes()) ordered_chromosomes = [] for chrom in CHROMOSOME_ORDER: if chrom in existing_chromosomes: ordered_chromosomes.append(chrom) existing_chromosomes.remove(chrom) for chrom in existing_chromosomes: ordered_chromosomes.append(chrom) nr_cases = adapter.cases().count() logger.info("Found {0} cases in database".format(nr_cases)) head = HeaderParser() head.add_fileformat("VCFv4.3") head.add_meta_line("NrCases", nr_cases) head.add_info("Obs", '1', 'Integer', "The number of observations for the variant") head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes") head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes") head.add_version_tracking("loqusdb", __version__, datetime.now().strftime("%Y-%m-%d %H:%M")) for chrom in ordered_chromosomes: length = adapter.get_max_position(chrom) head.add_contig(contig_id=chrom, length=str(length)) print_headers(head, outfile=outfile) for chrom in ordered_chromosomes: for variant in adapter.get_variants(chromosome=chrom): chrom = variant['chrom'] pos = variant['start'] ref = variant['ref'] alt = variant['alt'] observations = variant['observations'] homozygotes = variant['homozygote'] hemizygotes = variant['hemizygote'] info = "Obs={0}".format(observations) if homozygotes: info += ";Hom={0}".format(homozygotes) if hemizygotes: info += ";Hem={0}".format(hemizygotes) variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format( chrom, pos, ref, alt, info) print_variant(variant_line=variant_line, outfile=outfile)
def export(ctx, outfile): """Export the variants of a loqus db The variants are exported to a vcf file """ adapter = ctx.obj['adapter'] logger.info("Export the variants from {0}".format(adapter)) nr_cases = 0 for nr_cases, case in enumerate(adapter.cases()): nr_cases += 1 logger.info("Found {0} cases in database".format(nr_cases)) head = HeaderParser() head.add_fileformat("##fileformat=VCFv4.1") head.add_meta_line("NrCases", nr_cases) head.add_info("Obs", '1', 'Integer', "The number of observations for the variant") head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes") head.add_version_tracking("loqusdb", __version__, datetime.now().strftime("%Y-%m-%d %H:%M")) logger.debug("Create tempfile to print variants from database") variants = tempfile.TemporaryFile() logger.debug("Printing headers") print_headers(head, outfile=outfile) try: for variant in adapter.get_variants(): variant_id = variant['_id'].split('_') chrom = variant_id[0] pos = variant_id[1] ref = variant_id[2] alt = variant_id[3] observations = variant['observations'] homozygotes = variant['homozygote'] info = "Obs={0};Hom={1}".format(observations, homozygotes) variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format( chrom, pos, ref, alt, info) variants.write(variant_line) variants.seek(0) for line in sort_variants(variants): print_variant(variant_line=line, outfile=outfile) finally: variants.close()
def export(ctx, outfile, variant_type): """Export the variants of a loqus db The variants are exported to a vcf file """ adapter = ctx.obj['adapter'] version = ctx.obj['version'] LOG.info("Export the variants from {0}".format(adapter)) nr_cases = 0 is_sv = variant_type == 'sv' existing_chromosomes = set(adapter.get_chromosomes(sv=is_sv)) ordered_chromosomes = [] for chrom in CHROMOSOME_ORDER: if chrom in existing_chromosomes: ordered_chromosomes.append(chrom) existing_chromosomes.remove(chrom) for chrom in existing_chromosomes: ordered_chromosomes.append(chrom) nr_cases = adapter.cases().count() LOG.info("Found {0} cases in database".format(nr_cases)) head = HeaderParser() head.add_fileformat("VCFv4.3") head.add_meta_line("NrCases", nr_cases) head.add_info("Obs", '1', 'Integer', "The number of observations for the variant") head.add_info("Hom", '1', 'Integer', "The number of observed homozygotes") head.add_info("Hem", '1', 'Integer', "The number of observed hemizygotes") head.add_version_tracking("loqusdb", version, datetime.now().strftime("%Y-%m-%d %H:%M")) if variant_type == 'sv': head.add_info("END", '1', 'Integer', "End position of the variant") head.add_info("SVTYPE", '1', 'String', "Type of structural variant") head.add_info("SVLEN", '1', 'Integer', "Length of structural variant") for chrom in ordered_chromosomes: length = adapter.get_max_position(chrom) head.add_contig(contig_id=chrom, length=str(length)) print_headers(head, outfile=outfile) for chrom in ordered_chromosomes: if variant_type == 'snv': LOG.info("Collecting all SNV variants") variants = adapter.get_variants(chromosome=chrom) else: LOG.info("Collecting all SV variants") variants = adapter.get_sv_variants(chromosome=chrom) LOG.info("{} variants found".format(variants.count())) for variant in variants: variant_line = format_variant(variant, variant_type=variant_type) # chrom = variant['chrom'] # pos = variant['start'] # ref = variant['ref'] # alt = variant['alt'] # observations = variant['observations'] # homozygotes = variant['homozygote'] # hemizygotes = variant['hemizygote'] # info = "Obs={0}".format(observations) # if homozygotes: # info += ";Hom={0}".format(homozygotes) # if hemizygotes: # info += ";Hem={0}".format(hemizygotes) # variant_line = "{0}\t{1}\t.\t{2}\t{3}\t.\t.\t{4}\n".format( # chrom, pos, ref, alt, info) print_variant(variant_line=variant_line, outfile=outfile)
def cli(variant_file, thousand_g, exac, treshold, outfile, annotate, keyword, verbose, logfile): """ Filter vcf variants based on their frequency. One can use different sources by addind --keyword multiple times. Variants and frequency sources should be splitted and normalized(with vt). """ loglevel = LEVELS.get(min(verbose,2), "WARNING") init_log(root_logger, logfile, loglevel) logger = logging.getLogger(__name__) #For testing logger = logging.getLogger("filter_variants.cli.root") logger.info("Running filter_variants version {0}".format(__version__)) logger.info("Initializing a Header Parser") head = HeaderParser() for line in variant_file: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break if line: variant_file = itertools.chain([line], variant_file) if thousand_g: logger.info("Opening 1000G frequency file with tabix open") try: thousand_g_handle = get_tabix_handle(thousand_g) except OSError as e: logger.critical(e.message) logger.info("Exiting") sys.exit(1) logger.debug("1000G frequency file opened") if annotate: head.add_info( "1000GAF", "1", 'Float', "Frequency in the 1000G database." ) if exac: logger.info("Opening ExAC frequency file with tabix open") try: exac_handle = get_tabix_handle(exac) except OSError as e: logger.critical(e.message) logger.info("Exiting") sys.exit(1) logger.debug("ExAC frequency file opened") if annotate: head.add_info( "ExACAF", "1", 'Float', "Frequency in the ExAC database." ) plugins = [] for key in keyword: if key not in head.info_dict: logger.error("{0} is not defined in vcf header.".format(key)) logger.info("Exiting") sys.exit(1) plugins.append(Plugin( name=key, field='INFO', data_type='float', separators=[','], info_key=key, record_rule='max', )) print_headers(head, outfile) for line in variant_file: max_freq = 0 line = line.rstrip() variant_line = line.split('\t') chrom = variant_line[0].strip('chr') position = int(variant_line[1]) ref = variant_line[3] alternative = variant_line[4] logger.debug("Checking variant {0}".format( '_'.join([chrom, str(position), ref, alternative]) )) for plugin in plugins: logger.debug("Getting frequency for {0}".format( plugin.name)) frequency = plugin.get_value(variant_line=line) logger.debug("Found frequency {0}".format( frequency)) if frequency: if float(frequency) > max_freq: logger.debug("Updating max freq") max_freq = float(frequency) if thousand_g: logger.debug("Getting thousand g frequency") frequency = get_frequency( chrom = chrom, pos = position, alt = alternative, tabix_reader = thousand_g_handle ) logger.debug("Found frequency {0}".format( frequency)) if frequency: if annotate: line = add_vcf_info( keyword='1000GAF', variant_line=line, annotation=frequency ) if float(frequency) > max_freq: logger.debug("Updating max freq") max_freq = float(frequency) if exac: logger.debug("Getting ExAC frequency") frequency = get_frequency( chrom = chrom, pos = position, alt = alternative, tabix_reader = exac_handle ) logger.debug("Found frequency {0}".format( frequency)) if frequency: if annotate: line = add_vcf_info( keyword='ExACAF', variant_line=line, annotation=frequency ) if float(frequency) > max_freq: logger.debug("Updating max freq") max_freq = float(frequency) if max_freq < treshold: print_variant(line, outfile) else: logger.debug("Frequency {0} is higher than treshold"\ " {1}. Skip printing variant".format(max_freq, treshold))