def filter_vcf_in_memory(variants, filters, short_circuit = False, drop_filtered = False, invert = False, **kwargs): chain = [] for filter_obj in filters: chain.append(filter_obj) short_doc = filter_obj.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() # add a filter record to the output try: variants.filters[filter_obj.filter_name()] = _Filter(filter_obj.filter_name(), short_doc) except: pass filtered_records = [] for record in variants: output_record = True for filt in chain: result = filt(record) if (result is None) or (result is not None and invert): continue # save some work by skipping the rest of the code if drop_filtered: output_record = False break record.add_filter(filt.filter_name()) if short_circuit: break # If the record is to be kept (not dropping filtered, or record passed all filters) if output_record: filtered_records.append(record) return filtered_records
def filter_vcf(file_path, filters, short_circuit = False, drop_filtered = False, invert = False, output_file = None): if output_file is None: output_file = file_path + '.filt.vcf' inp = vcf.Reader(open(file_path, 'r')) # build filter chain chain = [] for filter_obj in filters: chain.append(filter_obj) short_doc = filter_obj.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() # add a filter record to the output inp.filters[filter_obj.filter_name()] = _Filter(filter_obj.filter_name(), short_doc) # output must be created after all the filter records have been added output = vcf.Writer(open(output_file, 'w'), inp) # apply filters for record in inp: output_record = True for filt in chain: result = filt(record) if (result is None) or (result is not None and invert): continue # save some work by skipping the rest of the code if drop_filtered: output_record = False break record.add_filter(filt.filter_name()) if short_circuit: break # If the record is to be kept (not dropping filtered, or record passed all filters) if output_record: output.write_record(record) return output_file
def main(args): # Load VCF file if not os.path.exists(args.vcf): common.WARNING("%s does not exist" % args.vcf) return 1 invcf = vcf.Reader(filename=args.vcf) # Set up record harmonizer and infer VCF type vcftype = trh.InferVCFType(invcf) # Check filters all make sense if not CheckFilters(invcf, args, vcftype): return 1 # Set up locus-level filter list try: filter_list = BuildLocusFilters(args, vcftype) except ValueError: return 1 invcf.filters = {} for f in filter_list: short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # Set up call-level filters call_filters = BuildCallFilters(args) # Add new FORMAT fields if "FILTER" not in invcf.formats: invcf.formats["FILTER"] = _Format("FILTER", 1, "String", "Call-level filter") # Add new INFO fields invcf.infos["AC"] = _Info("AC", -1, "Integer", "Alternate allele counts", source=None, version=None) invcf.infos["REFAC"] = _Info("REFAC", 1, "Integer", "Reference allele count", source=None, version=None) invcf.infos["HET"] = _Info("HET", 1, "Float", "Heterozygosity", source=None, version=None) invcf.infos["HWEP"] = _Info("HWEP", 1, "Float", "HWE p-value for obs. vs. exp het rate", source=None, version=None) invcf.infos["HRUN"] = _Info("HRUN", 1, "Integer", "Length of longest homopolymer run", source=None, version=None) # Set up output files if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Output directory does not exist") return 1 outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv)) if outvcf is None: return 1 # Set up sample info all_reasons = GetAllCallFilters(call_filters) sample_info = {} for s in invcf.samples: sample_info[s] = {"numcalls": 0, "totaldp": 0} for r in all_reasons: sample_info[s][r] = 0 # Set up locus info loc_info = {"totalcalls": 0, "PASS": 0} for filt in filter_list: loc_info[filt.filter_name()] = 0 # Go through each record record_counter = 0 while True: try: record = next(invcf) except IndexError: common.WARNING( "Skipping TR that couldn't be parsed by PyVCF. Check VCF format" ) if args.die_on_warning: return 1 except StopIteration: break if args.verbose: common.MSG("Processing %s:%s" % (record.CHROM, record.POS)) record_counter += 1 if args.num_records is not None and record_counter > args.num_records: break # Call-level filters record = ApplyCallFilters(record, invcf, call_filters, sample_info) # Locus-level filters record.FILTER = None output_record = True for filt in filter_list: if filt(record) == None: continue if args.drop_filtered: output_record = False break record.add_filter(filt.filter_name()) loc_info[filt.filter_name()] += 1 if args.drop_filtered: if record.call_rate == 0: output_record = False if output_record: trrecord = trh.HarmonizeRecord(vcftype, record) # Recalculate locus-level INFO fields record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF) if record.num_called > 0: allele_freqs = trrecord.GetAlleleFreqs( uselength=args.use_length) genotype_counts = trrecord.GetGenotypeCounts( uselength=args.use_length) record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs) record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest( allele_freqs, genotype_counts) record.INFO["AC"] = [ int(item * (3 * record.num_called)) for item in record.aaf ] record.INFO["REFAC"] = int( (1 - sum(record.aaf)) * (2 * record.num_called)) else: record.INFO["HET"] = -1 record.INFO["HWEP"] = -1 record.INFO["AC"] = [0] * len(record.ALT) record.INFO["REFAC"] = 0 # Recalc filter if record.FILTER is None and not args.drop_filtered: record.FILTER = "PASS" loc_info["PASS"] += 1 loc_info["totalcalls"] += record.num_called # Output the record outvcf.write_record(record) # Output log info WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab") WriteLocLog(loc_info, args.out + ".loclog.tab") return 0
def main(): # dynamically build the list of available filters filters = {} # parse command line args # (mainly because of local_script) parser = create_core_parser() (args, unknown_args) = parser.parse_known_args() # add filter to dictionary, extend help message # with help/arguments of each filter def addfilt(filt): filters[filt.name] = filt arg_group = parser.add_argument_group(filt.name, filt.__doc__) filt.customize_parser(arg_group) # look for global extensions for p in pkg_resources.iter_entry_points('vcf.filters'): filt = p.load() addfilt(filt) # add all classes from local script, if present if args.local_script != None: import inspect import os sys.path.insert(0, os.getcwd()) module_name = args.local_script.replace('.py', '') mod = __import__(module_name) classes = inspect.getmembers(mod, inspect.isclass) for name, cls in classes: addfilt(cls) # go through the filters on the command line # one by one, trying to consume only the declared arguments used_filters = [] while len(args.rest): filter_name = args.rest.pop(0) if filter_name not in filters: sys.exit("%s is not a known filter (%s)" % (filter_name, str(filters.keys()))) # create a parser only for arguments of current filter filt_parser = create_filt_parser(filter_name) filters[filter_name].customize_parser(filt_parser) (known_filt_args, unknown_filt_args) = filt_parser.parse_known_args(args.rest) if len(unknown_filt_args): sys.exit("%s has no arguments like %s" % (filter_name, unknown_filt_args)) used_filters.append((filter_name, known_filt_args)) args.rest = known_filt_args.rest # print help using the 'help' parser, so it includes # all possible filters and arguments if args.help or len(used_filters) == 0 or args.input == None: parser.print_help() parser.exit() inp = vcf.Reader(args.input) # build filter chain chain = [] for (name, filter_args) in used_filters: f = filters[name](filter_args) chain.append(f) # add a filter record to the output short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() inp.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # output must be created after all the filter records have been added output = vcf.Writer(args.output, inp) # apply filters short_circuit = not args.no_short_circuit drop_filtered = args.no_filtered for record in inp: output_record = True for filt in chain: result = filt(record) if result == None: continue # save some work by skipping the rest of the code if drop_filtered: output_record = False break record.add_filter(filt.filter_name()) if short_circuit: break if output_record: # use PASS only if other filter names appear in the FILTER column #FIXME: is this good idea? if record.FILTER is None and not drop_filtered: record.FILTER = 'PASS' output.write_record(record)
def main(): # dynamically build the list of available filters filters = {} # parse command line args # (mainly because of local_script) parser = create_core_parser() (args, unknown_args) = parser.parse_known_args() # Enable remote debugging if args.debug: user_ip = environ['USERIP'] pydevd.settrace(user_ip, port=58484, stdoutToServer=True, stderrToServer=True) # add filter to dictionary, extend help message # with help/arguments of each filter def addfilt(filt): filters[filt.name] = filt arg_group = parser.add_argument_group(filt.name, filt.__doc__) filt.customize_parser(arg_group) # look for global extensions for p in pkg_resources.iter_entry_points('vcf.filters'): filt = p.load() addfilt(filt) # add all classes from local script, if present if args.local_script != None: import inspect import os sys.path.insert(0, os.getcwd()) module_name = args.local_script.replace('.py', '') mod = __import__(module_name) classes = inspect.getmembers(mod, inspect.isclass) for name, cls in classes: addfilt(cls) # go through the filters on the command line # one by one, trying to consume only the declared arguments used_filters = [] while len(args.rest): filter_name = args.rest.pop(0) if filter_name not in filters: sys.exit("%s is not a known filter (%s)" % (filter_name, str(filters.keys()))) # create a parser only for arguments of current filter filt_parser = create_filt_parser(filter_name) filters[filter_name].customize_parser(filt_parser) (known_filt_args, unknown_filt_args) = filt_parser.parse_known_args(args.rest) if len(unknown_filt_args): sys.exit("%s has no arguments like %s" % (filter_name, unknown_filt_args)) used_filters.append((filter_name, known_filt_args)) args.rest = known_filt_args.rest # print help using the 'help' parser, so it includes # all possible filters and arguments if args.help or len(used_filters) == 0 or args.input == None: parser.print_help() parser.exit() inp = vcf.Reader(args.input) # build filter chain chain = [] for (name, filter_args) in used_filters: f = filters[name](filter_args) chain.append(f) # add a filter record to the output short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() inp.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # output must be created after all the filter records have been added output = vcf.Writer(args.output, inp) # apply filters short_circuit = not args.no_short_circuit drop_filtered = args.no_filtered for record in inp: output_record = True for filt in chain: result = filt(record) if result is None: continue # save some work by skipping the rest of the code if drop_filtered: output_record = False break record.add_filter(filt.filter_name()) if short_circuit: break if output_record: # use PASS only if other filter names appear in the FILTER column #FIXME: is this good idea? if record.FILTER is None and not drop_filtered: record.FILTER = 'PASS' output.write_record(record)
def filter_variants(self, keep_only_snps=False, only_good=False): """Filter the VCF records. Parameters ---------- keep_only_snps: bool, optional Retain only SNP variants (default: False). only_good: bool, optional True/False if only SNPs that PASS should output. Returns ------- list of records is returned. """ if self._reader is None: # Create a reader class from input VCF. self._reader = vcf.Reader(filename=self.vcf_in) # get list of existing filters. existing_filters = {} removed_filters = [] for filter_id in self._reader.filters: conf = PHEFilterBase.decode(filter_id) tuple(conf.keys()) existing_filters.update({tuple(conf.keys()):filter_id}) # Add each filter we are going to use to the record. # This is needed for writing out proper #FILTER header in VCF. for record_filter in self.filters: # We know that each filter has short description method. short_doc = record_filter.short_desc() short_doc = short_doc.split('\n')[0].lstrip() filter_name = PHEFilterBase.decode(record_filter.filter_name()) # Check if the sample has been filtered for this type of filter # in the past. If so remove is, because it is going to be refiltered. if tuple(filter_name) in existing_filters: logging.info("Removing existing filter: %s", existing_filters[tuple(filter_name)]) removed_filters.append(existing_filters[tuple(filter_name)]) del self._reader.filters[existing_filters[tuple(filter_name)]] self._reader.filters[record_filter.filter_name()] = _Filter(record_filter.filter_name(), short_doc) # Update the filters for output. self._update_filters(self._reader.filters) _pos = 1 _chrom = None # For each record (POSITION) apply set of filters. for record in self._reader: if _chrom != record.CHROM: _pos, _chrom = 1, record.CHROM # Fill in any missing consecutive data with GT=./. records. while _pos <= record.POS: if _pos == record.POS: _record = record else: # This is a padding "N" record when records do not follow each other, # and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted. _ref = self._get_reference_base(record.CHROM, _pos) _record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None) _calls = [] sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1)) for sample, i in sorted_samples: _data = make_calldata_tuple(["GT"]) _data._types = ["String"] _data._nums = [1] d = ["./."] _calls.append(vcf.model._Call(_record, sample=sample, data=_data(*d))) _record.samples = _calls _record._sample_indexes = dict(sorted_samples) self._filter_record(_record, removed_filters) # After applying all filters, check if FILTER is None. # If it is, then record PASSED all filters. if _record.FILTER is None or _record.FILTER == []: if not record.is_monomorphic: _record.FILTER = [] if not keep_only_snps or (_record.is_snp and keep_only_snps): self._variants.append(_record) elif not only_good: self._variants.append(_record) _pos += 1 if _chrom is None: _chrom = record.CHROM return [ variant for variant in self._variants if not variant.FILTER]
def main(): # Parse command line arguments args = parse_args() vcf_file = args.in_vcf outfile = args.out_csv out_vcf = args.out_vcf report_FPs = args.falsepos tech_variation = args.tech_variation outstream = open(outfile, 'w') # Write header outstream.write(('variant,nonref_alleles_pool,total_alleles_pool,' 'nonref_alleles_probands,total_alleles_probands,' 'nonref_reads_pool,total_reads_pool,nonref_reads_probands,' 'recovered_all,falsepos,QD,AF_EXOMESgnomad,AF_GENOMESgnomad,' 'proband,recovered_in_proband,GT_pool\n')) with open(vcf_file, 'r') as this_vcf: vcf_reader = vcf.Reader(this_vcf) # Add an aditional filter that will be inherited by the vcf writer vcf_reader.filters['InPool'] = _Filter('InPool', 'All alleles found in the probands are also found in the pool.') # Create vcf writer based on the header from the input vcf vcf_writer = vcf.Writer(open(out_vcf, 'w'), vcf_reader) # Fetch sample names all_vcf_samples = vcf_reader.samples pool_name = check_pool_name(args.pool, all_vcf_samples) proband_names = check_proband_names(args.probands, all_vcf_samples) sys.stderr.write('Pool name: {0}\nProband names: {1}\n'.format(pool_name, ', '.join(proband_names))) pool_size = len(proband_names) pool_pos = all_vcf_samples.index(pool_name) probands_pos = [all_vcf_samples.index(proband) for proband in proband_names] # Create a vcf writer for each proband probandVCF_dict = {} for proband in proband_names: # Can't deepcopy vcf reader object, so editing it and returning it to previous state vcf_reader.samples= [proband] proband_out_vcf = proband + args.out_proband_vcf probandVCF_dict[proband] = vcf.Writer(open(proband_out_vcf, 'w'), vcf_reader) vcf_reader.samples = all_vcf_samples for record in vcf_reader: # Extract gnomad allele frequency data if available AF_EXOMESgnomad = extract_record_info(record, 'AF_EXOMESgnomad') AF_GENOMESgnomad = extract_record_info(record, 'AF_GENOMESgnomad') var_id = variant_id(record) nonref_alleles_pool, total_alleles_pool = count_nonref_alleles(record.samples[pool_pos]['GT']) qual = record.QUAL QD = qual/record.INFO['DP'] nonref_reads_pool = count_nonref_reads(record.samples[pool_pos]) total_reads_pool = record.samples[pool_pos]['DP'] nonref_reads_probands = 0 for proband_pos in probands_pos: nonref_reads_probands += count_nonref_reads(record.samples[proband_pos]) GT_pool = record.samples[pool_pos]['GT'] alleles_in_pool = get_nonref_alleles(record.samples[pool_pos]['GT']) alleles_in_probands = set.union(*[get_nonref_alleles(record.samples[pos]['GT']) for pos in probands_pos]) filtered = 'FALSE' falsepos = 'FALSE' # Calculate a mininum read filter based on the filter_reads or ploidy_filter # arguments, if given if args.filter_reads or args.ploidy_filter: min_read_filter = set_read_filter(total_reads_pool, args.filter_reads, args.ploidy_filter, tech_variation) alleles_in_pool_by_reads = set(alleles_supported(record, pool_pos, min_read_filter, include_ref = False)) if is_recovered(alleles_in_probands, alleles_in_pool_by_reads): filtered = 'TRUE' # likely false positive if found in the pool but not in any of the probands if len(alleles_in_pool_by_reads - alleles_in_probands) > 0: falsepos = 'TRUE' else: # Filter if all the variants found in the probands are also found in the pool if is_recovered(alleles_in_probands, alleles_in_pool): filtered = 'TRUE' # likely false positive if found in the pool but not in any of the probands if len(alleles_in_pool - alleles_in_probands) > 0: falsepos = 'TRUE' if filtered == 'TRUE': record.FILTER = 'InPool' # Count nonref alleles and total alleles in probands # Write a filtered vcf for each proband nonref_alleles_probands = 0 total_alleles_probands = 0 for proband_pos in probands_pos: proband = all_vcf_samples[proband_pos] nonref, total = count_nonref_alleles(record.samples[proband_pos]['GT']) nonref_alleles_probands += nonref total_alleles_probands += total for proband_pos in probands_pos: proband = all_vcf_samples[proband_pos] nonref, total = count_nonref_alleles(record.samples[proband_pos]['GT']) # Skip variant if this individual has no non-ref alleles e.g. GT is ./. or 0/0 if nonref == 0: continue # Check if variant is recovered for this proband specifically alleles_this_proband = get_nonref_alleles(record.samples[proband_pos]['GT']) # Write out the variant (GT for this sample only) to the vcf file for that proband # only if the variant is not found in the parent pool recovered_proband = 'FALSE' if args.filter_reads or args.ploidy_filter: if is_recovered(alleles_this_proband, alleles_in_pool_by_reads): recovered_proband = 'TRUE' else: if is_recovered(alleles_this_proband, alleles_in_pool): recovered_proband = 'TRUE' if recovered_proband == 'FALSE': tmp_record = copy.deepcopy(record) tmp_record.samples = [record.samples[proband_pos]] probandVCF_dict[proband].write_record(tmp_record) outstream.write(','.join([str(x) for x in [var_id,nonref_alleles_pool, total_alleles_pool,nonref_alleles_probands,total_alleles_probands, nonref_reads_pool,total_reads_pool,nonref_reads_probands,filtered,falsepos,QD, AF_EXOMESgnomad, AF_GENOMESgnomad, proband, recovered_proband, GT_pool]]) + '\n') # If none of the probands have any non-ref alleles at this locus # Still report it in the csv for false positives counts if report_FPs and nonref_alleles_probands == 0: outstream.write(','.join([str(x) for x in [var_id,nonref_alleles_pool, total_alleles_pool,nonref_alleles_probands,total_alleles_probands, nonref_reads_pool,total_reads_pool,nonref_reads_probands,filtered,falsepos,QD, AF_EXOMESgnomad, AF_GENOMESgnomad, 'NA', 'NA', 'NA']]) + '\n') # Write all samples from all variants to VCF # includes FILTER InPool for variants where all alleles in probands are recovered in pool vcf_writer.write_record(record)
filt.customize_parser(parser) filter_help += '\n %s:\t%s' % (filt.name, filt.description) parser.description += filter_help # parse command line args args = parser.parse_args() inp = vcf.Reader(file(args.input[0])) # build filter chain chain = [] for name in args.filters: f = filters[name](args) chain.append(f) inp.filters[f.filter_name()] = _Filter(f.filter_name(), f.description) oup = vcf.Writer(args.output, inp) # apply filters short_circuit = not args.no_short_circuit for record in inp: for filt in chain: result = filt(record) if result: record.add_filter(filt.filter_name()) if short_circuit: break if (not args.no_filtered) or (record.FILTER == '.'):
def main(): # Parse command line arguments args = parse_args() individual_vcf_files = args.individual_vcfs pool_vcf_file = args.pool_vcf pool_spec_file = args.pool_specs outfile = args.out_csv out_vcf_suffix = args.suffix report_falsepos = args.falsepos output_filtered = not args.exclude_filtered split_vars = args.split probands_in_pool = parse_pool_specs(pool_spec_file) outstream = open(outfile, 'w') # Write header outstream.write(('proband,variant,recovered_proband,falsepos,' 'QD,AF_EXOMESgnomad,nonref_alleles_proband,' 'total_alleles_proband,nonref_reads_proband,' 'position' '\n')) # Parse vcfs for pools # Simply record which variants were found in the pool pool_vars = parse_pool_vcf(pool_vcf_file, split_vars) nonref_alleles_probands = {} # Parse vcfs of individuals individual_vars = set() probands_found = [] for vcf_file in individual_vcf_files: proband = sample_id_from_fname(vcf_file) if proband not in probands_in_pool: continue # Skip any vcf files that don't match up with the pool specs probands_found.append(proband) with open(vcf_file, 'r') as this_vcf: vcf_reader = vcf.Reader(this_vcf) # Add an aditional filter that will be inherited by the vcf writer vcf_reader.filters['InPool'] = _Filter( 'InPool', 'All alleles found in the probands are also found in the pool.' ) # Create vcf writer based on the header from the input vcf vcf_writer = vcf.Writer(open(proband + out_vcf_suffix, 'w'), vcf_reader) for record in vcf_reader: falsepos = 'FALSE' qual = record.QUAL try: QD = qual / record.INFO['DP'] except KeyError: QD = 'NA' # Count alleles/reads supporting this variant nonref_alleles_proband, total_alleles_proband = count_nonref_alleles( record.samples[0]['GT']) nonref_reads_proband = count_nonref_reads(record.samples[0]) position = variant_position(record) variants, AF_EXOMESgnomad_all = get_variants_and_info( record, 'AF_EXOMESgnomad', split_vars) if len(AF_EXOMESgnomad_all) != len(variants): if not AF_EXOMESgnomad_all == ['NA']: sys.stderr.write(( 'WARNING: Number of variant allelese and gnomAD ' 'records do not match. Writing AF_EXOMESgnomad = NA ' 'for all. Variants: {} AF_EXOMESgnomad {} \n' ).format(variants, AF_EXOMESgnomad_all)) AF_EXOMESgnomad_all = ['NA'] * len(variants) all_variants_in_pool = True for variant, AF_EXOMESgnomad in zip(variants, AF_EXOMESgnomad_all): individual_vars.add(variant) variant_in_pool = variant in pool_vars # If any variant is not in the pool, then set to false if not variant_in_pool: all_variants_in_pool = False variant_in_pool = R_bool(variant_in_pool) outstream.write(','.join([ str(x) for x in [ proband, variant, variant_in_pool, falsepos, QD, AF_EXOMESgnomad, nonref_alleles_proband, total_alleles_proband, nonref_reads_proband, position ] ]) + '\n') # Either report variants as filtered in the VCF or skip them completely if all_variants_in_pool: if output_filtered: record.FILTER = 'InPool' # Set in_pool vcf filter vcf_writer.write_record(record) else: vcf_writer.write_record(record) if set(probands_in_pool) != set(probands_found): raise ValueError( ('Based on --pool_specs, expecting VCFs for ' 'the probands: {}, found VCFs for: {}. Please check that file ' 'given for --pool_specs is correct and that all proband VCFs are ' 'provided and named correctly.').format(sorted(probands_in_pool), sorted(probands_found))) # If false positives required, go through pooled vcf again and report them # Can do this without looping through again? if report_falsepos: proband = 'NA' variant_in_pool = 'TRUE' falsepos = 'TRUE' QD = 'NA' AF_EXOMESgnomad = 'NA' nonref_alleles_proband = 'NA' total_alleles_proband = 'NA' nonref_reads_proband = 'NA' with open(pool_vcf_file, 'r') as this_vcf: for record in vcf.Reader(this_vcf): variants, AF_EXOMESgnomad_all = get_variants_and_info( record, 'AF_EXOMESgnomad', split_vars) for variant in variants: # usually one, but could be multiple if not variant in individual_vars: outstream.write(','.join([ str(x) for x in [ proband, variant, variant_in_pool, falsepos, QD, AF_EXOMESgnomad, nonref_alleles_proband, total_alleles_proband, nonref_reads_proband, position ] ]) + '\n') outstream.close
def main(): # dynamically build the list of available filters filters = {} # parse command line args # (mainly because of custom_filters) parser = create_core_parser() (args, unknown_args) = parser.parse_known_args() # add filter to dictionary, extend help message # with help/arguments of each filter def addfilt(filt): filters[filt.name] = filt arg_group = parser.add_argument_group(filt.name, filt.__doc__) filt.customize_parser(arg_group) # Load predefined and local script filters filter_modules = [importlib.import_module('vcf.filters')] if args.custom_filters != None: filter_modules.append(imp.load_source('local_filters', args.custom_filters)) for my_module in filter_modules: for name, filter_class in inspect.getmembers(my_module, inspect.isclass): addfilt(filter_class) # go through the filters on the command line # one by one, trying to consume only the declared arguments used_filters = [] while len(args.rest): filter_name = args.rest.pop(0) if filter_name not in filters: sys.exit("%s is not a known filter (%s)" % (filter_name, str(filters.keys()))) # create a parser only for arguments of current filter filt_parser = create_filt_parser(filter_name) filters[filter_name].customize_parser(filt_parser) (known_filt_args, unknown_filt_args) = filt_parser.parse_known_args(args.rest) if len(unknown_filt_args): sys.exit("%s has no arguments like %s" % (filter_name, unknown_filt_args)) used_filters.append((filter_name, known_filt_args)) args.rest = known_filt_args.rest # print help using the 'help' parser, so it includes # all possible filters and arguments if args.help or len(used_filters) == 0 or args.input == None: parser.print_help() parser.exit() inp = vcf.Reader(args.input) # build filter chain chain = [] for (name, filter_args) in used_filters: f = filters[name](filter_args) chain.append(f) # add a filter record to the output short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() inp.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # output must be created after all the filter records have been added output = vcf.Writer(open(args.output, 'w'), inp, lineterminator='\n') # apply filters short_circuit = not args.no_short_circuit drop_filtered = args.no_filtered for record in inp: output_record = True for filt in chain: result = filt(record) if result == None: continue # save some work by skipping the rest of the code if drop_filtered: output_record = False break record.add_filter(filt.filter_name()) if short_circuit: break if output_record: # use PASS only if other filter names appear in the FILTER column #FIXME: is this good idea? if record.FILTER is None and not drop_filtered: record.FILTER = 'PASS' output.write_record(record)