def score_entire_file(pcpath,vcfpath,weightpath,minac=2,maxac=2500,flag='',n_pcs=9,acfields=['AC']): ''' Runs through an entire VCF and gives diversity scores for every allele in the specified AC range. Note that the acfields parameter is useful if you want to only consider variants that fall within a particular AC range *in certain population(s)* and the AC for these populations is specified in the INFO field, e.g. AC_AFR, AC_AMR, etc. ''' pcs = read_pcs(pcpath,n_pcs) weights = read_weights(weightpath) if vcfpath[-3:] == ".gz": # open .vcf.gz file with gzip.open, otherwise just use open openfunc = gzip.open else: openfunc = open # in theory PyVCF can accept just a path, but I found it only works with an fsock, hence the need for openfunc (above). # filename='ignore',compressed=False is a lousy hack to force vcf.Reader.__init__ to use the fsock and not the filename # __init__ func currently contains 2 lines: # if filename is None and hasattr(fsock, 'name'): # filename = fsock.name # which short-circuit my attempt to pass it an fsock; it then opens the file with open(filename,mode='rt') instead of # with gzip, and thus it crashes with this error: # if mode[0:1] == 'r': # TypeError: 'int' object is not subscriptable # by making the filename non-None, I prevent this; but if PyVCF is later refactored to use filename before fsock, # my code will break. hence I say it's a lousy hack. vcf_reader = vcf.Reader(openfunc(vcfpath),filename='ignore',compressed=False,strict_whitespace=True) # split only on tab, allow spaces in ids for record in vcf_reader: # iterate over every row of VCF for alt in record.ALT: # for every alt allele at this site this_alt_allele_index = record.ALT.index(alt) # index of this particular allele in comma-separated INFO fields this_alt_allele_number = record.ALT.index(alt) + 1 # for GT fields, need allele number: 1, 2, etc. remember REF allele is 0. nominal_ac = 0 # initialize a variable for the allele count as stated in the INFO field for acfield in acfields: # add up the allele count in each AC field nominal_ac += int(record.INFO[acfield][this_alt_allele_index]) if nominal_ac < minac or nominal_ac > maxac: continue samples_with_allele = [] true_ac = 0 for sample in record.samples: if sample['GT'] is None: # no-calls apparently come through as None instead of ./. # if you call sample.gt_alleles on them, PyVCF tries to do None.split() and # throws an Attribute Error. so just ignore these. continue if this_alt_allele_number in map(int,sample.gt_alleles): # if this sample has this allele samples_with_allele.append(sample.sample.replace(' ','_')) # grab sample id, and replace space with underscore true_ac += map(int,sample.gt_alleles).count(this_alt_allele_number) # add this indiv's allele count to the running total if acfields == ['AC']: # only if we are including AC from *all* individuals, we can spot check that the AC is correct. assert true_ac == nominal_ac, "VCF has AC as %s, actual AC is %s.\nRecord is:\n%s"%(nominal_ac,true_ac,str(record)) # we only care about the samples for which PCs are available, so take intersection of those two sets: samples_with_allele_in_pcs = list(set(samples_with_allele) & set(pcs.keys())) # try: meandist = mean_euclid_dist(samples_with_allele_in_pcs,pcs,weights,warn=False) # do not warn if samples are missing from pcs minpos, minref, minalt = get_minimal_representation(record.POS,record.REF,str(alt)) print "\t".join([record.CHROM,str(minpos),minref,minalt,str(true_ac),str(meandist),flag]) except AssertionError as e: sys.stderr.write(e.message+"\n") continue
def seek_out_variant(self, chrom, pos, ref, alt): pos = int(pos) leeway = max([len(ref), len(alt)]) target = get_minimal_representation(pos, ref, alt) found = False reg = tuple(map(str,(chrom, pos-leeway, pos+leeway))) region = "%s:%s-%s" % reg self.go_to(region) for line in self.read(): pos = line['POS'] ref = line['REF'] alts = line['ALT'].split(",") # get min rep of alleles in full vcf so they correspond to alleles in sites table candidates = [get_minimal_representation(pos, ref, alt) for alt in alts] if target in candidates: found = True allele_number = candidates.index(target) + 1 break if found: return allele_number else: return False
annotations = [dict(zip(vep_field_names, x.split('|'))) for x in info_field['CSQ'].split(',') if len(vep_field_names) == len(x.split('|'))] if args.lof_only: annotations = [x for x in annotations if x['LoF'] == 'HC'] if args.canonical_only: annotations = [x for x in annotations if x['CANONICAL'] == 'YES'] else: annotations = [] if args.lof_only and len(annotations) == 0: continue if args.canonical_only and len(annotations) == 0: continue if 'FORMAT' in header: format_fields_list = fields[header['FORMAT']].split(':') format_fields = dict(zip(format_fields_list, range(len(format_fields_list)))) for index, alt in enumerate(alts): # Get site data if not args.do_not_minrep and get_minimal_representation is not None: new_pos, new_ref, new_alt = get_minimal_representation(fields[header['POS']], fields[header['REF']], alt) if args.snps_only and (len(new_ref) != 1 or len(new_alt) != 1): continue output = [fields[header['CHROM']], str(new_pos), new_ref, new_alt] else: if args.snps_only and (len(fields[header['REF']]) != 1 or len(alt) != 1): continue output = [fields[header['CHROM']], fields[header['POS']], fields[header['REF']], alt] ucsc_link = raw_ucsc_link % (output[0], int(output[1]) - args.ucsc_link_window, int(output[1]) + args.ucsc_link_window) if args.add_ucsc_link: output.append(ucsc_link) if args.include_id: output.append(fields[header['ID']]) if args.original_position: output.append(fields[header['POS']]) if not args.omit_filter: output.append(fields[header['FILTER']]) # Get data from INFO field for info in desired_info: this_output = missing_string
def main(args): # Read parameters f = gzip.open(args.vcf) if args.vcf.endswith('.gz') else open(args.vcf) if args.output is None: args.output = '.table'.join(args.output.rsplit('.vcf', 1)) if args.output == args.vcf: print >> sys.stderr, "VCF filename has no '.vcf' and no output file name was provided. Exiting." sys.exit(1) g = gzip.open(args.output, 'w') if args.output.endswith('.gz') else open(args.output, 'w') desired_info = [] if args.info is None else args.info.split(',') desired_vep_info = [] if args.vep_info is None else args.vep_info.split(',') if args.simplify: args.max_csq = True args.lof_only = True args.collapse_annotations = True missing_string = '\N' if args.mysql else 'NA' header = None vep_field_names = None info_from_header = {} started = False output_header = 'CHROM\tPOS\tREF\tALT\t' if args.include_id: output_header += 'ID\t' if not args.omit_filter: output_header += 'FILTER\t' for line in f: line = line.strip() # Reading header lines to get VEP and individual arrays if line.startswith('#'): line = line.lstrip('#') if line.startswith('INFO=<ID='): try: header_metadata = dict([x.split('=', 1) for x in line.split('<')[1].split('>')[0].split(',', 3) if '=' in x]) except Exception, e: print >> sys.stderr, "Malformed header line: %s" % line sys.exit(1) info_from_header[header_metadata['ID']] = header_metadata if 'ID=CSQ' in line: vep_field_names = line.split('Format: ')[-1].strip('">').split('|') vep_info_from_header = dict(zip(vep_field_names, range(len(vep_field_names)))) if line.startswith('CHROM'): header = line.split() header = dict(zip(header, range(len(header)))) if args.options: print >> sys.stderr, "######### OPTIONS FOR INFO #########" for info in info_from_header: print >> sys.stderr, '%s\t%s' % (info, info_from_header[info]['Description']) if vep_field_names is not None: print >> sys.stderr, "######### OPTIONS FOR VEP_INFO #########" print >> sys.stderr, '\n'.join(vep_field_names) sys.exit(0) continue if len(desired_vep_info) > 0: if vep_field_names is None: print >> sys.stderr, "VEP info requested, but VCF file does not have a VEP header line. Exiting." sys.exit(1) if 'ALLELE_NUM' not in vep_info_from_header: print >> sys.stderr, "VEP output does not have ALLELE_NUM which is required for extraction. Please re-run VEP with --allele_number. Exiting." sys.exit(1) if header is None: print >> sys.stderr, "VCF file does not have a header line (CHROM POS etc.). Exiting." sys.exit(1) if not started: # Allowing entries even if not found in the header line, with some caveats original_desired_info = copy.deepcopy(desired_info) desired_info = [] any_missing = 0 for info in original_desired_info: if info in info_from_header: print >> sys.stderr, 'SUCCESS: Found %s: %s' % (info, info_from_header[info]['Description']) desired_info.append(info) else: matches = 0 for header_record in info_from_header: if re.search('^%s$' % info, header_record): matches += 1 desired_info.append(header_record) print >> sys.stderr, 'SUCCESS: Found %s (matching %s): %s' % (header_record, info, info_from_header[header_record]['Description']) if not matches: print >> sys.stderr, 'WARNING: Did not find %s in header.' % info any_missing += 1 desired_info.append(info) # Only allowing entries in VEP header. original_desired_vep_info = copy.deepcopy(desired_vep_info) desired_vep_info = [] for info in original_desired_vep_info: if info in vep_info_from_header: desired_vep_info.append(info) print >> sys.stderr, 'SUCCESS: Found %s' % info else: print >> sys.stderr, 'WARNING: Did not find %s in VEP header. Not including from here on out.' % info # Warnings/errors for missing data if any_missing: print >> sys.stderr, 'WARNING: At least one INFO line requested was not found. Continuing, but results may be off.' if len(desired_info) + len(desired_vep_info) == 0: print >> sys.stderr, 'No fields left in requested info/VEP info. Exiting.' sys.exit(1) if args.lof_only and 'LoF' not in desired_vep_info: print >> sys.stderr, '--lof_only was used, but no LoF tag found in VEP field. Exiting.' sys.exit(1) # Ready to go. output_header += '\t'.join(desired_info) if len(desired_vep_info) > 0: output_header += '\t' + '\t'.join(desired_vep_info) print >> g, output_header started = True # Pull out annotation info from INFO and ALT fields fields = line.split('\t') info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[header['INFO']])]) # Only get VEP info if requested if len(desired_vep_info) > 0: if 'CSQ' in info_field: # if statement here is a fix for VEP's occasional introduction of a semi-colon into the CSQ. # Can be removed once that is completely fixed. annotations = [dict(zip(vep_field_names, x.split('|'))) for x in info_field['CSQ'].split(',') if len(vep_field_names) == len(x.split('|'))] if args.lof_only: annotations = [x for x in annotations if x['LoF'] == 'HC'] else: annotations = [] if args.lof_only and len(annotations) == 0: continue alts = fields[header['ALT']].split(',') # Default is split line into all alternate alleles if not args.preserve_multiallelic: for index, alt in enumerate(alts): if get_minimal_representation is not None: new_pos, new_ref, new_alt = get_minimal_representation(fields[header['POS']], fields[header['REF']], alt) output = [fields[header['CHROM']], str(new_pos), new_ref, new_alt] else: output = [fields[header['CHROM']], fields[header['POS']], fields[header['REF']], alt] if args.include_id: output.append(fields[header['ID']]) if not args.omit_filter: output.append(fields[header['FILTER']]) # Get info and VEP info for info in desired_info: if info in info_field: if info in info_from_header and 'Number' in info_from_header[info] and info_from_header[info]['Number'] == 'A': output.append(info_field[info].split(',')[index]) else: output.append(info_field[info]) else: output.append(missing_string) if len(desired_vep_info) > 0: # Filter to this allele this_alt_annotations = [x for x in annotations if int(x['ALLELE_NUM']) - 1 == index] if args.lof_only and len(this_alt_annotations) == 0: continue for info in desired_vep_info: this_alt_vep_info = [x[info] for x in this_alt_annotations if x[info] != ''] # Process options if args.max_csq and info == 'Consequence': this_alt_vep_info = [csq_max_vep(x) for x in this_alt_vep_info] if args.simplify_gtex and info == 'TissueExpression': # Converting from tissue1:value1&tissue2:value2 to [tissue1, tissue2] this_alt_vep_info = set([y.split(':')[0] for x in this_alt_vep_info for y in x.split('&')]) if args.collapse_annotations: this_alt_vep_info = set(this_alt_vep_info) # Collapse consequence further if args.max_csq and info == 'Consequence': this_alt_vep_info = [csq_max(this_alt_vep_info)] annotation_output = ','.join(this_alt_vep_info) if annotation_output == '': annotation_output = missing_string output.append(annotation_output) print >> g, '\t'.join(output) else: output = [fields[header['CHROM']], fields[header['POS']], fields[header['REF']], fields[header['ALT']]] if args.include_id: output.append(fields[header['ID']]) if not args.omit_filter: output.append(fields[header['FILTER']]) for info in desired_info: if info in info_field: output.append(info_field[info]) else: output.append(missing_string) if len(desired_vep_info) > 0: for info in desired_vep_info: this_vep_info = [x[info] for x in annotations] # Process options if args.max_csq and info == 'Consequence': this_vep_info = [csq_max(x) for x in this_vep_info] if args.simplify_gtex and info == 'TissueExpression': this_vep_info = set([y.split(':')[0] for x in this_vep_info for y in x.split('&')]) if args.collapse_annotations: this_vep_info = set(this_vep_info) if args.max_csq and info == 'Consequence': this_vep_info = [csq_max(this_vep_info)] annotation_output = ','.join(this_vep_info) if annotation_output == '': annotation_output = missing_string output.append(annotation_output) print >> g, '\t'.join(output)
def main(args): # Read parameters f = gzip.open(args.vcf) if args.vcf.endswith('.gz') else open(args.vcf) if args.output is None: args.output = '.table'.join(args.vcf.rsplit('.vcf', 1)) if args.output == args.vcf: print >> sys.stderr, "VCF filename has no '.vcf' and no output file name was provided. Exiting." sys.exit(1) g = gzip.open(args.output, 'w') if args.output.endswith('.gz') else open( args.output, 'w') desired_info = [] if args.info is None else args.info.split(',') desired_vep_info = [] if args.vep_info is None else args.vep_info.split( ',') desired_sample_info = [] if args.sample_info is None else args.sample_info.split( ',') if args.simplify: args.max_csq = True args.lof_only = True args.collapse_annotations = True missing_string = '\N' if args.mysql else 'NA' header = None vep_field_names = None info_from_header = {} started = False output_header = 'CHROM\tPOS\tREF\tALT\t' if args.add_ucsc_link: output_header += 'UCSC\t' if args.include_id: output_header += 'ID\t' if not args.omit_filter: output_header += 'FILTER\t' raw_ucsc_link = 'http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&position=chr%s:%s-%s' for line in f: line = line.strip() # Reading header lines to get VEP and individual arrays if line.startswith('#'): line = line.lstrip('#') if line.startswith('INFO=<ID='): try: header_metadata = dict([ x.split('=', 1) for x in line.split('<')[1].split('>')[0].split( ',', 3) if '=' in x ]) except Exception, e: print >> sys.stderr, "Malformed header line: %s" % line sys.exit(1) info_from_header[header_metadata['ID']] = header_metadata if 'ID=CSQ' in line: vep_field_names = line.split('Format: ')[-1].strip('">').split( '|') vep_info_from_header = dict( zip(vep_field_names, range(len(vep_field_names)))) if line.startswith('CHROM'): header = line.split() header = dict(zip(header, range(len(header)))) if args.options: print >> sys.stderr, "######### OPTIONS FOR INFO #########" for info in info_from_header: print >> sys.stderr, '%s\t%s' % ( info, info_from_header[info]['Description']) if vep_field_names is not None: print >> sys.stderr, "######### OPTIONS FOR VEP_INFO #########" print >> sys.stderr, '\n'.join(vep_field_names) sys.exit(0) continue if len(desired_vep_info) > 0: if vep_field_names is None: print >> sys.stderr, "VEP info requested, but VCF file does not have a VEP header line. Exiting." sys.exit(1) if 'ALLELE_NUM' not in vep_info_from_header: print >> sys.stderr, "VEP output does not have ALLELE_NUM which is required for extraction. Please re-run VEP with --allele_number. Exiting." sys.exit(1) if header is None: print >> sys.stderr, "VCF file does not have a header line (CHROM POS etc.). Exiting." sys.exit(1) if not started: # Allowing entries even if not found in the header line, with some caveats original_desired_info = copy.deepcopy(desired_info) desired_info = [] any_missing = 0 for info in original_desired_info: if info in info_from_header: print >> sys.stderr, 'SUCCESS: Found %s: %s' % ( info, info_from_header[info]['Description']) desired_info.append(info) else: matches = 0 for header_record in info_from_header: if re.search('^%s$' % info, header_record): matches += 1 desired_info.append(header_record) print >> sys.stderr, 'SUCCESS: Found %s (matching %s): %s' % ( header_record, info, info_from_header[header_record]['Description']) if not matches: print >> sys.stderr, 'WARNING: Did not find %s in header.' % info any_missing += 1 desired_info.append(info) # Only allowing entries in VEP header. original_desired_vep_info = copy.deepcopy(desired_vep_info) desired_vep_info = [] for info in original_desired_vep_info: if info in vep_info_from_header: desired_vep_info.append(info) print >> sys.stderr, 'SUCCESS: Found %s' % info else: print >> sys.stderr, 'WARNING: Did not find %s in VEP header. Not including from here on out.' % info # Getting info from individuals original_desired_sample_info = copy.deepcopy(desired_sample_info) desired_sample_info = [] if len(original_desired_sample_info) > 0: if 'FORMAT' not in header: print >> sys.stderr, 'WARNING: Did not find FORMAT in header line, will not be extracting any SAMPLE.FORMATs' else: for info in original_desired_sample_info: sample_format = info.split('.') if len(sample_format) != 2: print >> sys.stderr, 'WARNING: %s is not a SAMPLE.FORMAT designation' % sample_format else: sample, format = sample_format if sample in header: print >> sys.stderr, 'SUCCESS: Found sample %s in header' % sample desired_sample_info.append(info) else: print >> sys.stderr, 'WARNING: Sample %s not found in header' % sample # Warnings/errors for missing data if any_missing: print >> sys.stderr, 'WARNING: At least one INFO line requested was not found. Continuing, but results may be off.' if len(desired_info) + len(desired_vep_info) + len( desired_sample_info) == 0: print >> sys.stderr, 'No fields left in requested info/VEP info. Exiting.' sys.exit(1) if args.lof_only and 'LoF' not in desired_vep_info: print >> sys.stderr, '--lof_only was used, but no LoF tag found in VEP field. Exiting.' sys.exit(1) # Ready to go. output_header += '\t'.join(desired_info) if len(desired_vep_info) > 0: output_header += '\t' + '\t'.join(desired_vep_info) if len(desired_sample_info) > 0: output_header += '\t' + '\t'.join(desired_sample_info) print >> g, output_header started = True # Pull out annotation info from INFO and ALT fields fields = line.split('\t') info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[header['INFO']]) ]) # Only get VEP info if requested if len(desired_vep_info) > 0: if 'CSQ' in info_field: # if statement here is a fix for VEP's occasional introduction of a semi-colon into the CSQ. # Can be removed once that is completely fixed. annotations = [ dict(zip(vep_field_names, x.split('|'))) for x in info_field['CSQ'].split(',') if len(vep_field_names) == len(x.split('|')) ] if args.lof_only: annotations = [x for x in annotations if x['LoF'] == 'HC'] else: annotations = [] if args.lof_only and len(annotations) == 0: continue alts = fields[header['ALT']].split(',') format_fields_list = fields[header['FORMAT']].split(':') format_fields = dict( zip(format_fields_list, range(len(format_fields_list)))) # Default is split line into all alternate alleles if not args.preserve_multiallelic: for index, alt in enumerate(alts): if get_minimal_representation is not None: new_pos, new_ref, new_alt = get_minimal_representation( fields[header['POS']], fields[header['REF']], alt) output = [ fields[header['CHROM']], str(new_pos), new_ref, new_alt ] else: output = [ fields[header['CHROM']], fields[header['POS']], fields[header['REF']], alt ] ucsc_link = raw_ucsc_link % ( output[0], int(output[1]) - args.ucsc_link_window, int(output[1]) + args.ucsc_link_window) if args.add_ucsc_link: output.append(ucsc_link) if args.include_id: output.append(fields[header['ID']]) if not args.omit_filter: output.append(fields[header['FILTER']]) # Get info and VEP info for info in desired_info: if info in info_field: if info in info_from_header and 'Number' in info_from_header[ info] and info_from_header[info][ 'Number'] == 'A': output.append(info_field[info].split(',')[index]) else: output.append(info_field[info]) else: output.append(missing_string) if len(desired_vep_info) > 0: # Filter to this allele this_alt_annotations = [ x for x in annotations if int(x['ALLELE_NUM']) - 1 == index ] if args.lof_only and len(this_alt_annotations) == 0: continue for info in desired_vep_info: this_alt_vep_info = [ x[info] for x in this_alt_annotations if x[info] != '' ] # Process options if args.max_csq and info == 'Consequence': this_alt_vep_info = [ csq_max_vep(x) for x in this_alt_vep_info ] if args.simplify_gtex and info == 'TissueExpression': # Converting from tissue1:value1&tissue2:value2 to [tissue1, tissue2] this_alt_vep_info = set([ y.split(':')[0] for x in this_alt_vep_info for y in x.split('&') ]) if args.collapse_annotations: this_alt_vep_info = set(this_alt_vep_info) # Collapse consequence further if args.max_csq and info == 'Consequence': this_alt_vep_info = [ csq_max(this_alt_vep_info) ] annotation_output = ','.join(this_alt_vep_info) if annotation_output == '': annotation_output = missing_string output.append(annotation_output) for sample_format in desired_sample_info: sample, format = sample_format.split('.') if format not in format_fields: continue this_sample_format = dict( zip(format_fields_list, fields[header[sample]].split(':'))) if format in this_sample_format: output.append(this_sample_format[format]) else: output.append(missing_string) print >> g, '\t'.join(output) else: output = [ fields[header['CHROM']], fields[header['POS']], fields[header['REF']], fields[header['ALT']] ] ucsc_link = raw_ucsc_link % ( output[0], int(output[1]) - args.ucsc_link_window, int(output[1]) + args.ucsc_link_window) if args.add_ucsc_link: output.append(ucsc_link) if args.include_id: output.append(fields[header['ID']]) if not args.omit_filter: output.append(fields[header['FILTER']]) for info in desired_info: if info in info_field: output.append(info_field[info]) else: output.append(missing_string) if len(desired_vep_info) > 0: for info in desired_vep_info: this_vep_info = [x[info] for x in annotations] # Process options if args.max_csq and info == 'Consequence': this_vep_info = [csq_max(x) for x in this_vep_info] if args.simplify_gtex and info == 'TissueExpression': this_vep_info = set([ y.split(':')[0] for x in this_vep_info for y in x.split('&') ]) if args.collapse_annotations: this_vep_info = set(this_vep_info) if args.max_csq and info == 'Consequence': this_vep_info = [csq_max(this_vep_info)] annotation_output = ','.join(this_vep_info) if annotation_output == '': annotation_output = missing_string output.append(annotation_output) for sample_format in desired_sample_info: sample, format = sample_format if format not in format_fields: continue this_sample_format = dict( zip(format_fields_list, fields[header[sample]].split(':'))) if format in this_sample_format: output.append(this_sample_format[format]) else: output.append(missing_string) print >> g, '\t'.join(output)
def score_entire_file(pcpath, vcfpath, weightpath, minac=2, maxac=2500, flag='', n_pcs=9, acfields=['AC']): ''' Runs through an entire VCF and gives diversity scores for every allele in the specified AC range. Note that the acfields parameter is useful if you want to only consider variants that fall within a particular AC range *in certain population(s)* and the AC for these populations is specified in the INFO field, e.g. AC_AFR, AC_AMR, etc. ''' pcs = read_pcs(pcpath, n_pcs) weights = read_weights(weightpath) if vcfpath[ -3:] == ".gz": # open .vcf.gz file with gzip.open, otherwise just use open openfunc = gzip.open else: openfunc = open # in theory PyVCF can accept just a path, but I found it only works with an fsock, hence the need for openfunc (above). # filename='ignore',compressed=False is a lousy hack to force vcf.Reader.__init__ to use the fsock and not the filename # __init__ func currently contains 2 lines: # if filename is None and hasattr(fsock, 'name'): # filename = fsock.name # which short-circuit my attempt to pass it an fsock; it then opens the file with open(filename,mode='rt') instead of # with gzip, and thus it crashes with this error: # if mode[0:1] == 'r': # TypeError: 'int' object is not subscriptable # by making the filename non-None, I prevent this; but if PyVCF is later refactored to use filename before fsock, # my code will break. hence I say it's a lousy hack. vcf_reader = vcf.Reader( openfunc(vcfpath), filename='ignore', compressed=False, strict_whitespace=True) # split only on tab, allow spaces in ids for record in vcf_reader: # iterate over every row of VCF for alt in record.ALT: # for every alt allele at this site this_alt_allele_index = record.ALT.index( alt ) # index of this particular allele in comma-separated INFO fields this_alt_allele_number = record.ALT.index( alt ) + 1 # for GT fields, need allele number: 1, 2, etc. remember REF allele is 0. nominal_ac = 0 # initialize a variable for the allele count as stated in the INFO field for acfield in acfields: # add up the allele count in each AC field nominal_ac += int(record.INFO[acfield][this_alt_allele_index]) if nominal_ac < minac or nominal_ac > maxac: continue samples_with_allele = [] true_ac = 0 for sample in record.samples: if sample[ 'GT'] is None: # no-calls apparently come through as None instead of ./. # if you call sample.gt_alleles on them, PyVCF tries to do None.split() and # throws an Attribute Error. so just ignore these. continue if this_alt_allele_number in map( int, sample.gt_alleles): # if this sample has this allele samples_with_allele.append( sample.sample.replace(' ', '_') ) # grab sample id, and replace space with underscore true_ac += map(int, sample.gt_alleles).count( this_alt_allele_number ) # add this indiv's allele count to the running total if acfields == [ 'AC' ]: # only if we are including AC from *all* individuals, we can spot check that the AC is correct. assert true_ac == nominal_ac, "VCF has AC as %s, actual AC is %s.\nRecord is:\n%s" % ( nominal_ac, true_ac, str(record)) # we only care about the samples for which PCs are available, so take intersection of those two sets: samples_with_allele_in_pcs = list( set(samples_with_allele) & set(pcs.keys())) # try: meandist = mean_euclid_dist( samples_with_allele_in_pcs, pcs, weights, warn=False) # do not warn if samples are missing from pcs minpos, minref, minalt = get_minimal_representation( record.POS, record.REF, str(alt)) print "\t".join([ record.CHROM, str(minpos), minref, minalt, str(true_ac), str(meandist), flag ]) except AssertionError as e: sys.stderr.write(e.message + "\n") continue
def find_var_indivs(refvcf, reftable, chr, pos, ref, alt, find_indivs): # dictionary to hold info on people with the variant allele variant_indivs = {} # convert input variants to minimal representation pos, ref, alt = mr.get_minimal_representation(pos, ref, alt) print "##Minimal representation of your search: ", print pos, ref, alt # use tabix to grab 100 bp on either side of putative variant lines = get_vcf_lines(refvcf, 100, chr, pos) # get the #CHROM line from the gzipped VCF column_names = get_vcf_colnames(refvcf) # now search the lines for the variant of interest match_found = False # default is you haven't found a matching variant for line in lines: cols = line.split("\t") if len(cols ) <= 9: # skip any extra non-VCF lines that appear in output continue vchr, vpos, vid, vref, valt, vqual, vfilter, vinfo, vformat = cols[:9] vpos = int(vpos) # must cast to into to match incoming pos variable valt_alleles = valt.split(",") for valt_allele in valt_alleles: vpos_mr, vref_mr, valt_allele_mr = mr.get_minimal_representation( vpos, vref, valt_allele) # check if we've found a match if vchr == chr and vpos_mr == pos and vref_mr == ref and valt_allele_mr == alt: match_found = True # output the variant info as called in the reference VCF print "##Relevant line from VCF: ", print '\t'.join(cols[:9]) if find_indivs: allele_no = valt_alleles.index( valt_allele) + 1 # first alt allele is 1 (ref is 0) format_fields = vformat.split(":") gt_idx = format_fields.index( "GT") # in what order does genotype appear for column_no in range(9, len(cols)): call = cols[column_no] call_fields = call.split(":") genotype = call_fields[gt_idx] alleles = re.split( "/|\|", genotype ) # split on / or | to support UG or HC calls. # check if this individual has the allele in question if str(allele_no) in alleles: sample_name = column_names[column_no] call_info = call # store this person and their call in the dict variant_indivs[sample_name] = call_info break # stop looking for more matching alleles if match_found: break # stop looking for more matching sites if not match_found: print "##No matches found." else: # at this point we have printed the variant call, and stored the info # on each individual with the variant. # now if possible we also want to look up what study they're from. # the reftable parameter is optional, so we'll check if the table exists if find_indivs and reftable is not None and os.path.isfile(reftable): print "#SAMPLE\tPROJECT\tCALL" for sample_name, call_info in variant_indivs.iteritems(): project_name = get_project_name(reftable, sample_name) if project_name is None: project_name = "" print "%s\t%s\t%s" % (sample_name, project_name, call_info) elif find_indivs: print "#SAMPLE\tCALL" for sample_name, call_info in variant_indivs.iteritems(): print "%s\t%s" % (sample_name, call_info)
def find_var_indivs(refvcf,reftable,chr,pos,ref,alt,find_indivs): # dictionary to hold info on people with the variant allele variant_indivs = {} # convert input variants to minimal representation pos, ref, alt = mr.get_minimal_representation(pos,ref,alt) print "##Minimal representation of your search: ", print pos, ref, alt # use tabix to grab 100 bp on either side of putative variant lines = get_vcf_lines(refvcf,100,chr,pos) # get the #CHROM line from the gzipped VCF column_names = get_vcf_colnames(refvcf) # now search the lines for the variant of interest match_found = False # default is you haven't found a matching variant for line in lines: cols = line.split("\t") if len(cols) <= 9: # skip any extra non-VCF lines that appear in output continue vchr, vpos, vid, vref, valt, vqual, vfilter, vinfo, vformat = cols[:9] vpos = int(vpos) # must cast to into to match incoming pos variable valt_alleles = valt.split(",") for valt_allele in valt_alleles: vpos_mr, vref_mr, valt_allele_mr = mr.get_minimal_representation(vpos, vref, valt_allele) # check if we've found a match if vchr == chr and vpos_mr == pos and vref_mr == ref and valt_allele_mr == alt: match_found = True # output the variant info as called in the reference VCF print "##Relevant line from VCF: ", print '\t'.join(cols[:9]) if find_indivs: allele_no = valt_alleles.index(valt_allele) + 1 # first alt allele is 1 (ref is 0) format_fields = vformat.split(":") gt_idx = format_fields.index("GT") # in what order does genotype appear for column_no in range(9,len(cols)): call = cols[column_no] call_fields = call.split(":") genotype = call_fields[gt_idx] alleles = re.split("/|\|",genotype) # split on / or | to support UG or HC calls. # check if this individual has the allele in question if str(allele_no) in alleles: sample_name = column_names[column_no] call_info = call # store this person and their call in the dict variant_indivs[sample_name] = call_info break # stop looking for more matching alleles if match_found: break # stop looking for more matching sites if not match_found: print "##No matches found." else: # at this point we have printed the variant call, and stored the info # on each individual with the variant. # now if possible we also want to look up what study they're from. # the reftable parameter is optional, so we'll check if the table exists if find_indivs and reftable is not None and os.path.isfile(reftable): print "#SAMPLE\tPROJECT\tCALL" for sample_name, call_info in variant_indivs.iteritems(): project_name = get_project_name(reftable,sample_name) if project_name is None: project_name = "" print "%s\t%s\t%s" % (sample_name, project_name, call_info) elif find_indivs: print "#SAMPLE\tCALL" for sample_name, call_info in variant_indivs.iteritems(): print "%s\t%s" % (sample_name, call_info)
def test_simple_snv(self): pos, ref, alt = 1001, 'A', 'T' expected = 1001, 'A', 'T' self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
def test_multiallelic_indels(self): pos, ref, alt = 1001, 'CTCC', 'CCCC' expected = 1002, 'T', 'C' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTCC', 'CCC' expected = 1001, 'CT', 'C' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTCC', 'CTC' expected = 1002, 'TC', 'T' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTAG', 'CTG' expected = 1002, 'TA', 'T' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTCC', 'CTACC' expected = 1002, 'T', 'TA' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'TCAGCAGCAG', 'TCAGCAG' expected = 1001, 'TCAG', 'T' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTT', 'CTTT' expected = 1001, 'C', 'CT' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTT', 'C' expected = 1001, 'CTT', 'C' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'CTT', 'CT' expected = 1001, 'CT', 'C' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'AAAATATATATAT', 'A' expected = 1001, 'AAAATATATATAT', 'A' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'AAAATATATATAT', 'AATAT' expected = 1001, 'AAAATATAT', 'A' self.assertEqual(get_minimal_representation(pos, ref, alt), expected) pos, ref, alt = 1001, 'ACACACACAC', 'AACAC' expected = 1001, 'ACACAC', 'A' self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
def test_simple_snv(self): pos, ref, alt = 1000, 'A', 'T' expected = 1000, 'A', 'T' self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
default=sys.stdout) parser.add_argument('-i', '--inf', nargs='?', type=argparse.FileType('rb'), default=sys.stdin) args = parser.parse_args() # if there is a header line, just print it right back out if args.has_header: header = args.inf.readline() args.outf.write(header) # for other lines, process the POS, REF and ALT fields to minimal representation for line in args.inf.readlines(): cols = line.strip().split('\t') pos, ref, alt = [ cols[i] for i in [args.pos - 1, args.ref - 1, args.alt - 1] ] newpos, newref, newalt = get_minimal_representation(int(pos), ref, alt) cols[args.pos - 1] = newpos cols[args.ref - 1] = newref cols[args.alt - 1] = newalt args.outf.write("\t".join(map(str, cols)) + "\n") if args.inf is not sys.stdin: inf.close() if args.outf is not sys.stdout: outf.close()