def score_entire_file(pcpath,vcfpath,weightpath,minac=2,maxac=2500,flag='',n_pcs=9,acfields=['AC']):
    '''
    Runs through an entire VCF and gives diversity scores for every allele in the specified AC range.
    Note that the acfields parameter is useful if you want to only consider variants that fall within
    a particular AC range *in certain population(s)* and the AC for these populations is specified in
    the INFO field, e.g. AC_AFR, AC_AMR, etc.
    '''
    pcs = read_pcs(pcpath,n_pcs)
    weights = read_weights(weightpath)
    if vcfpath[-3:] == ".gz": # open .vcf.gz file with gzip.open, otherwise just use open
        openfunc = gzip.open
    else:
        openfunc = open
    # in theory PyVCF can accept just a path, but I found it only works with an fsock, hence the need for openfunc (above).
    # filename='ignore',compressed=False is a lousy hack to force vcf.Reader.__init__ to use the fsock and not the filename
    # __init__ func currently contains 2 lines:
    # if filename is None and hasattr(fsock, 'name'):
    #            filename = fsock.name
    # which short-circuit my attempt to pass it an fsock; it then opens the file with open(filename,mode='rt') instead of
    # with gzip, and thus it crashes with this error:
    # if mode[0:1] == 'r':
    #     TypeError: 'int' object is not subscriptable
    # by making the filename non-None, I prevent this; but if PyVCF is later refactored to use filename before fsock,
    # my code will break. hence I say it's a lousy hack.
    vcf_reader = vcf.Reader(openfunc(vcfpath),filename='ignore',compressed=False,strict_whitespace=True) # split only on tab, allow spaces in ids
    for record in vcf_reader: # iterate over every row of VCF
        for alt in record.ALT: # for every alt allele at this site
            this_alt_allele_index = record.ALT.index(alt) # index of this particular allele in comma-separated INFO fields
            this_alt_allele_number = record.ALT.index(alt) + 1 # for GT fields, need allele number: 1, 2, etc. remember REF allele is 0.
            nominal_ac = 0 # initialize a variable for the allele count as stated in the INFO field
            for acfield in acfields: # add up the allele count in each AC field
                nominal_ac += int(record.INFO[acfield][this_alt_allele_index])
            if nominal_ac < minac or nominal_ac > maxac:
                continue
            samples_with_allele = []
            true_ac = 0
            for sample in record.samples:
                if sample['GT'] is None: # no-calls apparently come through as None instead of ./.
                    # if you call sample.gt_alleles on them, PyVCF tries to do None.split() and
                    # throws an Attribute Error. so just ignore these.
                    continue
                if this_alt_allele_number in map(int,sample.gt_alleles): # if this sample has this allele
                    samples_with_allele.append(sample.sample.replace(' ','_')) # grab sample id, and replace space with underscore
                    true_ac += map(int,sample.gt_alleles).count(this_alt_allele_number) # add this indiv's allele count to the running total
            if acfields == ['AC']: # only if we are including AC from *all* individuals, we can spot check that the AC is correct.
                assert true_ac == nominal_ac, "VCF has AC as %s, actual AC is %s.\nRecord is:\n%s"%(nominal_ac,true_ac,str(record))
            # we only care about the samples for which PCs are available, so take intersection of those two sets:    
            samples_with_allele_in_pcs = list(set(samples_with_allele) & set(pcs.keys())) # 
            try:
                meandist = mean_euclid_dist(samples_with_allele_in_pcs,pcs,weights,warn=False) # do not warn if samples are missing from pcs
                minpos, minref, minalt = get_minimal_representation(record.POS,record.REF,str(alt))
                print "\t".join([record.CHROM,str(minpos),minref,minalt,str(true_ac),str(meandist),flag])
            except AssertionError as e:
                sys.stderr.write(e.message+"\n")
                continue
Example #2
0
	def seek_out_variant(self, chrom, pos, ref, alt):
		pos = int(pos)
		leeway = max([len(ref), len(alt)])
		target = get_minimal_representation(pos, ref, alt)
		found = False
		reg = tuple(map(str,(chrom, pos-leeway, pos+leeway)))
		region = "%s:%s-%s" % reg
		self.go_to(region)
		for line in self.read():
			pos = line['POS']
			ref = line['REF']
			alts = line['ALT'].split(",")
			# get min rep of alleles in full vcf so they correspond to alleles in sites table
			candidates = [get_minimal_representation(pos, ref, alt) for alt in alts]
			if target in candidates:
				found = True
				allele_number = candidates.index(target) + 1
				break
		if found:
			return allele_number
		else:
			return False
Example #3
0
                    annotations = [dict(zip(vep_field_names, x.split('|'))) for x in info_field['CSQ'].split(',') if len(vep_field_names) == len(x.split('|'))]
                    if args.lof_only: annotations = [x for x in annotations if x['LoF'] == 'HC']
                    if args.canonical_only: annotations = [x for x in annotations if x['CANONICAL'] == 'YES']
                else:
                    annotations = []
                if args.lof_only and len(annotations) == 0: continue
                if args.canonical_only and len(annotations) == 0: continue

            if 'FORMAT' in header:
                format_fields_list = fields[header['FORMAT']].split(':')
                format_fields = dict(zip(format_fields_list, range(len(format_fields_list))))

            for index, alt in enumerate(alts):
                # Get site data
                if not args.do_not_minrep and get_minimal_representation is not None:
                    new_pos, new_ref, new_alt = get_minimal_representation(fields[header['POS']], fields[header['REF']], alt)
                    if args.snps_only and (len(new_ref) != 1 or len(new_alt) != 1): continue
                    output = [fields[header['CHROM']], str(new_pos), new_ref, new_alt]
                else:
                    if args.snps_only and (len(fields[header['REF']]) != 1 or len(alt) != 1): continue
                    output = [fields[header['CHROM']], fields[header['POS']], fields[header['REF']], alt]

                ucsc_link = raw_ucsc_link % (output[0], int(output[1]) - args.ucsc_link_window, int(output[1]) + args.ucsc_link_window)
                if args.add_ucsc_link: output.append(ucsc_link)
                if args.include_id: output.append(fields[header['ID']])
                if args.original_position: output.append(fields[header['POS']])
                if not args.omit_filter: output.append(fields[header['FILTER']])

                # Get data from INFO field
                for info in desired_info:
                    this_output = missing_string
Example #4
0
def main(args):
    # Read parameters
    f = gzip.open(args.vcf) if args.vcf.endswith('.gz') else open(args.vcf)
    if args.output is None:
        args.output = '.table'.join(args.output.rsplit('.vcf', 1))
    if args.output == args.vcf:
        print >> sys.stderr, "VCF filename has no '.vcf' and no output file name was provided. Exiting."
        sys.exit(1)
    g = gzip.open(args.output, 'w') if args.output.endswith('.gz') else open(args.output, 'w')

    desired_info = [] if args.info is None else args.info.split(',')
    desired_vep_info = [] if args.vep_info is None else args.vep_info.split(',')
    if args.simplify:
        args.max_csq = True
        args.lof_only = True
        args.collapse_annotations = True

    missing_string = '\N' if args.mysql else 'NA'

    header = None
    vep_field_names = None
    info_from_header = {}
    started = False

    output_header = 'CHROM\tPOS\tREF\tALT\t'
    if args.include_id: output_header += 'ID\t'
    if not args.omit_filter: output_header += 'FILTER\t'

    for line in f:
        line = line.strip()

        # Reading header lines to get VEP and individual arrays
        if line.startswith('#'):
            line = line.lstrip('#')
            if line.startswith('INFO=<ID='):
                try:
                    header_metadata = dict([x.split('=', 1) for x in line.split('<')[1].split('>')[0].split(',', 3) if '=' in x])
                except Exception, e:
                    print >> sys.stderr, "Malformed header line: %s" % line
                    sys.exit(1)
                info_from_header[header_metadata['ID']] = header_metadata
            if 'ID=CSQ' in line:
                vep_field_names = line.split('Format: ')[-1].strip('">').split('|')
                vep_info_from_header = dict(zip(vep_field_names, range(len(vep_field_names))))
            if line.startswith('CHROM'):
                header = line.split()
                header = dict(zip(header, range(len(header))))
                if args.options:
                    print >> sys.stderr, "######### OPTIONS FOR INFO #########"
                    for info in info_from_header:
                        print >> sys.stderr, '%s\t%s' % (info, info_from_header[info]['Description'])
                    if vep_field_names is not None:
                        print >> sys.stderr, "######### OPTIONS FOR VEP_INFO #########"
                        print >> sys.stderr, '\n'.join(vep_field_names)
                    sys.exit(0)
            continue

        if len(desired_vep_info) > 0:
            if vep_field_names is None:
                print >> sys.stderr, "VEP info requested, but VCF file does not have a VEP header line. Exiting."
                sys.exit(1)
            if 'ALLELE_NUM' not in vep_info_from_header:
                print >> sys.stderr, "VEP output does not have ALLELE_NUM which is required for extraction. Please re-run VEP with --allele_number. Exiting."
                sys.exit(1)
        if header is None:
            print >> sys.stderr, "VCF file does not have a header line (CHROM POS etc.). Exiting."
            sys.exit(1)

        if not started:
            # Allowing entries even if not found in the header line, with some caveats
            original_desired_info = copy.deepcopy(desired_info)
            desired_info = []
            any_missing = 0
            for info in original_desired_info:
                if info in info_from_header:
                    print >> sys.stderr, 'SUCCESS: Found %s: %s' % (info, info_from_header[info]['Description'])
                    desired_info.append(info)
                else:
                    matches = 0
                    for header_record in info_from_header:
                        if re.search('^%s$' % info, header_record):
                            matches += 1
                            desired_info.append(header_record)
                            print >> sys.stderr, 'SUCCESS: Found %s (matching %s): %s' % (header_record, info, info_from_header[header_record]['Description'])
                    if not matches:
                        print >> sys.stderr, 'WARNING: Did not find %s in header.' % info
                        any_missing += 1
                        desired_info.append(info)

            # Only allowing entries in VEP header.
            original_desired_vep_info = copy.deepcopy(desired_vep_info)
            desired_vep_info = []
            for info in original_desired_vep_info:
                if info in vep_info_from_header:
                    desired_vep_info.append(info)
                    print >> sys.stderr, 'SUCCESS: Found %s' % info
                else:
                    print >> sys.stderr, 'WARNING: Did not find %s in VEP header. Not including from here on out.' % info

            # Warnings/errors for missing data
            if any_missing: print >> sys.stderr, 'WARNING: At least one INFO line requested was not found. Continuing, but results may be off.'
            if len(desired_info) + len(desired_vep_info) == 0:
                print >> sys.stderr, 'No fields left in requested info/VEP info. Exiting.'
                sys.exit(1)
            if args.lof_only and 'LoF' not in desired_vep_info:
                print >> sys.stderr, '--lof_only was used, but no LoF tag found in VEP field. Exiting.'
                sys.exit(1)

            # Ready to go.
            output_header += '\t'.join(desired_info)
            if len(desired_vep_info) > 0: output_header += '\t' + '\t'.join(desired_vep_info)
            print >> g, output_header
            started = True

        # Pull out annotation info from INFO and ALT fields
        fields = line.split('\t')
        info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[header['INFO']])])

        # Only get VEP info if requested
        if len(desired_vep_info) > 0:
            if 'CSQ' in info_field:
                # if statement here is a fix for VEP's occasional introduction of a semi-colon into the CSQ.
                # Can be removed once that is completely fixed.
                annotations = [dict(zip(vep_field_names, x.split('|'))) for x in info_field['CSQ'].split(',') if len(vep_field_names) == len(x.split('|'))]
                if args.lof_only: annotations = [x for x in annotations if x['LoF'] == 'HC']
            else:
                annotations = []
            if args.lof_only and len(annotations) == 0: continue
        alts = fields[header['ALT']].split(',')

        # Default is split line into all alternate alleles
        if not args.preserve_multiallelic:
            for index, alt in enumerate(alts):
                if get_minimal_representation is not None:
                    new_pos, new_ref, new_alt = get_minimal_representation(fields[header['POS']], fields[header['REF']], alt)
                    output = [fields[header['CHROM']], str(new_pos), new_ref, new_alt]
                else:
                    output = [fields[header['CHROM']], fields[header['POS']], fields[header['REF']], alt]
                if args.include_id: output.append(fields[header['ID']])
                if not args.omit_filter: output.append(fields[header['FILTER']])

                # Get info and VEP info
                for info in desired_info:
                    if info in info_field:
                        if info in info_from_header and 'Number' in info_from_header[info] and info_from_header[info]['Number'] == 'A':
                            output.append(info_field[info].split(',')[index])
                        else:
                            output.append(info_field[info])
                    else:
                        output.append(missing_string)
                if len(desired_vep_info) > 0:
                    # Filter to this allele
                    this_alt_annotations = [x for x in annotations if int(x['ALLELE_NUM']) - 1 == index]
                    if args.lof_only and len(this_alt_annotations) == 0: continue
                    for info in desired_vep_info:
                        this_alt_vep_info = [x[info] for x in this_alt_annotations if x[info] != '']

                        # Process options
                        if args.max_csq and info == 'Consequence': this_alt_vep_info = [csq_max_vep(x) for x in this_alt_vep_info]
                        if args.simplify_gtex and info == 'TissueExpression':
                            # Converting from tissue1:value1&tissue2:value2 to [tissue1, tissue2]
                            this_alt_vep_info = set([y.split(':')[0] for x in this_alt_vep_info for y in x.split('&')])
                        if args.collapse_annotations:
                            this_alt_vep_info = set(this_alt_vep_info)
                            # Collapse consequence further
                            if args.max_csq and info == 'Consequence': this_alt_vep_info = [csq_max(this_alt_vep_info)]

                        annotation_output = ','.join(this_alt_vep_info)
                        if annotation_output == '': annotation_output = missing_string
                        output.append(annotation_output)
                print >> g, '\t'.join(output)
        else:
            output = [fields[header['CHROM']], fields[header['POS']], fields[header['REF']], fields[header['ALT']]]
            if args.include_id: output.append(fields[header['ID']])
            if not args.omit_filter: output.append(fields[header['FILTER']])
            for info in desired_info:
                if info in info_field:
                    output.append(info_field[info])
                else:
                    output.append(missing_string)
            if len(desired_vep_info) > 0:
                for info in desired_vep_info:
                    this_vep_info = [x[info] for x in annotations]

                    # Process options
                    if args.max_csq and info == 'Consequence': this_vep_info = [csq_max(x) for x in this_vep_info]
                    if args.simplify_gtex and info == 'TissueExpression':
                        this_vep_info = set([y.split(':')[0] for x in this_vep_info for y in x.split('&')])
                    if args.collapse_annotations:
                        this_vep_info = set(this_vep_info)
                        if args.max_csq and info == 'Consequence': this_vep_info = [csq_max(this_vep_info)]

                    annotation_output = ','.join(this_vep_info)
                    if annotation_output == '': annotation_output = missing_string
                    output.append(annotation_output)
            print >> g, '\t'.join(output)
Example #5
0
def main(args):
    # Read parameters
    f = gzip.open(args.vcf) if args.vcf.endswith('.gz') else open(args.vcf)
    if args.output is None:
        args.output = '.table'.join(args.vcf.rsplit('.vcf', 1))
    if args.output == args.vcf:
        print >> sys.stderr, "VCF filename has no '.vcf' and no output file name was provided. Exiting."
        sys.exit(1)
    g = gzip.open(args.output, 'w') if args.output.endswith('.gz') else open(
        args.output, 'w')

    desired_info = [] if args.info is None else args.info.split(',')
    desired_vep_info = [] if args.vep_info is None else args.vep_info.split(
        ',')
    desired_sample_info = [] if args.sample_info is None else args.sample_info.split(
        ',')
    if args.simplify:
        args.max_csq = True
        args.lof_only = True
        args.collapse_annotations = True

    missing_string = '\N' if args.mysql else 'NA'

    header = None
    vep_field_names = None
    info_from_header = {}
    started = False

    output_header = 'CHROM\tPOS\tREF\tALT\t'
    if args.add_ucsc_link: output_header += 'UCSC\t'
    if args.include_id: output_header += 'ID\t'
    if not args.omit_filter: output_header += 'FILTER\t'

    raw_ucsc_link = 'http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&position=chr%s:%s-%s'

    for line in f:
        line = line.strip()

        # Reading header lines to get VEP and individual arrays
        if line.startswith('#'):
            line = line.lstrip('#')
            if line.startswith('INFO=<ID='):
                try:
                    header_metadata = dict([
                        x.split('=', 1)
                        for x in line.split('<')[1].split('>')[0].split(
                            ',', 3) if '=' in x
                    ])
                except Exception, e:
                    print >> sys.stderr, "Malformed header line: %s" % line
                    sys.exit(1)
                info_from_header[header_metadata['ID']] = header_metadata
            if 'ID=CSQ' in line:
                vep_field_names = line.split('Format: ')[-1].strip('">').split(
                    '|')
                vep_info_from_header = dict(
                    zip(vep_field_names, range(len(vep_field_names))))
            if line.startswith('CHROM'):
                header = line.split()
                header = dict(zip(header, range(len(header))))
                if args.options:
                    print >> sys.stderr, "######### OPTIONS FOR INFO #########"
                    for info in info_from_header:
                        print >> sys.stderr, '%s\t%s' % (
                            info, info_from_header[info]['Description'])
                    if vep_field_names is not None:
                        print >> sys.stderr, "######### OPTIONS FOR VEP_INFO #########"
                        print >> sys.stderr, '\n'.join(vep_field_names)
                    sys.exit(0)
            continue

        if len(desired_vep_info) > 0:
            if vep_field_names is None:
                print >> sys.stderr, "VEP info requested, but VCF file does not have a VEP header line. Exiting."
                sys.exit(1)
            if 'ALLELE_NUM' not in vep_info_from_header:
                print >> sys.stderr, "VEP output does not have ALLELE_NUM which is required for extraction. Please re-run VEP with --allele_number. Exiting."
                sys.exit(1)

        if header is None:
            print >> sys.stderr, "VCF file does not have a header line (CHROM POS etc.). Exiting."
            sys.exit(1)

        if not started:
            # Allowing entries even if not found in the header line, with some caveats
            original_desired_info = copy.deepcopy(desired_info)
            desired_info = []
            any_missing = 0
            for info in original_desired_info:
                if info in info_from_header:
                    print >> sys.stderr, 'SUCCESS: Found %s: %s' % (
                        info, info_from_header[info]['Description'])
                    desired_info.append(info)
                else:
                    matches = 0
                    for header_record in info_from_header:
                        if re.search('^%s$' % info, header_record):
                            matches += 1
                            desired_info.append(header_record)
                            print >> sys.stderr, 'SUCCESS: Found %s (matching %s): %s' % (
                                header_record, info,
                                info_from_header[header_record]['Description'])
                    if not matches:
                        print >> sys.stderr, 'WARNING: Did not find %s in header.' % info
                        any_missing += 1
                        desired_info.append(info)

            # Only allowing entries in VEP header.
            original_desired_vep_info = copy.deepcopy(desired_vep_info)
            desired_vep_info = []
            for info in original_desired_vep_info:
                if info in vep_info_from_header:
                    desired_vep_info.append(info)
                    print >> sys.stderr, 'SUCCESS: Found %s' % info
                else:
                    print >> sys.stderr, 'WARNING: Did not find %s in VEP header. Not including from here on out.' % info

            # Getting info from individuals
            original_desired_sample_info = copy.deepcopy(desired_sample_info)
            desired_sample_info = []
            if len(original_desired_sample_info) > 0:
                if 'FORMAT' not in header:
                    print >> sys.stderr, 'WARNING: Did not find FORMAT in header line, will not be extracting any SAMPLE.FORMATs'
                else:
                    for info in original_desired_sample_info:
                        sample_format = info.split('.')
                        if len(sample_format) != 2:
                            print >> sys.stderr, 'WARNING: %s is not a SAMPLE.FORMAT designation' % sample_format
                        else:
                            sample, format = sample_format
                            if sample in header:
                                print >> sys.stderr, 'SUCCESS: Found sample %s in header' % sample
                                desired_sample_info.append(info)
                            else:
                                print >> sys.stderr, 'WARNING: Sample %s not found in header' % sample

            # Warnings/errors for missing data
            if any_missing:
                print >> sys.stderr, 'WARNING: At least one INFO line requested was not found. Continuing, but results may be off.'
            if len(desired_info) + len(desired_vep_info) + len(
                    desired_sample_info) == 0:
                print >> sys.stderr, 'No fields left in requested info/VEP info. Exiting.'
                sys.exit(1)
            if args.lof_only and 'LoF' not in desired_vep_info:
                print >> sys.stderr, '--lof_only was used, but no LoF tag found in VEP field. Exiting.'
                sys.exit(1)

            # Ready to go.
            output_header += '\t'.join(desired_info)
            if len(desired_vep_info) > 0:
                output_header += '\t' + '\t'.join(desired_vep_info)
            if len(desired_sample_info) > 0:
                output_header += '\t' + '\t'.join(desired_sample_info)
            print >> g, output_header
            started = True

        # Pull out annotation info from INFO and ALT fields
        fields = line.split('\t')
        info_field = dict([(x.split('=', 1)) if '=' in x else (x, x)
                           for x in re.split(';(?=\w)', fields[header['INFO']])
                           ])

        # Only get VEP info if requested
        if len(desired_vep_info) > 0:
            if 'CSQ' in info_field:
                # if statement here is a fix for VEP's occasional introduction of a semi-colon into the CSQ.
                # Can be removed once that is completely fixed.
                annotations = [
                    dict(zip(vep_field_names, x.split('|')))
                    for x in info_field['CSQ'].split(',')
                    if len(vep_field_names) == len(x.split('|'))
                ]
                if args.lof_only:
                    annotations = [x for x in annotations if x['LoF'] == 'HC']
            else:
                annotations = []
            if args.lof_only and len(annotations) == 0: continue

        alts = fields[header['ALT']].split(',')
        format_fields_list = fields[header['FORMAT']].split(':')
        format_fields = dict(
            zip(format_fields_list, range(len(format_fields_list))))

        # Default is split line into all alternate alleles
        if not args.preserve_multiallelic:
            for index, alt in enumerate(alts):
                if get_minimal_representation is not None:
                    new_pos, new_ref, new_alt = get_minimal_representation(
                        fields[header['POS']], fields[header['REF']], alt)
                    output = [
                        fields[header['CHROM']],
                        str(new_pos), new_ref, new_alt
                    ]
                else:
                    output = [
                        fields[header['CHROM']], fields[header['POS']],
                        fields[header['REF']], alt
                    ]

                ucsc_link = raw_ucsc_link % (
                    output[0], int(output[1]) - args.ucsc_link_window,
                    int(output[1]) + args.ucsc_link_window)
                if args.add_ucsc_link: output.append(ucsc_link)
                if args.include_id: output.append(fields[header['ID']])
                if not args.omit_filter:
                    output.append(fields[header['FILTER']])

                # Get info and VEP info
                for info in desired_info:
                    if info in info_field:
                        if info in info_from_header and 'Number' in info_from_header[
                                info] and info_from_header[info][
                                    'Number'] == 'A':
                            output.append(info_field[info].split(',')[index])
                        else:
                            output.append(info_field[info])
                    else:
                        output.append(missing_string)
                if len(desired_vep_info) > 0:
                    # Filter to this allele
                    this_alt_annotations = [
                        x for x in annotations
                        if int(x['ALLELE_NUM']) - 1 == index
                    ]
                    if args.lof_only and len(this_alt_annotations) == 0:
                        continue
                    for info in desired_vep_info:
                        this_alt_vep_info = [
                            x[info] for x in this_alt_annotations
                            if x[info] != ''
                        ]

                        # Process options
                        if args.max_csq and info == 'Consequence':
                            this_alt_vep_info = [
                                csq_max_vep(x) for x in this_alt_vep_info
                            ]
                        if args.simplify_gtex and info == 'TissueExpression':
                            # Converting from tissue1:value1&tissue2:value2 to [tissue1, tissue2]
                            this_alt_vep_info = set([
                                y.split(':')[0] for x in this_alt_vep_info
                                for y in x.split('&')
                            ])
                        if args.collapse_annotations:
                            this_alt_vep_info = set(this_alt_vep_info)
                            # Collapse consequence further
                            if args.max_csq and info == 'Consequence':
                                this_alt_vep_info = [
                                    csq_max(this_alt_vep_info)
                                ]

                        annotation_output = ','.join(this_alt_vep_info)
                        if annotation_output == '':
                            annotation_output = missing_string
                        output.append(annotation_output)

                for sample_format in desired_sample_info:
                    sample, format = sample_format.split('.')
                    if format not in format_fields: continue
                    this_sample_format = dict(
                        zip(format_fields_list,
                            fields[header[sample]].split(':')))
                    if format in this_sample_format:
                        output.append(this_sample_format[format])
                    else:
                        output.append(missing_string)

                print >> g, '\t'.join(output)
        else:
            output = [
                fields[header['CHROM']], fields[header['POS']],
                fields[header['REF']], fields[header['ALT']]
            ]
            ucsc_link = raw_ucsc_link % (
                output[0], int(output[1]) - args.ucsc_link_window,
                int(output[1]) + args.ucsc_link_window)
            if args.add_ucsc_link: output.append(ucsc_link)
            if args.include_id: output.append(fields[header['ID']])
            if not args.omit_filter: output.append(fields[header['FILTER']])
            for info in desired_info:
                if info in info_field:
                    output.append(info_field[info])
                else:
                    output.append(missing_string)
            if len(desired_vep_info) > 0:
                for info in desired_vep_info:
                    this_vep_info = [x[info] for x in annotations]

                    # Process options
                    if args.max_csq and info == 'Consequence':
                        this_vep_info = [csq_max(x) for x in this_vep_info]
                    if args.simplify_gtex and info == 'TissueExpression':
                        this_vep_info = set([
                            y.split(':')[0] for x in this_vep_info
                            for y in x.split('&')
                        ])
                    if args.collapse_annotations:
                        this_vep_info = set(this_vep_info)
                        if args.max_csq and info == 'Consequence':
                            this_vep_info = [csq_max(this_vep_info)]

                    annotation_output = ','.join(this_vep_info)
                    if annotation_output == '':
                        annotation_output = missing_string
                    output.append(annotation_output)

            for sample_format in desired_sample_info:
                sample, format = sample_format
                if format not in format_fields: continue
                this_sample_format = dict(
                    zip(format_fields_list, fields[header[sample]].split(':')))
                if format in this_sample_format:
                    output.append(this_sample_format[format])
                else:
                    output.append(missing_string)

            print >> g, '\t'.join(output)
def score_entire_file(pcpath,
                      vcfpath,
                      weightpath,
                      minac=2,
                      maxac=2500,
                      flag='',
                      n_pcs=9,
                      acfields=['AC']):
    '''
    Runs through an entire VCF and gives diversity scores for every allele in the specified AC range.
    Note that the acfields parameter is useful if you want to only consider variants that fall within
    a particular AC range *in certain population(s)* and the AC for these populations is specified in
    the INFO field, e.g. AC_AFR, AC_AMR, etc.
    '''
    pcs = read_pcs(pcpath, n_pcs)
    weights = read_weights(weightpath)
    if vcfpath[
            -3:] == ".gz":  # open .vcf.gz file with gzip.open, otherwise just use open
        openfunc = gzip.open
    else:
        openfunc = open
    # in theory PyVCF can accept just a path, but I found it only works with an fsock, hence the need for openfunc (above).
    # filename='ignore',compressed=False is a lousy hack to force vcf.Reader.__init__ to use the fsock and not the filename
    # __init__ func currently contains 2 lines:
    # if filename is None and hasattr(fsock, 'name'):
    #            filename = fsock.name
    # which short-circuit my attempt to pass it an fsock; it then opens the file with open(filename,mode='rt') instead of
    # with gzip, and thus it crashes with this error:
    # if mode[0:1] == 'r':
    #     TypeError: 'int' object is not subscriptable
    # by making the filename non-None, I prevent this; but if PyVCF is later refactored to use filename before fsock,
    # my code will break. hence I say it's a lousy hack.
    vcf_reader = vcf.Reader(
        openfunc(vcfpath),
        filename='ignore',
        compressed=False,
        strict_whitespace=True)  # split only on tab, allow spaces in ids
    for record in vcf_reader:  # iterate over every row of VCF
        for alt in record.ALT:  # for every alt allele at this site
            this_alt_allele_index = record.ALT.index(
                alt
            )  # index of this particular allele in comma-separated INFO fields
            this_alt_allele_number = record.ALT.index(
                alt
            ) + 1  # for GT fields, need allele number: 1, 2, etc. remember REF allele is 0.
            nominal_ac = 0  # initialize a variable for the allele count as stated in the INFO field
            for acfield in acfields:  # add up the allele count in each AC field
                nominal_ac += int(record.INFO[acfield][this_alt_allele_index])
            if nominal_ac < minac or nominal_ac > maxac:
                continue
            samples_with_allele = []
            true_ac = 0
            for sample in record.samples:
                if sample[
                        'GT'] is None:  # no-calls apparently come through as None instead of ./.
                    # if you call sample.gt_alleles on them, PyVCF tries to do None.split() and
                    # throws an Attribute Error. so just ignore these.
                    continue
                if this_alt_allele_number in map(
                        int,
                        sample.gt_alleles):  # if this sample has this allele
                    samples_with_allele.append(
                        sample.sample.replace(' ', '_')
                    )  # grab sample id, and replace space with underscore
                    true_ac += map(int, sample.gt_alleles).count(
                        this_alt_allele_number
                    )  # add this indiv's allele count to the running total
            if acfields == [
                    'AC'
            ]:  # only if we are including AC from *all* individuals, we can spot check that the AC is correct.
                assert true_ac == nominal_ac, "VCF has AC as %s, actual AC is %s.\nRecord is:\n%s" % (
                    nominal_ac, true_ac, str(record))
            # we only care about the samples for which PCs are available, so take intersection of those two sets:
            samples_with_allele_in_pcs = list(
                set(samples_with_allele) & set(pcs.keys()))  #
            try:
                meandist = mean_euclid_dist(
                    samples_with_allele_in_pcs, pcs, weights,
                    warn=False)  # do not warn if samples are missing from pcs
                minpos, minref, minalt = get_minimal_representation(
                    record.POS, record.REF, str(alt))
                print "\t".join([
                    record.CHROM,
                    str(minpos), minref, minalt,
                    str(true_ac),
                    str(meandist), flag
                ])
            except AssertionError as e:
                sys.stderr.write(e.message + "\n")
                continue
Example #7
0
def find_var_indivs(refvcf, reftable, chr, pos, ref, alt, find_indivs):
    # dictionary to hold info on people with the variant allele
    variant_indivs = {}
    # convert input variants to minimal representation
    pos, ref, alt = mr.get_minimal_representation(pos, ref, alt)
    print "##Minimal representation of your search: ",
    print pos, ref, alt
    # use tabix to grab 100 bp on either side of putative variant
    lines = get_vcf_lines(refvcf, 100, chr, pos)
    # get the #CHROM line from the gzipped VCF
    column_names = get_vcf_colnames(refvcf)
    # now search the lines for the variant of interest
    match_found = False  # default is you haven't found a matching variant
    for line in lines:
        cols = line.split("\t")
        if len(cols
               ) <= 9:  # skip any extra non-VCF lines that appear in output
            continue
        vchr, vpos, vid, vref, valt, vqual, vfilter, vinfo, vformat = cols[:9]
        vpos = int(vpos)  # must cast to into to match incoming pos variable
        valt_alleles = valt.split(",")
        for valt_allele in valt_alleles:
            vpos_mr, vref_mr, valt_allele_mr = mr.get_minimal_representation(
                vpos, vref, valt_allele)
            # check if we've found a match
            if vchr == chr and vpos_mr == pos and vref_mr == ref and valt_allele_mr == alt:
                match_found = True
                # output the variant info as called in the reference VCF
                print "##Relevant line from VCF: ",
                print '\t'.join(cols[:9])
                if find_indivs:
                    allele_no = valt_alleles.index(
                        valt_allele) + 1  # first alt allele is 1 (ref is 0)
                    format_fields = vformat.split(":")
                    gt_idx = format_fields.index(
                        "GT")  # in what order does genotype appear
                    for column_no in range(9, len(cols)):
                        call = cols[column_no]
                        call_fields = call.split(":")
                        genotype = call_fields[gt_idx]
                        alleles = re.split(
                            "/|\|", genotype
                        )  # split on / or | to support UG or HC calls.
                        # check if this individual has the allele in question
                        if str(allele_no) in alleles:
                            sample_name = column_names[column_no]
                            call_info = call
                            # store this person and their call in the dict
                            variant_indivs[sample_name] = call_info
                break  # stop looking for more matching alleles
        if match_found:
            break  # stop looking for more matching sites
    if not match_found:
        print "##No matches found."
    else:
        # at this point we have printed the variant call, and stored the info
        # on each individual with the variant.
        # now if possible we also want to look up what study they're from.
        # the reftable parameter is optional, so we'll check if the table exists
        if find_indivs and reftable is not None and os.path.isfile(reftable):
            print "#SAMPLE\tPROJECT\tCALL"
            for sample_name, call_info in variant_indivs.iteritems():
                project_name = get_project_name(reftable, sample_name)
                if project_name is None:
                    project_name = ""
                print "%s\t%s\t%s" % (sample_name, project_name, call_info)
        elif find_indivs:
            print "#SAMPLE\tCALL"
            for sample_name, call_info in variant_indivs.iteritems():
                print "%s\t%s" % (sample_name, call_info)
def find_var_indivs(refvcf,reftable,chr,pos,ref,alt,find_indivs):
    # dictionary to hold info on people with the variant allele
    variant_indivs = {}
    # convert input variants to minimal representation
    pos, ref, alt = mr.get_minimal_representation(pos,ref,alt)
    print "##Minimal representation of your search: ",
    print pos, ref, alt
    # use tabix to grab 100 bp on either side of putative variant
    lines = get_vcf_lines(refvcf,100,chr,pos)
    # get the #CHROM line from the gzipped VCF
    column_names = get_vcf_colnames(refvcf)
    # now search the lines for the variant of interest
    match_found = False # default is you haven't found a matching variant
    for line in lines:
        cols = line.split("\t")
        if len(cols) <= 9: # skip any extra non-VCF lines that appear in output
            continue
        vchr, vpos, vid, vref, valt, vqual, vfilter, vinfo, vformat = cols[:9]
        vpos = int(vpos) # must cast to into to match incoming pos variable
        valt_alleles = valt.split(",")
        for valt_allele in valt_alleles:
            vpos_mr, vref_mr, valt_allele_mr = mr.get_minimal_representation(vpos, vref, valt_allele)
            # check if we've found a match
            if vchr == chr and vpos_mr == pos and vref_mr == ref and valt_allele_mr == alt:
                match_found = True
                # output the variant info as called in the reference VCF
                print "##Relevant line from VCF: ",
                print '\t'.join(cols[:9])
                if find_indivs:
                    allele_no = valt_alleles.index(valt_allele) + 1 # first alt allele is 1 (ref is 0)
                    format_fields = vformat.split(":")
                    gt_idx = format_fields.index("GT") # in what order does genotype appear
                    for column_no in range(9,len(cols)):
                        call = cols[column_no]
                        call_fields = call.split(":")
                        genotype = call_fields[gt_idx]
                        alleles = re.split("/|\|",genotype) # split on / or | to support UG or HC calls.
                        # check if this individual has the allele in question
                        if str(allele_no) in alleles:
                            sample_name = column_names[column_no]
                            call_info = call
                            # store this person and their call in the dict
                            variant_indivs[sample_name] = call_info
                break # stop looking for more matching alleles
        if match_found:
            break # stop looking for more matching sites
    if not match_found:
        print "##No matches found."
    else:
        # at this point we have printed the variant call, and stored the info
        # on each individual with the variant.
        # now if possible we also want to look up what study they're from.
        # the reftable parameter is optional, so we'll check if the table exists
        if find_indivs and reftable is not None and os.path.isfile(reftable):
            print "#SAMPLE\tPROJECT\tCALL"
            for sample_name, call_info in variant_indivs.iteritems():
                project_name = get_project_name(reftable,sample_name)
                if project_name is None:
                    project_name = ""
                print "%s\t%s\t%s" % (sample_name, project_name, call_info)
        elif find_indivs:
            print "#SAMPLE\tCALL"
            for sample_name, call_info in variant_indivs.iteritems():
                print "%s\t%s" % (sample_name, call_info)
Example #9
0
 def test_simple_snv(self):
     pos, ref, alt = 1001, 'A', 'T'
     expected = 1001, 'A', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
Example #10
0
 def test_multiallelic_indels(self):
     pos, ref, alt = 1001, 'CTCC', 'CCCC'
     expected = 1002, 'T', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTCC', 'CCC'
     expected = 1001, 'CT', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTCC', 'CTC'
     expected = 1002, 'TC', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTAG', 'CTG'
     expected = 1002, 'TA', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTCC', 'CTACC'
     expected = 1002, 'T', 'TA'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'TCAGCAGCAG', 'TCAGCAG'
     expected = 1001, 'TCAG', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTT', 'CTTT'
     expected = 1001, 'C', 'CT'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTT', 'C'
     expected = 1001, 'CTT', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTT', 'CT'
     expected = 1001, 'CT', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'AAAATATATATAT', 'A'
     expected = 1001, 'AAAATATATATAT', 'A'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'AAAATATATATAT', 'AATAT'
     expected = 1001, 'AAAATATAT', 'A'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'ACACACACAC', 'AACAC'
     expected = 1001, 'ACACAC', 'A'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
	def test_simple_snv(self): 
		pos, ref, alt = 1000, 'A', 'T'
		expected = 1000, 'A', 'T'
		self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
 def test_multiallelic_indels(self):
     pos, ref, alt = 1001, 'CTCC', 'CCCC'
     expected = 1002, 'T', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTCC', 'CCC'
     expected = 1001, 'CT', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTCC', 'CTC'
     expected = 1002, 'TC', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTAG', 'CTG'
     expected = 1002, 'TA', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTCC', 'CTACC'
     expected = 1002, 'T', 'TA'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'TCAGCAGCAG', 'TCAGCAG'
     expected = 1001, 'TCAG', 'T'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTT', 'CTTT'
     expected = 1001, 'C', 'CT'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTT', 'C'
     expected = 1001, 'CTT', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'CTT', 'CT'
     expected = 1001, 'CT', 'C'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'AAAATATATATAT', 'A'
     expected = 1001, 'AAAATATATATAT', 'A'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'AAAATATATATAT', 'AATAT'
     expected = 1001, 'AAAATATAT', 'A'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
     pos, ref, alt = 1001, 'ACACACACAC', 'AACAC'
     expected = 1001, 'ACACAC', 'A'
     self.assertEqual(get_minimal_representation(pos, ref, alt), expected)
                    default=sys.stdout)
parser.add_argument('-i',
                    '--inf',
                    nargs='?',
                    type=argparse.FileType('rb'),
                    default=sys.stdin)
args = parser.parse_args()

# if there is a header line, just print it right back out
if args.has_header:
    header = args.inf.readline()
    args.outf.write(header)

# for other lines, process the POS, REF and ALT fields to minimal representation
for line in args.inf.readlines():
    cols = line.strip().split('\t')
    pos, ref, alt = [
        cols[i] for i in [args.pos - 1, args.ref - 1, args.alt - 1]
    ]
    newpos, newref, newalt = get_minimal_representation(int(pos), ref, alt)
    cols[args.pos - 1] = newpos
    cols[args.ref - 1] = newref
    cols[args.alt - 1] = newalt
    args.outf.write("\t".join(map(str, cols)) + "\n")

if args.inf is not sys.stdin:
    inf.close()

if args.outf is not sys.stdout:
    outf.close()