Example #1
0
def variant_discard_by_position(vcf_path, pos_path):
    info('Reading list of blacklisted positions...')
    pos_file = zopen(pos_path)
    blacklist = []
    for line in pos_file:
        cols = line.rstrip().split('\t')
        if len(cols) < 2: continue
        chr = cols[0][3:] if cols[0].startswith('chr') else cols[0]
        blacklist.append(chr + ':' + cols[1])
    blacklist = set(blacklist)

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1

    sys.stdout.write(line)
    for line in vcf_file:
        cols = line.rstrip().split('\t')
        chr = cols[0][3:] if cols[0].startswith('chr') else cols[0]
        if not chr + ':' + cols[1] in blacklist:
            sys.stdout.write(line)
Example #2
0
def variant_allele_fractions(vcf_path, pos_path):
    snps = set()
    for line in zopen(pos_path):
        pos = ':'.join(line[:-1].split('\t')[0:2])
        if not pos.startswith('chr'): pos = 'chr' + pos
        snps.add(pos)

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    headers = line[:-1].split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1

    sys.stdout.write(line)
    for line in vcf_file:
        cols = line[:-1].split('\t')
        if not ':'.join(cols[0:2]) in snps: continue

        reads = [gt.split(':')[1:3] for gt in cols[sample_col:]]
        sys.stdout.write('\t'.join(cols[:sample_col]))
        for r in reads:
            alt, total = float(r[0]), int(r[1])
            sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' %
                             (alt / total))
        sys.stdout.write('\n')
Example #3
0
def variant_heterozygous_bases(vcf_path, kgenomes_path):
    is_snp = bytearray(int(300e6))
    for line in zopen(kgenomes_path):
        pos = int(line[:-1].split('\t')[1])
        is_snp[pos] = True

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    headers = line[:-1].split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1

    print('\t'.join(headers[0:4]))
    for line in vcf_file:
        cols = line[:-1].split('\t')
        if not is_snp[int(cols[1])]: continue

        gt_cols = cols[sample_col:]
        genotypes = [gt_symbols.index(gt[:gt.find(':')]) for gt in gt_cols]
        total_reads = [float(gt.split(':')[2]) for gt in gt_cols]
        if not any(g == 2 and r >= 15 for g, r in zip(genotypes, total_reads)):
            continue

        print('\t'.join(cols[0:4]))
Example #4
0
def swiss_annotate(input_path, bed_path):
    features = []
    for line in zopen(bed_path):
        c = line.rstrip().split('\t')
        features.append((c[0], c[5], (int(c[1]), int(c[2])), c[3]))

    for line in zopen(input_path):
        t = line.rstrip().split('\t')
        chr, pos = None, None

        m = re.match(r'(chr.+):(\d+)', t[0])
        if m:
            chr = m.group(1)
            pos = int(m.group(2))

        if chr == None:
            sys.stdout.write(line)
            continue

        nearby = []
        for f in features:
            if f[0] != chr: continue
            dist = distance(pos, f[2])
            if dist < 50000:
                nearby.append((re.sub(' \(ENSG.*?\)', '', f[3]), dist))

        nearby.sort(key=lambda x: x[1])
        sys.stdout.write(line[:-1])
        sys.stdout.write('\t')
        print(','.join(['%s (%d)' % f for f in nearby]))
Example #5
0
def sam_reads_raw(bam_path, out_prefix):
    out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w')
    out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w')
    out = zopen('%s.reads.gz' % out_prefix, 'w')

    reads_1 = {}
    reads_2 = {}

    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read only has one primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path))
    for line in bam2fq:
        if line[0] != '@': error('Invalid bam2fq output.')
        line = line[:-1]
        if line.endswith('/1'):
            segname = line[1:-2]
            mate = reads_2.pop(segname, None)
            if mate:
                out_1.write(next(bam2fq))
                out_2.write('%s\n' % mate)
            else:
                reads_1[segname] = next(bam2fq)[:-1]
        elif line.endswith('/2'):
            segname = line[1:-2]
            mate = reads_1.pop(segname, None)
            if mate:
                out_1.write('%s\n' % mate)
                out_2.write(next(bam2fq))
            else:
                reads_2[segname] = next(bam2fq)[:-1]
        else:
            out.write('%s\n' % next(bam2fq)[:-1])

        # Skip per-base qualities. They can start with '@'.
        next(bam2fq)
        next(bam2fq)

    info('Found %d orphan first mates.' % len(reads_1))
    for read_id in reads_1.keys()[:5]:
        info('- Example: %s' % read_id)

    info('Found %d orphan second mates.' % len(reads_2))
    for read_id in reads_2.keys()[:5]:
        info('- Example: %s' % read_id)

    if len(reads_1) > 0:
        for read in reads_1.itervalues():
            out.write('%s\n' % read)

    if len(reads_2) > 0:
        for read in reads_2.itervalues():
            out.write('%s\n' % read)

    out_1.close()
    out_2.close()
    out.close()
Example #6
0
def swiss_split_wig(wig_path, out_prefix, no_header=False):
    chrom_re = re.compile('chrom=(\w+)')
    chr = ''
    for line in zopen(wig_path):
        if line.startswith(('fixedS', 'variableS')):
            if chr: out.close()
            chr = chrom_re.search(line).group(1)
            out = zopen('%s_%s.wig.gz' % (out_prefix, chr), 'w')
            if not no_header: out.write(line)
            continue

        if chr: out.write(line)
Example #7
0
def sam_reads(bam_path, out_prefix):
    fastq_1 = zopen('%s_1.fq.gz' % out_prefix, 'w')
    fastq_2 = zopen('%s_2.fq.gz' % out_prefix, 'w')
    fastq = zopen('%s.fq.gz' % out_prefix, 'w')

    reads_1 = {}
    reads_2 = {}

    # FIXME: We assume that each read only has one alignment in the BAM file.
    for al in read_sam(bam_path):
        flags = int(al[1])
        if flags & 0x40:
            rname = al[0][:-2] if al[0].endswith('/1') else al[0]
            mate = reads_2.pop(rname, None)
            if mate:
                fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, al[9], al[10]))
                fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, mate[0], mate[1]))
            else:
                reads_1[rname] = (al[9], al[10])
        elif flags & 0x80:
            rname = al[0][:-2] if al[0].endswith('/2') else al[0]
            mate = reads_1.pop(rname, None)
            if mate:
                fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, mate[0], mate[1]))
                fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, al[9], al[10]))
            else:
                reads_2[rname] = (al[9], al[10])
        else:
            fastq.write('@%s\n%s\n+\n%s\n' % (al[0], al[9], al[10]))

    info('Found %d orphan first mates.' % len(reads_1))
    for read_id in reads_1.keys()[:5]:
        info('- Example: %s' % read_id)

    info('Found %d orphan second mates.' % len(reads_2))
    for read_id in reads_2.keys()[:5]:
        info('- Example: %s' % read_id)

    if len(reads_1) > 0:
        for rname, read in reads_1.iteritems():
            fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1]))

    if len(reads_2) > 0:
        for rname, read in reads_2.iteritems():
            fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1]))

    fastq_1.close()
    fastq_2.close()
    fastq.close()
Example #8
0
def variant_annotate(vcf_path,
                     genome='~/tools/annovar-2016-02-01/humandb/hg38'):
    format_annovar(vcf_path, 'anno_tmp.vcf')
    humandb_dir, genome_version = os.path.split(genome)
    shell('table_annovar.pl anno_tmp.vcf %s -buildver %s --remove --otherinfo '
          '--outfile annotated -operation g,f,f,f '
          '-protocol refGene,cosmic70,1000g2014oct_all,exac03' %
          (humandb_dir, genome_version))

    anno = open('annotated.%s_multianno.txt' % genome_version)
    out = zopen('annotated.vcf.gz', 'w')
    anno.next()
    line = anno.next()
    headers = [
        'CHROM', 'POSITION', 'REF', 'ALT', 'FUNCTION', 'GENE',
        'EXONIC_FUNCTION', 'AA_CHANGE', 'COSMIC', '1000G', 'EXAC'
    ]
    headers += line.rstrip('\n').split('\t')[20:]
    out.write('\t'.join(headers) + '\n')
    for line in anno:
        c = line.rstrip('\n').split('\t')
        out.write('\t'.join(c[0:2] + c[3:7] + c[8:13] + c[20:]))
        out.write('\n')
    out.close()

    os.remove('anno_tmp.vcf')
    os.remove('annotated.%s_multianno.txt' % genome_version)
    if num_lines('annotated.invalid_input') <= 1:
        os.remove('annotated.invalid_input')
    if num_lines('annotated.refGene.invalid_input') <= 1:
        os.remove('annotated.refGene.invalid_input')
Example #9
0
def ensembl_transcript_bed(gtf_path):
    tx_id_to_gene = {}
    tx_exons = {}

    gtf_file = zopen(gtf_path)
    for line in gtf_file:
        if line.startswith('#'): continue
        c = line.rstrip('\n').split('\t')
        if not c[0] in human_chr: continue
        if not c[1] in accepted_gene_types: continue
        if c[2] != 'exon': continue

        chr, start, end, strand = c[0], int(c[3]), int(c[4]), c[6]
        if not chr.startswith('chr'): chr = 'chr' + chr

        tx_id = re.search(r'transcript_id "(.+?)"', line).group(1)
        gene_id = re.search(r'gene_id "(.+?)"', line).group(1)
        gene_name = re.search(r'gene_name "(.+?)"', line).group(1)

        exons = tx_exons.setdefault(tx_id, [])
        exons.append((chr, strand, start, end))
        tx_id_to_gene[tx_id] = (gene_id, gene_name)

    for tx_id, exons in tx_exons.iteritems():
        start, end = min(ex[2] for ex in exons), max(ex[3] for ex in exons)
        print('%s\t%d\t%d\t%s:%s:%s\t\t%s' %
              (exons[0][0], start - 1, end, tx_id_to_gene[tx_id][0],
               tx_id_to_gene[tx_id][1], tx_id, exons[0][1]))
Example #10
0
def partition(samples_path, num_partitions):
	samples = [line.strip() for line in zopen(samples_path)]
	part_size = float(len(samples)) / num_partitions
	partition_ends = [int((p+1) * part_size) for p in range(num_partitions)]
	print(partition_ends)

	patient_ids = []
	num_without_pid = 0
	for s in samples:
		m = re.search('TCGA-..-....', s)
		if not m: num_without_pid += 1
		patient_ids.append(m.group(0) if m else 'zzz' + s)

	if num_without_pid:
		info('WARNING: %d sample names did not contain a TCGA patient ID.' % num_without_pid)

	samples, patient_ids = zip(*sorted(zip(samples, patient_ids),
		key=lambda x: x[1]))

	partitions = []
	for p in range(num_partitions):
		first = sum(len(p) for p in partitions)
		last = partition_ends[p] - 1
		part = [s for s in samples[first:last+1]]
		while last + 1 < len(samples) and \
			patient_ids[last+1] == patient_ids[last]:
			part.append(samples[last+1])
			last += 1
		partitions.append(part)

	for idx, part in enumerate(partitions):
		out = open('batch_%d.txt' % (idx+1), 'w')
		for s in part: out.write('%s\n' % s)
		out.close()
Example #11
0
def read_fixed_wig(wig_path):
	tracks = {}
	chr = ''
	for line in zopen(wig_path):
		if line.startswith('fixedStep'):
			if chr:
				track.values = values[0:N]	 # Remove preallocated space
			track = Object()
			values = np.zeros(1000000)
			m = re.search(r'chrom=(\w+)', line)
			chr = m.group(1)
			m = re.search(r'start=(\d+)', line)
			track.start = int(m.group(1))
			m = re.search(r'step=(\d+)', line)
			track.step = int(m.group(1))
			m = re.search(r'span=(\d+)', line)
			track.span = int(m.group(1)) if m else -1
			N = 0
			tracks[chr] = track
			continue
		
		if chr:
			values[N] = float(line)
			N += 1
	
	if chr: track.values = values[0:N]	 # Remove preallocated space
	return tracks
Example #12
0
def coverage_cds(bam_path, gtf_path):
	
	chr_sizes = ref_sequence_sizes(bam_path)
	
	info('Constructing a map of coding regions...')
	coding = {}
	for chr, size in chr_sizes.iteritems():
		coding[chr] = [False] * size
	for line in zopen(gtf_path):
		if line.startswith('#'): continue
		cols = line.split('\t')
		if cols[2] != 'CDS': continue
		if len(cols[0]) > 5: continue   # Ignore chromosomes other than chrXX
		if not cols[0] in coding: continue
		coding[cols[0]][int(cols[3])-1:int(cols[4])] = True
		
	info('Calculating a coverage histogram...')
	coverage_hist = [0] * 200
	chr = ''
	pos = 0
	for line in shell_stdout('bedtools genomecov -d -split -ibam %s' % bam_path):
		cols = line.split('\t')
		if cols[0] != chr:
			chr = cols[0]
			cds = coding[chr]
			pos = int(cols[1])-2
			info('%s...' % chr)
		pos += 1
		if cds[pos]:
			coverage_hist[min(int(cols[2]), len(coverage_hist)-1)] += 1
			
	print('Coverage histogram:')
	print('===================')
	for cov in range(0, len(coverage_hist)):
		print('%d: %d' % (cov, coverage_hist[cov]))
Example #13
0
def variant_filter(vcf_path, nonsynonymous, no_1000g):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break
    sys.stdout.write(line)

    headers = line[:-1].split('\t')

    if nonsynonymous and not 'EXONIC_FUNCTION' in headers:
        error('Cannot find exonic function column.')
    if no_1000g and not '1000G' in headers:
        error('Cannot find 1000 Genomes column.')

    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    col_1000g = headers.index('1000G')
    col_exonic_func = headers.index('EXONIC_FUNCTION')

    for line in vcf_file:
        cols = line[:-1].split('\t')

        if nonsynonymous:
            if not cols[col_exonic_func].startswith(
                ('nonsynonymous', 'frameshift', 'stopgain', 'stoploss',
                 'nonframeshift')):
                continue

        if no_1000g:
            if cols[col_1000g]: continue

        sys.stdout.write(line)
Example #14
0
def discard_if_in_controls(vcf_path, control_samples, threshold):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    control = [
        any(re.search(rx, s) for rx in control_samples)
        for s in headers[sample_col:]
    ]
    if not any(control): error('No control samples found.')

    info('Using these %d control samples:' % sum(control))
    for s, c in zip(headers[sample_col:], control):
        if c: info('- %s' % s)

    sys.stdout.write(line)
    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')[sample_col:]
        genotypes = [gt_symbols.index(c[:c.find(':')]) for c in cols]
        if sum(c and gt > 1 for c, gt in zip(control, genotypes)) >= threshold:
            continue
        sys.stdout.write(line)
Example #15
0
def swiss_igv(tsv_path, data_col, one_based=True):
    tsv_file = zopen(tsv_path)
    headers = next(tsv_file)[:-1].split('\t')

    chrom_col = [
        i for i, h in enumerate(headers[:data_col])
        if re.match('chrom', h, re.I)
    ]
    if len(chrom_col) != 1: error('Cannot find chromosome column.')
    chrom_col = chrom_col[0]

    pos_col = [
        i for i, h in enumerate(headers[:data_col])
        if re.match('pos', h, re.I)
    ]
    if len(pos_col) != 1: error('Cannot find position column.')
    pos_col = pos_col[0]

    print('CHROMOSOME\tSTART\tEND\tFEATURE\t' + '\t'.join(headers[data_col:]))
    for line in tsv_file:
        tokens = line[:-1].split('\t')
        chr = tokens[chrom_col]
        pos = int(tokens[pos_col])
        if one_based: pos -= 1
        sys.stdout.write('%s\t%d\t%d\t-\t' % (chr, pos, pos + 1))
        print('\t'.join(tokens[data_col:]))
Example #16
0
def summarize(exon_expr_path):
	file = zopen(exon_expr_path)
	header = next(file)
	samples = header.rstrip('\n').split('\t')[4:]
	S = len(samples)
	
	features = {}
	for line in file:
		cols = line.rstrip('\n').split('\t')
		if len(cols) < S: continue
		chr = cols[0]
		start = int(cols[1]) + 1
		end = int(cols[2])
		expr = [float(x) for x in cols[4:]]
		
		f = features.setdefault(cols[3], MergedFeature(S))
		f.expr = [a + b for a, b in zip(f.expr, expr)]
		f.total_len += end - start + 1
		f.chromosome = chr
		if f.start == -1 or f.start > start: f.start = start
		if f.end == -1 or f.end < end: f.end = end
	
	print('CHROM\tSTART\tEND\tNAME\tLENGTH\t' + '\t'.join(samples))
	for name, f in features.iteritems():
		print('%s\t%d\t%d\t%s\t%d\t%s' % (f.chromosome, f.start, f.end,
			name, f.total_len, '\t'.join(str(e) for e in f.expr)))
Example #17
0
def fasta_remove_adapters(fasta_path, adapter):
    # Convert the adapter into a regular expression
    if len(adapter) < 5: error('Adapter sequence is too short.')
    adapter_re = adapter[:5]
    for base in adapter[5:]:
        adapter_re += '(?:' + base
    adapter_re += (len(adapter) - 5) * ')?'
    adapter_re = re.compile(adapter_re)

    info('Adapter regular expression: %s' % adapter_re)

    fasta = zopen(fasta_path)
    for line in fasta:
        if line[0] == '#':
            sys.stdout.write(line)
        elif line[0] == '>':
            sys.stdout.write(line)
            seq = next(fasta)[:-1]
            m = adapter_re.search(seq)
            if m: seq = seq[:m.start()]
            print(seq)
        elif line[0] == '@':
            sys.stdout.write(line)
            seq = next(fasta)[:-1]
            m = adapter_re.search(seq)
            trim_len = m.start() if m else len(seq)
            print(seq[:trim_len])
            sys.stdout.write(next(fasta))
            print(next(fasta)[:trim_len])
Example #18
0
def variant_conservation(vcf_path):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ALT') + 1
    samples = headers[sample_col:]
    chr = None

    Sd2 = math.ceil(len(samples) / 2.0)

    for line in vcf_file:
        cols = line.rstrip().split('\t')
        if cols[0] != chr:
            chr = cols[0]
            print('variableStep chrom=%s' % chr)

        genotypes = [
            gt_symbols.index(gt[:gt.find(':')]) for gt in cols[sample_col:]
        ]
        if any(genotypes == 0): continue

        is_alt = (genotypes >= 2)
        conserved = max(sum(is_alt), sum(1 - is_alt))
        conserved = (conserved - Sd2) / (len(samples) - Sd2)  # -> [0,1]
        print('%s\t%.2f' % (cols[1], conserved))
Example #19
0
def fasta_rna_to_dna(fasta_path):
    fasta = zopen(fasta_path)
    for line in fasta:
        if line[0] in '#>@+':
            sys.stdout.write(line)
        else:
            sys.stdout.write(line.upper().replace('U', 'T'))
    fasta.close()
Example #20
0
def fasta_c2t(fasta_path):
    fasta = zopen(fasta_path)
    for line in fasta:
        sys.stdout.write(line)
        if line[0] in '>@':
            line = next(fasta)  # Read sequence
            sys.stdout.write(line.upper().replace('C', 'T'))

    fasta.close()
Example #21
0
def coverage_grid(genome_path, winsize, step):
	for line in zopen(genome_path):
		if not line.strip(): continue
		c = line.rstrip('\n').split('\t')
		chr, chr_len = c[0], int(c[1])
		start = 1
		while start + winsize < chr_len:
			print('%s\t%d\t%d' % (chr, start - 1, start + winsize - 1))
			start += step
Example #22
0
def fasta_flatten(fasta_path, output_dir):
    fasta = zopen(fasta_path)
    flat_file = None
    for line in fasta:
        if line[0] in ('>'):
            if flat_file: flat_file.close()
            flat_file = open(output_dir + '/' + line[1:].strip() + '.seq', 'w')
        else:
            flat_file.write(line.strip())
    flat_file.close()
Example #23
0
def fasta_trim(fasta_path, trim_len):
    fasta = zopen(fasta_path)
    for line in fasta:
        if line[0] in '@+>':
            sys.stdout.write(line)
            line = next(fasta)
            if len(line) - 1 > trim_len:
                print(line[:trim_len])
            else:
                sys.stdout.write(line)
Example #24
0
def ensembl_to_hugo(expr_path, gtf_path):
	translations = {}
	for line in zopen(gtf_path):
		m = re.search('gene_id "(.*?)";.*; gene_name "(.*?)"', line)
		if not m: continue
		translations[m.group(1)] = m.group(2)
	
	file = zopen(expr_path)
	header = next(file)
	headers = header[:-1].split('\t')
	feature_col = headers.index('NAME')
	
	sys.stdout.write(header)
	for line in file:
		cols = line[:-1].split('\t')
		translated = translations.get(cols[feature_col])
		if translated:
			cols[feature_col] = translated + ':' + cols[feature_col]
		print('\t'.join(cols))
Example #25
0
def ensembl_cleanup(gtf_path):
    valid_chr = set([
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
        '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'
    ])
    for line in zopen(gtf_path):
        chr = line[:line.find('\t')]
        if chr in valid_chr and not 'nonsense_mediated' in line:
            sys.stdout.write('chr')
            sys.stdout.write(line)
Example #26
0
def filter_distance(sv_path, min_distance):
    for line in zopen(sv_path):
        if not line.startswith('chr'):
            sys.stdout.write(line)
            continue

        tokens = line[:-1].split('\t')

        if tokens[0] != tokens[5] or abs(int(tokens[2]) -
                                         int(tokens[7])) >= min_distance:
            sys.stdout.write(line)
Example #27
0
def variant_heterozygous_concordance(vcf_path, kgenomes_path, test_rx, ref_rx):
    is_snp = np.zeros(300 * 1000 * 1000, np.bool_)
    for line in zopen(kgenomes_path):
        pos = int(line[:-1].split('\t')[1])
        is_snp[pos] = True

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    headers = line[:-1].split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1

    test_col = [
        i for i, h in headers if re.search(test_rx, h) and i >= sample_col
    ]
    ref_col = [
        i for i, h in headers if re.search(ref_rx, h) and i >= sample_col
    ]
    if len(test_col) != 1: error('Test sample not found.')
    if len(ref_col) != 1: error('Reference sample not found.')

    total_hetz_in_ref = 0
    total_concordant = 0
    for line in vcf_file:
        cols = line[:-1].split('\t')
        if not is_snp[int(cols[1])]: continue

        test = cols[test_col]
        test_gt = gt_symbols.index(test[:test.find(':')])
        ref = cols[ref_col]
        ref_gt = gt_symbols.index(ref[:ref.find(':')])

        if ref_gt == 2:
            total_hetz_in_ref += 1
            total_concordant += (test_gt == 2)

    print('Concordance was %.1f%% (%d / %d).' %
          (float(total_concordant) / total_hetz_in_ref * 100, total_concordant,
           total_hetz_in_ref))
Example #28
0
def fasta_split(fasta_path, tag_length, anchors_5p_path, anchors_3p_path):
    fasta = zopen(fasta_path, 'r')
    tags_5p = zopen(anchors_5p_path, 'w')
    tags_3p = zopen(anchors_3p_path, 'w')

    R = 0

    for line in fasta:
        if line[0] == '+':
            next(fasta)
            continue
        if line[0] in '>@#': continue

        R += 1
        if len(line) <= 2 * tag_length: continue
        tags_5p.write('>%d_%s/1\n%s\n' % (R, line[:-1], line[0:tag_length]))
        tags_3p.write('>%d_%s/2\n%s\n' %
                      (R, line[:-1], line[-tag_length - 1:-1]))

    tags_5p.close()
    tags_3p.close()
Example #29
0
def variant_statistics(vcf_path):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('#'): break

    headers = line.rstrip('\n').split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    samples = headers[sample_col:]

    nearby_gene_col = headers.index('NEARBY_GENES') \
     if 'NEARBY_GENES' in headers else None

    mutations_per_sample = np.zeros(len(samples))
    mutations_per_chr = defaultdict(lambda: np.zeros(len(samples)))
    mutations_per_gene = defaultdict(lambda: np.zeros(len(samples)))

    for line in vcf_file:
        cols = line[:-1].split('\t')
        gtypes = [gt.split(':')[0] for gt in cols[sample_col:]]
        gtypes = np.array([gt_symbols.index(gt) for gt in gtypes])
        mutations_per_sample += (gtypes > 1)
        mutations_per_chr[cols[0]] += (gtypes > 1)

        if nearby_gene_col:
            for nearby in cols[nearby_gene_col].split(','):
                mutations_per_gene[nearby] += (gtypes > 1)

    print('Sample mutation counts:')
    for s, sample_name in enumerate(samples):
        print('%s: %d' % (sample_name, mutations_per_sample[s]))

    print('Mutations per chromosome:')
    chrs = natural_sorted(mutations_per_chr.keys())
    print('SAMPLE\t%s' % '\t'.join(chrs))
    for s, sample_name in enumerate(samples):
        total = sum(mutations_per_chr[chr][s] for chr in chrs)
        if total == 0: continue
        sys.stdout.write(sample_name)
        for chr in chrs:
            sys.stdout.write('\t%d (%.1f)' %
                             (mutations_per_chr[chr][s],
                              float(mutations_per_chr[chr][s]) / total * 100))
        sys.stdout.write('\n')

    print('Top mutated genes:')
    top_genes = sorted(mutations_per_gene.iteritems(),
                       key=lambda x: sum(x[1] > 0),
                       reverse=True)
    for top in top_genes[0:100]:
        mut_samples = sum(top[1] > 0)
        if mut_samples < 2: continue
        print('%s\t%d samples' % (top[0], mut_samples))
Example #30
0
def annotate_variants(sv_path, bed_path):
    features = []
    bed_file = zopen(bed_path)
    for line in bed_file:
        c = line.rstrip().split('\t')
        features.append((c[0], c[5], (int(c[1]), int(c[2])), c[3]))

    print(sv_file_header)
    sv_file = zopen(sv_path)
    for line in sv_file:
        if not line.startswith('chr'): continue

        tokens = line[:-1].split('\t')
        chr_1 = tokens[0]
        strand_1 = tokens[1]
        pos_1 = int(tokens[2])
        chr_2 = tokens[5]
        strand_2 = tokens[6]
        pos_2 = int(tokens[7])

        nearby_features_1 = [(re.sub(' \(ENSG.*?\)', '',
                                     f[3]), distance_to_gene(pos_1, f[2]))
                             for f in features if f[0] == chr_1]
        nearby_features_2 = [(re.sub(' \(ENSG.*?\)', '',
                                     f[3]), distance_to_gene(pos_2, f[2]))
                             for f in features if f[0] == chr_2]

        nearby_features_1 = [f for f in nearby_features_1 if f[1] < 100000]
        nearby_features_2 = [f for f in nearby_features_2 if f[1] < 100000]

        nearby_features_1.sort(key=lambda x: x[1])
        nearby_features_2.sort(key=lambda x: x[1])

        tokens[3] = ', '.join(['%s (%d)' % f for f in nearby_features_1])
        tokens[8] = ', '.join(['%s (%d)' % f for f in nearby_features_2])

        print('%s' % '\t'.join(tokens))

    sv_file.close()