Ejemplo n.º 1
0
def variant_annotate(vcf_path,
                     genome='~/tools/annovar-2016-02-01/humandb/hg38'):
    format_annovar(vcf_path, 'anno_tmp.vcf')
    humandb_dir, genome_version = os.path.split(genome)
    shell('table_annovar.pl anno_tmp.vcf %s -buildver %s --remove --otherinfo '
          '--outfile annotated -operation g,f,f,f '
          '-protocol refGene,cosmic70,1000g2014oct_all,exac03' %
          (humandb_dir, genome_version))

    anno = open('annotated.%s_multianno.txt' % genome_version)
    out = zopen('annotated.vcf.gz', 'w')
    anno.next()
    line = anno.next()
    headers = [
        'CHROM', 'POSITION', 'REF', 'ALT', 'FUNCTION', 'GENE',
        'EXONIC_FUNCTION', 'AA_CHANGE', 'COSMIC', '1000G', 'EXAC'
    ]
    headers += line.rstrip('\n').split('\t')[20:]
    out.write('\t'.join(headers) + '\n')
    for line in anno:
        c = line.rstrip('\n').split('\t')
        out.write('\t'.join(c[0:2] + c[3:7] + c[8:13] + c[20:]))
        out.write('\n')
    out.close()

    os.remove('anno_tmp.vcf')
    os.remove('annotated.%s_multianno.txt' % genome_version)
    if num_lines('annotated.invalid_input') <= 1:
        os.remove('annotated.invalid_input')
    if num_lines('annotated.refGene.invalid_input') <= 1:
        os.remove('annotated.refGene.invalid_input')
Ejemplo n.º 2
0
def sam_count(bam_path,
              bed_path,
              genome_path='~/organisms/homo_sapiens/hg19.chrom.sizes'):
    bed_cols = len(next(open(bed_path)).split('\t'))
    shell(
        'bedtools coverage -split -sorted -counts -g %s -a %s -b %s | cut -f%d'
        % (genome_path, bed_path, bam_path, bed_cols + 1))
Ejemplo n.º 3
0
def swiss_download_sra(sra_study):
    if not sra_study.startswith('SRP'):
        error('SRA study identifier must begin with "SRP".')

    shell('/data/csb/tools/ncftp-3.2.5/bin/ncftpget -R -v '
          'ftp-trace.ncbi.nlm.nih.gov ./ '
          '/sra/sra-instant/reads/ByStudy/sra/SRP/%s/%s' %
          (sra_study[:6], sra_study))
Ejemplo n.º 4
0
	def lftp_mirror(rule, dry_run=False):

		cmds = open('.lftp_script', 'w')
		cmds.write('open -u %s,%s sftp://%s\n' % (
			rule.username, rule.password, rule.dst_host))
		cmds.write('mirror -P3 -Rae %s %s %s\n' % (
			'--dry-run' if dry_run else '-v', rule.src_dir, rule.dst_dir))
		cmds.close()

		if dry_run:
			userpass = rule.username + ':' + rule.password + '@'
			host = rule.dst_host

			out = shell_stdout('lftp -f .lftp_script')
			for line in out:
				if line.startswith('chmod'): continue
				if line.startswith('mkdir'): continue

				m = re.match('get -O sftp://(.+) file:/.+/(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('ADD %s/%s' % (dst, m.group(2)))
					continue

				m = re.match('get -e -O sftp://(.+) file:/.+/(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('UPDATE %s/%s' % (dst, m.group(2)))
					continue

				m = re.match('rm .*sftp://(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('DELETE %s' % dst)
					continue

				sys.stdout.write(line)
		else:
			shell('lftp -f .lftp_script')

		os.remove('.lftp_script')
Ejemplo n.º 5
0
def sam_compact(bam_path):
    shell('samtools view -H %s' % bam_path)
    short_id = {}
    id_counter = 1
    for al in read_sam(bam_path, 'D'):
        rid = al[0]
        if rid[-2] == '/': rid = rid[:-2]
        new_id = short_id.get(rid)
        if new_id:
            del short_id[rid]
        else:
            new_id = str(id_counter)
            id_counter += 1
            short_id[rid] = new_id
        al[0] = new_id
        al[10] = '*'
        sys.stdout.write('\t'.join(al))
Ejemplo n.º 6
0
def parallel_worker(log_dir):
	with open('%s/tasks' % log_dir) as f:
		command = next(f).strip()
		targets = [target.strip() for target in f]
		
	for target in targets:
		out = open_exclusive('%s/%s.out' % (log_dir, sanitize_path(target)))
		if not out: continue
		cmd_with_target = 'export x=%s; %s' % (target, command)
		out.write('%s\n%s\n' % (cmd_with_target, '-'*80))
		out.flush()
		start_time = datetime.datetime.now()
		shell(cmd_with_target, stdout=out, stderr=out)
		end_time = datetime.datetime.now()
		out.write('%s\nJOB FINISHED. ELAPSED TIME WAS %s.\n' %
			('-'*80, end_time - start_time))
		out.close()
Ejemplo n.º 7
0
def cghub_download(samples):
    for sample in samples:
        # Don't redownload files that are already present.
        existing = {}
        for root, dirnames, filenames in os.walk('.'):
            for f in filenames:
                path = os.path.join(root, f)
                existing[f] = os.stat(path).st_size

        filename = sample.files[0]
        filesize = sample.filesizes[0]

        if filename in existing and existing[filename] == filesize:
            info('%s has already been downloaded...' % filename)
            continue

        info('Downloading %s...' % filename)
        shell('gtdownload -v -d %s -c ~/tools/genetorrent*/cghub_2016.key' %
              sample.analysis_data_uri)
Ejemplo n.º 8
0
def sam_unaligned_reads(bam_path):
    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read has max 1 primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    if has_base_qualities(bam_path):
        shell('samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' %
              (bam_path, options))
    else:
        bam2fq = shell_stdout(
            'samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' %
            (bam_path, options))
        for line in bam2fq:
            if line[0] != '@': error('Invalid bam2fq output.')
            sys.stdout.write('>')
            sys.stdout.write(line[1:])
            sys.stdout.write(next(bam2fq))

            # Skip per-base qualities. They can start with '@'.
            next(bam2fq)
            next(bam2fq)
Ejemplo n.º 9
0
def fasta_from_sra(sra_path):
    shell('~/tools/sratoolkit*/fastq-dump --split-3 --gzip %s' % sra_path)
Ejemplo n.º 10
0
def visualize_splicing(genes, fastq_prefix, out_prefix):
	genome_path = '/data/csb/organisms/homo_sapiens/hg19_flat'
	bed_path = '/data/csb/organisms/homo_sapiens/ensembl_68/exons.bed'
	genes = genes.replace(' ', '').split(',')
	min_anchor = 15
	read_len = 90
	trim = read_len - min_anchor
	
	chromosomes = read_flat_seq('/data/csb/organisms/homo_sapiens/hg19_flat')
	
	donors = []
	acceptors = []
	exons = []
	for line in zopen(bed_path):
		cols = line[:-1].split('\t')
		if cols[3] in genes:
			chr = cols[0] if cols[0].startswith('chr') else 'chr'+cols[0]
			chr_seq = chromosomes[chr]
			pos = (int(cols[1])+1, int(cols[2]))
			if cols[5] == '+':
				acceptors.append((chr, '+', pos[0],
					chr_seq[pos[0]-1:pos[0]-1+trim]))
				donors.append((chr, '+', pos[1], chr_seq[pos[1]-trim:pos[1]]))
			elif cols[5] == '-':
				acceptors.append((chr, '-', pos[1],
					revcomplement(chr_seq[pos[1]-trim:pos[1]])))
				donors.append((chr, '-', pos[0],
					revcomplement(chr_seq[pos[0]-1:pos[0]-1+trim])))
			exons.append(pos)
				
	# Remove duplicate acceptors and donors.
	acceptors = list(set(acceptors))
	donors = list(set(donors))
	exons = list(set(exons))
	
	# Calculate the contiguous genomic sequence
	chr = acceptors[0][0]
	if any(a[0] != chr for a in acceptors):
		error('Genes must be in the same chromosome!')
	
	genome_window = (min(a[2] for a in acceptors)-2000,
		max(a[2] for a in acceptors)+2000)
	#contig = chromosomes[chr][genome_window[0]:genome_window[1]]
	
	# Calculate junction sequences
	class Junction(object):
		def __init__(self, name, seq):
			self.name = name
			self.sequence = seq
			self.reads = 0
			self.ratio = 0
	
	junctions = defaultdict(list)   # Group junctions by donor
	for left in donors:
		for right in acceptors:
			name = '%d[%s]_%d[%s]' % (left[2], left[1], right[2], right[1])
			junctions[left].append(Junction(name, left[3] + right[3]))
	print('Generated %d junctions.' % (len(donors) * len(acceptors)))
	
	# Build Bowtie index
	index_fasta_path = '%s_ref.fa' % out_prefix
	index = open(index_fasta_path, 'w')
	#index.write('>contig\n%s\n' % contig)
	for donor in junctions:
		for junc in junctions[donor]:
			index.write('>%s\n%s\n' % (junc.name, junc.sequence))
	index.close()
	shell('/data/csb/tools/bowtie-0.12.9/bowtie-build -q %s %s_index' %
		(index_fasta_path, out_prefix))
	
	# Align reads against junctions and tally junction read counts.
	shell('bowtie -v1 -B1 -p8 %s_index <(gunzip -c %s_1.fq.gz %s_2.fq.gz) '
		'> %s.bowtie' % (out_prefix, fastq_prefix, fastq_prefix, out_prefix))
	junction_by_name = {}
	for donor in junctions:
		for j in junctions[donor]: junction_by_name[j.name] = j
	for line in open('%s.bowtie' % out_prefix):
		cols = line[:-1].split('\t')
		if not '_' in cols[2]: continue
		junction_by_name[cols[2]].reads += 1
	
	# Calculate junction power relative to all outgoing links from donor
	for donor in junctions:
		total = sum(j.reads for j in junctions[donor])
		if total <= 0: continue
		for j in junctions[donor]:
			j.ratio = float(j.reads) / total
			if j.reads > 0:
				print('%s: %.1f%% (%d)' % (j.name, j.ratio*100, j.reads))
		
	# Check which exons actually participate in the mature transcripts
	active_edges = []
	for donor in junctions:
		for j in junctions[donor]:
			if j.ratio < 0.05: continue
			active_edges += [int(x[:-3]) for x in j.name.split('_')]
	
	exons = [[ex[0], ex[1], False] for ex in exons]
	ties = []
	for edge in set(active_edges):
		matches = [ex for ex in exons if edge in ex]
		if len(matches) == 1: matches[0][2] = True  # Unique match, mark active
		if len(matches) > 1: ties.append(matches)
	for tie in ties:
		if not any(ex[2] for ex in tie):
			for ex in tie: ex[2] = True   # If still tied, mark all tied active
	
	# Print exon map
	from svgfig import Rect, Frame, Poly
	rects = [Rect(ex[0], 1, ex[1], 2, stroke='none',
		fill='whitesmoke', stroke_linejoin='miter')
		for ex in exons if not ex[2]]
	rects += [Rect(ex[0], 1, ex[1], 2, stroke='none',
		fill='black', stroke_linejoin='miter') for ex in exons if ex[2]]
	lines = []
	for donor in junctions:
		for j in junctions[donor]:
			start, end = [int(x[:-3]) for x in j.name.split('_')]
			lines.append(Poly([(start,2), ((start+end)/2,3), (end,2)],
				stroke_opacity=j.ratio))
		
	Frame(genome_window[0], genome_window[1], 0, 10, *(rects+lines),
		width=500).SVG().save('%s.svg' % out_prefix)
	
	shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))
Ejemplo n.º 11
0
def detect_specific(bam_path, donors_path, acceptors_path, genome_path,
                    out_prefix, all_reads):

    read_len = sam.read_length(bam_path)
    info('Using read length %d bp...' % read_len)

    flank_len = read_len - 10
    chromosomes = read_fasta(genome_path)

    donor_exons = regions_from_bed(donors_path)
    donors = []
    for ex in donor_exons:
        chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0]
        chr_seq = chromosomes[chr]
        if ex[1] == '+':
            donors.append((chr, '+', ex[3], chr_seq[ex[3] - flank_len:ex[3]]))
        elif ex[1] == '-':
            donors.append(
                (chr, '-', ex[2],
                 revcomplement(chr_seq[ex[2] - 1:ex[2] - 1 + flank_len])))

    acceptor_exons = regions_from_bed(acceptors_path)
    acceptors = []
    for ex in acceptor_exons:
        chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0]
        chr_seq = chromosomes[chr]
        if ex[1] == '+':
            acceptors.append(
                (chr, '+', ex[2], chr_seq[ex[2] - 1:ex[2] - 1 + flank_len]))
        elif ex[1] == '-':
            acceptors.append((chr, '-', ex[3],
                              revcomplement(chr_seq[ex[3] - flank_len:ex[3]])))

    del chromosomes  # Release 3 GB of memory
    gc.collect()

    # Remove duplicate acceptors and donors.
    acceptors = list(set(acceptors))
    donors = list(set(donors))

    # Calculate junction sequences
    junctions = {}
    for left in donors:
        for right in acceptors:
            name = '%s:%s:%d_%s:%s:%d' % (left[:3] + right[:3])
            junctions[name] = Object(sequence=left[3] + right[3], reads=[])
    info('Generated %d junctions.' % len(junctions))

    # Build Bowtie index
    info('Constructing junction FASTA file...')
    index_fasta_path = out_prefix + '_ref.fa'
    index = open(index_fasta_path, 'w')
    for name, junction in junctions.iteritems():
        index.write('>%s\n%s\n' % (name, junction.sequence))
    index.close()
    info('Constructing Bowtie index...')
    shell('bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix))

    # Align reads against junctions and tally junction read counts.
    if all_reads:
        info('Aligning all reads against index...')
        reads_command = 'sam reads %s' % bam_path
    else:
        info('Aligning unaligned reads against index...')
        reads_command = 'sam unaligned reads %s' % bam_path

    for line in shell_stdout('bowtie -f -v1 -B1 %s_index <(%s)' %
                             (out_prefix, reads_command)):
        cols = line.rstrip().split('\t')
        junctions[cols[2]].reads.append(cols[4])

    shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))

    out_file = open(out_prefix + '.tsv', 'w')
    out_file.write('5\' breakpoint\t3\' breakpoint\tNum reads\tSequences\n')
    for name, j in junctions.iteritems():
        if not j.reads: continue
        flanks = name.split('_')
        out_file.write('%s\t%s\t%d\t' % (flanks[0], flanks[1], len(j.reads)))
        #out_file.write(';'.join(j.reads))
        out_file.write('\n')
    out_file.close()
Ejemplo n.º 12
0
def detect_rearrangements(sam_path,
                          genome_path,
                          out_prefix,
                          anchor_len,
                          min_mapq,
                          orientation,
                          max_frag_len,
                          discard_duplicates='both-ends'):

    if not os.path.exists(sam_path):
        error('File %s does not exist.' % sam_path)

    if not discard_duplicates in ('no', 'both-ends', 'one-end'):
        error('Invalid duplicate discard method: %s' % discard_duplicates)

    detect_discordant_pairs(sam_path,
                            out_prefix,
                            max_frag_len=max_frag_len,
                            min_mapq=min_mapq,
                            orientation=orientation)

    # Execute split read analysis if the user has specified an anchor length.
    if anchor_len > 0:
        detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len)

    info('Sorting discordant pairs by chromosomal position...')
    sort_inputs = '<(gunzip -c %s.discordant_pairs.tsv.gz)' % out_prefix
    if anchor_len > 0:
        sort_inputs += ' <(gunzip -c %s.discordant_reads.tsv.gz)' % out_prefix

    sort_tmp_dir = os.path.dirname(out_prefix)
    if not sort_tmp_dir: sort_tmp_dir = './'

    shell('sort -k1,1 -k3,3n -T %s %s | gzip -c > %s.sorted_pairs.tsv.gz' %
          (sort_tmp_dir, sort_inputs, out_prefix))

    def report_rearrangement(out, r):
        if discard_duplicates == 'both-ends':
            discard_duplicates_both_ends(r)
        elif discard_duplicates == 'one-end':
            discard_duplicates_one_end(r)
        if len(r.reads) < 2: return 0
        out.write('%s\t%s\t%d\t\t\t%s\t%s\t%d\t\t\t%d\t%d\t%s\n' %
                  (r.chr, r.strand, r.pos, r.mchr, r.mstrand, r.mpos,
                   sum([read[2] == None for read in r.reads]),
                   sum([read[2] != None for read in r.reads]), ';'.join(
                       [read[2] for read in r.reads if read[2] != None])))
        return 1

    info('Identifying rearrangements based on clusters of discordant reads...')

    out = open('%s.sv' % out_prefix, 'w')
    out.write(sv_file_header + '\n')

    N = 0
    rearrangements = []
    for line in zopen('%s.sorted_pairs.tsv.gz' % out_prefix):
        al = line[:-1].split('\t')

        chr = al[0]
        strand = al[1]
        pos = int(al[2])
        mchr = al[3]
        mstrand = al[4]
        mpos = int(al[5])
        seq = None if al[6] == '-' else al[6]

        # Rearrangements that are too far need not be considered in the future
        reachable = []
        for r in rearrangements:
            if pos - r.pos > max_frag_len:
                N += report_rearrangement(out, r)
            else:
                reachable.append(r)
        rearrangements = reachable

        # Check if we already have a rearrangement that matches the new pair.
        # We don't check the distance for the first mate because we already
        # know from above the rearrangements near it.
        matches = [
            r for r in rearrangements
            if abs(mpos - r.mpos) <= max_frag_len and chr == r.chr
            and mchr == r.mchr and strand == r.strand and mstrand == r.mstrand
        ]

        read = (pos, mpos, seq)
        if matches:
            for match in matches:
                match.reads.append(read)

        else:
            # No suitable rearrangements, create a new one.
            rearrangements.append(
                Rearrangement(chr, strand, pos, mchr, mstrand, mpos, read))

    for r in rearrangements:
        N += report_rearrangement(out, r)

    info('Found %d rearrangements with at least 2 reads of evidence.' % N)