Beispiel #1
0
def read_sam(sam_path, mode='', min_quality=0):
    view_options = ''
    flag_on = 0x0
    flag_off = 0x900  # Ignore secondary and supplementary alignments
    if 'a' in mode: flag_off |= 0x4  # Aligned
    if 'A' in mode:
        flag_on |= 0x1
        flag_off |= 0xc  # Both mates aligned
    if 'C' in mode:
        flag_on |= 0x3
        flag_off |= 0xc  # Concordant read pairs
    if 'u' in mode: flag_on |= 0x4  # Unaligned
    if '1' in mode: flag_on |= 0x40  # First mates
    if '2' in mode: flag_on |= 0x80  # Second mates
    if '+' in mode: flag_off |= 0x10  # Plus strand only
    if '-' in mode: flag_on |= 0x10  # Minus strand only
    if not 'D' in mode: flag_off |= 0x400  # Flagged duplicates

    view_options += '-f 0x%x -F 0x%x ' % (flag_on, flag_off)

    if min_quality > 0: view_options += '-q%d ' % min_quality

    out = shell_stdout('samtools view %s %s' % (view_options, sam_path))
    for line in out:
        yield line.split('\t')
Beispiel #2
0
def read_sam(sam_path, mode="", min_quality=0):
    view_options = ""
    flag_on = 0x0
    flag_off = 0x900  # Ignore secondary and supplementary alignments
    if "a" in mode:
        flag_off |= 0x4  # Aligned
    if "A" in mode:
        flag_on |= 0x1
        flag_off |= 0xC  # Both mates aligned
    if "C" in mode:
        flag_on |= 0x3
        flag_off |= 0xC  # Concordant read pairs
    if "u" in mode:
        flag_on |= 0x4  # Unaligned
    if "1" in mode:
        flag_on |= 0x40  # First mates
    if "2" in mode:
        flag_on |= 0x80  # Second mates
    if "+" in mode:
        flag_off |= 0x10  # Plus strand only
    if "-" in mode:
        flag_on |= 0x10  # Minus strand only
    if not "D" in mode:
        flag_off |= 0x400  # Flagged duplicates

    view_options += "-f 0x%x -F 0x%x " % (flag_on, flag_off)

    if min_quality > 0:
        view_options += "-q%d " % min_quality
    out = shell_stdout("samtools view %s %s" % (view_options, sam_path))
    for line in out:
        yield line.decode("utf8").split("\t")
Beispiel #3
0
def coverage_cds(bam_path, gtf_path):
	
	chr_sizes = ref_sequence_sizes(bam_path)
	
	info('Constructing a map of coding regions...')
	coding = {}
	for chr, size in chr_sizes.iteritems():
		coding[chr] = [False] * size
	for line in zopen(gtf_path):
		if line.startswith('#'): continue
		cols = line.split('\t')
		if cols[2] != 'CDS': continue
		if len(cols[0]) > 5: continue   # Ignore chromosomes other than chrXX
		if not cols[0] in coding: continue
		coding[cols[0]][int(cols[3])-1:int(cols[4])] = True
		
	info('Calculating a coverage histogram...')
	coverage_hist = [0] * 200
	chr = ''
	pos = 0
	for line in shell_stdout('bedtools genomecov -d -split -ibam %s' % bam_path):
		cols = line.split('\t')
		if cols[0] != chr:
			chr = cols[0]
			cds = coding[chr]
			pos = int(cols[1])-2
			info('%s...' % chr)
		pos += 1
		if cds[pos]:
			coverage_hist[min(int(cols[2]), len(coverage_hist)-1)] += 1
			
	print('Coverage histogram:')
	print('===================')
	for cov in range(0, len(coverage_hist)):
		print('%d: %d' % (cov, coverage_hist[cov]))
Beispiel #4
0
def ref_sequence_sizes(sam_path):
    out = shell_stdout("samtools view -H %s" % sam_path)
    chr_sizes = {}
    for line in out:
        m = re.match("@SQ\tSN:(\w+)\tLN:(\d+)", line.decode("utf8"))
        if m:
            chr_sizes[m.group(1)] = int(m.group(2))
    return chr_sizes
Beispiel #5
0
def ref_sequence_sizes(sam_path):
    out = shell_stdout('samtools view -H %s' % sam_path)
    chr_sizes = {}
    for line in out:
        m = re.match('@SQ\tSN:(\w+)\tLN:(\d+)', line)
        if m:
            chr_sizes[m.group(1)] = int(m.group(2))
    return chr_sizes
Beispiel #6
0
def sam_reads_raw(bam_path, out_prefix):
    out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w')
    out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w')
    out = zopen('%s.reads.gz' % out_prefix, 'w')

    reads_1 = {}
    reads_2 = {}

    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read only has one primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path))
    for line in bam2fq:
        if line[0] != '@': error('Invalid bam2fq output.')
        line = line[:-1]
        if line.endswith('/1'):
            segname = line[1:-2]
            mate = reads_2.pop(segname, None)
            if mate:
                out_1.write(next(bam2fq))
                out_2.write('%s\n' % mate)
            else:
                reads_1[segname] = next(bam2fq)[:-1]
        elif line.endswith('/2'):
            segname = line[1:-2]
            mate = reads_1.pop(segname, None)
            if mate:
                out_1.write('%s\n' % mate)
                out_2.write(next(bam2fq))
            else:
                reads_2[segname] = next(bam2fq)[:-1]
        else:
            out.write('%s\n' % next(bam2fq)[:-1])

        # Skip per-base qualities. They can start with '@'.
        next(bam2fq)
        next(bam2fq)

    info('Found %d orphan first mates.' % len(reads_1))
    for read_id in reads_1.keys()[:5]:
        info('- Example: %s' % read_id)

    info('Found %d orphan second mates.' % len(reads_2))
    for read_id in reads_2.keys()[:5]:
        info('- Example: %s' % read_id)

    if len(reads_1) > 0:
        for read in reads_1.itervalues():
            out.write('%s\n' % read)

    if len(reads_2) > 0:
        for read in reads_2.itervalues():
            out.write('%s\n' % read)

    out_1.close()
    out_2.close()
    out.close()
Beispiel #7
0
def sam_statistics(bam_paths):
    samples = [re.sub('\.bam$', '', s) for s in bam_paths]
    print(
        'SAMPLE\tTOTAL READS\tALIGNED READS\tALIGNED READS WITH ALIGNED MATE\tALIGNED READS WITH CONCORDANT MATE\tMITOCHONDRIAL\tDUPLICATES'
    )

    for bam_path in bam_paths:
        total = -1
        aligned = -1
        aligned_with_aligned_mate = -1
        aligned_with_concordant_mate = -1
        mitochondrial = -1
        duplicates = -1

        for line in shell_stdout('samtools flagstat %s' % bam_path):
            m = re.search(r'(\d+) \+ (\d+) in total', line)
            if m: total = int(m.group(1)) + int(m.group(2))

            m = re.search(r'(\d+) \+ (\d+) duplicates', line)
            if m: duplicates = int(m.group(1)) + int(m.group(2))

            m = re.search(r'(\d+) \+ (\d+) mapped', line)
            if m: aligned = int(m.group(1)) + int(m.group(2))

            m = re.search(r'(\d+) \+ (\d+) properly paired', line)
            if m:                aligned_with_concordant_mate = \
           int(m.group(1)) + int(m.group(2))

            m = re.search(r'(\d+) \+ (\d+) with itself and mate mapped', line)
            if m: aligned_with_aligned_mate = int(m.group(1)) + int(m.group(2))

        # Count the number of reads aligned to mitochondrial DNA
        for line in shell_stdout('samtools view -c %s chrM' % bam_path):
            mitochondrial = int(line)
            break

        print(
            '%s\t%d\t%d (%.1f%%)\t%d (%.1f%%)\t%d (%.1f%%)\t%d (%.1f%%)\t%d' %
            (re.sub('\.bam$', '', bam_path), total, aligned,
             float(aligned) / total * 100, aligned_with_aligned_mate,
             float(aligned_with_aligned_mate) / total * 100,
             aligned_with_concordant_mate,
             float(aligned_with_concordant_mate) / total * 100, mitochondrial,
             float(mitochondrial) / total * 100, duplicates))
Beispiel #8
0
def sam_merge_counts(bed_path, count_paths):
    samples = [p.replace('.tsv', '') for p in count_paths]
    bed_file = open(bed_path)
    cols = next(bed_file).rstrip('\n').split('\t')
    header = ['CHROMOSOME', 'START', 'END']
    if len(cols) >= 4: header.append('FEATURE')
    while len(header) < len(cols):
        header.append('')
    header += samples
    print('\t'.join(header))
    for line in shell_stdout('paste %s %s' %
                             (bed_path, ' '.join(count_paths))):
        sys.stdout.write(line)
Beispiel #9
0
def simple_pileup(bam_paths, genome_path, kgenomes_path, min_mapq=10, min_alt_alleles=3,
	region=None):
	
	helper_dir = os.path.dirname(os.path.realpath(__file__)) + '/compiled'
	
	options = []
	if region:
		options.append('%s %s' % ('-l' if region.endswith('.bed') else '-r', region))
	
	# samtools mpileup will automatically ignore alignments flagged as
	# duplicates
	cmd = 'samtools mpileup -d 100000 -A -x -R -sB %s -q0 -l %s -f %s %s | %s/spileup %d %d' % (' '.join(options), kgenomes_path, genome_path,
		' '.join(bam_paths), helper_dir, min_alt_alleles, min_mapq)
	#info('Pre-filtering mutations with the following command:\n%s' % cmd)
	return shell_stdout(cmd)
Beispiel #10
0
	def lftp_mirror(rule, dry_run=False):

		cmds = open('.lftp_script', 'w')
		cmds.write('open -u %s,%s sftp://%s\n' % (
			rule.username, rule.password, rule.dst_host))
		cmds.write('mirror -P3 -Rae %s %s %s\n' % (
			'--dry-run' if dry_run else '-v', rule.src_dir, rule.dst_dir))
		cmds.close()

		if dry_run:
			userpass = rule.username + ':' + rule.password + '@'
			host = rule.dst_host

			out = shell_stdout('lftp -f .lftp_script')
			for line in out:
				if line.startswith('chmod'): continue
				if line.startswith('mkdir'): continue

				m = re.match('get -O sftp://(.+) file:/.+/(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('ADD %s/%s' % (dst, m.group(2)))
					continue

				m = re.match('get -e -O sftp://(.+) file:/.+/(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('UPDATE %s/%s' % (dst, m.group(2)))
					continue

				m = re.match('rm .*sftp://(.+)', line)
				if m:
					dst = m.group(1)
					if dst.startswith(userpass): dst = dst[len(userpass):]
					if dst.startswith(host): dst = dst[len(host):]
					print('DELETE %s' % dst)
					continue

				sys.stdout.write(line)
		else:
			shell('lftp -f .lftp_script')

		os.remove('.lftp_script')
Beispiel #11
0
def simple_pileup(bam_paths,
                  genome_path,
                  min_mapq=10,
                  min_alt_alleles=3,
                  region=None):

    helper_dir = os.path.dirname(os.path.realpath(__file__)) + '/compiled'

    options = []
    if region:
        options.append('%s %s' %
                       ('-l' if region.endswith('.bed') else '-r', region))

    # samtools mpileup will automatically ignore alignments flagged as
    # duplicates
    cmd = 'samtools mpileup -d 1000000 -A -x -R -sB %s -q0 -f %s %s | %s/spileup %d %d' % (
        ' '.join(options), genome_path, ' '.join(bam_paths), helper_dir,
        min_alt_alleles, min_mapq)
    #info('Pre-filtering mutations with the following command:\n%s' % cmd)
    return shell_stdout(cmd)
Beispiel #12
0
def sam_unaligned_reads(bam_path):
    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read has max 1 primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    if has_base_qualities(bam_path):
        shell('samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' %
              (bam_path, options))
    else:
        bam2fq = shell_stdout(
            'samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' %
            (bam_path, options))
        for line in bam2fq:
            if line[0] != '@': error('Invalid bam2fq output.')
            sys.stdout.write('>')
            sys.stdout.write(line[1:])
            sys.stdout.write(next(bam2fq))

            # Skip per-base qualities. They can start with '@'.
            next(bam2fq)
            next(bam2fq)
Beispiel #13
0
def detect_discordant_pairs(sam_path, out_prefix, max_frag_len, min_mapq,
                            orientation):

    out = zopen(out_prefix + '.discordant_pairs.tsv.gz', 'w')
    N = 0

    sort_tmp_dir = os.path.dirname(out_prefix)
    if not sort_tmp_dir: sort_tmp_dir = './'

    # Go through all the first mates and look for discordant pairs.
    info('Searching for discordant read pairs...')
    prev = ['']
    for line in shell_stdout(
            'sam discordant pairs --min-mapq=%d %s %d | sort -k1,1 -T %s' %
        (min_mapq, sam_path, max_frag_len, sort_tmp_dir)):

        al = line.split('\t')
        if len(al) < 9: continue

        # Discard spliced and clipped reads.
        # FIXME: Add support for spliced RNA-seq reads.
        if 'N' in al[5] or 'S' in al[5]: continue

        if al[0].endswith('/1') or al[0].endswith('/2'):
            al[0] = al[0][:-2]  # Remove /1 or /2 suffix

        if al[0] != prev[0]:
            prev = al
            continue

        flags = int(al[1])
        chr = al[2]
        mchr = prev[2]
        strand = '-' if flags & 0x10 else '+'
        mstrand = '-' if flags & 0x20 else '+'
        pos = int(al[3])
        mpos = int(prev[3])
        rlen = len(al[9])
        mrlen = len(prev[9])

        if not chr.startswith('chr'): chr = 'chr' + chr
        if not mchr.startswith('chr'): mchr = 'chr' + mchr

        if chr == 'chrM' or mchr == 'chrM': continue  # Discard mitochondrial

        if orientation == 'fr':
            # Reorient pairs so that the first mate is always upstream.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = mstrand, strand

            # Convert to forward-forward orientation (flip second mate).
            mstrand = '-' if mstrand == '+' else '+'

        elif orientation == 'rf':
            # Reorient pairs so that the first mate is always upstream.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = mstrand, strand

            # Convert to forward-forward orientation (flip first mate).
            strand = '-' if strand == '+' else '+'

        elif orientation == 'ff':
            # Reorient pairs so that the first mate is always upstream.
            # If mates are swapped, both mates must be reversed.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = '+' if mstrand == '-' else '-', \
                 '+' if strand == '-' else '-'

        else:
            error('Unsupported read orientation detected.')

        # Make positions represent read starts.
        if strand == '-': pos += rlen - 1
        if mstrand == '-': mpos += mrlen - 1

        # Each discordant mate pair is represented as a 7-tuple
        # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, None).
        # The None at the end signifies that this is a mate pair.
        # Positions are 1-based and represent read starts.
        out.write('%s\t%s\t%d\t%s\t%s\t%d\t-\n' %
                  (chr, strand, pos, mchr, mstrand, mpos))
        N += 1

    out.close()
    info('Found %d discordant mate pairs.' % N)
Beispiel #14
0
def detect_specific(bam_path, donors_path, acceptors_path, genome_path,
                    out_prefix, all_reads):

    read_len = sam.read_length(bam_path)
    info('Using read length %d bp...' % read_len)

    flank_len = read_len - 10
    chromosomes = read_fasta(genome_path)

    donor_exons = regions_from_bed(donors_path)
    donors = []
    for ex in donor_exons:
        chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0]
        chr_seq = chromosomes[chr]
        if ex[1] == '+':
            donors.append((chr, '+', ex[3], chr_seq[ex[3] - flank_len:ex[3]]))
        elif ex[1] == '-':
            donors.append(
                (chr, '-', ex[2],
                 revcomplement(chr_seq[ex[2] - 1:ex[2] - 1 + flank_len])))

    acceptor_exons = regions_from_bed(acceptors_path)
    acceptors = []
    for ex in acceptor_exons:
        chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0]
        chr_seq = chromosomes[chr]
        if ex[1] == '+':
            acceptors.append(
                (chr, '+', ex[2], chr_seq[ex[2] - 1:ex[2] - 1 + flank_len]))
        elif ex[1] == '-':
            acceptors.append((chr, '-', ex[3],
                              revcomplement(chr_seq[ex[3] - flank_len:ex[3]])))

    del chromosomes  # Release 3 GB of memory
    gc.collect()

    # Remove duplicate acceptors and donors.
    acceptors = list(set(acceptors))
    donors = list(set(donors))

    # Calculate junction sequences
    junctions = {}
    for left in donors:
        for right in acceptors:
            name = '%s:%s:%d_%s:%s:%d' % (left[:3] + right[:3])
            junctions[name] = Object(sequence=left[3] + right[3], reads=[])
    info('Generated %d junctions.' % len(junctions))

    # Build Bowtie index
    info('Constructing junction FASTA file...')
    index_fasta_path = out_prefix + '_ref.fa'
    index = open(index_fasta_path, 'w')
    for name, junction in junctions.iteritems():
        index.write('>%s\n%s\n' % (name, junction.sequence))
    index.close()
    info('Constructing Bowtie index...')
    shell('bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix))

    # Align reads against junctions and tally junction read counts.
    if all_reads:
        info('Aligning all reads against index...')
        reads_command = 'sam reads %s' % bam_path
    else:
        info('Aligning unaligned reads against index...')
        reads_command = 'sam unaligned reads %s' % bam_path

    for line in shell_stdout('bowtie -f -v1 -B1 %s_index <(%s)' %
                             (out_prefix, reads_command)):
        cols = line.rstrip().split('\t')
        junctions[cols[2]].reads.append(cols[4])

    shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))

    out_file = open(out_prefix + '.tsv', 'w')
    out_file.write('5\' breakpoint\t3\' breakpoint\tNum reads\tSequences\n')
    for name, j in junctions.iteritems():
        if not j.reads: continue
        flanks = name.split('_')
        out_file.write('%s\t%s\t%d\t' % (flanks[0], flanks[1], len(j.reads)))
        #out_file.write(';'.join(j.reads))
        out_file.write('\n')
    out_file.close()
Beispiel #15
0
def detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len):

    out = zopen(out_prefix + '.discordant_reads.tsv.gz', 'w')
    N = 0

    info('Splitting unaligned reads into %d bp anchors and aligning against '
         'the genome...' % anchor_len)

    # IMPORTANT: Only one thread can be used, otherwise alignment order is not
    # guaranteed and the loop below will fail.
    anchor_alignments = shell_stdout(
        'samtools fasta -f 0x4 %s | fasta split interleaved - %d | '
        'bowtie -f -p1 -v0 -m1 -B1 --suppress 5,6,7,8 %s -' %
        (sam_path, anchor_len, genome_path))

    chromosomes = read_flat_seq(genome_path)
    for chr in list(chromosomes.keys()):
        if not chr.startswith('chr'):
            chromosomes['chr' + chr] = chromosomes.pop(chr)

    prev = ['']
    for line in anchor_alignments:
        al = line.split('\t')
        if al[0][-2] == '/': al[0] = al[0][:-2]

        if al[0] != prev[0]:
            prev = al
            continue

        chr = prev[2]
        mchr = al[2]
        strand = prev[1]
        mstrand = al[1]
        pos = int(prev[3])
        mpos = int(al[3])
        seq = prev[0][prev[0].find('_') + 1:]
        full_len = len(seq)

        if not chr.startswith('chr'): chr = 'chr' + chr
        if not mchr.startswith('chr'): mchr = 'chr' + mchr

        # Ignore anchor pairs where the anchors are too close.
        if chr == mchr and abs(pos - mpos) < full_len - anchor_len + 10:
            continue

        # Ignore rearrangements involving mitochondrial DNA.
        if 'M' in chr or 'M' in mchr: continue

        # Reorient the pairs so the first anchor is always upstream.
        # If mates are swapped, both mates must be reverse-complemented.
        if chr > mchr or (chr == mchr and pos > mpos):
            chr, mchr = mchr, chr
            pos, mpos = mpos, pos
            strand, mstrand = '+' if mstrand == '-' else '-', \
             '+' if strand == '-' else '-'
            seq = revcomplement(seq)

        # Extract the flanking sequences from the chromosome sequences.
        # The range calculations are a bit complex. It's easier to understand
        # them if you first add one to all indices to convert to 1-based
        # genomic coordinates ("pos" and "mpos" are 1-based).
        if strand == '+':
            left_grch = chromosomes[chr][pos - 1:pos + full_len - 1]
        else:
            left_grch = revcomplement(
                chromosomes[chr][pos + anchor_len - full_len - 1:pos +
                                 anchor_len - 1])

        if mstrand == '+':
            right_grch = chromosomes[mchr][mpos + anchor_len - full_len -
                                           1:mpos + anchor_len - 1]
        else:
            right_grch = revcomplement(chromosomes[mchr][mpos - 1:mpos +
                                                         full_len - 1])

        # If the read is at the very edge of a chromosome, ignore it.
        if len(left_grch) < full_len or len(right_grch) < full_len:
            continue

        # Make sure that reference sequences are in uppercase
        left_grch = left_grch.upper()
        right_grch = right_grch.upper()

        #print('-------------------')
        #print([chr, strand, pos, mchr, mstrand, mpos])
        #print(seq)
        #print(left_grch)
        #print(right_grch)

        # Check that the read sequence is not too homologous on either side
        # of the breakpoint.
        left_match = float(
            sum([
                seq[i] == left_grch[i]
                for i in range(full_len - anchor_len, full_len)
            ])) / anchor_len
        right_match = float(
            sum([seq[i] == right_grch[i]
                 for i in range(anchor_len)])) / anchor_len

        max_homology = 0.7
        if left_match >= max_homology or right_match >= max_homology: continue

        # Identify the breakpoint location that minimizes the number of
        # nucleotide mismatches between the read and the breakpoint flanks.
        potential_breakpoints = range(anchor_len, full_len - anchor_len + 1)
        mismatches = [0] * len(potential_breakpoints)
        for k, br in enumerate(potential_breakpoints):
            grch_chimera = left_grch[:br] + right_grch[br:]
            mismatches[k] = sum(
                [seq[i] != grch_chimera[i] for i in range(full_len)])

        # The best breakpoint placement cannot have more than N mismatches.
        least_mismatches = min(mismatches)
        #if least_mismatches > 2: continue

        # "br" represent the number of nucleotides in the read
        # before the breakpoint, counting from the 5' end of the read.
        # If there is microhomology, we pick the first breakpoint.
        br = potential_breakpoints[mismatches.index(least_mismatches)]

        # Now that we know the exact fusion breakpoint, we mark mismatches
        # with a lower case nucleotide and augment the read
        # sequence with a | symbol to denote the junction.
        grch_chimera = left_grch[:br] + right_grch[br:]
        seq = ''.join([
            nuc if grch_chimera[k] == nuc else nuc.lower()
            for k, nuc in enumerate(seq)
        ])
        seq = seq[:br] + '|' + seq[br:]

        # Make positions represent read starts.
        if strand == '-': pos += anchor_len - 1
        if mstrand == '-': mpos += anchor_len - 1

        # Each discordant anchor pair is represented as a 7-tuple
        # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, sequence).
        # Positions are 1-based and represent read starts.
        out.write('%s\t%s\t%d\t%s\t%s\t%d\t%s\n' %
                  (chr, strand, pos, mchr, mstrand, mpos, seq))
        N += 1

    info('Found %d discordant anchor pairs.' % N)
    out.close()
Beispiel #16
0
        info('Downloading %s...' % filename)
        shell('gtdownload -v -d %s -c ~/tools/genetorrent*/cghub_2016.key' %
              sample.analysis_data_uri)


if __name__ == '__main__':
    args = docopt.docopt(__doc__)
    predicates = [
        'disease_abbr=' + args['<cancer>'].upper(),
        'library_strategy=' + args['<library_type>']
    ]
    if args['--genome']:
        predicates.append('refassem_short_name=' + args['--genome'])

    output = shell_stdout('cgquery "%s"' % '&'.join(predicates))
    samples = cghub_parse(output)

    # Filter the samples if the user has provided a whitelist
    if args['--filename']:
        rx = args['--filename']
        samples = [s for s in samples if re.search(rx, s.files[0])]

    # Filter the samples if the user has provided a filename whitelist
    if args['--filename-in']:
        whitelist = [line.strip() for line in open(args['--filename-in'])]
        samples = [s for s in samples if s.files[0] in whitelist]

    # Filter the samples if the user has provided a filename blacklist
    if args['--filename-not-in']:
        blacklist = [line.strip() for line in open(args['--filename-not-in'])]