Esempio n. 1
0
def partition(samples_path, num_partitions):
	samples = [line.strip() for line in zopen(samples_path)]
	part_size = float(len(samples)) / num_partitions
	partition_ends = [int((p+1) * part_size) for p in range(num_partitions)]
	print(partition_ends)

	patient_ids = []
	num_without_pid = 0
	for s in samples:
		m = re.search('TCGA-..-....', s)
		if not m: num_without_pid += 1
		patient_ids.append(m.group(0) if m else 'zzz' + s)

	if num_without_pid:
		info('WARNING: %d sample names did not contain a TCGA patient ID.' % num_without_pid)

	samples, patient_ids = zip(*sorted(zip(samples, patient_ids),
		key=lambda x: x[1]))

	partitions = []
	for p in range(num_partitions):
		first = sum(len(p) for p in partitions)
		last = partition_ends[p] - 1
		part = [s for s in samples[first:last+1]]
		while last + 1 < len(samples) and \
			patient_ids[last+1] == patient_ids[last]:
			part.append(samples[last+1])
			last += 1
		partitions.append(part)

	for idx, part in enumerate(partitions):
		out = open('batch_%d.txt' % (idx+1), 'w')
		for s in part: out.write('%s\n' % s)
		out.close()
Esempio n. 2
0
def variant_discard_by_position(vcf_path, pos_path):
    info('Reading list of blacklisted positions...')
    pos_file = zopen(pos_path)
    blacklist = []
    for line in pos_file:
        cols = line.rstrip().split('\t')
        if len(cols) < 2: continue
        chr = cols[0][3:] if cols[0].startswith('chr') else cols[0]
        blacklist.append(chr + ':' + cols[1])
    blacklist = set(blacklist)

    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1

    sys.stdout.write(line)
    for line in vcf_file:
        cols = line.rstrip().split('\t')
        chr = cols[0][3:] if cols[0].startswith('chr') else cols[0]
        if not chr + ':' + cols[1] in blacklist:
            sys.stdout.write(line)
Esempio n. 3
0
def discard_if_in_controls(vcf_path, control_samples, threshold):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    control = [
        any(re.search(rx, s) for rx in control_samples)
        for s in headers[sample_col:]
    ]
    if not any(control): error('No control samples found.')

    info('Using these %d control samples:' % sum(control))
    for s, c in zip(headers[sample_col:], control):
        if c: info('- %s' % s)

    sys.stdout.write(line)
    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')[sample_col:]
        genotypes = [gt_symbols.index(c[:c.find(':')]) for c in cols]
        if sum(c and gt > 1 for c, gt in zip(control, genotypes)) >= threshold:
            continue
        sys.stdout.write(line)
Esempio n. 4
0
def fasta_remove_adapters(fasta_path, adapter):
    # Convert the adapter into a regular expression
    if len(adapter) < 5: error('Adapter sequence is too short.')
    adapter_re = adapter[:5]
    for base in adapter[5:]:
        adapter_re += '(?:' + base
    adapter_re += (len(adapter) - 5) * ')?'
    adapter_re = re.compile(adapter_re)

    info('Adapter regular expression: %s' % adapter_re)

    fasta = zopen(fasta_path)
    for line in fasta:
        if line[0] == '#':
            sys.stdout.write(line)
        elif line[0] == '>':
            sys.stdout.write(line)
            seq = next(fasta)[:-1]
            m = adapter_re.search(seq)
            if m: seq = seq[:m.start()]
            print(seq)
        elif line[0] == '@':
            sys.stdout.write(line)
            seq = next(fasta)[:-1]
            m = adapter_re.search(seq)
            trim_len = m.start() if m else len(seq)
            print(seq[:trim_len])
            sys.stdout.write(next(fasta))
            print(next(fasta)[:trim_len])
Esempio n. 5
0
def parallel(command, job_name, max_workers, cpus, memory, partition,
	time_limit):
	
	# Allow splitting the command string onto multiple lines.
	command = command.replace('\n', ' ')
	
	if sys.stdin.isatty():
		# If the user did not provide any input, just run the command once.
		# The command must not contain $x.
		if '$x' in command or '${x' in command:
			error('Command contains $x but no targets provided.')
		targets = ['']
	else:
		# Parse whitespace-delimited target items from standard input.
		targets = []
		for line in sys.stdin:
			targets += line.split(' ')
		targets = [t.replace('\n', '') for t in targets]
		
		if not targets: error('Command requires targets but none provided.')
	
	if len(set(targets)) < len(targets):
		error('Target list contains multiple instances of the following targets:\n' + '\n'.join(s for s in set(targets)
			if targets.count(s) > 1))

	if max_workers > len(targets): max_workers = len(targets)

	if partition != 'local':
		info('Distributing %d %s named "%s" on %s partition '
			'(with %d %s and %d GB of memory per job).' % (
			len(targets), 'jobs' if len(targets) != 1 else 'job',
			job_name, partition, cpus, 'CPUs' if cpus != 1 else 'CPU', memory))
	else:
		info('Starting %d %s named "%s" on local machine.' % (
			len(targets), 'jobs' if len(targets) != 1 else 'job', job_name))

	log_dir = os.path.expanduser('~/.jobs/%s_%s' % (job_name, 
		datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')))
	os.makedirs(log_dir)

	with open('%s/tasks' % log_dir, 'w') as f:
		f.write('%s\n' % command)
		for target in targets: f.write('%s\n' % target)

	if partition == 'local':
		worker_cmd = ['parallel', 'worker', log_dir]
		workers = [subprocess.Popen(worker_cmd) for w in range(max_workers)]
		for w in workers: w.wait()
	else:
		# Run the job steps on a SLURM cluster using sbatch.
		# Required memory is given in GB per job step. Convert to MB per CPU.
		mem_per_cpu = round(float(memory) / cpus * 1000)
		sbatch_script = sbatch_template % (partition, job_name, cpus,
			mem_per_cpu, 60 * time_limit, log_dir, log_dir, log_dir)
		workers = [subprocess.Popen(['sbatch', '-Q'], stdin=subprocess.PIPE)
			for p in range(max_workers)]
		for w in workers:
			w.stdin.write(sbatch_script.encode('utf-8'))
			w.stdin.close()
		for w in workers: w.wait()
Esempio n. 6
0
def cghub_list(samples):
    for s in samples:
        print('%s\t%s\t%s\t%s' %
              (s.files[0], s.legacy_sample_id, s.ref_genome, s.center))
        #print('%s\t%s\t%s' % (s.files[0], s.filesizes[0], s.center))

    info('Found a total of %d samples.' % len(samples))
    info('Total filesize: %.1f GB.' % (sum(s.filesizes[0]
                                           for s in samples) / 1e9))
Esempio n. 7
0
def smallrna_parse_mirbase(mirbase_gff_path):
    info('Printing mature microRNA loci in BED format.')

    mirna_name_re = re.compile(r';Name=([^\s;]+)')

    for line in open(mirbase_gff_path):
        if line[0] == '#': continue
        tokens = line[:-1].split('\t')
        if tokens[2] != 'miRNA': continue

        print('%s\t%d\t%d\t%s' % (tokens[0], int(tokens[3]) - 1, int(
            tokens[4]), mirna_name_re.search(tokens[8]).group(1)))
Esempio n. 8
0
def sam_pileup(region, bam_paths, min_al_quality=0):

    # Check the file paths here to ensure a nicer error message if files are
    # missing.
    missing = [path for path in bam_paths if not os.path.isfile(path)]
    for path in missing:
        info('WARNING: File %s was not found.' % path)
    bam_paths = [path for path in bam_paths if os.path.isfile(path)]
    if not bam_paths: return

    chr, region = region.replace(' ', '').split(':')
    region = [int(x) for x in region.split('-')]
    if len(region) == 1: region *= 2

    dev_null = open('/dev/null', 'a')
    indel_rx = re.compile('(\w[+-]\d+)?(\w+)(?![+-])')

    for pos in range(region[0], region[1] + 1):
        if region[0] != region[1]: print('Pileup for %s:%d:' % (chr, pos))

        for bam in bam_paths:
            line = subprocess.check_output(
                'samtools mpileup -A -B -q%d -r %s:%d-%d %s' %
                (min_al_quality, chr, pos, pos, bam),
                shell=True,
                stderr=dev_null)
            sample_name = re.sub(r'(.*/)?(.*).bam', r'\2', bam)
            if not line:
                print('%s\t' % sample_name)
            else:
                tokens = line[:-1].split('\t')
                bases = re.sub(r'\^.', '', tokens[4]).upper()
                bases = re.sub(r'[$<>]', '', bases)

                # Parse the pileup string for indels
                indel_tokens = indel_rx.findall(bases)
                bases = ''.join([
                    m[1][int(m[0][2:]):] if m[0] else m[1]
                    for m in indel_tokens
                ])
                indels = [
                    m[0][:2] + m[1][:int(m[0][2:])] for m in indel_tokens
                    if m[0]
                ]

                bases = ''.join(sorted(bases))
                if bases: bases += ' '
                print('%s\t%s%s' % (sample_name, bases, ' '.join(indels)))

    dev_null.close()
Esempio n. 9
0
def swiss_link(tsv_path):
    for line in open(tsv_path, 'U'):
        line = line.replace('\n', '')
        tokens = line.split('\t')
        if len(tokens) != 2: continue

        (source, dest) = tokens
        if not os.path.exists(source):
            info('Source file %s does not exist.' % source)
            continue
        if os.path.lexists(dest):
            info('Destination file %s exists. Will not overwrite.' % dest)
            continue

        os.symlink(source, dest)
Esempio n. 10
0
def sam_reads_raw(bam_path, out_prefix):
    out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w')
    out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w')
    out = zopen('%s.reads.gz' % out_prefix, 'w')

    reads_1 = {}
    reads_2 = {}

    # The "samtools bam2fq" command does not output supplementary or
    # secondary alignments. Each read only has one primary alignment.
    options = '-n' if has_mate_suffixes(bam_path) else ''
    bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path))
    for line in bam2fq:
        if line[0] != '@': error('Invalid bam2fq output.')
        line = line[:-1]
        if line.endswith('/1'):
            segname = line[1:-2]
            mate = reads_2.pop(segname, None)
            if mate:
                out_1.write(next(bam2fq))
                out_2.write('%s\n' % mate)
            else:
                reads_1[segname] = next(bam2fq)[:-1]
        elif line.endswith('/2'):
            segname = line[1:-2]
            mate = reads_1.pop(segname, None)
            if mate:
                out_1.write('%s\n' % mate)
                out_2.write(next(bam2fq))
            else:
                reads_2[segname] = next(bam2fq)[:-1]
        else:
            out.write('%s\n' % next(bam2fq)[:-1])

        # Skip per-base qualities. They can start with '@'.
        next(bam2fq)
        next(bam2fq)

    info('Found %d orphan first mates.' % len(reads_1))
    for read_id in reads_1.keys()[:5]:
        info('- Example: %s' % read_id)

    info('Found %d orphan second mates.' % len(reads_2))
    for read_id in reads_2.keys()[:5]:
        info('- Example: %s' % read_id)

    if len(reads_1) > 0:
        for read in reads_1.itervalues():
            out.write('%s\n' % read)

    if len(reads_2) > 0:
        for read in reads_2.itervalues():
            out.write('%s\n' % read)

    out_1.close()
    out_2.close()
    out.close()
Esempio n. 11
0
def samples_by_patient(samples_path):
	samples = [line.strip() for line in zopen(samples_path)]
	patients = {}
	num_without_pid = 0
	for s in samples:
		m = re.search('TCGA-..-....', s)
		if not m:
			num_without_pid += 1
			continue
		psamples = patients.setdefault(m.group(0), [])
		psamples.append(s)

	if num_without_pid:
		info('WARNING: %d sample names did not contain a TCGA patient ID.' % num_without_pid)

	for patient, psamples in patients.iteritems():
		print('Patient %s (%d samples):' % (patient, len(psamples)))
		for sample in psamples: print('- %s' % sample)
Esempio n. 12
0
def smallrna_expression(read_paths, srna_reference_path):

    S = len(read_paths)
    min_other_reads = 100

    # Read the FASTA file containing small RNA reference sequences and
    # construct a new FASTA file that includes potential isoforms and
    # variants of these small RNA sequences.
    info('Constructing database of reference small RNA sequences...')
    seq_names = defaultdict(lambda: '')
    counts = defaultdict(lambda: [0] * S)

    for name, seq in read_fasta(srna_reference_path).iteritems():
        name = re.sub(' MIMAT.*', '', name)
        seq = seq.upper().replace('U', 'T')
        seq_names[seq] = name
        seq_names[seq[:-1]] = name + '-1'
        seq_names[seq + 'A'] = name + '+A'
        seq_names[seq + 'C'] = name + '+C'
        seq_names[seq + 'G'] = name + '+G'
        seq_names[seq + 'T'] = name + '+T'

    info('Counting reads aligning to small RNA sequences...')
    for s, read_path in enumerate(read_paths):
        fasta = zopen(read_path)
        for line in fasta:
            if not line or line[0] == '#': continue
            if line[0] in '>@':
                seq = next(fasta)[:-1]
                counts[seq][s] += 1

    counts = {
        seq: count
        for seq, count in counts.iteritems()
        if seq in seq_names or sum(count >= min_other_reads) >= 2
    }

    print('NAME\tSEQUENCE\t%s' % '\t'.join(read_paths))
    for seq in sorted(counts.iterkeys(), key=lambda x: seq_names[x]):
        sys.stdout.write('%s\t%s' % (seq_names[seq], seq))
        for x in counts[seq]:
            sys.stdout.write('\t%d' % x)
        sys.stdout.write('\n')
Esempio n. 13
0
def cghub_download(samples):
    for sample in samples:
        # Don't redownload files that are already present.
        existing = {}
        for root, dirnames, filenames in os.walk('.'):
            for f in filenames:
                path = os.path.join(root, f)
                existing[f] = os.stat(path).st_size

        filename = sample.files[0]
        filesize = sample.filesizes[0]

        if filename in existing and existing[filename] == filesize:
            info('%s has already been downloaded...' % filename)
            continue

        info('Downloading %s...' % filename)
        shell('gtdownload -v -d %s -c ~/tools/genetorrent*/cghub_2016.key' %
              sample.analysis_data_uri)
Esempio n. 14
0
def coverage_cds(bam_path, gtf_path):
	
	chr_sizes = ref_sequence_sizes(bam_path)
	
	info('Constructing a map of coding regions...')
	coding = {}
	for chr, size in chr_sizes.iteritems():
		coding[chr] = [False] * size
	for line in zopen(gtf_path):
		if line.startswith('#'): continue
		cols = line.split('\t')
		if cols[2] != 'CDS': continue
		if len(cols[0]) > 5: continue   # Ignore chromosomes other than chrXX
		if not cols[0] in coding: continue
		coding[cols[0]][int(cols[3])-1:int(cols[4])] = True
		
	info('Calculating a coverage histogram...')
	coverage_hist = [0] * 200
	chr = ''
	pos = 0
	for line in shell_stdout('bedtools genomecov -d -split -ibam %s' % bam_path):
		cols = line.split('\t')
		if cols[0] != chr:
			chr = cols[0]
			cds = coding[chr]
			pos = int(cols[1])-2
			info('%s...' % chr)
		pos += 1
		if cds[pos]:
			coverage_hist[min(int(cols[2]), len(coverage_hist)-1)] += 1
			
	print('Coverage histogram:')
	print('===================')
	for cov in range(0, len(coverage_hist)):
		print('%d: %d' % (cov, coverage_hist[cov]))
Esempio n. 15
0
def sam_reads(bam_path, out_prefix):
    fastq_1 = zopen('%s_1.fq.gz' % out_prefix, 'w')
    fastq_2 = zopen('%s_2.fq.gz' % out_prefix, 'w')
    fastq = zopen('%s.fq.gz' % out_prefix, 'w')

    reads_1 = {}
    reads_2 = {}

    # FIXME: We assume that each read only has one alignment in the BAM file.
    for al in read_sam(bam_path):
        flags = int(al[1])
        if flags & 0x40:
            rname = al[0][:-2] if al[0].endswith('/1') else al[0]
            mate = reads_2.pop(rname, None)
            if mate:
                fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, al[9], al[10]))
                fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, mate[0], mate[1]))
            else:
                reads_1[rname] = (al[9], al[10])
        elif flags & 0x80:
            rname = al[0][:-2] if al[0].endswith('/2') else al[0]
            mate = reads_1.pop(rname, None)
            if mate:
                fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, mate[0], mate[1]))
                fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, al[9], al[10]))
            else:
                reads_2[rname] = (al[9], al[10])
        else:
            fastq.write('@%s\n%s\n+\n%s\n' % (al[0], al[9], al[10]))

    info('Found %d orphan first mates.' % len(reads_1))
    for read_id in reads_1.keys()[:5]:
        info('- Example: %s' % read_id)

    info('Found %d orphan second mates.' % len(reads_2))
    for read_id in reads_2.keys()[:5]:
        info('- Example: %s' % read_id)

    if len(reads_1) > 0:
        for rname, read in reads_1.iteritems():
            fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1]))

    if len(reads_2) > 0:
        for rname, read in reads_2.iteritems():
            fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1]))

    fastq_1.close()
    fastq_2.close()
    fastq.close()
Esempio n. 16
0
def somatic(vcf_path, sample_pairs):
    vcf_file = zopen(vcf_path)
    for line in vcf_file:
        if not line.startswith('##'): break

    headers = line.rstrip().split('\t')
    sample_col = headers.index('ESP6500' if 'ESP6500' in
                               headers else 'ALT') + 1
    samples = headers[sample_col:]

    # Convert sample pair names into index 2-tuples.
    sample_pairs = [pair.split(',') for pair in sample_pairs]
    if not all(len(pair) == 2 for pair in sample_pairs):
        info([pair for pair in sample_pairs if len(pair) != 2])
        error('Test and control samples must be in "test,control" format.')
    for pair in sample_pairs:
        if not pair[0] in samples:
            error('Test sample %s was not found in VCF file.' % pair[0])
        if not pair[1] in samples:
            error('Control sample %s was not found in VCF file.' % pair[1])
    sample_pairs = [(samples.index(pair[0]), samples.index(pair[1]))
                    for pair in sample_pairs]

    sys.stdout.write(line)

    for line in vcf_file:
        cols = line.rstrip('\n').split('\t')
        gt_cols = cols[sample_col:]

        genotypes = [gt_symbols.index(g[:g.find(':')]) for g in gt_cols]

        somatic = [
            genotypes[pair[0]] >= 2 and genotypes[pair[1]] == 1
            for pair in sample_pairs
        ]
        if not any(somatic): continue

        sys.stdout.write(line)
Esempio n. 17
0
def variant_merge(vcf_paths):
    sort_in, sort_out = shell_stdinout('sort -k2,2 -k3,3n -k4,4 -k5,5')
    cons_headers = []  # Consensus headers
    vcf_samples = []  # Sample names of each VCF
    for vcf_index, vcf_path in enumerate(vcf_paths):
        info('Merging VCF file %s...' % vcf_path)
        vcf = zopen(vcf_path)
        for line in vcf:
            if not line.startswith('#'): break
        headers = line.rstrip('\n').split('\t')
        gtype_col = (4 if not 'ESP6500' in headers else
                     headers.index('ESP6500') + 1)
        if not cons_headers: cons_headers = headers[:gtype_col]
        if cons_headers != headers[:gtype_col]: error('Header mismatch!')
        vcf_samples.append(headers[gtype_col:])
        for line in vcf:
            sort_in.write('%d\t%s' % (vcf_index, line))
    sort_in.close()

    print('\t'.join(cons_headers + sum(vcf_samples, [])))
    vcf_sample_counts = [len(samples) for samples in vcf_samples]
    S = sum(vcf_sample_counts)
    vcf_sample_col = [
        sum(vcf_sample_counts[0:k]) for k in range(len(vcf_samples))
    ]

    info('Merged VCF will contain:')
    info('- %d header columns' % len(cons_headers))
    for samples, path in zip(vcf_samples, vcf_paths):
        info('- %d columns from %s' % (len(samples), path))

    prev = None
    calls = [':0:0'] * S
    for line in sort_out:
        cols = line.rstrip('\n').split('\t')
        vcf_index = int(cols[0])
        call_col = vcf_sample_col[vcf_index]
        if prev != cols[1:5]:
            if prev != None:
                print('\t'.join(prev + calls))
            prev = cols[1:gtype_col + 1]
            calls = [':0:0'] * S
        calls[call_col:call_col+vcf_sample_counts[vcf_index]] = \
         cols[gtype_col+1:]

    print('\t'.join(prev + calls))  # Handle the last line
Esempio n. 18
0
def ensembl_gene_bed(gtf_path):
    gene_id_to_name = {}
    gene_exons = {}

    gtf_file = zopen(gtf_path)
    for line in gtf_file:
        if line.startswith('#'): continue
        c = line.rstrip('\n').split('\t')
        if not c[0] in human_chr: continue
        if not c[1] in accepted_gene_types: continue
        if c[2] != 'exon': continue

        chr, start, end, strand = c[0], int(c[3]), int(c[4]), c[6]
        if not chr.startswith('chr'): chr = 'chr' + chr

        m = re.search(r'gene_id "(.+?)"', line)
        gene_id = m.group(1)

        m = re.search(r'gene_name "(.+?)"', line)
        gene_name = m.group(1)

        exons = gene_exons.setdefault(gene_id, [])
        exons.append((chr, strand, start, end))

        gene_id_to_name[gene_id] = gene_name

    for gene_id, exons in gene_exons.iteritems():
        if not all(exon[0] == exons[0][0] for exon in exons):
            info('Chromosome confusion detected.')
        if not all(exon[1] == exons[0][1] for exon in exons):
            info('Strand confusion detected.')

        start, end = min(ex[2] for ex in exons), max(ex[3] for ex in exons)
        print('%s\t%d\t%d\t%s (%s)\t\t%s' %
              (exons[0][0], start - 1, end, gene_id_to_name[gene_id], gene_id,
               exons[0][1]))
Esempio n. 19
0
def fasta_repair(fasta_1_path, fasta_2_path, out_1_path, out_2_path):
    fasta_1 = zopen(fasta_1_path)
    fasta_2 = zopen(fasta_2_path)

    out_1 = zopen(out_1_path, 'w')
    out_2 = zopen(out_2_path, 'w')

    orphans_1 = {}
    orphans_2 = {}

    while not (fasta_1 == None and fasta_2 == None):

        if fasta_1:
            while 1:
                line = fasta_1.readline()
                if line == '':
                    fasta_1.close()
                    fasta_1 = None
                    break

                if not line[0] in '>@': continue

                header = line[:-1].replace('/1', '')
                seq = fasta_1.readline()[:-1]
                qual = None
                if header[0] == '@':
                    while line[0] != '+':
                        line = fasta_1.readline()
                    qual = fasta_1.readline()[:-1]

                # Check that there are as many quality values as nucleotides.
                if len(seq) != len(qual):
                    info('Read %s/1 discarded due to corrupted qualities.' %
                         header[:-1])
                    break

                read = orphans_2.get(header)
                if read:
                    del orphans_2[header]
                    if qual and read[1]:
                        out_1.write('%s/1\n%s\n+\n%s\n' % (header, seq, qual))
                        out_2.write('%s/2\n%s\n+\n%s\n' %
                                    (header, read[0], read[1]))
                    else:
                        out_1.write('%s/1\n%s\n' % (header, seq))
                        out_2.write('%s/2\n%s\n' % (header, read[0]))
                else:
                    orphans_1[header] = (seq, qual)

                break

        if fasta_2:
            while 1:
                line = fasta_2.readline()
                if line == '':
                    fasta_2.close()
                    fasta_2 = None
                    break

                if not line[0] in '>@': continue

                header = line[:-1].replace('/2', '')
                seq = fasta_2.readline()[:-1]
                qual = None
                if header[0] == '@':
                    while line[0] != '+':
                        line = fasta_2.readline()
                    qual = fasta_2.readline()[:-1]

                # Check that there are as many quality values as nucleotides.
                if len(seq) != len(qual):
                    info('Read %s/2 discarded due to corrupted qualities.' %
                         header[:-1])
                    break

                read = orphans_1.get(header)
                if read:
                    del orphans_1[header]
                    if qual and read[1]:
                        out_1.write('%s/1\n%s\n+\n%s\n' %
                                    (header, read[0], read[1]))
                        out_2.write('%s/2\n%s\n+\n%s\n' % (header, seq, qual))
                    else:
                        out_1.write('%s/1\n%s\n' % (header, read[0]))
                        out_2.write('%s/2\n%s\n' % (header, seq))
                else:
                    orphans_2[header] = (seq, qual)

                break

    out_1.close()
    out_2.close()
Esempio n. 20
0
def fasta_check(fasta_1_path, fasta_2_path, out_1_path, out_2_path):
    fasta_1 = zopen(fasta_1_path)
    fasta_2 = zopen(fasta_2_path)

    out_1 = zopen(out_1_path, 'w')
    out_2 = zopen(out_2_path, 'w')

    bad_out_1 = zopen('bad.' + out_1_path, 'w')
    bad_out_2 = zopen('bad.' + out_2_path, 'w')

    while 1:

        discard = False

        if fasta_1:
            while 1:
                line = fasta_1.readline()
                if line == '':
                    fasta_1.close()
                    fasta_1 = None
                    break

                if not line[0] in '>@': continue

                header_1 = line[:-1].replace('/1', '')
                seq_1 = fasta_1.readline()[:-1]
                qual_1 = None
                if header_1[0] == '@':
                    while line[0] != '+':
                        line = fasta_1.readline()
                    qual_1 = fasta_1.readline()[:-1]
                    if len(seq_1) != len(qual_1):
                        discard = True

                break

        if fasta_2:
            while 1:
                line = fasta_2.readline()
                if line == '':
                    fasta_2.close()
                    fasta_2 = None
                    break

                if not line[0] in '>@': continue

                header_2 = line[:-1].replace('/2', '')
                seq_2 = fasta_2.readline()[:-1]
                if header_2[0] == '@':
                    while line[0] != '+':
                        line = fasta_2.readline()
                    qual_2 = fasta_2.readline()[:-1]
                    if len(seq_2) != len(qual_2):
                        discard = True

                break

        if fasta_1 == None and fasta_2 == None: break

        if (fasta_1 == None) ^ (fasta_2 == None):
            info('File terminated abruptly.')
            break

        if header_1 != header_2: discard = True

        if discard:
            if qual_1 and qual_2:
                bad_out_1.write('%s/1\n%s\n+\n%s\n' %
                                (header_1, seq_1, qual_1))
                bad_out_2.write('%s/2\n%s\n+\n%s\n' %
                                (header_2, seq_2, qual_2))
            else:
                bad_out_1.write('%s/1\n%s\n' % (header_1, seq_1))
                bad_out_2.write('%s/2\n%s\n' % (header_2, seq_2))
        else:
            if qual_1 and qual_2:
                out_1.write('%s/1\n%s\n+\n%s\n' % (header_1, seq_1, qual_1))
                out_2.write('%s/2\n%s\n+\n%s\n' % (header_2, seq_2, qual_2))
            else:
                out_1.write('%s/1\n%s\n' % (header_1, seq_1))
                out_2.write('%s/2\n%s\n' % (header_2, seq_2))

    out_1.close()
    out_2.close()

    bad_out_1.close()
    bad_out_2.close()
Esempio n. 21
0
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options):
	#print(bam_paths, genome_path, options.region, options.homz)
	gt_symbols = ['', '0/0', '0/1', '1/1']
	if not os.path.exists(genome_path):
		error('Could not find genome FASTA file %s.' % genome_path)

	if options.region:
		for bam_path in bam_paths:
			if not os.path.exists(bam_path + '.bai'):
				error('No index found for BAM file %s.' % bam_path)
	
	samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths]

	# print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples))
	print('CHROM\tPOSITION\tREF\tALT\t%s' %samples[0])

	ignore_mapq = [False] * len(samples)
	if options.ignore_mapq:
		for s, sample in enumerate(samples):
			if re.search(options.ignore_mapq, sample) != None:
				ignore_mapq[s] = True
				info('Ignoring mapping quality for sample %s.' % sample)
	
	for line in simple_pileup(bam_paths, genome_path, kgenomes_path,
		min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads),
		region=options.region):

		if type(line) == bytes:
			line = line.decode('utf8')

		tokens = line[:-1].split('\t')
		if len(tokens) < 3: error('Invalid spileup line:\n%s' % line)
		if tokens[2] == 'N': continue
		pileups = [p.split(' ') for p in tokens[3:]]
		#total_reads = np.zeros(len(samples))
		#allele_reads = defaultdict(lambda: np.zeros(len(samples)))

		total_reads = [0] * len(samples)
		allele_reads = defaultdict(lambda: [0] * len(samples))

		for s, pileup in enumerate(pileups):
			if len(pileup) < 3: continue
			for a in range(0, len(pileup), 3):
				count = int(pileup[a+1]) + \
					(int(pileup[a+2]) if ignore_mapq[s] else 0)
				total_reads[s] += count
				if pileup[a] != '.': allele_reads[pileup[a]][s] = count		

		# Call genotypes for each allele.
		# for alt, reads in allele_reads.iteritems():
		for alt, reads in allele_reads.items():
			genotypes = call_genotypes(reads, total_reads, options)

			# if not options.keep_all and all(gt < 2 for gt in genotypes): continue
			# if all(gt != 2 for gt in genotypes): continue
			if genotypes[1] != 2: continue
			
			gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s])
				for s, g in enumerate(genotypes))
			# Reformat indels in VCF4 format
			ref = tokens[2]
			if len(alt) >= 2:
				if alt[1] == '+':    # Insertion
					alt = (ref if alt[0] == '.' else alt[0]) + alt[2:]
				elif alt[1] == '-':  # Deletion
					ref += alt[2:]
					alt = (ref[0] if alt[0] == '.' else alt[0])
			
			#######################
			## Hetrozygous bases ##
			#######################
			
			gt_list = list(gtypes)
			gt_col = gt_list[1] ## genotype for the normal sample
			genotype = gt_symbols.index(gt_col[:gt_col.find(':')])
			total_read = float(gt_col.split(':')[2])
			if not (genotype == 2 and total_read >= 15): continue
			
			#########################
			## calculating the BAF ##
			#########################
			
			read = gt_list[0].split(':')[1:3] ## reads for the tumor sample
			sys.stdout.write('\t'.join([tokens[0], tokens[1], ref, alt.upper()]))
			alt, total = float(read[0]), int(read[1])
			sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total))
			sys.stdout.write('\n')
Esempio n. 22
0
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options):
    #print(bam_paths, genome_path, options.region, options.homz)
    gt_symbols = ['', '0/0', '0/1', '1/1']
    if not os.path.exists(genome_path):
        error('Could not find genome FASTA file %s.' % genome_path)

    if options.region:
        for bam_path in bam_paths:
            if not os.path.exists(bam_path + '.bai'):
                error('No index found for BAM file %s.' % bam_path)

    samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths]

    # print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples))
    print('CHROM\tPOSITION\tREF\tALT\t%s' % samples[0])

    ignore_mapq = [False] * len(samples)
    if options.ignore_mapq:
        for s, sample in enumerate(samples):
            if re.search(options.ignore_mapq, sample) != None:
                ignore_mapq[s] = True
                info('Ignoring mapping quality for sample %s.' % sample)

    for line in simple_pileup(bam_paths,
                              genome_path,
                              kgenomes_path,
                              min_mapq=options.min_mapq,
                              min_alt_alleles=(0 if options.keep_all else
                                               options.min_hetz_reads),
                              region=options.region):

        if type(line) == bytes:
            line = line.decode('utf8')

        tokens = line[:-1].split('\t')
        if len(tokens) < 3: error('Invalid spileup line:\n%s' % line)
        if tokens[2] == 'N': continue
        pileups = [p.split(' ') for p in tokens[3:]]
        #total_reads = np.zeros(len(samples))
        #allele_reads = defaultdict(lambda: np.zeros(len(samples)))

        total_reads = [0] * len(samples)
        allele_reads = defaultdict(lambda: [0] * len(samples))

        for s, pileup in enumerate(pileups):
            if len(pileup) < 3: continue
            for a in range(0, len(pileup), 3):
                count = int(pileup[a+1]) + \
                 (int(pileup[a+2]) if ignore_mapq[s] else 0)
                total_reads[s] += count
                if pileup[a] != '.': allele_reads[pileup[a]][s] = count

        # Call genotypes for each allele.
        # for alt, reads in allele_reads.iteritems():
        for alt, reads in allele_reads.items():
            genotypes = call_genotypes(reads, total_reads, options)

            # if not options.keep_all and all(gt < 2 for gt in genotypes): continue
            # if all(gt != 2 for gt in genotypes): continue
            if genotypes[1] != 2: continue

            gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s])
                      for s, g in enumerate(genotypes))
            # Reformat indels in VCF4 format
            ref = tokens[2]
            if len(alt) >= 2:
                if alt[1] == '+':  # Insertion
                    alt = (ref if alt[0] == '.' else alt[0]) + alt[2:]
                elif alt[1] == '-':  # Deletion
                    ref += alt[2:]
                    alt = (ref[0] if alt[0] == '.' else alt[0])

            #######################
            ## Hetrozygous bases ##
            #######################

            gt_list = list(gtypes)
            gt_col = gt_list[1]  ## genotype for the normal sample
            genotype = gt_symbols.index(gt_col[:gt_col.find(':')])
            total_read = float(gt_col.split(':')[2])
            if not (genotype == 2 and total_read >= 15): continue

            #########################
            ## calculating the BAF ##
            #########################

            read = gt_list[0].split(':')[1:3]  ## reads for the tumor sample
            sys.stdout.write('\t'.join(
                [tokens[0], tokens[1], ref,
                 alt.upper()]))
            alt, total = float(read[0]), int(read[1])
            sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' %
                             (alt / total))
            sys.stdout.write('\n')
Esempio n. 23
0
def variant_call(bam_paths, genome_path, options):

    if not os.path.exists(genome_path):
        error('Could not find genome FASTA file %s.' % genome_path)

    if options.region:
        for bam_path in bam_paths:
            if not os.path.exists(bam_path + '.bai'):
                error('No index found for BAM file %s.' % bam_path)

    samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths]
    print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples))

    ignore_mapq = [False] * len(samples)
    if options.ignore_mapq:
        for s, sample in enumerate(samples):
            if re.search(options.ignore_mapq, sample) != None:
                ignore_mapq[s] = True
                info('Ignoring mapping quality for sample %s.' % sample)

    for line in simple_pileup(bam_paths,
                              genome_path,
                              min_mapq=options.min_mapq,
                              min_alt_alleles=(0 if options.keep_all else
                                               options.min_hetz_reads),
                              region=options.region):

        tokens = line[:-1].split('\t')
        if len(tokens) < 3: error('Invalid spileup line:\n%s' % line)
        if tokens[2] == 'N': continue
        pileups = [p.split(' ') for p in tokens[3:]]

        #total_reads = np.zeros(len(samples))
        #allele_reads = defaultdict(lambda: np.zeros(len(samples)))

        total_reads = [0] * len(samples)
        allele_reads = defaultdict(lambda: [0] * len(samples))

        for s, pileup in enumerate(pileups):
            if len(pileup) < 3: continue
            for a in range(0, len(pileup), 3):
                count = int(pileup[a+1]) + \
                 (int(pileup[a+2]) if ignore_mapq[s] else 0)
                total_reads[s] += count
                if pileup[a] != '.': allele_reads[pileup[a]][s] = count

        # Call genotypes for each allele.
        for alt, reads in allele_reads.iteritems():
            genotypes = call_genotypes(reads, total_reads, options)
            if not options.keep_all and all(gt < 2 for gt in genotypes):
                continue

            gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s])
                      for s, g in enumerate(genotypes))

            # Reformat indels in VCF4 format
            ref = tokens[2]
            if len(alt) >= 2:
                if alt[1] == '+':  # Insertion
                    alt = (ref if alt[0] == '.' else alt[0]) + alt[2:]
                elif alt[1] == '-':  # Deletion
                    ref += alt[2:]
                    alt = (ref[0] if alt[0] == '.' else alt[0])

            print('%s\t%s\t%s\t%s\t%s' %
                  (tokens[0], tokens[1], ref, alt.upper(), '\t'.join(gtypes)))
Esempio n. 24
0
def detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len):

    out = zopen(out_prefix + '.discordant_reads.tsv.gz', 'w')
    N = 0

    info('Splitting unaligned reads into %d bp anchors and aligning against '
         'the genome...' % anchor_len)

    # IMPORTANT: Only one thread can be used, otherwise alignment order is not
    # guaranteed and the loop below will fail.
    anchor_alignments = shell_stdout(
        'samtools fasta -f 0x4 %s | fasta split interleaved - %d | '
        'bowtie -f -p1 -v0 -m1 -B1 --suppress 5,6,7,8 %s -' %
        (sam_path, anchor_len, genome_path))

    chromosomes = read_flat_seq(genome_path)
    for chr in list(chromosomes.keys()):
        if not chr.startswith('chr'):
            chromosomes['chr' + chr] = chromosomes.pop(chr)

    prev = ['']
    for line in anchor_alignments:
        al = line.split('\t')
        if al[0][-2] == '/': al[0] = al[0][:-2]

        if al[0] != prev[0]:
            prev = al
            continue

        chr = prev[2]
        mchr = al[2]
        strand = prev[1]
        mstrand = al[1]
        pos = int(prev[3])
        mpos = int(al[3])
        seq = prev[0][prev[0].find('_') + 1:]
        full_len = len(seq)

        if not chr.startswith('chr'): chr = 'chr' + chr
        if not mchr.startswith('chr'): mchr = 'chr' + mchr

        # Ignore anchor pairs where the anchors are too close.
        if chr == mchr and abs(pos - mpos) < full_len - anchor_len + 10:
            continue

        # Ignore rearrangements involving mitochondrial DNA.
        if 'M' in chr or 'M' in mchr: continue

        # Reorient the pairs so the first anchor is always upstream.
        # If mates are swapped, both mates must be reverse-complemented.
        if chr > mchr or (chr == mchr and pos > mpos):
            chr, mchr = mchr, chr
            pos, mpos = mpos, pos
            strand, mstrand = '+' if mstrand == '-' else '-', \
             '+' if strand == '-' else '-'
            seq = revcomplement(seq)

        # Extract the flanking sequences from the chromosome sequences.
        # The range calculations are a bit complex. It's easier to understand
        # them if you first add one to all indices to convert to 1-based
        # genomic coordinates ("pos" and "mpos" are 1-based).
        if strand == '+':
            left_grch = chromosomes[chr][pos - 1:pos + full_len - 1]
        else:
            left_grch = revcomplement(
                chromosomes[chr][pos + anchor_len - full_len - 1:pos +
                                 anchor_len - 1])

        if mstrand == '+':
            right_grch = chromosomes[mchr][mpos + anchor_len - full_len -
                                           1:mpos + anchor_len - 1]
        else:
            right_grch = revcomplement(chromosomes[mchr][mpos - 1:mpos +
                                                         full_len - 1])

        # If the read is at the very edge of a chromosome, ignore it.
        if len(left_grch) < full_len or len(right_grch) < full_len:
            continue

        # Make sure that reference sequences are in uppercase
        left_grch = left_grch.upper()
        right_grch = right_grch.upper()

        #print('-------------------')
        #print([chr, strand, pos, mchr, mstrand, mpos])
        #print(seq)
        #print(left_grch)
        #print(right_grch)

        # Check that the read sequence is not too homologous on either side
        # of the breakpoint.
        left_match = float(
            sum([
                seq[i] == left_grch[i]
                for i in range(full_len - anchor_len, full_len)
            ])) / anchor_len
        right_match = float(
            sum([seq[i] == right_grch[i]
                 for i in range(anchor_len)])) / anchor_len

        max_homology = 0.7
        if left_match >= max_homology or right_match >= max_homology: continue

        # Identify the breakpoint location that minimizes the number of
        # nucleotide mismatches between the read and the breakpoint flanks.
        potential_breakpoints = range(anchor_len, full_len - anchor_len + 1)
        mismatches = [0] * len(potential_breakpoints)
        for k, br in enumerate(potential_breakpoints):
            grch_chimera = left_grch[:br] + right_grch[br:]
            mismatches[k] = sum(
                [seq[i] != grch_chimera[i] for i in range(full_len)])

        # The best breakpoint placement cannot have more than N mismatches.
        least_mismatches = min(mismatches)
        #if least_mismatches > 2: continue

        # "br" represent the number of nucleotides in the read
        # before the breakpoint, counting from the 5' end of the read.
        # If there is microhomology, we pick the first breakpoint.
        br = potential_breakpoints[mismatches.index(least_mismatches)]

        # Now that we know the exact fusion breakpoint, we mark mismatches
        # with a lower case nucleotide and augment the read
        # sequence with a | symbol to denote the junction.
        grch_chimera = left_grch[:br] + right_grch[br:]
        seq = ''.join([
            nuc if grch_chimera[k] == nuc else nuc.lower()
            for k, nuc in enumerate(seq)
        ])
        seq = seq[:br] + '|' + seq[br:]

        # Make positions represent read starts.
        if strand == '-': pos += anchor_len - 1
        if mstrand == '-': mpos += anchor_len - 1

        # Each discordant anchor pair is represented as a 7-tuple
        # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, sequence).
        # Positions are 1-based and represent read starts.
        out.write('%s\t%s\t%d\t%s\t%s\t%d\t%s\n' %
                  (chr, strand, pos, mchr, mstrand, mpos, seq))
        N += 1

    info('Found %d discordant anchor pairs.' % N)
    out.close()
Esempio n. 25
0
def detect_rearrangements(sam_path,
                          genome_path,
                          out_prefix,
                          anchor_len,
                          min_mapq,
                          orientation,
                          max_frag_len,
                          discard_duplicates='both-ends'):

    if not os.path.exists(sam_path):
        error('File %s does not exist.' % sam_path)

    if not discard_duplicates in ('no', 'both-ends', 'one-end'):
        error('Invalid duplicate discard method: %s' % discard_duplicates)

    detect_discordant_pairs(sam_path,
                            out_prefix,
                            max_frag_len=max_frag_len,
                            min_mapq=min_mapq,
                            orientation=orientation)

    # Execute split read analysis if the user has specified an anchor length.
    if anchor_len > 0:
        detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len)

    info('Sorting discordant pairs by chromosomal position...')
    sort_inputs = '<(gunzip -c %s.discordant_pairs.tsv.gz)' % out_prefix
    if anchor_len > 0:
        sort_inputs += ' <(gunzip -c %s.discordant_reads.tsv.gz)' % out_prefix

    sort_tmp_dir = os.path.dirname(out_prefix)
    if not sort_tmp_dir: sort_tmp_dir = './'

    shell('sort -k1,1 -k3,3n -T %s %s | gzip -c > %s.sorted_pairs.tsv.gz' %
          (sort_tmp_dir, sort_inputs, out_prefix))

    def report_rearrangement(out, r):
        if discard_duplicates == 'both-ends':
            discard_duplicates_both_ends(r)
        elif discard_duplicates == 'one-end':
            discard_duplicates_one_end(r)
        if len(r.reads) < 2: return 0
        out.write('%s\t%s\t%d\t\t\t%s\t%s\t%d\t\t\t%d\t%d\t%s\n' %
                  (r.chr, r.strand, r.pos, r.mchr, r.mstrand, r.mpos,
                   sum([read[2] == None for read in r.reads]),
                   sum([read[2] != None for read in r.reads]), ';'.join(
                       [read[2] for read in r.reads if read[2] != None])))
        return 1

    info('Identifying rearrangements based on clusters of discordant reads...')

    out = open('%s.sv' % out_prefix, 'w')
    out.write(sv_file_header + '\n')

    N = 0
    rearrangements = []
    for line in zopen('%s.sorted_pairs.tsv.gz' % out_prefix):
        al = line[:-1].split('\t')

        chr = al[0]
        strand = al[1]
        pos = int(al[2])
        mchr = al[3]
        mstrand = al[4]
        mpos = int(al[5])
        seq = None if al[6] == '-' else al[6]

        # Rearrangements that are too far need not be considered in the future
        reachable = []
        for r in rearrangements:
            if pos - r.pos > max_frag_len:
                N += report_rearrangement(out, r)
            else:
                reachable.append(r)
        rearrangements = reachable

        # Check if we already have a rearrangement that matches the new pair.
        # We don't check the distance for the first mate because we already
        # know from above the rearrangements near it.
        matches = [
            r for r in rearrangements
            if abs(mpos - r.mpos) <= max_frag_len and chr == r.chr
            and mchr == r.mchr and strand == r.strand and mstrand == r.mstrand
        ]

        read = (pos, mpos, seq)
        if matches:
            for match in matches:
                match.reads.append(read)

        else:
            # No suitable rearrangements, create a new one.
            rearrangements.append(
                Rearrangement(chr, strand, pos, mchr, mstrand, mpos, read))

    for r in rearrangements:
        N += report_rearrangement(out, r)

    info('Found %d rearrangements with at least 2 reads of evidence.' % N)
Esempio n. 26
0
def detect_specific(bam_path, donors_path, acceptors_path, genome_path,
                    out_prefix, all_reads):

    read_len = sam.read_length(bam_path)
    info('Using read length %d bp...' % read_len)

    flank_len = read_len - 10
    chromosomes = read_fasta(genome_path)

    donor_exons = regions_from_bed(donors_path)
    donors = []
    for ex in donor_exons:
        chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0]
        chr_seq = chromosomes[chr]
        if ex[1] == '+':
            donors.append((chr, '+', ex[3], chr_seq[ex[3] - flank_len:ex[3]]))
        elif ex[1] == '-':
            donors.append(
                (chr, '-', ex[2],
                 revcomplement(chr_seq[ex[2] - 1:ex[2] - 1 + flank_len])))

    acceptor_exons = regions_from_bed(acceptors_path)
    acceptors = []
    for ex in acceptor_exons:
        chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0]
        chr_seq = chromosomes[chr]
        if ex[1] == '+':
            acceptors.append(
                (chr, '+', ex[2], chr_seq[ex[2] - 1:ex[2] - 1 + flank_len]))
        elif ex[1] == '-':
            acceptors.append((chr, '-', ex[3],
                              revcomplement(chr_seq[ex[3] - flank_len:ex[3]])))

    del chromosomes  # Release 3 GB of memory
    gc.collect()

    # Remove duplicate acceptors and donors.
    acceptors = list(set(acceptors))
    donors = list(set(donors))

    # Calculate junction sequences
    junctions = {}
    for left in donors:
        for right in acceptors:
            name = '%s:%s:%d_%s:%s:%d' % (left[:3] + right[:3])
            junctions[name] = Object(sequence=left[3] + right[3], reads=[])
    info('Generated %d junctions.' % len(junctions))

    # Build Bowtie index
    info('Constructing junction FASTA file...')
    index_fasta_path = out_prefix + '_ref.fa'
    index = open(index_fasta_path, 'w')
    for name, junction in junctions.iteritems():
        index.write('>%s\n%s\n' % (name, junction.sequence))
    index.close()
    info('Constructing Bowtie index...')
    shell('bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix))

    # Align reads against junctions and tally junction read counts.
    if all_reads:
        info('Aligning all reads against index...')
        reads_command = 'sam reads %s' % bam_path
    else:
        info('Aligning unaligned reads against index...')
        reads_command = 'sam unaligned reads %s' % bam_path

    for line in shell_stdout('bowtie -f -v1 -B1 %s_index <(%s)' %
                             (out_prefix, reads_command)):
        cols = line.rstrip().split('\t')
        junctions[cols[2]].reads.append(cols[4])

    shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))

    out_file = open(out_prefix + '.tsv', 'w')
    out_file.write('5\' breakpoint\t3\' breakpoint\tNum reads\tSequences\n')
    for name, j in junctions.iteritems():
        if not j.reads: continue
        flanks = name.split('_')
        out_file.write('%s\t%s\t%d\t' % (flanks[0], flanks[1], len(j.reads)))
        #out_file.write(';'.join(j.reads))
        out_file.write('\n')
    out_file.close()
Esempio n. 27
0
def detect_discordant_pairs(sam_path, out_prefix, max_frag_len, min_mapq,
                            orientation):

    out = zopen(out_prefix + '.discordant_pairs.tsv.gz', 'w')
    N = 0

    sort_tmp_dir = os.path.dirname(out_prefix)
    if not sort_tmp_dir: sort_tmp_dir = './'

    # Go through all the first mates and look for discordant pairs.
    info('Searching for discordant read pairs...')
    prev = ['']
    for line in shell_stdout(
            'sam discordant pairs --min-mapq=%d %s %d | sort -k1,1 -T %s' %
        (min_mapq, sam_path, max_frag_len, sort_tmp_dir)):

        al = line.split('\t')
        if len(al) < 9: continue

        # Discard spliced and clipped reads.
        # FIXME: Add support for spliced RNA-seq reads.
        if 'N' in al[5] or 'S' in al[5]: continue

        if al[0].endswith('/1') or al[0].endswith('/2'):
            al[0] = al[0][:-2]  # Remove /1 or /2 suffix

        if al[0] != prev[0]:
            prev = al
            continue

        flags = int(al[1])
        chr = al[2]
        mchr = prev[2]
        strand = '-' if flags & 0x10 else '+'
        mstrand = '-' if flags & 0x20 else '+'
        pos = int(al[3])
        mpos = int(prev[3])
        rlen = len(al[9])
        mrlen = len(prev[9])

        if not chr.startswith('chr'): chr = 'chr' + chr
        if not mchr.startswith('chr'): mchr = 'chr' + mchr

        if chr == 'chrM' or mchr == 'chrM': continue  # Discard mitochondrial

        if orientation == 'fr':
            # Reorient pairs so that the first mate is always upstream.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = mstrand, strand

            # Convert to forward-forward orientation (flip second mate).
            mstrand = '-' if mstrand == '+' else '+'

        elif orientation == 'rf':
            # Reorient pairs so that the first mate is always upstream.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = mstrand, strand

            # Convert to forward-forward orientation (flip first mate).
            strand = '-' if strand == '+' else '+'

        elif orientation == 'ff':
            # Reorient pairs so that the first mate is always upstream.
            # If mates are swapped, both mates must be reversed.
            if chr > mchr or (chr == mchr and pos > mpos):
                chr, mchr = mchr, chr
                pos, mpos = mpos, pos
                rlen, mrlen = mrlen, rlen
                strand, mstrand = '+' if mstrand == '-' else '-', \
                 '+' if strand == '-' else '-'

        else:
            error('Unsupported read orientation detected.')

        # Make positions represent read starts.
        if strand == '-': pos += rlen - 1
        if mstrand == '-': mpos += mrlen - 1

        # Each discordant mate pair is represented as a 7-tuple
        # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, None).
        # The None at the end signifies that this is a mate pair.
        # Positions are 1-based and represent read starts.
        out.write('%s\t%s\t%d\t%s\t%s\t%d\t-\n' %
                  (chr, strand, pos, mchr, mstrand, mpos))
        N += 1

    out.close()
    info('Found %d discordant mate pairs.' % N)