def analyze_reads(fqs, paired, reads_file, keep_files): '''Analyzes read length for single-end sampled, required by Kallisto.''' awk = "| awk '{if(NR%4==2) print length($1)}'" log.info('[alignment] Analyzing read length') if paired: fq1, fq2 = fqs command = ['zcat <', fq1, awk, '>', reads_file] run_command(command) command = ['zcat <', fq2, awk, '>>', reads_file] run_command(command) else: fq = fqs[0] command = ['zcat <', fq, awk, '>', reads_file] run_command(command) read_lengths = np.genfromtxt(reads_file) num = len(read_lengths) avg = round(np.mean(read_lengths), 4) std = round(np.std(read_lengths), 4) remove_files([reads_file], keep_files) return num, avg, std
def process_counts(count_file, eq_file, gene_list, allele_idx, allele_lengths, keep_files): '''Processes pseudoalignment output, returning compatibility classes.''' log.info('[alignment] processing pseudoalignment') # Process count information counts = dict() with open(count_file, 'r') as file: for line in file.read().splitlines(): eq, count = line.split('\t') counts[eq] = float(count) # Process compatibility classes eqs = dict() with open(eq_file, 'r') as file: for line in file.read().splitlines(): eq, indices = line.split('\t') eqs[eq] = indices.split(',') # Set up compatibility class index eq_idx = defaultdict(list) count_unique = 0 count_multi = 0 class_unique = 0 class_multi = 0 for eq, indices in eqs.items(): if [idx for idx in indices if not allele_idx[idx]]: continue genes = list({ get_gene(allele) for idx in indices for allele in allele_idx[idx] }) count = counts[eq] if len(genes) == 1 and counts[eq] > 0: gene = genes[0] eq_idx[gene].append((indices, count)) count_unique += count class_unique += 1 else: count_multi += count class_multi += 1 # Alleles mapping to their respective compatibility classes allele_eq = defaultdict(set) for eqs in eq_idx.values(): for eq, (indices, _) in enumerate(eqs): for idx in indices: allele_eq[idx].add(eq) remove_files([count_file, eq_file], keep_files) align_stats = [count_unique, count_multi, class_unique, class_multi] return eq_idx, allele_eq, align_stats
def pseudoalign(fqs, sample, paired, reference, outdir, temp, threads, keep_files, partial=False): '''Calls Kallisto to pseudoalign reads.''' file_list = [] # Get read length stats reads_file = ''.join([temp, sample, '.reads.txt']) num, avg, std = analyze_reads(fqs, paired, reads_file, keep_files) # Kallisto fails if std used for single-end is 0 if std == 0: std = .00001 temp2 = check_path(''.join([temp, sample])) command = ['kallisto pseudo -i', reference, '-t', threads, '-o', temp2] if paired: command.extend([fqs[0], fqs[1]]) else: fq = fqs[0] command.extend(['--single -l', str(avg), '-s', str(std), fq]) run_command(command, '[alignment] Pseudoaligning with Kallisto: ') # Move and rename Kallisto output file_in = ''.join([temp2, 'pseudoalignments.tsv']) count_file = ''.join([temp, sample, '.counts.tsv']) file_list.append(file_in) run_command(['mv', file_in, count_file]) file_in = ''.join([temp2, 'pseudoalignments.ec']) eq_file = ''.join([temp, sample, '.eq.tsv']) file_list.append(file_in) run_command(['mv', file_in, eq_file]) run_command(['rm -rf', temp2]) remove_files(file_list, keep_files) return count_file, eq_file, num, avg, std
def extract_reads(bam, outdir, paired, unmapped, alts, temp, threads, keep_files): '''Extracts reads from chromosome 6 and alts/decoys if applicable.''' log.info(f'[extract] Extracting reads from {bam}') file_list = [] sample = os.path.splitext(os.path.basename(bam))[0] # Index bam index_bam(bam) hla_filtered = ''.join([temp, sample, '.hla.sam']) file_list.append(hla_filtered) hla_filtered_bam = ''.join([temp, sample, '.hla.bam']) file_list.append(hla_filtered_bam) # Get bam header to check for chromosome nomenclature output = run_command(['samtools', 'view', '-@'+threads, '-H', bam]) header = output.stdout.decode('utf-8') if 'SN:chr' in header: chrom = 'chr6' else: chrom = '6' # Extract BAM header message = '[extract] Extracting chromosome 6: ' command = ['samtools', 'view', '-H', '-@'+threads] command.extend([bam, '-o', hla_filtered]) run_command(command, message) # Extracted reads mapped to chromosome 6 message = '[extract] Extracting chromosome 6: ' command = ['samtools', 'view', '-@'+threads] if paired: command.append('-f 2') else: command.append('-F 4') command.extend([bam, chrom, '>>', hla_filtered]) run_command(command, message) # Extract unmapped reads if unmapped: message = '[extract] Extracting chromosome 6: ' command = ['samtools', 'view', '-@'+threads] if paired: command.append('-f 12') else: command.append('-f 4') command.extend([bam, chrom, '>>', hla_filtered]) run_command(command, message) # Check for alts in header and extract reads if present for alt in alts: if alt in header: command = ['samtools', 'view', '-@'+threads] if paired: command.append('-f 2') else: command.append('-F 4') command.extend([bam, alt+':', '>>', hla_filtered]) run_command(command) # Convert SAM to BAM message = '[extract] Converting SAM to BAM: ' command = ['samtools', 'view', '-Sb', '-@'+threads, hla_filtered, '>', hla_filtered_bam] run_command(command, message) # Sort BAM hla_sorted = ''.join([temp, sample, '.hla.sorted.bam']) file_list.append(hla_sorted) message = '[extract] Sorting bam: ' command = ['samtools', 'sort', '-n', '-@'+threads, hla_filtered_bam, '-o', hla_sorted] run_command(command, message) # Convert BAM to FASTQ and compress message = '[extract] Converting bam to fastq: ' command = ['bedtools', 'bamtofastq', '-i', hla_sorted] if paired: fq1 = ''.join([outdir, sample, '.extracted.1.fq']) fq2 = ''.join([outdir, sample, '.extracted.2.fq']) command.extend(['-fq', fq1, '-fq2', fq2]) run_command(command, message) run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq1]) run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq2]) else: fq = ''.join([outdir, sample, '.extracted.fq']) command.extend(['-fq', fq]) run_command(command, message) run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq]) remove_files(file_list, keep_files)