Example #1
0
def analyze_reads(fqs, paired, reads_file, keep_files):
    '''Analyzes read length for single-end sampled, required by Kallisto.'''
    awk = "| awk '{if(NR%4==2) print length($1)}'"

    log.info('[alignment] Analyzing read length')
    if paired:
        fq1, fq2 = fqs

        command = ['zcat <', fq1, awk, '>', reads_file]
        run_command(command)

        command = ['zcat <', fq2, awk, '>>', reads_file]
        run_command(command)

    else:
        fq = fqs[0]
        command = ['zcat <', fq, awk, '>', reads_file]
        run_command(command)

    read_lengths = np.genfromtxt(reads_file)

    num = len(read_lengths)
    avg = round(np.mean(read_lengths), 4)
    std = round(np.std(read_lengths), 4)

    remove_files([reads_file], keep_files)

    return num, avg, std
Example #2
0
def process_counts(count_file, eq_file, gene_list, allele_idx, allele_lengths,
                   keep_files):
    '''Processes pseudoalignment output, returning compatibility classes.'''
    log.info('[alignment] processing pseudoalignment')
    # Process count information
    counts = dict()
    with open(count_file, 'r') as file:
        for line in file.read().splitlines():
            eq, count = line.split('\t')
            counts[eq] = float(count)

    # Process compatibility classes
    eqs = dict()
    with open(eq_file, 'r') as file:
        for line in file.read().splitlines():
            eq, indices = line.split('\t')
            eqs[eq] = indices.split(',')

    # Set up compatibility class index
    eq_idx = defaultdict(list)

    count_unique = 0
    count_multi = 0
    class_unique = 0
    class_multi = 0

    for eq, indices in eqs.items():
        if [idx for idx in indices if not allele_idx[idx]]:
            continue

        genes = list({
            get_gene(allele)
            for idx in indices for allele in allele_idx[idx]
        })
        count = counts[eq]

        if len(genes) == 1 and counts[eq] > 0:
            gene = genes[0]
            eq_idx[gene].append((indices, count))

            count_unique += count
            class_unique += 1
        else:
            count_multi += count
            class_multi += 1

    # Alleles mapping to their respective compatibility classes
    allele_eq = defaultdict(set)
    for eqs in eq_idx.values():
        for eq, (indices, _) in enumerate(eqs):
            for idx in indices:
                allele_eq[idx].add(eq)

    remove_files([count_file, eq_file], keep_files)

    align_stats = [count_unique, count_multi, class_unique, class_multi]

    return eq_idx, allele_eq, align_stats
Example #3
0
def pseudoalign(fqs,
                sample,
                paired,
                reference,
                outdir,
                temp,
                threads,
                keep_files,
                partial=False):
    '''Calls Kallisto to pseudoalign reads.'''
    file_list = []

    # Get read length stats
    reads_file = ''.join([temp, sample, '.reads.txt'])
    num, avg, std = analyze_reads(fqs, paired, reads_file, keep_files)

    # Kallisto fails if std used for single-end is 0
    if std == 0: std = .00001

    temp2 = check_path(''.join([temp, sample]))
    command = ['kallisto pseudo -i', reference, '-t', threads, '-o', temp2]

    if paired:
        command.extend([fqs[0], fqs[1]])
    else:
        fq = fqs[0]
        command.extend(['--single -l', str(avg), '-s', str(std), fq])

    run_command(command, '[alignment] Pseudoaligning with Kallisto: ')

    # Move and rename Kallisto output
    file_in = ''.join([temp2, 'pseudoalignments.tsv'])
    count_file = ''.join([temp, sample, '.counts.tsv'])
    file_list.append(file_in)
    run_command(['mv', file_in, count_file])

    file_in = ''.join([temp2, 'pseudoalignments.ec'])
    eq_file = ''.join([temp, sample, '.eq.tsv'])
    file_list.append(file_in)
    run_command(['mv', file_in, eq_file])

    run_command(['rm -rf', temp2])
    remove_files(file_list, keep_files)

    return count_file, eq_file, num, avg, std
Example #4
0
def extract_reads(bam, outdir, paired, unmapped, alts, 
                    temp, threads, keep_files):
    '''Extracts reads from chromosome 6 and alts/decoys if applicable.'''
    
    log.info(f'[extract] Extracting reads from {bam}')
    
    file_list = []
    sample = os.path.splitext(os.path.basename(bam))[0]
    
    # Index bam
    index_bam(bam)
    
    hla_filtered = ''.join([temp, sample, '.hla.sam'])
    file_list.append(hla_filtered)
    hla_filtered_bam = ''.join([temp, sample, '.hla.bam'])
    file_list.append(hla_filtered_bam)
        
    # Get bam header to check for chromosome nomenclature
    output = run_command(['samtools', 'view', '-@'+threads, '-H', bam])
    header = output.stdout.decode('utf-8')
    
    if 'SN:chr' in header: 
        chrom = 'chr6'
    else: 
        chrom = '6'

    # Extract BAM header
    message = '[extract] Extracting chromosome 6: '
    command = ['samtools', 'view', '-H', '-@'+threads]
    command.extend([bam, '-o', hla_filtered])
    run_command(command, message)
    
    # Extracted reads mapped to chromosome 6
    message = '[extract] Extracting chromosome 6: '
    command = ['samtools', 'view', '-@'+threads]
    if paired: command.append('-f 2')
    else: command.append('-F 4')
    command.extend([bam, chrom, '>>', hla_filtered])
    run_command(command, message)
    
    # Extract unmapped reads
    if unmapped:
        message = '[extract] Extracting chromosome 6: '
        command = ['samtools', 'view', '-@'+threads]
        
        if paired: command.append('-f 12')
        else: command.append('-f 4')
        
        command.extend([bam, chrom, '>>', hla_filtered])
        run_command(command, message)
    
    # Check for alts in header and extract reads if present
    for alt in alts:
        if alt in header:
            command = ['samtools', 'view', '-@'+threads]
            
            if paired: command.append('-f 2')
            else: command.append('-F 4')
            
            command.extend([bam, alt+':', '>>', hla_filtered])
            run_command(command)


    # Convert SAM to BAM
    message = '[extract] Converting SAM to BAM: '
    command = ['samtools', 'view', '-Sb', '-@'+threads,
                hla_filtered, '>', hla_filtered_bam]    
    run_command(command, message)
            

    # Sort BAM
    hla_sorted = ''.join([temp, sample, '.hla.sorted.bam'])
    file_list.append(hla_sorted)
    message = '[extract] Sorting bam: '
    command = ['samtools', 'sort', '-n', '-@'+threads, 
                hla_filtered_bam, '-o', hla_sorted]
    run_command(command, message)

    # Convert BAM to FASTQ and compress
    message = '[extract] Converting bam to fastq: '
    command = ['bedtools', 'bamtofastq', '-i', hla_sorted]
    if paired:
        fq1 = ''.join([outdir, sample, '.extracted.1.fq'])
        fq2 = ''.join([outdir, sample, '.extracted.2.fq'])
        command.extend(['-fq', fq1, '-fq2', fq2])
        run_command(command, message)
        
        run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq1])
        run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq2])
        
    else:
        fq = ''.join([outdir, sample, '.extracted.fq'])
        command.extend(['-fq', fq])
        run_command(command, message)
        run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq])

    remove_files(file_list, keep_files)