Exemple #1
0
def index_bam(bam):
    '''Attempts to index BAM if .bai file is not found.'''
    if not isfile(''.join([bam, '.bai'])):
        run_command(['samtools', 'index', bam], '[extract] indexing bam: ')

    if not isfile(''.join([bam, '.bai'])):
        sys.exit('[extract] Error: unable to index bam file.')
Exemple #2
0
def checkout_version(commithash):
    '''Checks out a specific IMGTHLA github version given a commithash.'''
    
    if not isfile(hla_dat):
        fetch_hla_dat()

    command = ['git', '-C', IMGTHLA, 'checkout', commithash]
    run_command(command,'[reference] checking out IMGT/HLA:')
Exemple #3
0
def fetch_hla_dat():
    '''Clones IMGTHLA github to database.'''

    if isdir(IMGTHLA):
        run_command(['rm', '-rf', IMGTHLA])

    command = ['git', 'clone', IMGTHLA_git, IMGTHLA]
    run_command(command, '[reference] cloning IMGT/HLA database:')
Exemple #4
0
def write_reference(sequences, info, fasta, idx, database, type):
    '''Writes and idxes HLA references.'''
    with open(fasta,'w') as file:
        SeqIO.write(sequences, file, 'fasta')
        
    commithash = hla_dat_version()
    with open(database,'wb') as file:
        pickle.dump([commithash,info],file)

    run_command(['kallisto', 'index', '-i', idx, fasta] ,
                '[reference] indexing ' + type + ' reference with Kallisto:')
Exemple #5
0
def hla_dat_version(print_version = False):
    '''Returns commithash of downloaded IMGTHLA database.'''

    results = run_command(['git', '-C', IMGTHLA, 'rev-parse HEAD'])
    commit = results.stdout.decode()
    if print_version:
        log.info(commit)
    
    return commit
Exemple #6
0
def analyze_reads(fqs, paired, reads_file, keep_files):
    '''Analyzes read length for single-end sampled, required by Kallisto.'''
    awk = "| awk '{if(NR%4==2) print length($1)}'"

    log.info('[alignment] Analyzing read length')
    if paired:
        fq1, fq2 = fqs

        command = ['zcat <', fq1, awk, '>', reads_file]
        run_command(command)

        command = ['zcat <', fq2, awk, '>>', reads_file]
        run_command(command)

    else:
        fq = fqs[0]
        command = ['zcat <', fq, awk, '>', reads_file]
        run_command(command)

    read_lengths = np.genfromtxt(reads_file)

    num = len(read_lengths)
    avg = round(np.mean(read_lengths), 4)
    std = round(np.std(read_lengths), 4)

    remove_files([reads_file], keep_files)

    return num, avg, std
Exemple #7
0
def pseudoalign(fqs,
                sample,
                paired,
                reference,
                outdir,
                temp,
                threads,
                keep_files,
                partial=False):
    '''Calls Kallisto to pseudoalign reads.'''
    file_list = []

    # Get read length stats
    reads_file = ''.join([temp, sample, '.reads.txt'])
    num, avg, std = analyze_reads(fqs, paired, reads_file, keep_files)

    # Kallisto fails if std used for single-end is 0
    if std == 0: std = .00001

    temp2 = check_path(''.join([temp, sample]))
    command = ['kallisto pseudo -i', reference, '-t', threads, '-o', temp2]

    if paired:
        command.extend([fqs[0], fqs[1]])
    else:
        fq = fqs[0]
        command.extend(['--single -l', str(avg), '-s', str(std), fq])

    run_command(command, '[alignment] Pseudoaligning with Kallisto: ')

    # Move and rename Kallisto output
    file_in = ''.join([temp2, 'pseudoalignments.tsv'])
    count_file = ''.join([temp, sample, '.counts.tsv'])
    file_list.append(file_in)
    run_command(['mv', file_in, count_file])

    file_in = ''.join([temp2, 'pseudoalignments.ec'])
    eq_file = ''.join([temp, sample, '.eq.tsv'])
    file_list.append(file_in)
    run_command(['mv', file_in, eq_file])

    run_command(['rm -rf', temp2])
    remove_files(file_list, keep_files)

    return count_file, eq_file, num, avg, std
Exemple #8
0
def extract_reads(bam, outdir, paired, unmapped, alts, 
                    temp, threads, keep_files):
    '''Extracts reads from chromosome 6 and alts/decoys if applicable.'''
    
    log.info(f'[extract] Extracting reads from {bam}')
    
    file_list = []
    sample = os.path.splitext(os.path.basename(bam))[0]
    
    # Index bam
    index_bam(bam)
    
    hla_filtered = ''.join([temp, sample, '.hla.sam'])
    file_list.append(hla_filtered)
    hla_filtered_bam = ''.join([temp, sample, '.hla.bam'])
    file_list.append(hla_filtered_bam)
        
    # Get bam header to check for chromosome nomenclature
    output = run_command(['samtools', 'view', '-@'+threads, '-H', bam])
    header = output.stdout.decode('utf-8')
    
    if 'SN:chr' in header: 
        chrom = 'chr6'
    else: 
        chrom = '6'

    # Extract BAM header
    message = '[extract] Extracting chromosome 6: '
    command = ['samtools', 'view', '-H', '-@'+threads]
    command.extend([bam, '-o', hla_filtered])
    run_command(command, message)
    
    # Extracted reads mapped to chromosome 6
    message = '[extract] Extracting chromosome 6: '
    command = ['samtools', 'view', '-@'+threads]
    if paired: command.append('-f 2')
    else: command.append('-F 4')
    command.extend([bam, chrom, '>>', hla_filtered])
    run_command(command, message)
    
    # Extract unmapped reads
    if unmapped:
        message = '[extract] Extracting chromosome 6: '
        command = ['samtools', 'view', '-@'+threads]
        
        if paired: command.append('-f 12')
        else: command.append('-f 4')
        
        command.extend([bam, chrom, '>>', hla_filtered])
        run_command(command, message)
    
    # Check for alts in header and extract reads if present
    for alt in alts:
        if alt in header:
            command = ['samtools', 'view', '-@'+threads]
            
            if paired: command.append('-f 2')
            else: command.append('-F 4')
            
            command.extend([bam, alt+':', '>>', hla_filtered])
            run_command(command)


    # Convert SAM to BAM
    message = '[extract] Converting SAM to BAM: '
    command = ['samtools', 'view', '-Sb', '-@'+threads,
                hla_filtered, '>', hla_filtered_bam]    
    run_command(command, message)
            

    # Sort BAM
    hla_sorted = ''.join([temp, sample, '.hla.sorted.bam'])
    file_list.append(hla_sorted)
    message = '[extract] Sorting bam: '
    command = ['samtools', 'sort', '-n', '-@'+threads, 
                hla_filtered_bam, '-o', hla_sorted]
    run_command(command, message)

    # Convert BAM to FASTQ and compress
    message = '[extract] Converting bam to fastq: '
    command = ['bedtools', 'bamtofastq', '-i', hla_sorted]
    if paired:
        fq1 = ''.join([outdir, sample, '.extracted.1.fq'])
        fq2 = ''.join([outdir, sample, '.extracted.2.fq'])
        command.extend(['-fq', fq1, '-fq2', fq2])
        run_command(command, message)
        
        run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq1])
        run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq2])
        
    else:
        fq = ''.join([outdir, sample, '.extracted.fq'])
        command.extend(['-fq', fq])
        run_command(command, message)
        run_command(['pigz', '-f', '-p', threads, '-S', '.gz', fq])

    remove_files(file_list, keep_files)