def initial_abundances(eqs, lengths, population): # Divides counts equally between alleles in the same compatibility class def divide_equally(alleles, count): n_alleles = len(alleles) for allele in alleles: counts[allele] += count / n_alleles # Divides counts proportionally to allele frequency def divide_prior(alleles, count, allele_prob): total_prob = sum(allele_prob.values()) for allele in alleles: counts[allele] += count * (allele_prob[allele] / total_prob) counts = defaultdict(float) undivided_counts = defaultdict(float) for alleles, count in eqs: allele_prior = defaultdict(float) for idx in alleles: undivided_counts[idx] += count allele = process_allele(allele_idx[idx][0], 2) if population and allele in prior: allele_prior[idx] = prior[allele][population] if population and allele_prior: divide_prior(alleles, count, allele_prior) continue divide_equally(alleles, count) return counts_to_abundances(counts), undivided_counts
def convert_allele(allele, resolution): '''Checks nomenclature of input allele and returns converted allele.''' i = len(allele.split(':')) # Input: P-group allele if allele[-1] == 'P': if resolution == 'g-group': sys.exit('[convert] Error: p-group cannot be converted ' + 'to g-group.') # Output: 1-field allele unless forced elif type(resolution) == int: if resolution > 1 and not args.force: sys.exit('[convert] Error: p-group cannot be ' + 'converted to %.0f fields.' % resolution) allele = process_allele(allele[:-1], resolution) # Input: G-group allele elif allele[-1] == 'G': # Output: 1-field allele unless forced if type(resolution) == int: if resolution > 1 and not args.force: sys.exit('[convert] Error: g-group cannot be converted' + 'to %.0f fields.' % resolution) allele = process_allele(allele[:-1], resolution) # Output: P-group allele elif resolution == 'p-group': if allele[:-1] in p_group[i]: allele = p_group[i][allele[:-1]] elif process_allele(allele[:-1], i - 1) in p_group[i]: allele = p_group[i][process_allele(allele[:-1], i - 1)] # Input: ungrouped allele # Output: G-group allele elif resolution == 'g-group': if allele in g_group[i]: allele = g_group[i][allele] elif allele[-1] != 'N': allele = process_allele(allele, 3) # Input: ungrouped allele # Output: P-group allele elif resolution == 'p-group': if allele in p_group[i]: allele = p_group[i][allele] # Input: ungrouped allele # Output: reduced resolution, ungrouped allele elif type(resolution) == int: allele = process_allele(allele, resolution) return allele
def filter_eqs(complete_genotypes, allele_idx, eq_idx, partial_alleles): '''Filters compatibility classes if they contain partial alleles or at least one predicted complete allele. ''' all_predicted = { allele for alleles in complete_genotypes.values() for allele in alleles } wanted_indices = { index for index, alleles in allele_idx.items() if alleles and (set(alleles) & (partial_alleles | set(all_predicted))) } filtered_eqs = dict() for group, eq_list in eq_idx.items(): filtered_eqs[group] = dict() for gene in complete_genotypes: if gene not in eq_list: continue filtered = [] for indices, count in eq_list[gene]: indices = set(indices) & wanted_indices if not indices: continue filtered.append([indices, count]) filtered_eqs[group][gene] = filtered allele_eq = {group: defaultdict(set) for group in filtered_eqs.keys()} for group, eq_list in filtered_eqs.items(): for gene in eq_list: for i, (indices, count) in enumerate(eq_list[gene]): for idx in indices: for allele in allele_idx[idx]: allele = process_allele(allele, 3) allele_eq[group][allele].add(i) return filtered_eqs, allele_eq
def genotype_gene(gene, gene_count, eqs, lengths, allele_idx, population, prior, tolerance, max_iterations, drop_iterations, drop_threshold, zygosity_threshold): '''Calls transcript quantification and genotype prediction.''' if gene not in {'A', 'B', 'C', 'DRB1', 'DQB1', 'DQA1'}: population = None em_results = expectation_maximization(eqs, lengths, allele_idx, population, prior, tolerance, max_iterations, drop_iterations, drop_threshold) em_results = [[idx, allele_idx[idx], a] for idx, a in em_results.items()] log.info('\n[genotype] Top alleles by abundance:') log.info('\t\t{: <20} {: >9}'.format('allele', 'abundance')) for _, alleles, abundance in sorted(em_results, key=lambda x: x[2], reverse=True): log.info('\t\t{: <20} {: >8.2f}%'.format( process_allele(alleles[0], 3), abundance * 100)) genotype, pair_count = predict_genotype(eqs, allele_idx, allele_eq, em_results, gene_count, population, prior, zygosity_threshold) log.info( '\n[genotype] Most likely genotype explaining {:.0f} reads:'.format( pair_count)) for allele in genotype: log.info(f'\t\t{allele}') return em_results, genotype
def process_hla_dat(): '''Processes IMGTHLA database, returning HLA sequences, exon locations, lists of complete and partial alleles and possible exon combinations. ''' sequences = dict() utrs = defaultdict(dict) exons = defaultdict(dict) gene_exons = defaultdict(set) sequence = partial = utr = exon = False gene_set = set() complete_alleles = set() complete_2fields = set() partial_alleles = set() with open(hla_dat, 'r') as file: lines = file.read().splitlines() for line in lines: # Denotes end of sequence, add allele to database if line.startswith('//'): if sequence and allele in exons: sequences[allele] = seq gene_exons[gene].add(number) gene_set.add(gene) if not partial: complete_alleles.add(allele) complete_2fields.add(process_allele(allele,2)) else: partial_alleles.add(allele) partial = False # Denotes partial alleles elif line.startswith('FT') and 'partial' in line: partial = True # Allele name and gene elif line.startswith('FT') and re.search('allele\="HLA-', line): allele = re.split('HLA-', re.sub('["\n]','',line))[1] gene = get_gene(allele) exon = sequence = False seq = '' # Exon coordinates elif line.startswith('FT') and re.search('exon',line): info = re.split('\s+', line) start = int(info[2].split('..')[0]) - 1 stop = int(info[2].split('..')[1]) exon_coord = [start, stop] exon = True # Exon number on following line elif exon: number = re.split('"', line)[1] exons[allele][number] = exon_coord exon = False # UTRs elif line.startswith('FT') and (re.search('\sUTR\s',line)): info = re.split('\s+', line) start = int(info[2].split('..')[0]) - 1 stop = int(info[2].split('..')[1]) utr_coord = [start, stop] if allele not in exons: utrs[allele]['utr5'] = utr_coord else: utrs[allele]['utr3'] = utr_coord # Start of sequence elif line.startswith('SQ'): sequence = True elif sequence and line.startswith(' '): seq += ''.join(line.split()[:-1]).upper() # select only 2-field partial alleles partial_alleles = {allele for allele in partial_alleles if process_allele(allele,2) not in complete_2fields} # get most common final exon length to truncate stop-loss alleles final_exon_length = defaultdict(list) for allele in complete_alleles: gene = get_gene(allele) exon = sorted(gene_exons[gene])[-1] if exon not in exons[allele]: continue start, stop = exons[allele][exon] final_exon_length[gene].append(stop-start) for gene, lengths in final_exon_length.items(): exon = sorted(gene_exons[gene])[-1] length = get_mode(lengths) final_exon_length[gene] = [exon,length] return (complete_alleles, partial_alleles, gene_set, sequences, utrs, exons, final_exon_length)
def type_partial(eqs, gene, partial_exons, complete_genotype, partial_alleles, population, prior, tolerance, max_iterations, drop_iterations, drop_threshold, zygosity_threshold): '''Types partial alleles.''' # Return count of a single allele def get_single_count(a): return sum([eqs[group][gene][idx][1] for idx in allele_eq[group][a]]) # Return count of a pair of alleles def get_pair_count(a1, a2): indices = allele_eq[group][a1] | allele_eq[group][a2] return sum([eqs[group][gene][idx][1] for idx in indices]) # Return nonshared count of a pair of alleles def get_nonshared_count(a1, a2): a1_indices = allele_eq[group][a1] - allele_eq[group][a2] a2_indices = allele_eq[group][a2] - allele_eq[group][a1] a1_count = sum([eqs[group][gene][idx][1] for idx in a1_indices]) a2_count = sum([eqs[group][gene][idx][1] for idx in a2_indices]) return a1_count, a2_count if gene not in {'A', 'B', 'C', 'DRB1', 'DQB1', 'DQA1'}: population = None # Set binding region by class if gene.startswith('D'): binding_region = "['2']" else: binding_region = "['2', '3']" if gene not in eqs[binding_region]: log.info(f'[genotype] No reads aligned to HLA-{gene} binding region') return complete_genotype # Get group of possible partial alleles by performing transcript # quantification on the binding region exons results = expectation_maximization(eqs[binding_region][gene], lengths, allele_idx, population, prior, tolerance, max_iterations, drop_iterations, drop_threshold) exon_groups = defaultdict(set) # Map partial alleles to their possible exon combinations for idx in results: alleles = {process_allele(allele, 3) for allele in allele_idx[idx]} for allele in (alleles - set(complete_genotype)) & partial_alleles: for group in eqs.keys(): if group[1:-1] in str(sorted(partial_exons[allele].keys())): exon_groups[group].add(allele) # Compare pairs of partial alleles and predicted alleles overall = [] for group in sorted(exon_groups.keys(), key=lambda x: len(x), reverse=False): # Skip just exon 2 for class I if not gene.startswith('D') and group == "['2']": continue # Only look at partial alleles that have a different sequence for # this combination of exons than the complete alleles a1, a2 = complete_genotype possible_alleles = { allele for allele in exon_groups[group] if allele_eq[group][allele] != allele_eq[group][a1] and allele_eq[group][allele] != allele_eq[group][a2] } if not possible_alleles: continue explained_reads = dict() # Filter partial alleles that have only a few more reads # than the complete alleles min_count = min(get_single_count(a1), get_single_count(a2)) possible_alleles &= { allele for allele in exon_groups[group] if get_single_count(allele) > 10 + min_count } total_count = sum([count for _, count in eqs[group][gene]]) # Get percent explained reads by complete genotype pair_count = get_pair_count(a1, a2) explained_percent = round(pair_count / total_count, 8) explained_reads[(a1, a2)] = explained_percent # Only consider pairs with partial alleles if they explain # a greater percentage of reads for a1, a2 in combinations( set(complete_genotype) | possible_alleles, 2): pair_count = get_pair_count(a1, a2) if pair_count / total_count > explained_percent: explained_reads[(a1, a2)] = round(pair_count / total_count, 8) if not explained_reads: continue top_perc = max(explained_reads.values()) explained_reads = { key: value for key, value in explained_reads.items() if value == top_perc } # If the top percentage of explained reads is shared by more than # one pair, use priors to break the ties if population and len(explained_reads) > 1: pair_prior = dict() for a1, a2 in explained_reads.keys(): if (process_allele(a1, 2) not in prior or process_allele(a2, 2) not in prior): continue pair_prior[(a1,a2)] = prior[process_allele(a1,2)][population] \ * prior[process_allele(a2,2)][population] if pair_prior: a1, a2 = sorted(pair_prior.items(), key=lambda x: x[1], reverse=True)[0][0] else: a1, a2 = sorted(explained_reads.items(), key=lambda x: x[1], reverse=True)[0][0] else: a1, a2 = sorted(explained_reads.items(), key=lambda x: x[1], reverse=True)[0][0] group = re.sub('[\'\[\]]', '', group) log.info('\t\texons {: <22}\t{: <28}\t{:.2f}%'.format( group, ', '.join([a1, a2]), top_perc * 100)) overall.append([(a1, a2), top_perc]) if overall: return sorted(overall, key=lambda x: (x[1], x[0][0], x[0][1]), reverse=True)[0][0] return complete_genotype
def predict_genotype(eqs, allele_idx, allele_eq, em_results, gene_count, population, prior, zygosity_threshold): '''Predicts most likely genotype using scoring based on proportion of explained reads, tie-breaking with allele priors. ''' # Returns number of reads explained by an allele def get_count(a): observed_eqs = allele_eq[a] return sum([eqs[idx][1] for idx in observed_eqs]) # Returns number of reads explained by a pair of alleles def get_pair_count(a1, a2): if type(a1) == tuple: a1_eqs = set.union(*[allele_eq[idx] for idx in a1]) else: a1_eqs = allele_eq[a1] if type(a2) == tuple: a2_eqs = set.union(*[allele_eq[idx] for idx in a2]) else: a2_eqs = allele_eq[a2] observed_eqs = a1_eqs | a2_eqs return sum([eqs[idx][1] for idx in observed_eqs]) # Returns non-shared counts for a pair of alleles def get_nonshared_count(a1, a2): if type(a1) == tuple: a1_eqs = set.union(*[allele_eq[idx] for idx in a1]) else: a1_eqs = allele_eq[a1] if type(a2) == tuple: a2_eqs = set.union(*[allele_eq[idx] for idx in a2]) else: a2_eqs = allele_eq[a2] a1_nonshared_eqs = a1_eqs - a2_eqs a2_nonshared_eqs = a2_eqs - a1_eqs a1_count = sum([eqs[idx][1] for idx in a1_nonshared_eqs]) a2_count = sum([eqs[idx][1] for idx in a2_nonshared_eqs]) return a1_count, a2_count explained_reads = dict() if len(em_results) > 1: grouped_indices = defaultdict(set) for idx, alleles, abundances in em_results: allele = process_allele(alleles[0], 2) grouped_indices[allele].add(idx) grouped_indices = [tuple(v) for v in grouped_indices.values()] if len(grouped_indices) > 1: for a1, a2 in combinations(grouped_indices, 2): pair_count = get_pair_count(a1, a2) explained_reads[(a1, a2)] = pair_count / gene_count else: a1, a2 = sorted(list(grouped_indices)[0])[:2] pair_count = get_pair_count(a1, a2) explained_reads[((a1, ), (a2, ))] = pair_count / gene_count # Print information log.info('\n[genotype] Pairs by % explained reads:') log.info('\t\t{: <28} {: >7}\t'.format('allele pair', 'explained')) for (a1, a2), count in sorted(explained_reads.items(), key=lambda x: x[1], reverse=True): alleles = ', '.join([ process_allele(allele_idx[a1[0]][0], 3), process_allele(allele_idx[a2[0]][0], 3) ]) log.info('\t\t{: <28} {: >9.2f}%\t'.format( alleles, count * 100)) max_count = max(explained_reads.values()) top_by_reads = { pair: count for pair, count in explained_reads.items() if count == max_count } # If more than one pair has the same number of explained reads # use allele frequency priors to break the tie if len(top_by_reads) > 1 and population: pair_prior = dict() for a1, a2 in top_by_reads.keys(): allele1 = process_allele(allele_idx[a1[0]][0], 2) allele2 = process_allele(allele_idx[a2[0]][0], 2) if allele1 not in prior or allele2 not in prior: continue pair_prior[(a1,a2)] = prior[allele1][population] \ * prior[allele2][population] max_prior = max(pair_prior.values()) pair_prior = { pair: prior for pair, prior in pair_prior.items() if prior >= max_prior } a1, a2 = sorted(pair_prior.keys(), key=lambda x: (x[0], x[1]))[0] else: a1, a2 = sorted(top_by_reads.items(), key=lambda x: x[1], reverse=True)[0][0] pair_count = get_pair_count(a1, a2) a1_count, a2_count = get_nonshared_count(a1, a2) a1 = process_allele(allele_idx[sorted(a1)[0]][0], 3) a2 = process_allele(allele_idx[sorted(a2)[0]][0], 3) # Zygosity check based on nonshared counts log.info(f'\n[genotype] Checking zygosity') if a1_count == a2_count == 0: log.info('[genotype] Unable to distinguish ' + 'between minor and major alleles') genotype = [a1, a2] elif a1_count == 0: log.info('[genotype] Likely heterozygous: minor allele has no ' + 'nonshared reads') genotype = [a2] elif a2_count == 0: log.info('[genotype] Likely heterozygous: minor allele has no ' + 'nonshared reads') genotype = [a1] elif min(a1_count / a2_count, a2_count / a1_count) < zygosity_threshold: log.info(f'[genotype] Likely homozygous: minor/major ' + 'nonshared count {:.2f}'.format( min(a1_count / a2_count, a2_count / a1_count))) if a1_count > a2_count: genotype = [a1] else: genotype = [a2] else: log.info(f'[genotype] Likely heterozygous: minor/major ' + 'nonshared count {:.2f}'.format( min(a1_count / a2_count, a2_count / a1_count))) genotype = [a1, a2] else: a1, alleles, _ = em_results[0] pair_count = get_count(a1) a1_count = pair_count a2_count = None genotype = [ process_allele(alleles[0], 3), ] return genotype, pair_count
def expectation_maximization(eqs, lengths, allele_idx, population, prior, tolerance, max_iterations, drop_iterations, drop_threshold): '''Quantifies allele transcript abundance. Based on the methods used in HISAT-genotype (http://dx.doi.org/10.1101/266197). ''' # Divides raw counts between alleles for the first iteration # of transcript quantification def initial_abundances(eqs, lengths, population): # Divides counts equally between alleles in the same compatibility class def divide_equally(alleles, count): n_alleles = len(alleles) for allele in alleles: counts[allele] += count / n_alleles # Divides counts proportionally to allele frequency def divide_prior(alleles, count, allele_prob): total_prob = sum(allele_prob.values()) for allele in alleles: counts[allele] += count * (allele_prob[allele] / total_prob) counts = defaultdict(float) undivided_counts = defaultdict(float) for alleles, count in eqs: allele_prior = defaultdict(float) for idx in alleles: undivided_counts[idx] += count allele = process_allele(allele_idx[idx][0], 2) if population and allele in prior: allele_prior[idx] = prior[allele][population] if population and allele_prior: divide_prior(alleles, count, allele_prior) continue divide_equally(alleles, count) return counts_to_abundances(counts), undivided_counts # Normalizes counts by allele length and convert to abundances def counts_to_abundances(counts): abundances = defaultdict(float) for allele, count in counts.items(): length = lengths[allele] abundances[allele] = count / length total_abundance = sum(abundances.values()) for allele, abundance in abundances.items(): abundances[allele] = abundance / total_abundance return abundances # Redistribute counts between alleles in the same compatibility # class based on their overall abundance def update_abundances(eqs, abundances): counts = defaultdict(float) for alleles, count in eqs: alleles = [allele for allele in alleles if allele in abundances] total_abundance = sum([abundances[allele] for allele in alleles]) if total_abundance == 0: continue for allele in alleles: counts[allele] += count * (abundances[allele] / total_abundance) return counts_to_abundances(counts) # Drop low support alleles after a specified number of iterations if their # abundance is less than a specified proportion of the greatest abundance def drop_alleles(eqs, abundances, drop_iterations, drop_threshold, iterations, converged): if iterations == 1: abundances = { allele: abundance for allele, abundance in abundances.items() if abundance > 0.0 } elif iterations >= drop_iterations or converged: threshold = drop_threshold * max(abundances.values()) abundances = { allele: abundance for allele, abundance in abundances.items() if abundance >= threshold } return abundances, eqs # Compute square root of sum of squares def SRSS(theta): square_sum = 0.0 for i in theta: square_sum += i**2 return math.sqrt(square_sum) # Check if sum difference between two iterations is below tolerance def check_convergence(theta0, theta_prime): diff = [theta_prime[allele] - theta0[allele] for allele in theta0] residual_error = SRSS(diff) return residual_error < tolerance converged = False iterations = 1 theta0, undivided_counts = initial_abundances(eqs, lengths, population) log.info('[genotype] Top 10 alleles by undivided read count:') log.info('\t\t{: <20} {: >10}\t'.format('allele', 'read count')) for idx, count in sorted(undivided_counts.items(), key=lambda x: x[1], reverse=True)[:10]: log.info('\t\t{: <20} {: >10.0f}\t'.format( process_allele(allele_idx[idx][0], 3), count)) log.info(f'\n[genotype] Quantifying allele transcript abundance') # SQUAREM - accelerated EM # R. Varadhan & C. Roland (doi: 10.1 1 1 1/j. 1467-9469.2007.00585.X) # Used by HISAT-genotype, originaly used by Sailfish while iterations < max_iterations and not converged: # Get next two steps theta1 = update_abundances(eqs, theta0) theta2 = update_abundances(eqs, theta1) theta_prime = defaultdict(float) r = dict() v = dict() sum_r = 0.0 sum_v = 0.0 # Compute r and v for allele in theta1: r[allele] = theta1[allele] - theta0[allele] v[allele] = (theta2[allele] - theta1[allele]) - r[allele] srss_r = SRSS(r.values()) srss_v = SRSS(v.values()) if srss_v != 0: # Compute step length alpha = -(srss_r / srss_v) for allele in r: value = theta0[allele] \ - 2*alpha*r[allele] \ + (alpha**2)*v[allele] theta_prime[allele] = value step_min = min(theta_prime.values()) step_max = max(theta_prime.values()) # Adjust step rather than kicking out alleles with a negative result if step_min < 0: theta_prime = { allele: (value - step_min) / (step_max - step_min) for allele, value in theta_prime.items() } total = sum(theta_prime.values()) theta_prime = { allele: value / total for allele, value in theta_prime.items() } # Update abundances with given the new proportions theta_prime = update_abundances(eqs, theta_prime) else: theta_prime = theta1 converged = check_convergence(theta0, theta_prime) theta0, eqs = drop_alleles(eqs, theta_prime, drop_iterations, drop_threshold, iterations, converged) iterations += 1 log.info(f'[genotype] EM converged after {iterations} iterations') return theta0