def process_counts(count_file, eq_file, gene_list, allele_idx, allele_lengths, keep_files): '''Processes pseudoalignment output, returning compatibility classes.''' log.info('[alignment] processing pseudoalignment') # Process count information counts = dict() with open(count_file, 'r') as file: for line in file.read().splitlines(): eq, count = line.split('\t') counts[eq] = float(count) # Process compatibility classes eqs = dict() with open(eq_file, 'r') as file: for line in file.read().splitlines(): eq, indices = line.split('\t') eqs[eq] = indices.split(',') # Set up compatibility class index eq_idx = defaultdict(list) count_unique = 0 count_multi = 0 class_unique = 0 class_multi = 0 for eq, indices in eqs.items(): if [idx for idx in indices if not allele_idx[idx]]: continue genes = list({ get_gene(allele) for idx in indices for allele in allele_idx[idx] }) count = counts[eq] if len(genes) == 1 and counts[eq] > 0: gene = genes[0] eq_idx[gene].append((indices, count)) count_unique += count class_unique += 1 else: count_multi += count class_multi += 1 # Alleles mapping to their respective compatibility classes allele_eq = defaultdict(set) for eqs in eq_idx.values(): for eq, (indices, _) in enumerate(eqs): for idx in indices: allele_eq[idx].add(eq) remove_files([count_file, eq_file], keep_files) align_stats = [count_unique, count_multi, class_unique, class_multi] return eq_idx, allele_eq, align_stats
def build_complete(allele): gene = get_gene(allele) allele_exons = sorted(exons[allele].items()) coords = [[start,stop] for n,(start,stop) in allele_exons] seq = [sequences[allele][start:stop] for start,stop in coords] seq = ''.join(seq) exon, exon_length = final_exon_length[gene] if exon in exons[allele]: start, stop = exons[allele][exon] if stop-start > exon_length: seq = seq[:exon_length - (stop - start) + 1] cDNA[seq].add(allele) gene_length[gene].append(len(seq)) if allele in utrs: for (start,stop) in utrs[allele].values(): seq = sequences[allele][start:stop] other.add(seq)
def process_hla_dat(): '''Processes IMGTHLA database, returning HLA sequences, exon locations, lists of complete and partial alleles and possible exon combinations. ''' sequences = dict() utrs = defaultdict(dict) exons = defaultdict(dict) gene_exons = defaultdict(set) sequence = partial = utr = exon = False gene_set = set() complete_alleles = set() complete_2fields = set() partial_alleles = set() with open(hla_dat, 'r') as file: lines = file.read().splitlines() for line in lines: # Denotes end of sequence, add allele to database if line.startswith('//'): if sequence and allele in exons: sequences[allele] = seq gene_exons[gene].add(number) gene_set.add(gene) if not partial: complete_alleles.add(allele) complete_2fields.add(process_allele(allele,2)) else: partial_alleles.add(allele) partial = False # Denotes partial alleles elif line.startswith('FT') and 'partial' in line: partial = True # Allele name and gene elif line.startswith('FT') and re.search('allele\="HLA-', line): allele = re.split('HLA-', re.sub('["\n]','',line))[1] gene = get_gene(allele) exon = sequence = False seq = '' # Exon coordinates elif line.startswith('FT') and re.search('exon',line): info = re.split('\s+', line) start = int(info[2].split('..')[0]) - 1 stop = int(info[2].split('..')[1]) exon_coord = [start, stop] exon = True # Exon number on following line elif exon: number = re.split('"', line)[1] exons[allele][number] = exon_coord exon = False # UTRs elif line.startswith('FT') and (re.search('\sUTR\s',line)): info = re.split('\s+', line) start = int(info[2].split('..')[0]) - 1 stop = int(info[2].split('..')[1]) utr_coord = [start, stop] if allele not in exons: utrs[allele]['utr5'] = utr_coord else: utrs[allele]['utr3'] = utr_coord # Start of sequence elif line.startswith('SQ'): sequence = True elif sequence and line.startswith(' '): seq += ''.join(line.split()[:-1]).upper() # select only 2-field partial alleles partial_alleles = {allele for allele in partial_alleles if process_allele(allele,2) not in complete_2fields} # get most common final exon length to truncate stop-loss alleles final_exon_length = defaultdict(list) for allele in complete_alleles: gene = get_gene(allele) exon = sorted(gene_exons[gene])[-1] if exon not in exons[allele]: continue start, stop = exons[allele][exon] final_exon_length[gene].append(stop-start) for gene, lengths in final_exon_length.items(): exon = sorted(gene_exons[gene])[-1] length = get_mode(lengths) final_exon_length[gene] = [exon,length] return (complete_alleles, partial_alleles, gene_set, sequences, utrs, exons, final_exon_length)