Ejemplo n.º 1
0
def process_counts(count_file, eq_file, gene_list, allele_idx, allele_lengths,
                   keep_files):
    '''Processes pseudoalignment output, returning compatibility classes.'''
    log.info('[alignment] processing pseudoalignment')
    # Process count information
    counts = dict()
    with open(count_file, 'r') as file:
        for line in file.read().splitlines():
            eq, count = line.split('\t')
            counts[eq] = float(count)

    # Process compatibility classes
    eqs = dict()
    with open(eq_file, 'r') as file:
        for line in file.read().splitlines():
            eq, indices = line.split('\t')
            eqs[eq] = indices.split(',')

    # Set up compatibility class index
    eq_idx = defaultdict(list)

    count_unique = 0
    count_multi = 0
    class_unique = 0
    class_multi = 0

    for eq, indices in eqs.items():
        if [idx for idx in indices if not allele_idx[idx]]:
            continue

        genes = list({
            get_gene(allele)
            for idx in indices for allele in allele_idx[idx]
        })
        count = counts[eq]

        if len(genes) == 1 and counts[eq] > 0:
            gene = genes[0]
            eq_idx[gene].append((indices, count))

            count_unique += count
            class_unique += 1
        else:
            count_multi += count
            class_multi += 1

    # Alleles mapping to their respective compatibility classes
    allele_eq = defaultdict(set)
    for eqs in eq_idx.values():
        for eq, (indices, _) in enumerate(eqs):
            for idx in indices:
                allele_eq[idx].add(eq)

    remove_files([count_file, eq_file], keep_files)

    align_stats = [count_unique, count_multi, class_unique, class_multi]

    return eq_idx, allele_eq, align_stats
Ejemplo n.º 2
0
 def build_complete(allele):
     gene = get_gene(allele)
     allele_exons = sorted(exons[allele].items())
     coords = [[start,stop] for n,(start,stop) in allele_exons]
     seq = [sequences[allele][start:stop] for start,stop in coords]
     seq = ''.join(seq)
     
     exon, exon_length = final_exon_length[gene]
     if exon in exons[allele]:
         start, stop = exons[allele][exon]
         if stop-start > exon_length:
             seq = seq[:exon_length - (stop - start) + 1]
     
     cDNA[seq].add(allele)
     gene_length[gene].append(len(seq))
     
     if allele in utrs:
         for (start,stop) in utrs[allele].values():
             seq = sequences[allele][start:stop]
             other.add(seq)
Ejemplo n.º 3
0
def process_hla_dat():
    '''Processes IMGTHLA database, returning HLA sequences, exon locations, 
       lists of complete and partial alleles and possible exon combinations.
    '''

    sequences = dict()
    utrs = defaultdict(dict)
    exons = defaultdict(dict)
    gene_exons = defaultdict(set)

    sequence = partial = utr = exon = False

    gene_set = set()
    complete_alleles = set()
    complete_2fields = set()
    partial_alleles = set()

    with open(hla_dat, 'r') as file:
        lines = file.read().splitlines()

    for line in lines:
        # Denotes end of sequence, add allele to database
        if line.startswith('//'):
            if sequence and allele in exons:
                sequences[allele] = seq
                gene_exons[gene].add(number)
                gene_set.add(gene)
                
                if not partial:
                    complete_alleles.add(allele)
                    complete_2fields.add(process_allele(allele,2))
                    
                else:
                    partial_alleles.add(allele)
            partial = False

        # Denotes partial alleles
        elif line.startswith('FT') and 'partial' in line:
            partial = True
            
        # Allele name and gene
        elif line.startswith('FT') and re.search('allele\="HLA-', line):   
            allele = re.split('HLA-', re.sub('["\n]','',line))[1]
            gene = get_gene(allele)

            exon = sequence = False
            seq = ''

        # Exon coordinates
        elif line.startswith('FT') and re.search('exon',line):
            info = re.split('\s+', line)
            start = int(info[2].split('..')[0]) - 1
            stop = int(info[2].split('..')[1])
            exon_coord = [start, stop]
            exon = True

        # Exon number on following line
        elif exon:
            number = re.split('"', line)[1]
            exons[allele][number] = exon_coord
            exon = False

        # UTRs
        elif line.startswith('FT') and (re.search('\sUTR\s',line)):
            info = re.split('\s+', line)
            start = int(info[2].split('..')[0]) - 1
            stop = int(info[2].split('..')[1])
            utr_coord = [start, stop]

            if allele not in exons:
                utrs[allele]['utr5'] = utr_coord
            else:
                utrs[allele]['utr3'] = utr_coord

                
        # Start of sequence
        elif line.startswith('SQ'):
            sequence = True

        elif sequence and line.startswith(' '):
            seq += ''.join(line.split()[:-1]).upper()
            
    # select only 2-field partial alleles
    partial_alleles = {allele for allele in partial_alleles 
                        if process_allele(allele,2) not in complete_2fields}
                   
    # get most common final exon length to truncate stop-loss alleles
    final_exon_length = defaultdict(list)
    for allele in complete_alleles:
        gene = get_gene(allele)
        exon = sorted(gene_exons[gene])[-1]
        
        if exon not in exons[allele]:
            continue
            
        start, stop = exons[allele][exon]
        final_exon_length[gene].append(stop-start)
        
    for gene, lengths in final_exon_length.items():
        exon = sorted(gene_exons[gene])[-1]
        length = get_mode(lengths)
        final_exon_length[gene] = [exon,length]
            
    return (complete_alleles, partial_alleles, gene_set, sequences, utrs, 
           exons, final_exon_length)