def test_is_sequence_broken(): Protein = namedtuple('Protein', 'refseq, sequence') p = Protein(refseq='NM_0001', sequence='MEAVPKKKKKK') not_broken_tuple = bioinf.is_sequence_broken(p, 1, 'M', 'A') assert not not_broken_tuple broken_tuple = bioinf.is_sequence_broken(p, 2, 'M', 'A') assert broken_tuple == ('NM_0001', 'E', 'M', '2', 'A')
def preparse_mutations(self, line: List[str]): """Preparse mutations from a line of Annovar annotation file. Given line should be already split by correct separator (usually tabulator character). The mutations will be extracted from 10th field. The function gets first semicolon separated impact-list, and splits the list by commas. The redundancy of semicolon separated impact-lists is guaranteed in the data by check_semicolon_separated_data_redundancy test from `test_data.py` script. For more explanation, check #43 issue on GitHub. """ for mutation in [ m.split(':') for m in line[9].split(';')[0].split(',') ]: refseq = mutation[1] # if the mutation affects a protein # which is not in our dataset, skip it try: protein = self.proteins[refseq] except KeyError: continue ref, pos, alt = decode_mutation(mutation[4]) broken_sequence_tuple = is_sequence_broken(protein, pos, ref, alt) if broken_sequence_tuple: self.broken_seq[refseq].append(broken_sequence_tuple) continue is_ptm_related = protein.has_sites_in_range(pos - 7, pos + 7) yield pos, protein, alt, ref, is_ptm_related
def import_genome_proteome_mappings( proteins, mappings_dir='data/200616/all_variants/playground', mappings_file_pattern='annot_*.txt.gz', bdb_dir=''): print('Importing mappings:') chromosomes = get_human_chromosomes() broken_seq = defaultdict(list) bdb.reset() bdb.close() path = current_app.config['BDB_DNA_TO_PROTEIN_PATH'] if bdb_dir: path = bdb_dir + '/' + basename(path) bdb.open(path, cache_size=20480 * 8 * 8 * 8 * 8) for line in read_from_gz_files(mappings_dir, mappings_file_pattern): try: chrom, pos, ref, alt, prot = line.rstrip().split('\t') except ValueError as e: print(e, line) continue assert chrom.startswith('chr') chrom = chrom[3:] assert chrom in chromosomes ref = ref.rstrip() # new Coding Sequence Variants to be added to those already # mapped from given `snv` (Single Nucleotide Variation) for dest in filter(bool, prot.split(',')): try: name, refseq, exon, cdna_mut, prot_mut = dest.split(':') except ValueError as e: print(e, line) continue assert refseq.startswith('NM_') # refseq = int(refseq[3:]) # name and refseq are redundant with respect one to another assert exon.startswith('exon') exon = exon[4:] assert cdna_mut.startswith('c') try: cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut) except ValueError as e: print(e, line) continue try: strand = determine_strand(ref, cdna_ref, alt, cdna_alt) except DataInconsistencyError as e: print(e, line) continue assert prot_mut.startswith('p') # we can check here if a given reference nuc is consistent # with the reference amino acid. For example cytosine in # reference implies that there should't be a methionine, # glutamic acid, lysine nor arginine. The same applies to # alternative nuc/aa and their combinations (having # references (nuc, aa): (G, K) and alt nuc C defines that # the alt aa has to be Asparagine (N) - no other is valid). # Note: it could be used to compress the data in memory too aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut) try: # try to get it from cache (`proteins` dictionary) protein = proteins[refseq] except KeyError: continue assert aa_pos == (int(cdna_pos) - 1) // 3 + 1 broken_sequence_tuple = is_sequence_broken(protein, aa_pos, aa_ref, aa_alt) if broken_sequence_tuple: broken_seq[refseq].append(broken_sequence_tuple) continue is_ptm_related = protein.has_sites_in_range(aa_pos - 7, aa_pos + 7) snv = make_snv_key(chrom, pos, cdna_ref, cdna_alt) # add new item, emulating set update item = encode_csv(strand, aa_ref, aa_alt, cdna_pos, exon, protein.id, is_ptm_related) bdb.add(snv, item) return broken_seq
def import_aminoacid_mutation_refseq_mappings( proteins, mappings_dir='data/200616/all_variants/playground', mappings_file_pattern='annot_*.txt.gz', bdb_dir=''): print('Importing mappings:') chromosomes = get_human_chromosomes() bdb_refseq.reset() bdb_refseq.close() if bdb_dir: bdb_dir += '/' bdb_refseq.open(bdb_dir + basename(current_app.config['BDB_GENE_TO_ISOFORM_PATH'])) for line in read_from_gz_files(mappings_dir, mappings_file_pattern): try: chrom, pos, ref, alt, prot = line.rstrip().split('\t') except ValueError: print('Import error: not enough values for "tab" split') print(line) continue assert chrom.startswith('chr') chrom = chrom[3:] assert chrom in chromosomes for dest in filter(bool, prot.split(',')): try: name, refseq, exon, cdna_mut, prot_mut = dest.split(':') except ValueError: print('Import error: not enough values for ":" split') print(line, dest) continue assert refseq.startswith('NM_') assert cdna_mut.startswith('c') cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut) assert prot_mut.startswith('p') aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut) try: # try to get it from cache (`proteins` dictionary) protein = proteins[refseq] except KeyError: continue assert aa_pos == (int(cdna_pos) - 1) // 3 + 1 broken_sequence_tuple = is_sequence_broken(protein, aa_pos, aa_ref, aa_alt) if broken_sequence_tuple: continue key = protein.gene.name + ' ' + aa_ref + str(aa_pos) + aa_alt bdb_refseq.add(key, refseq)
def import_aminoacid_mutation_refseq_mappings( proteins: Dict[str, Protein], mappings_dir='data/200616/all_variants/playground', mappings_file_pattern='annot_*.txt.gz', bdb_dir=''): print('Importing mappings:') chromosomes = get_human_chromosomes() bdb_refseq.reset() bdb_refseq.close() path = current_app.config['BDB_GENE_TO_ISOFORM_PATH'] if bdb_dir: path = bdb_dir + '/' + basename(path) bdb_refseq.open(path, cache_size=20480 * 8 * 8 * 8 * 8) genes = {protein: protein.gene.name for protein in proteins.values()} with bdb_refseq.cached_session(overwrite_db_values=True): for line in read_from_gz_files(mappings_dir, mappings_file_pattern, after_batch=bdb_refseq.flush_cache): try: chrom, pos, ref, alt, prot = line.rstrip().split('\t') except ValueError: print('Import error: not enough values for "tab" split') print(line) continue assert chrom.startswith('chr') chrom = chrom[3:] assert chrom in chromosomes for dest in filter(bool, prot.split(',')): try: name, refseq, exon, cdna_mut, prot_mut = dest.split(':') except ValueError: print('Import error: not enough values for ":" split') print(line) print(dest) continue try: assert refseq.startswith('NM_') except AssertionError: print(f'Import error: refseq does not start with NM_:') print(line) print(refseq) continue try: assert cdna_mut.startswith('c') cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut) assert prot_mut.startswith('p') aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut) try: # try to get it from cache (`proteins` dictionary) protein = proteins[refseq] except KeyError: continue assert aa_pos == (int(cdna_pos) - 1) // 3 + 1 broken_sequence_tuple = is_sequence_broken( protein, aa_pos, aa_ref, aa_alt) if broken_sequence_tuple: continue bdb_refseq.cached_add_integer( genes[protein] + ' ' + aa_ref + str(aa_pos) + aa_alt, protein.id) except Exception as e: print(f'Import error:') print(e)