Exemple #1
0
def test_encode_csv():
    test_data = (
        # strand, ref, alt, cdna_pos, exon, protein_id, is_ptm
        (('+', 'R', 'H', 204, 'exon1', 123, False), '+RH0cc:exon1:7b'),
        (('-', 'R', 'H', 204, 'exon1', 123, True), '-RH1cc:exon1:7b'),
    )
    for attributes, correct_result in test_data:
        result = genomic_mappings.encode_csv(*attributes)
        assert result == correct_result
Exemple #2
0
def import_genome_proteome_mappings(
        proteins,
        mappings_dir='data/200616/all_variants/playground',
        mappings_file_pattern='annot_*.txt.gz',
        bdb_dir=''):
    print('Importing mappings:')

    chromosomes = get_human_chromosomes()
    broken_seq = defaultdict(list)

    bdb.reset()
    bdb.close()

    path = current_app.config['BDB_DNA_TO_PROTEIN_PATH']
    if bdb_dir:
        path = bdb_dir + '/' + basename(path)

    bdb.open(path, cache_size=20480 * 8 * 8 * 8 * 8)

    for line in read_from_gz_files(mappings_dir, mappings_file_pattern):
        try:
            chrom, pos, ref, alt, prot = line.rstrip().split('\t')
        except ValueError as e:
            print(e, line)
            continue

        assert chrom.startswith('chr')
        chrom = chrom[3:]

        assert chrom in chromosomes
        ref = ref.rstrip()

        # new Coding Sequence Variants to be added to those already
        # mapped from given `snv` (Single Nucleotide Variation)

        for dest in filter(bool, prot.split(',')):
            try:
                name, refseq, exon, cdna_mut, prot_mut = dest.split(':')
            except ValueError as e:
                print(e, line)
                continue
            assert refseq.startswith('NM_')
            # refseq = int(refseq[3:])
            # name and refseq are redundant with respect one to another

            assert exon.startswith('exon')
            exon = exon[4:]

            assert cdna_mut.startswith('c')
            try:
                cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut)
            except ValueError as e:
                print(e, line)
                continue

            try:
                strand = determine_strand(ref, cdna_ref, alt, cdna_alt)
            except DataInconsistencyError as e:
                print(e, line)
                continue

            assert prot_mut.startswith('p')
            # we can check here if a given reference nuc is consistent
            # with the reference amino acid. For example cytosine in
            # reference implies that there should't be a methionine,
            # glutamic acid, lysine nor arginine. The same applies to
            # alternative nuc/aa and their combinations (having
            # references (nuc, aa): (G, K) and alt nuc C defines that
            # the alt aa has to be Asparagine (N) - no other is valid).
            # Note: it could be used to compress the data in memory too
            aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut)

            try:
                # try to get it from cache (`proteins` dictionary)
                protein = proteins[refseq]
            except KeyError:
                continue

            assert aa_pos == (int(cdna_pos) - 1) // 3 + 1

            broken_sequence_tuple = is_sequence_broken(protein, aa_pos, aa_ref,
                                                       aa_alt)

            if broken_sequence_tuple:
                broken_seq[refseq].append(broken_sequence_tuple)
                continue

            is_ptm_related = protein.has_sites_in_range(aa_pos - 7, aa_pos + 7)

            snv = make_snv_key(chrom, pos, cdna_ref, cdna_alt)

            # add new item, emulating set update
            item = encode_csv(strand, aa_ref, aa_alt, cdna_pos, exon,
                              protein.id, is_ptm_related)

            bdb.add(snv, item)

    return broken_seq