Example #1
0
    def parse(self, path):
        thousand_genomes_mutations = []
        duplicates = 0
        skipped = 0

        maf_keys = (
            'AF',
            'EAS_AF',
            'AMR_AF',
            'AFR_AF',
            'EUR_AF',
            'SAS_AF',
        )

        for line in read_from_gz_files(dirname(path),
                                       basename(path),
                                       skip_header=False):
            line = line.rstrip().split('\t')

            metadata = line[20].split(';')

            maf_data = make_metadata_ordered_dict(
                maf_keys, metadata, self.find_af_subfield_number(line))

            # ignore mutations with frequency equal to zero
            if maf_data['AF'] == '0':
                skipped += 1
                continue

            values = list(maf_data.values())

            for mutation_id in self.preparse_mutations(line):

                duplicated = self.look_after_duplicates(
                    mutation_id, thousand_genomes_mutations, values)
                if duplicated:
                    duplicates += 1
                    continue

                self.protect_from_duplicates(mutation_id,
                                             thousand_genomes_mutations)

                thousand_genomes_mutations.append((
                    mutation_id,
                    # Python 3.5 makes it easy:
                    # **values, but is not available
                    values[0],
                    values[1],
                    values[2],
                    values[3],
                    values[4],
                    values[5],
                ))

        print('%s duplicates found' % duplicates)
        print('%s zero-frequency mutations skipped' % skipped)

        return thousand_genomes_mutations
 def iterate_lines(self, path):
     for line in read_from_gz_files(dirname(path),
                                    basename(path),
                                    skip_header=False):
         yield line.rstrip().split('\t')
Example #3
0
def import_genome_proteome_mappings(
        proteins,
        mappings_dir='data/200616/all_variants/playground',
        mappings_file_pattern='annot_*.txt.gz',
        bdb_dir=''):
    print('Importing mappings:')

    chromosomes = get_human_chromosomes()
    broken_seq = defaultdict(list)

    bdb.reset()
    bdb.close()

    path = current_app.config['BDB_DNA_TO_PROTEIN_PATH']
    if bdb_dir:
        path = bdb_dir + '/' + basename(path)

    bdb.open(path, cache_size=20480 * 8 * 8 * 8 * 8)

    for line in read_from_gz_files(mappings_dir, mappings_file_pattern):
        try:
            chrom, pos, ref, alt, prot = line.rstrip().split('\t')
        except ValueError as e:
            print(e, line)
            continue

        assert chrom.startswith('chr')
        chrom = chrom[3:]

        assert chrom in chromosomes
        ref = ref.rstrip()

        # new Coding Sequence Variants to be added to those already
        # mapped from given `snv` (Single Nucleotide Variation)

        for dest in filter(bool, prot.split(',')):
            try:
                name, refseq, exon, cdna_mut, prot_mut = dest.split(':')
            except ValueError as e:
                print(e, line)
                continue
            assert refseq.startswith('NM_')
            # refseq = int(refseq[3:])
            # name and refseq are redundant with respect one to another

            assert exon.startswith('exon')
            exon = exon[4:]

            assert cdna_mut.startswith('c')
            try:
                cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut)
            except ValueError as e:
                print(e, line)
                continue

            try:
                strand = determine_strand(ref, cdna_ref, alt, cdna_alt)
            except DataInconsistencyError as e:
                print(e, line)
                continue

            assert prot_mut.startswith('p')
            # we can check here if a given reference nuc is consistent
            # with the reference amino acid. For example cytosine in
            # reference implies that there should't be a methionine,
            # glutamic acid, lysine nor arginine. The same applies to
            # alternative nuc/aa and their combinations (having
            # references (nuc, aa): (G, K) and alt nuc C defines that
            # the alt aa has to be Asparagine (N) - no other is valid).
            # Note: it could be used to compress the data in memory too
            aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut)

            try:
                # try to get it from cache (`proteins` dictionary)
                protein = proteins[refseq]
            except KeyError:
                continue

            assert aa_pos == (int(cdna_pos) - 1) // 3 + 1

            broken_sequence_tuple = is_sequence_broken(protein, aa_pos, aa_ref,
                                                       aa_alt)

            if broken_sequence_tuple:
                broken_seq[refseq].append(broken_sequence_tuple)
                continue

            is_ptm_related = protein.has_sites_in_range(aa_pos - 7, aa_pos + 7)

            snv = make_snv_key(chrom, pos, cdna_ref, cdna_alt)

            # add new item, emulating set update
            item = encode_csv(strand, aa_ref, aa_alt, cdna_pos, exon,
                              protein.id, is_ptm_related)

            bdb.add(snv, item)

    return broken_seq
Example #4
0
def import_aminoacid_mutation_refseq_mappings(
        proteins,
        mappings_dir='data/200616/all_variants/playground',
        mappings_file_pattern='annot_*.txt.gz',
        bdb_dir=''):
    print('Importing mappings:')

    chromosomes = get_human_chromosomes()

    bdb_refseq.reset()
    bdb_refseq.close()
    if bdb_dir:
        bdb_dir += '/'
    bdb_refseq.open(bdb_dir +
                    basename(current_app.config['BDB_GENE_TO_ISOFORM_PATH']))

    for line in read_from_gz_files(mappings_dir, mappings_file_pattern):
        try:
            chrom, pos, ref, alt, prot = line.rstrip().split('\t')
        except ValueError:
            print('Import error: not enough values for "tab" split')
            print(line)
            continue

        assert chrom.startswith('chr')
        chrom = chrom[3:]

        assert chrom in chromosomes

        for dest in filter(bool, prot.split(',')):
            try:
                name, refseq, exon, cdna_mut, prot_mut = dest.split(':')
            except ValueError:
                print('Import error: not enough values for ":" split')
                print(line, dest)
                continue

            assert refseq.startswith('NM_')

            assert cdna_mut.startswith('c')
            cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut)

            assert prot_mut.startswith('p')

            aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut)

            try:
                # try to get it from cache (`proteins` dictionary)
                protein = proteins[refseq]
            except KeyError:
                continue

            assert aa_pos == (int(cdna_pos) - 1) // 3 + 1

            broken_sequence_tuple = is_sequence_broken(protein, aa_pos, aa_ref,
                                                       aa_alt)

            if broken_sequence_tuple:
                continue

            key = protein.gene.name + ' ' + aa_ref + str(aa_pos) + aa_alt
            bdb_refseq.add(key, refseq)
Example #5
0
def import_aminoacid_mutation_refseq_mappings(
        proteins: Dict[str, Protein],
        mappings_dir='data/200616/all_variants/playground',
        mappings_file_pattern='annot_*.txt.gz',
        bdb_dir=''):
    print('Importing mappings:')

    chromosomes = get_human_chromosomes()

    bdb_refseq.reset()
    bdb_refseq.close()

    path = current_app.config['BDB_GENE_TO_ISOFORM_PATH']

    if bdb_dir:
        path = bdb_dir + '/' + basename(path)

    bdb_refseq.open(path, cache_size=20480 * 8 * 8 * 8 * 8)

    genes = {protein: protein.gene.name for protein in proteins.values()}

    with bdb_refseq.cached_session(overwrite_db_values=True):
        for line in read_from_gz_files(mappings_dir,
                                       mappings_file_pattern,
                                       after_batch=bdb_refseq.flush_cache):
            try:
                chrom, pos, ref, alt, prot = line.rstrip().split('\t')
            except ValueError:
                print('Import error: not enough values for "tab" split')
                print(line)
                continue

            assert chrom.startswith('chr')
            chrom = chrom[3:]

            assert chrom in chromosomes

            for dest in filter(bool, prot.split(',')):
                try:
                    name, refseq, exon, cdna_mut, prot_mut = dest.split(':')
                except ValueError:
                    print('Import error: not enough values for ":" split')
                    print(line)
                    print(dest)
                    continue

                try:
                    assert refseq.startswith('NM_')
                except AssertionError:
                    print(f'Import error: refseq does not start with NM_:')
                    print(line)
                    print(refseq)
                    continue

                try:
                    assert cdna_mut.startswith('c')
                    cdna_ref, cdna_pos, cdna_alt = decode_mutation(cdna_mut)

                    assert prot_mut.startswith('p')

                    aa_ref, aa_pos, aa_alt = decode_mutation(prot_mut)

                    try:
                        # try to get it from cache (`proteins` dictionary)
                        protein = proteins[refseq]
                    except KeyError:
                        continue

                    assert aa_pos == (int(cdna_pos) - 1) // 3 + 1

                    broken_sequence_tuple = is_sequence_broken(
                        protein, aa_pos, aa_ref, aa_alt)

                    if broken_sequence_tuple:
                        continue

                    bdb_refseq.cached_add_integer(
                        genes[protein] + ' ' + aa_ref + str(aa_pos) + aa_alt,
                        protein.id)
                except Exception as e:
                    print(f'Import error:')
                    print(e)