Beispiel #1
0
    def test_edge_cases_mapping(self):

        gene_t = Gene(
            name='T',
            isoforms=[
                #                                 123456789
                Protein(refseq='NM_01', sequence='AXAXAYAYA'),
                # C-terminal part was trimmed
                Protein(refseq='NM_02', sequence='AXAXA'),
                # N-terminal part was trimmed
                Protein(refseq='NM_03', sequence='AYAYA'),
            ])
        db.session.add(gene_t)
        db.session.commit()

        mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'),
                            lambda s: f'{s.position}{s.residue}')

        # all sites in NM_01, the idea is to test
        sites = DataFrame.from_dict(data={
            'site at N-terminus edge': ('T', 'NM_01', 1, '^AX', 'A', 2),
            'site at C-terminus edge': ('T', 'NM_01', 9, 'YA$', 'A', 2),
        },
                                    orient='index')

        sites.columns = [
            'gene', 'refseq', 'position', 'sequence', 'residue',
            'left_sequence_offset'
        ]

        mapped_sites = mapper.map_sites_by_sequence(sites)

        assert len(mapped_sites) == 4
def kinase_classification(path='data/regphos_kinome_scraped_clean.txt'):

    known_kinases = create_key_model_dict(Kinase, 'name', True)
    known_groups = create_key_model_dict(KinaseGroup, 'name', True)

    new_groups = []

    print('Loading protein kinase groups:')

    header = [
        'No.', 'Kinase', 'Group', 'Family', 'Subfamily', 'Gene.Symbol',
        'gene.clean', 'Description', 'group.clean'
    ]

    def parser(line):

        # note that the subfamily is often absent
        group, family, subfamily = line[2:5]

        # the 'gene.clean' [6] fits better to the names
        # of kinases used in all other data files
        kinase_name = line[6]

        # 'group.clean' is not atomic and is redundant with respect to
        # family and subfamily. This check assures that in case of a change
        # the maintainer would be able to spot the inconsistency easily
        clean = family + '_' + subfamily if subfamily else family
        assert line[8] == clean

        if kinase_name.lower() not in known_kinases:
            kinase = Kinase(name=kinase_name,
                            protein=get_preferred_gene_isoform(kinase_name))
            known_kinases[kinase_name.lower()] = kinase

        # the 'family' corresponds to 'group' in the all other files
        if family.lower() not in known_groups:
            group = KinaseGroup(name=family)
            known_groups[family.lower()] = group
            new_groups.append(group)

        known_groups[family.lower()].kinases.append(
            known_kinases[kinase_name.lower()])

    parse_tsv_file(path, parser, header)

    return new_groups
Beispiel #3
0
    def test_mapping(self):

        gene_a = Gene(
            name='A',
            isoforms=[
                # the full isoform of gene A
                Protein(refseq='NM_01', sequence='AAAAAAAAAXAA'),
                # a trimmed isoform of gene A
                Protein(refseq='NM_02', sequence='AAAXAA'),
            ])
        gene_b = Gene(name='B',
                      isoforms=[
                          Protein(refseq='NM_03', sequence='BBBBBBBBBYBB'),
                          Protein(refseq='NM_04', sequence='BBBYBB'),
                      ])
        db.session.add_all([gene_a, gene_b])

        # whoops, NM_03 has be accidentally removed (!)
        db.session.delete(Protein.query.filter_by(refseq='NM_03').one())
        db.session.commit()

        mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'),
                            lambda s: f'{s.position}{s.residue}')

        sites = DataFrame.from_dict(data={
            'good site A': ('A', 'NM_01', 10, 'AXA', 'X', 1),
            'lost isoform': ('B', 'NM_03', 10, 'BYB', 'Y', 1)
        },
                                    orient='index')

        sites.columns = [
            'gene', 'refseq', 'position', 'sequence', 'residue',
            'left_sequence_offset'
        ]

        mapped_sites = mapper.map_sites_by_sequence(sites)

        sites_by_isoform = group_by_isoform(mapped_sites)

        # one from NM_01 (defined), from NM_02 (mapped), from NM_04 (mapped)
        assert len(mapped_sites) == 3
        assert set(sites_by_isoform) == {'NM_01', 'NM_02', 'NM_04'}

        assert sites_by_isoform['NM_01'].residue == sites_by_isoform[
            'NM_02'].residue == 'X'
        assert sites_by_isoform['NM_01'].position == 10
        assert sites_by_isoform['NM_02'].position == 4

        assert sites_by_isoform['NM_04'].residue == 'Y'
        assert sites_by_isoform['NM_04'].position == 4

        # will the mapping to NM_02 still work if we remove 'gene' column?
        sites.drop(columns=['gene'], inplace=True)
        mapped_sites = mapper.map_sites_by_sequence(sites)
        sites_by_isoform = group_by_isoform(mapped_sites)

        assert len(mapped_sites) == 2
        assert set(sites_by_isoform) == {'NM_01', 'NM_02'}
def pathways(path='data/hsapiens.pathways.NAME.gmt'):
    """Loads pathways from given '.gmt' file.

    New genes may be created and should automatically be added
    to the session with pathways as those have a relationship.
    """
    known_genes = create_key_model_dict(Gene, 'name', lowercase=True)

    pathways = []
    new_genes = []

    def parser(data):
        """Parse GTM file with pathway descriptions.

        Args:
            data: a list of subsequent columns from a single line of GTM file

                For example::

                    ['CORUM:5419', 'HTR1A-GPR26 complex', 'GPR26', 'HTR1A']

        """
        gene_set_name = data[0]
        # Entry description can by empty
        entry_description = data[1].strip()

        entry_gene_names = [name.strip() for name in data[2:]]

        pathway_genes = []

        for gene_name in entry_gene_names:
            name_lower = gene_name.lower()
            if name_lower in known_genes:
                gene = known_genes[name_lower]
            else:
                gene = Gene(name=gene_name)
                known_genes[name_lower] = gene
                new_genes.append(gene)

            pathway_genes.append(gene)

        pathway = Pathway(description=entry_description, genes=pathway_genes)

        if gene_set_name.startswith('GO'):
            pathway.gene_ontology = int(gene_set_name[3:])
        elif gene_set_name.startswith('REAC'):
            pathway.reactome = int(gene_set_name[5:])
        else:
            raise Exception('Unknown gene set name: "%s"' % gene_set_name)

    parse_tsv_file(path, parser)

    print(len(new_genes), 'new genes created')

    return pathways
Beispiel #5
0
    def proteins(self):
        """Allows for lazy fetching of proteins.refseq -> protein

        as not all uses of importer require proteins in place.
        """
        if self._proteins:
            return self._proteins

        return create_key_model_dict(Protein,
                                     'refseq',
                                     options=load_only('refseq', 'sequence',
                                                       'id'))
Beispiel #6
0
    def __init__(self):

        print(f'Preparing {self.source_name} sites importer...')

        self.issues_counter = Counter()
        # caching proteins and kinases allows for much faster
        # import later on, though it takes some time to cache
        self.known_kinases = create_key_model_dict(Kinase,
                                                   'name',
                                                   lowercase=True)
        self.known_groups = create_key_model_dict(KinaseGroup,
                                                  'name',
                                                  lowercase=True)
        self.known_sites = create_key_model_dict(
            Site, ['protein_id', 'position', 'residue'],
            options=(joinedload(Site.sources).joinedload('*')))
        self.proteins = create_key_model_dict(
            Protein,
            'refseq',
            options=(load_only('refseq', 'sequence', 'id').joinedload(
                Protein.gene).joinedload(Gene.isoforms).load_only('refseq')))

        # create site types
        site_type_objects = [
            get_or_create(SiteType, name=name) for name in set(self.site_types)
        ]

        self.novel_site_types = [
            site_type for site_type, new in site_type_objects if new
        ]
        self.site_types_map = {
            site_type.name: site_type
            for site_type, new in site_type_objects
        }

        self.source, _ = get_or_create(SiteSource, name=self.source_name)

        print(f'{self.source_name} importer ready.')
def domains_types(path='data/interpro.xml.gz'):
    from xml.etree import ElementTree
    import gzip

    print('Loading extended InterPro annotations:')

    domains = create_key_model_dict(InterproDomain, 'accession')

    with gzip.open(path) as interpro_file:
        tree = ElementTree.parse(interpro_file)

    entries = tree.getroot().findall('interpro')

    for entry in tqdm(entries):
        try:
            domain = domains[entry.get('id')]
        except KeyError:
            continue
        domain.type = entry.get('type')
def kinase_mappings(path='data/curated_kinase_IDs.txt'):
    """Create kinases from `kinase_name gene_name` mappings.

    For each kinase a `preferred isoforms` of given gene will be used.

    If given kinase already is in the database and has an isoform
    associated, the association will be superseded with the new one.

    Returns:
        list of created isoforms
    """
    known_kinases = create_key_model_dict(Kinase, 'name')

    new_kinases = []

    def parser(line):
        kinase_name, gene_name = line
        protein = get_preferred_gene_isoform(gene_name)

        if not protein:
            print('No isoform for %s kinase mapped to %s gene!' %
                  (kinase_name, gene_name))
            return

        if kinase_name in known_kinases:
            kinase = known_kinases[kinase_name]
            if kinase.protein and kinase.protein != protein:

                print('Overriding kinase-protein association for '
                      '%s kinase. Old isoform: %s; new isoform: %s.' %
                      (kinase_name, kinase.protein.refseq, protein.refseq))
            kinase.protein = protein

        else:
            new_kinases.append(Kinase(name=kinase_name, protein=protein))

    parse_tsv_file(path, parser)

    return new_kinases
Beispiel #9
0
 def __init__(self, proteins, repr_site):
     self.proteins = proteins
     self.repr_site = repr_site
     self.genes = create_key_model_dict(Gene, 'name')
     self.has_gene_names = None
     self.already_warned = None
def domains(path='data/biomart_protein_domains_20072016.txt'):
    proteins = get_proteins()

    print('Loading domains:')

    interpro_domains = create_key_model_dict(InterproDomain, 'accession')
    new_domains = []

    skipped = 0
    wrong_length = 0
    not_matching_chrom = []

    header = [
        'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Protein ID',
        'Chromosome Name', 'Gene Start (bp)', 'Gene End (bp)',
        'RefSeq mRNA [e.g. NM_001195597]', 'Interpro ID',
        'Interpro Short Description', 'Interpro Description', 'Interpro end',
        'Interpro start'
    ]

    def parser(line):

        nonlocal skipped, wrong_length, not_matching_chrom

        try:
            protein = proteins[line[6]]  # by refseq
        except KeyError:
            skipped += 1
            return

        # If there is no data about the domains, skip this record
        if len(line) == 7:
            return

        try:
            assert len(line) == 12
        except AssertionError:
            print(line, len(line))

        # does start is lower than end?
        assert int(line[11]) < int(line[10])

        accession = line[7]

        # according to:
        # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC29841/#__sec2title
        assert accession.startswith('IPR')

        start, end = int(line[11]), int(line[10])

        # TODO: the assertion fails for some domains: what to do?
        # assert end <= protein.length
        if end > protein.length:
            wrong_length += 1

        if line[3] != protein.gene.chrom:
            skipped += 1
            not_matching_chrom.append(line)
            return

        if accession not in interpro_domains:

            interpro = InterproDomain(
                accession=line[7],  # Interpro Accession
                short_description=line[8],  # Interpro Short Description
                description=line[9],  # Interpro Description
            )

            interpro_domains[accession] = interpro

        interpro = interpro_domains[accession]

        similar_domains = [
            # select similar domain occurrences with criteria being:
            domain for domain in protein.domains
            # - the same interpro id
            if domain.interpro == interpro and
            # - at least 75% of common coverage for shorter occurrence of domain
            ((min(domain.end, end) - max(domain.start, start)) /
             min(len(domain), end - start) > 0.75)
        ]

        if similar_domains:
            try:
                assert len(similar_domains) == 1
            except AssertionError:
                print(similar_domains)
            domain = similar_domains[0]

            domain.start = min(domain.start, start)
            domain.end = max(domain.end, end)
        else:

            domain = Domain(interpro=interpro,
                            protein=protein,
                            start=start,
                            end=end)
            new_domains.append(domain)

    parse_tsv_file(path, parser, header)

    print('Domains loaded,', skipped, 'proteins skipped.',
          'Domains exceeding proteins length:', wrong_length,
          'Domains skipped due to not matching chromosomes:',
          len(not_matching_chrom))
    return new_domains
def proteins_and_genes(path='data/protein_data.tsv'):
    """Create proteins and genes based on data in a given file.

    If protein/gene already exists it will be skipped.

    Returns:
        list of created (new) proteins
    """
    # TODO where does the tsv file come from?
    print('Creating proteins and genes:')

    genes = create_key_model_dict(Gene, 'name', lowercase=True)
    known_proteins = get_proteins()

    proteins = {}

    coordinates_to_save = [('txStart', 'tx_start'), ('txEnd', 'tx_end'),
                           ('cdsStart', 'cds_start'), ('cdsEnd', 'cds_end')]

    allowed_strands = ['+', '-']

    # a list storing refseq ids which occur at least twice in the file
    with_duplicates = []
    potentially_empty_genes = set()

    header = [
        'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart',
        'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2',
        'cdsStartStat', 'cdsEndStat', 'exonFrames'
    ]

    columns = tuple(header.index(x[0]) for x in coordinates_to_save)
    coordinates_names = [x[1] for x in coordinates_to_save]

    def parser(line):

        # use name2 (fourth column from the end)
        name = line[-4]

        strand = line[3]
        assert strand in allowed_strands

        gene_data = {
            'name': name,
            'chrom': line[2][3:],  # remove chr prefix
            'strand': True if strand == '+' else False
        }

        if name.lower() not in genes:
            gene = Gene(**gene_data)
            genes[name.lower()] = gene
        else:
            gene = genes[name.lower()]
            for key, value in gene_data.items():
                previous = getattr(gene, key)
                if previous != value:
                    print(
                        f'Replacing {gene} {key} with {value} (previously: {previous})'
                    )
                    setattr(gene, key, value)

        # load protein
        refseq = line[1]

        # if protein is already in database no action is required
        if refseq in known_proteins:
            return

        # do not allow duplicates
        if refseq in proteins:

            with_duplicates.append(refseq)
            potentially_empty_genes.add(gene)
            """
            if gene.chrom in ('X', 'Y'):
                # close an eye for pseudoautosomal regions
                print(
                    'Skipping duplicated entry (probably belonging',
                    'to pseudoautosomal region) with refseq:', refseq
                )
            else:
                # warn about other duplicated records
                print(
                    'Skipping duplicated entry with refseq:', refseq
                )
            """
            return

        # from this line there is no processing of duplicates allowed
        assert refseq not in proteins

        protein_data = {'refseq': refseq, 'gene': gene}

        coordinates = zip(
            coordinates_names,
            [int(value) for i, value in enumerate(line) if i in columns])
        protein_data.update(coordinates)

        proteins[refseq] = Protein(**protein_data)

    parse_tsv_file(path, parser, header)

    cnt = sum(map(lambda g: len(g.isoforms) == 1, potentially_empty_genes))
    print('Duplicated that are only isoforms for gene:', cnt)
    print('Duplicated rows detected:', len(with_duplicates))
    return proteins.values()