def make_df(self, tax_id = False): hdr = ['enzyme', 'substrate', 'isoforms', 'residue_type', 'residue_offset', 'modification', 'sources', 'references'] self.df = pd.DataFrame( [dm.get_line() for dm in self], columns = hdr ).astype( { 'enzyme': 'category', 'substrate': 'category', 'isoforms': 'category', 'residue_type': 'category', 'residue_offset': 'int32', 'modification': 'category', } ) self.df['enzyme_genesymbol'] = pd.Series([ ( mapping.map_name0( u, id_type = 'uniprot', target_id_type = 'genesymbol', ncbi_tax_id = self.ncbi_tax_id, ) or '' ) for u in self.df.enzyme ]) self.df['substrate_genesymbol'] = pd.Series([ ( mapping.map_name0( u, id_type = 'uniprot', target_id_type = 'genesymbol', ncbi_tax_id = self.ncbi_tax_id, ) or '' ) for u in self.df.substrate ]) hdr.insert(2, 'enzyme_genesymbol') hdr.insert(3, 'substrate_genesymbol') self.df = self.df.loc[:,hdr] if tax_id: self.df['ncbi_tax_id'] = [self.ncbi_tax_id] * self.df.shape[0]
def build_gene(self): self.gene = set() for entity in self._entities: # we add the components of the complexes to the protein data # frame; I don't know if it's necessary but does not harm I guess if hasattr(entity, 'components'): components = entity.components else: components = (entity, ) for comp in components: name = mapping.map_name0(comp, 'uniprot', 'genesymbol') ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl') for ensembl in ensembl_genes: self.gene.add( CellPhoneDBGene( gene_name=name, uniprot=comp, hgnc_symbol=name, ensembl=ensembl, ))
def stoichiometry_str_genesymbols(self): return ';'.join( itertools.chain(*(((mapping.map_name0( uniprot, 'uniprot', 'genesymbol', ) or uniprot), ) * cnt for uniprot, cnt in sorted( iteritems(self.components), key=lambda comp_cnt: comp_cnt[0], ))))
def get_id_name(entity): id_ = entity.__str__() name = (id_ if 'COMPLEX' in id_ else mapping.map_name0( id_, 'uniprot', 'uniprot-entry', )) return id_, name
def build_protein(self): integrins = annot.db.annots['Integrins'] self.cpdb_protein = set() for entity in self._entities: # we add the components of the complexes to the protein data # frame; I don't know if it's necessary but does not harm I guess if hasattr(entity, 'components'): components = entity.components else: components = (entity,) for comp in components: classes = self.intercell.classes_by_entity(comp) self.cpdb_protein.add( CellPhoneDBProtein( uniprot = comp.__str__(), protein_name = mapping.map_name0( comp, 'uniprot', 'uniprot-entry', ), transmembrane = 'transmembrane' in classes, peripheral = 'cell_surface' in classes, secreted = 'secreted' in classes, secreted_desc = '', secreted_highlight = '', receptor = 'receptor' in classes, receptor_desc = '', integrin = comp in integrins, other = '', other_desc = '', tags = '', tags_reason = '', tags_description = '', ) )
def genesymbols(self): return sorted( (mapping.map_name0(uniprot, 'uniprot', 'genesymbol') or uniprot) for uniprot in self.components.keys())