def compress(taxid=9606, target='../gpickle'): if not os.path.exists(target): os.mkdir(target) source = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(source): p = ProteinCore().load(file=os.path.join(source, pf)) p.gdump(file=os.path.join(target, os.path.splitext(pf)[0] + '.gzp'))
def compress( taxid=9606, target='/home/matteo/Coding/Michelanglo/MichelaNGLo-human-protein-data/gpickle' ): if not os.path.exists(target): os.mkdir(target) source = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(source): p = ProteinCore().load(file=os.path.join(source, pf)) p.gdump(file=os.path.join(target, os.path.splitext(pf)[0] + '.gzp'))
def inspect_offsets(uniprot): print('***************** inspect_offsets *******************************') p = ProteinCore(taxid='9606', uniprot=uniprot).load() for s in p.pdbs: print(s.code) print(s.chain_definitions) print(s._get_sifts())
def random_view(request): while True: name = random.choice(list(human.keys())) uniprot = human[name] protein = ProteinCore(taxid=9606, uniprot=uniprot).load() if protein.pdbs: pdb = random.choice(protein.pdbs) elif protein.swissmodel: pdb = random.choice(protein.swissmodel) else: continue i = random.randint(pdb.x, pdb.y) try: return { 'name': name, 'uniprot': uniprot, 'taxid': '9606', 'species': 'human', 'mutation': f'p.{protein.sequence[i-1]}{i}{random.choice(Mutation.aa_list)}' } except IndexError: log.error( f'Impossible... pdb.x out of bounds in unicode for gene {uniprot}' ) continue
def random_view(self): while True: name = random.choice(list(human.keys())) uniprot = human[name] protein = ProteinCore(taxid=9606, uniprot=uniprot).load() if protein.pdbs: pdb = random.choice(protein.pdbs) elif protein.swissmodel: pdb = random.choice(protein.swissmodel) else: continue try: i = random.randint(pdb.x, pdb.y) # the to_resn cannot be the same as original or * to_resn = random.choice( list( set(Mutation.aa_list) - {'*', protein.sequence[i - 1]})) return { 'name': name, 'uniprot': uniprot, 'taxid': '9606', 'species': 'human', 'mutation': f'p.{protein.sequence[i - 1]}{i}{to_resn}' } except IndexError: log.error( f'Impossible... pdb.x out of bounds in unicode for gene {uniprot}' ) continue
def get_transcript(request): enst = request.params['enst'] mutation = request.params['mutation'] mapper = ENSTMapper(enst) if mapper.is_full_match(): return {'uniprot': mapper.uniprot, 'mutation': mutation} else: p = ProteinCore(uniprot=mapper.uniprot, taxid=9606).load() return { 'uniprot': mapper.uniprot, 'mutation': mapper.convert(p.sequence, mutation) }
def touch_offsets(taxid=9606): overview = [] global_settings.verbose = False source = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(source): p = ProteinCore().load(file=os.path.join(source, pf)) for s in p.pdbs: if s.type != 'rcsb': continue details = s._get_sifts() v = [] for detail in details: ## clean rows for k in ('PDB_BEG', 'PDB_END', 'RES_END', 'RES_BEG', 'SP_BEG', 'SP_END'): if k == 'None' or k is None: detail[k] = None elif isinstance(detail[k], int): pass # this means so test is being done. else: r = re.search( '(-?\d+)', detail[k] ) # str().isdigit() does not like negatives. if r is None: detail[k] = None else: detail[k] = int( r.group(1)) # yes. py int is signed ## get offset if detail['PDB_BEG'] is not None: ##nice. offset = detail['SP_BEG'] - detail['PDB_BEG'] if offset and detail['PDB_BEG'] == detail['RES_BEG']: v.append('off-start') elif offset: v.append('off-unstart') elif detail['SP_BEG'] != 1: v.append('no-off-unstart') else: v.append('no-off-start') elif detail['PDB_END'] is not None: offset = detail['SP_BEG'] - ( detail['PDB_END'] - (detail['SP_END'] - detail['SP_BEG'])) if offset and detail['PDB_END'] == detail['RES_END']: v.append('off-start') elif offset: v.append('off-unstart') elif detail['SP_BEG'] != 1: v.append('no-off-unstart') else: v.append('no-off-start') elif detail['SP_BEG'] == 1: offset = 0 v.append('no-off-start') elif detail['RES_BEG'] == 1: # This is problematic. This means that there are unresolved residues at the N & C termini. # This can go either way. v.append('RES1') offset = 0 else: v.append('RESn') offset = 0 if 'RESn' in v or 'RES1' in v: offset = 0 c = Counter(v).most_common() overview.append('+'.join(sorted(set(v)))) print(Counter(overview).most_common())
def fix_offsets(file): """ This method fixes the offsets of a file. and saves. :param file: fullpath. :return: """ p = ProteinCore().load(file=file) lines = [] for s in p.pdbs: if s.type != 'rcsb': continue details = s._get_sifts() s.chain_definitions = [] s.offsets = {} for detail in details: ## clean rows for k in ('PDB_BEG', 'PDB_END', 'RES_END', 'RES_BEG', 'SP_BEG', 'SP_END'): if k == 'None' or k is None: detail[k] = None elif isinstance(detail[k], int): pass # this means so test is being done. else: r = re.search( '(-?\d+)', detail[k]) # str().isdigit() does not like negatives. if r is None: detail[k] = None else: detail[k] = int(r.group(1)) # yes. py int is signed ## get offset if detail['PDB_BEG'] is not None: ##nice. offset = detail['SP_BEG'] - detail['PDB_BEG'] elif detail['PDB_END'] is not None: offset = detail['SP_BEG'] - ( detail['PDB_END'] - (detail['SP_END'] - detail['SP_BEG'])) elif detail['SP_BEG']: try: if detail['SP_PRIMARY'] == p.uniprot: offset = s.get_offset_from_PDB(detail, p.sequence) else: seq = ProteinCore( uniprot=detail['SP_PRIMARY']).load().sequence offset = s.get_offset_from_PDB(detail, seq) except: ## Pymol subclasses BaseException. offset = 0 else: offset = 0 detail['offset'] = offset lines.append( f"{s.code}\t{detail['CHAIN']}\t{detail['SP_PRIMARY']}\t{offset}" ) s.chain_definitions.append({ 'chain': detail['CHAIN'], 'uniprot': detail['SP_PRIMARY'], 'x': detail["SP_BEG"], 'y': detail["SP_END"], 'offset': offset, 'range': f'{detail["SP_BEG"]}-{detail["SP_END"]}', 'name': None, 'description': None }) s.offsets[detail['CHAIN']] = offset try: if s.chain != '*': detail = next( filter(lambda x: s.chain == x['chain'], s.chain_definitions)) s.offset = detail['offset'] except: pass if p.pdbs: p.dump() return '\n'.join(lines)
def describe(uniprot): print('***************** DESCRIPTION *******************************') p = ProteinCore(taxid='9606', uniprot=uniprot).load() # gnb1 P62873 gnb2 P62879 pprint(p.asdict())
def get_data(uniprot: str) -> ProteinCore: return ProteinCore(taxid=9606, uniprot=uniprot).load()
# p = ProteinCore(taxid='9606', uniprot='P01112').load() # p = ProteinCore(taxid='3562', uniprot='Q8GT36').load() # print(sum(p.properties['kd'])/len(p)) # print(sum(p.properties['Flex']) / len(p)) # from create import message # download_swissmodel() # all_swiss(fx=hotfix_swiss) # message('Reswissed!') # fix_offsets('../protein-data/pickle/taxid9606/Q13586.p') # print('**************************************') taxid = 9606 gene = 'P04637' # TP53 # describe(gene) p = ProteinCore(taxid=taxid, uniprot=gene).load() # gnb1 P62873 gnb2 P62879 print(p.features.keys()) print(p.features['PSP_modified_residues']) exit() # #pprint(p.__dict__) # t = [s for s in p.pdbs if s.code.lower() == '1a1u'][0] # print(str(t)) # print(t.offset) # print(t.chain_definitions) # sifts = t._get_sifts() # print(sifts) # print(t.get_offset_from_PDB(sifts[0], p.sequence)) # p.parse_uniprot() # p.parse_swissmodel()