def reparse_gene(name): human = json.load( open( os.path.join(global_settings.dictionary_folder, 'taxid9606-names2uniprot.json'))) target = human[name] p = ProteinGatherer(uniprot=target) p.parse_uniprot() print(p.sequence)
def add_swissmodel(taxid=9606): def fix(p): global_settings.verbose = True p.parse_all(mode='serial') assert len(p.sequence) > 0, 'Darn. Sequence is zero AA long' p.dump() global_settings.verbose = False print( f'************************ {taxid} *************************************' ) path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(path): if os.path.splitext(pf)[1] != '.p': continue try: p = ProteinGatherer().load(file=os.path.join(path, pf)) except: p = ProteinGatherer(uniprot=pf.replace('.p', '')) fix(p) if len(p.sequence) == 0: try: fix(p) except Exception: traceback.print_exc(file=sys.stdout) else: p.parse_swissmodel() p.dump()
def mini_gene_data(): genes = '''DOCK180 DOCK2 DOCK3 DOCK4 DOCK5 DOCK6 DOCK7 DOCK8 DOCK9 DOCK10 DOCK11 '''.split() data = {} from pprint import PrettyPrinter pprint = PrettyPrinter().pprint namedex = json.load(open('data/human_prot_namedex.json')) for uni in set(namedex.values()): g = ProteinGatherer(uniprot=uni).parse_uniprot() data[g.gene_name] = { 'name': g.gene_name, 'uniprot': g.uniprot, 'len': len(g), 'domains': { k: g.features[k] for k in ('active site', 'modified residue', 'topological domain', 'domain', 'region of interest', 'transmembrane region') if k in g.features }, 'disease': g.diseases } #print(g.gene_name,g.uniprot,len(g)) json.dump(data, open('map.json', 'w'))
def hotfix_swiss(taxid=9606): # The database stores the data differently to what the data in the indices say! # https://swissmodel.expasy.org/repository/uniprot/P31946.pdb?from=1&to=232&template=2bq0&provider=pdb # https://swissmodel.expasy.org/repository/uniprot/P31946.pdb?sort=seqsim&provider=pdb&template=2bq0&range=1-232 def fix(p): global_settings.verbose = True p.parse_all(mode='serial') assert len(p.sequence) > 0, 'Darn. Sequence is zero AA long' p.dump() global_settings.verbose = False print( f'************************ {taxid} *************************************' ) path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(path): if os.path.splitext(pf)[1] != '.p': continue try: p = ProteinGatherer().load(file=os.path.join(path, pf)) except: p = ProteinGatherer(uniprot=pf.replace('.p', '')) fix(p) if len(p.sequence) == 0: try: fix(p) except Exception: traceback.print_exc(file=sys.stdout) else: for model in p.swissmodel: model.url = re.sub(r'from\=(\d+)&to\=(\d+)', r'sort=seqsim&range=\1-\2', model.url) p.dump()
def how_many_empty(taxid=9606): from collections import Counter global_settings.verbose = False empty = 0 full = 0 path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(path): p = ProteinGatherer().load(file=os.path.join(path, pf)) if len(p.sequence) == 0: print(p) empty += 1 else: full += 1 print(full, empty)
def iterate_taxon(taxid=9606): """ This is an ad hoc fix to fix humans or similar. For full deployment use ProteomeParser. :param taxid: :return: """ path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(path): try: protein = ProteinGatherer().load(file=os.path.join(path, pf)) protein.gnomAD = [] protein.parse_gnomAD() protein.get_PTM() protein.compute_params() protein.dump() #michelanglo_protein.get_offsets().parse_gnomAD().compute_params() #michelanglo_protein.dump() except: pass
def fix_empty(taxid=9606): from collections import Counter global_settings.verbose = False glitchy = 0 fine = 0 fixed = 0 path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(path): p = ProteinGatherer().load(file=os.path.join(path, pf)) if len(p.sequence) == 0: print('****************************************') print(f'Attempting to fix {p.gene_name}') try: global_settings.verbose = True p.parse_uniprot() p.parse_swissmodel() p.compute_params() p.parse_gnomAD() p.get_PTM() assert len(p.sequence) > 0, 'Darn. Sequence is zero AA long' p.dump() fixed += 1 global_settings.verbose = False except Exception: traceback.print_exc(file=sys.stdout) glitchy += 1 else: fine += 1 print('****************************************') print(f'Fine: {fine:,}, Fixed {fixed:,}, Glitchy: {glitchy:,}')
#### workspace! if 1 == 1: #os.mkdir(os.path.join(ProteinCore.settings.temp_folder, 'PDB')) #describe('P01112') #analyse('P62873') #how_many_empty() #fix_empty() #compress() #parse_uniprot( #inspect_offsets('P01133') #touch_offsets() #fix_all_offsets() #all_swiss() fix_all_offsets() elif 1 == 9: p = ProteinGatherer(taxid='9606', uniprot='P62873').load() print(p.gnomAD) print(p.parse_gnomAD()) print(p.gnomAD) print(p.features['PSP_modified_residues']) from michelanglo_protein.generate.split_phosphosite import Phoshosite #ph = Phoshosite().split().write('phosphosite') p = ProteinGatherer(taxid='9606', uniprot='P62879').load() print(':B and (' + ' or '.join([str(m.x) for m in p.gnomAD if m.homozygous]) + ')') print([m for m in p.gnomAD if m.homozygous]) print(' '.join([str(m.description.split()[0]) for m in p.gnomAD]) + ')') print([m for m in p.gnomAD if m.homozygous]) elif 1 == 0: iterate_taxon('9606')
namedexfile=os.path.join(global_settings.dictionary_folder, 'taxid9606-names2uniprot.json'), folder=os.path.join(global_settings.temp_folder, 'gnomAD')).split() if __name__ == '__main__': global_settings.verbose = True #False global_settings.error_tolerant = True global_settings.startup(data_folder='../protein-data') global_settings.retrieve_references(ask=False, refresh=False) ## Phosphosite #Phosphosite().split().write() message('Phosphosite split') ## Uniprot UniprotMasterReader(first_n_protein=0) message('Uniprot split') # gnomAD data needs to be split up after that the dictionaries are made. # _gnomad() message('gnomAD split') taxid = 9606 # that's humans path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}') for pf in os.listdir(path): try: protein = ProteinGatherer().load(file=os.path.join(path, pf)) protein.gnomAD = [] protein.parse_gnomAD() protein.get_PTM() protein.dump() except: pass message('Done.')
# print(t.chain_definitions) # sifts = t._get_sifts() # print(sifts) # print(t.get_offset_from_PDB(sifts[0], p.sequence)) # p.parse_uniprot() # p.parse_swissmodel() # p.compute_params() # p.parse_gnomAD() # p.get_PTM() # p.dump() # print('**************************************') # pprint(p.asdict()) elif 1 == 9: p = ProteinGatherer(taxid=9606, uniprot='P62873').load() print(p.gnomAD) print(p.parse_gnomAD()) print(p.gnomAD) print(p.features['PSP_modified_residues']) from michelanglo_protein.generate.split_phosphosite import Phoshosite # ph = Phoshosite().split().write('phosphosite') p = ProteinGatherer(taxid=9606, uniprot='P62879').load() print(':B and (' + ' or '.join([str(m.x) for m in p.gnomAD if m.homozygous]) + ')') print([m for m in p.gnomAD if m.homozygous]) print(' '.join([str(m.description.split()[0]) for m in p.gnomAD]) + ')') print([m for m in p.gnomAD if m.homozygous]) elif 1 == 0: