def reparse_gene(name):
    human = json.load(
        open(
            os.path.join(global_settings.dictionary_folder,
                         'taxid9606-names2uniprot.json')))
    target = human[name]
    p = ProteinGatherer(uniprot=target)
    p.parse_uniprot()
    print(p.sequence)
def add_swissmodel(taxid=9606):
    def fix(p):
        global_settings.verbose = True
        p.parse_all(mode='serial')
        assert len(p.sequence) > 0, 'Darn. Sequence is zero AA long'
        p.dump()
        global_settings.verbose = False

    print(
        f'************************ {taxid} *************************************'
    )
    path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(path):
        if os.path.splitext(pf)[1] != '.p':
            continue
        try:
            p = ProteinGatherer().load(file=os.path.join(path, pf))
        except:
            p = ProteinGatherer(uniprot=pf.replace('.p', ''))
            fix(p)
        if len(p.sequence) == 0:
            try:
                fix(p)
            except Exception:
                traceback.print_exc(file=sys.stdout)
        else:
            p.parse_swissmodel()
            p.dump()
def mini_gene_data():
    genes = '''DOCK180
    DOCK2
    DOCK3
    DOCK4
    DOCK5
    DOCK6
    DOCK7
    DOCK8
    DOCK9
    DOCK10
    DOCK11
    '''.split()

    data = {}
    from pprint import PrettyPrinter
    pprint = PrettyPrinter().pprint
    namedex = json.load(open('data/human_prot_namedex.json'))
    for uni in set(namedex.values()):
        g = ProteinGatherer(uniprot=uni).parse_uniprot()
        data[g.gene_name] = {
            'name': g.gene_name,
            'uniprot': g.uniprot,
            'len': len(g),
            'domains': {
                k: g.features[k]
                for k in ('active site', 'modified residue',
                          'topological domain', 'domain', 'region of interest',
                          'transmembrane region') if k in g.features
            },
            'disease': g.diseases
        }
        #print(g.gene_name,g.uniprot,len(g))
    json.dump(data, open('map.json', 'w'))
Beispiel #4
0
def hotfix_swiss(taxid=9606):
    # The database stores the data differently to what the data in the indices say!
    # https://swissmodel.expasy.org/repository/uniprot/P31946.pdb?from=1&to=232&template=2bq0&provider=pdb
    # https://swissmodel.expasy.org/repository/uniprot/P31946.pdb?sort=seqsim&provider=pdb&template=2bq0&range=1-232
    def fix(p):
        global_settings.verbose = True
        p.parse_all(mode='serial')
        assert len(p.sequence) > 0, 'Darn. Sequence is zero AA long'
        p.dump()
        global_settings.verbose = False

    print(
        f'************************ {taxid} *************************************'
    )
    path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(path):
        if os.path.splitext(pf)[1] != '.p':
            continue
        try:
            p = ProteinGatherer().load(file=os.path.join(path, pf))
        except:
            p = ProteinGatherer(uniprot=pf.replace('.p', ''))
            fix(p)
        if len(p.sequence) == 0:
            try:
                fix(p)
            except Exception:
                traceback.print_exc(file=sys.stdout)
        else:
            for model in p.swissmodel:
                model.url = re.sub(r'from\=(\d+)&to\=(\d+)',
                                   r'sort=seqsim&range=\1-\2', model.url)
            p.dump()
def how_many_empty(taxid=9606):
    from collections import Counter
    global_settings.verbose = False
    empty = 0
    full = 0
    path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(path):
        p = ProteinGatherer().load(file=os.path.join(path, pf))
        if len(p.sequence) == 0:
            print(p)
            empty += 1
        else:
            full += 1
    print(full, empty)
def iterate_taxon(taxid=9606):
    """
    This is an ad hoc fix to fix humans or similar. For full deployment use ProteomeParser.
    :param taxid:
    :return:
    """
    path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(path):
        try:
            protein = ProteinGatherer().load(file=os.path.join(path, pf))
            protein.gnomAD = []
            protein.parse_gnomAD()
            protein.get_PTM()
            protein.compute_params()
            protein.dump()
            #michelanglo_protein.get_offsets().parse_gnomAD().compute_params()
            #michelanglo_protein.dump()
        except:
            pass
def fix_empty(taxid=9606):
    from collections import Counter
    global_settings.verbose = False
    glitchy = 0
    fine = 0
    fixed = 0
    path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(path):
        p = ProteinGatherer().load(file=os.path.join(path, pf))
        if len(p.sequence) == 0:
            print('****************************************')
            print(f'Attempting to fix {p.gene_name}')
            try:
                global_settings.verbose = True
                p.parse_uniprot()
                p.parse_swissmodel()
                p.compute_params()
                p.parse_gnomAD()
                p.get_PTM()
                assert len(p.sequence) > 0, 'Darn. Sequence is zero AA long'
                p.dump()
                fixed += 1
                global_settings.verbose = False
            except Exception:
                traceback.print_exc(file=sys.stdout)
                glitchy += 1
        else:
            fine += 1
    print('****************************************')
    print(f'Fine: {fine:,}, Fixed {fixed:,}, Glitchy: {glitchy:,}')
#### workspace!
if 1 == 1:
    #os.mkdir(os.path.join(ProteinCore.settings.temp_folder, 'PDB'))
    #describe('P01112')
    #analyse('P62873')
    #how_many_empty()
    #fix_empty()
    #compress()
    #parse_uniprot(
    #inspect_offsets('P01133')
    #touch_offsets()
    #fix_all_offsets()
    #all_swiss()
    fix_all_offsets()
elif 1 == 9:
    p = ProteinGatherer(taxid='9606', uniprot='P62873').load()
    print(p.gnomAD)
    print(p.parse_gnomAD())
    print(p.gnomAD)
    print(p.features['PSP_modified_residues'])
    from michelanglo_protein.generate.split_phosphosite import Phoshosite
    #ph = Phoshosite().split().write('phosphosite')
    p = ProteinGatherer(taxid='9606', uniprot='P62879').load()
    print(':B and (' +
          ' or '.join([str(m.x) for m in p.gnomAD if m.homozygous]) + ')')

    print([m for m in p.gnomAD if m.homozygous])
    print(' '.join([str(m.description.split()[0]) for m in p.gnomAD]) + ')')
    print([m for m in p.gnomAD if m.homozygous])
elif 1 == 0:
    iterate_taxon('9606')
           namedexfile=os.path.join(global_settings.dictionary_folder,
                                    'taxid9606-names2uniprot.json'),
           folder=os.path.join(global_settings.temp_folder, 'gnomAD')).split()


if __name__ == '__main__':
    global_settings.verbose = True  #False
    global_settings.error_tolerant = True
    global_settings.startup(data_folder='../protein-data')
    global_settings.retrieve_references(ask=False, refresh=False)
    ## Phosphosite
    #Phosphosite().split().write()
    message('Phosphosite split')
    ## Uniprot
    UniprotMasterReader(first_n_protein=0)
    message('Uniprot split')
    # gnomAD data needs to be split up after that the dictionaries are made.
    # _gnomad()
    message('gnomAD split')
    taxid = 9606  # that's humans
    path = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(path):
        try:
            protein = ProteinGatherer().load(file=os.path.join(path, pf))
            protein.gnomAD = []
            protein.parse_gnomAD()
            protein.get_PTM()
            protein.dump()
        except:
            pass
    message('Done.')
Beispiel #10
0
    # print(t.chain_definitions)
    # sifts = t._get_sifts()
    # print(sifts)
    # print(t.get_offset_from_PDB(sifts[0], p.sequence))

    # p.parse_uniprot()
    # p.parse_swissmodel()
    # p.compute_params()
    # p.parse_gnomAD()
    # p.get_PTM()
    # p.dump()
    # print('**************************************')
    # pprint(p.asdict())

elif 1 == 9:
    p = ProteinGatherer(taxid=9606, uniprot='P62873').load()
    print(p.gnomAD)
    print(p.parse_gnomAD())
    print(p.gnomAD)
    print(p.features['PSP_modified_residues'])
    from michelanglo_protein.generate.split_phosphosite import Phoshosite

    # ph = Phoshosite().split().write('phosphosite')
    p = ProteinGatherer(taxid=9606, uniprot='P62879').load()
    print(':B and (' +
          ' or '.join([str(m.x) for m in p.gnomAD if m.homozygous]) + ')')

    print([m for m in p.gnomAD if m.homozygous])
    print(' '.join([str(m.description.split()[0]) for m in p.gnomAD]) + ')')
    print([m for m in p.gnomAD if m.homozygous])
elif 1 == 0: