def compress(taxid=9606, target='../gpickle'):
    if not os.path.exists(target):
        os.mkdir(target)
    source = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(source):
        p = ProteinCore().load(file=os.path.join(source, pf))
        p.gdump(file=os.path.join(target, os.path.splitext(pf)[0] + '.gzp'))
Exemple #2
0
def compress(
    taxid=9606,
    target='/home/matteo/Coding/Michelanglo/MichelaNGLo-human-protein-data/gpickle'
):
    if not os.path.exists(target):
        os.mkdir(target)
    source = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(source):
        p = ProteinCore().load(file=os.path.join(source, pf))
        p.gdump(file=os.path.join(target, os.path.splitext(pf)[0] + '.gzp'))
def inspect_offsets(uniprot):
    print('***************** inspect_offsets *******************************')
    p = ProteinCore(taxid='9606', uniprot=uniprot).load()
    for s in p.pdbs:
        print(s.code)
        print(s.chain_definitions)
        print(s._get_sifts())
Exemple #4
0
def random_view(request):
    while True:
        name = random.choice(list(human.keys()))
        uniprot = human[name]
        protein = ProteinCore(taxid=9606, uniprot=uniprot).load()
        if protein.pdbs:
            pdb = random.choice(protein.pdbs)
        elif protein.swissmodel:
            pdb = random.choice(protein.swissmodel)
        else:
            continue
        i = random.randint(pdb.x, pdb.y)
        try:
            return {
                'name':
                name,
                'uniprot':
                uniprot,
                'taxid':
                '9606',
                'species':
                'human',
                'mutation':
                f'p.{protein.sequence[i-1]}{i}{random.choice(Mutation.aa_list)}'
            }
        except IndexError:
            log.error(
                f'Impossible... pdb.x out of bounds in unicode for gene {uniprot}'
            )
            continue
 def random_view(self):
     while True:
         name = random.choice(list(human.keys()))
         uniprot = human[name]
         protein = ProteinCore(taxid=9606, uniprot=uniprot).load()
         if protein.pdbs:
             pdb = random.choice(protein.pdbs)
         elif protein.swissmodel:
             pdb = random.choice(protein.swissmodel)
         else:
             continue
         try:
             i = random.randint(pdb.x, pdb.y)
             # the to_resn cannot be the same as original or *
             to_resn = random.choice(
                 list(
                     set(Mutation.aa_list) -
                     {'*', protein.sequence[i - 1]}))
             return {
                 'name': name,
                 'uniprot': uniprot,
                 'taxid': '9606',
                 'species': 'human',
                 'mutation': f'p.{protein.sequence[i - 1]}{i}{to_resn}'
             }
         except IndexError:
             log.error(
                 f'Impossible... pdb.x out of bounds in unicode for gene {uniprot}'
             )
             continue
def get_transcript(request):
    enst = request.params['enst']
    mutation = request.params['mutation']
    mapper = ENSTMapper(enst)
    if mapper.is_full_match():
        return {'uniprot': mapper.uniprot, 'mutation': mutation}
    else:
        p = ProteinCore(uniprot=mapper.uniprot, taxid=9606).load()
        return {
            'uniprot': mapper.uniprot,
            'mutation': mapper.convert(p.sequence, mutation)
        }
def touch_offsets(taxid=9606):
    overview = []
    global_settings.verbose = False
    source = os.path.join(global_settings.pickle_folder, f'taxid{taxid}')
    for pf in os.listdir(source):
        p = ProteinCore().load(file=os.path.join(source, pf))
        for s in p.pdbs:
            if s.type != 'rcsb':
                continue
            details = s._get_sifts()
            v = []
            for detail in details:
                ## clean rows
                for k in ('PDB_BEG', 'PDB_END', 'RES_END', 'RES_BEG', 'SP_BEG',
                          'SP_END'):
                    if k == 'None' or k is None:
                        detail[k] = None
                    elif isinstance(detail[k], int):
                        pass  # this means so test is being done.
                    else:
                        r = re.search(
                            '(-?\d+)', detail[k]
                        )  # str().isdigit() does not like negatives.
                        if r is None:
                            detail[k] = None
                        else:
                            detail[k] = int(
                                r.group(1))  # yes. py int is signed
                ## get offset
                if detail['PDB_BEG'] is not None:  ##nice.
                    offset = detail['SP_BEG'] - detail['PDB_BEG']
                    if offset and detail['PDB_BEG'] == detail['RES_BEG']:
                        v.append('off-start')
                    elif offset:
                        v.append('off-unstart')
                    elif detail['SP_BEG'] != 1:
                        v.append('no-off-unstart')
                    else:
                        v.append('no-off-start')
                elif detail['PDB_END'] is not None:
                    offset = detail['SP_BEG'] - (
                        detail['PDB_END'] -
                        (detail['SP_END'] - detail['SP_BEG']))
                    if offset and detail['PDB_END'] == detail['RES_END']:
                        v.append('off-start')
                    elif offset:
                        v.append('off-unstart')
                    elif detail['SP_BEG'] != 1:
                        v.append('no-off-unstart')
                    else:
                        v.append('no-off-start')
                elif detail['SP_BEG'] == 1:
                    offset = 0
                    v.append('no-off-start')
                elif detail['RES_BEG'] == 1:
                    # This is problematic. This means that there are unresolved residues at the N & C termini.
                    # This can go either way.
                    v.append('RES1')
                    offset = 0
                else:
                    v.append('RESn')
                    offset = 0
            if 'RESn' in v or 'RES1' in v:
                offset = 0
            c = Counter(v).most_common()
            overview.append('+'.join(sorted(set(v))))
    print(Counter(overview).most_common())
def fix_offsets(file):
    """
    This method fixes the offsets of a file. and saves.

    :param file: fullpath.
    :return:
    """
    p = ProteinCore().load(file=file)
    lines = []
    for s in p.pdbs:
        if s.type != 'rcsb':
            continue
        details = s._get_sifts()
        s.chain_definitions = []
        s.offsets = {}
        for detail in details:
            ## clean rows
            for k in ('PDB_BEG', 'PDB_END', 'RES_END', 'RES_BEG', 'SP_BEG',
                      'SP_END'):
                if k == 'None' or k is None:
                    detail[k] = None
                elif isinstance(detail[k], int):
                    pass  # this means so test is being done.
                else:
                    r = re.search(
                        '(-?\d+)',
                        detail[k])  # str().isdigit() does not like negatives.
                    if r is None:
                        detail[k] = None
                    else:
                        detail[k] = int(r.group(1))  # yes. py int is signed
            ## get offset
            if detail['PDB_BEG'] is not None:  ##nice.
                offset = detail['SP_BEG'] - detail['PDB_BEG']
            elif detail['PDB_END'] is not None:
                offset = detail['SP_BEG'] - (
                    detail['PDB_END'] - (detail['SP_END'] - detail['SP_BEG']))
            elif detail['SP_BEG']:
                try:
                    if detail['SP_PRIMARY'] == p.uniprot:
                        offset = s.get_offset_from_PDB(detail, p.sequence)
                    else:
                        seq = ProteinCore(
                            uniprot=detail['SP_PRIMARY']).load().sequence
                        offset = s.get_offset_from_PDB(detail, seq)
                except:  ## Pymol subclasses BaseException.
                    offset = 0
            else:
                offset = 0
            detail['offset'] = offset
            lines.append(
                f"{s.code}\t{detail['CHAIN']}\t{detail['SP_PRIMARY']}\t{offset}"
            )
            s.chain_definitions.append({
                'chain': detail['CHAIN'],
                'uniprot': detail['SP_PRIMARY'],
                'x': detail["SP_BEG"],
                'y': detail["SP_END"],
                'offset': offset,
                'range': f'{detail["SP_BEG"]}-{detail["SP_END"]}',
                'name': None,
                'description': None
            })
            s.offsets[detail['CHAIN']] = offset
        try:
            if s.chain != '*':
                detail = next(
                    filter(lambda x: s.chain == x['chain'],
                           s.chain_definitions))
                s.offset = detail['offset']
        except:
            pass

    if p.pdbs:
        p.dump()
    return '\n'.join(lines)
def describe(uniprot):
    print('***************** DESCRIPTION *******************************')
    p = ProteinCore(taxid='9606',
                    uniprot=uniprot).load()  # gnb1 P62873 gnb2 P62879
    pprint(p.asdict())
Exemple #10
0
def get_data(uniprot: str) -> ProteinCore:
    return ProteinCore(taxid=9606, uniprot=uniprot).load()
Exemple #11
0
    # p = ProteinCore(taxid='9606', uniprot='P01112').load()
    # p = ProteinCore(taxid='3562', uniprot='Q8GT36').load()
    # print(sum(p.properties['kd'])/len(p))
    # print(sum(p.properties['Flex']) / len(p))
    # from create import message
    # download_swissmodel()
    # all_swiss(fx=hotfix_swiss)
    # message('Reswissed!')

    # fix_offsets('../protein-data/pickle/taxid9606/Q13586.p')

    # print('**************************************')
    taxid = 9606
    gene = 'P04637'  # TP53
    # describe(gene)
    p = ProteinCore(taxid=taxid,
                    uniprot=gene).load()  # gnb1 P62873 gnb2 P62879
    print(p.features.keys())
    print(p.features['PSP_modified_residues'])

    exit()
    # #pprint(p.__dict__)
    # t = [s for s in p.pdbs if s.code.lower() == '1a1u'][0]
    # print(str(t))
    # print(t.offset)
    # print(t.chain_definitions)
    # sifts = t._get_sifts()
    # print(sifts)
    # print(t.get_offset_from_PDB(sifts[0], p.sequence))

    # p.parse_uniprot()
    # p.parse_swissmodel()