Beispiel #1
0
def generateSomeSpeciesLists(ftmp, fnontmp, species='MYCPN'):
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': False, 'accession': True}
    docs = do.QueryObj({
        "keyword.@id": 'KW-0812',
        '_id': {
            '$regex': species
        }
    },
                       projection=projection)
    tmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    docs = do.QueryObj(
        {
            "keyword.@id": {
                '$ne': 'KW-0812'
            },
            'keyword': {
                '$exists': True
            },
            '_id': {
                '$regex': 'HUMAN'
            }
        },
        projection=projection)
    nontmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist)))
    saveList(tmplist, ftmp)
    saveList(nontmplist, fnontmp)
    return tmplist, nontmplist
Beispiel #2
0
def save(aclist, dbnam, fnotSave=None):
    # aclist = ['P06685', 'P06686']
    # dbnam = 'seqtmppi_test1'
    do_all = DataOperation('uniprot', 'seqtmppi_positive')
    do_new = DataOperation('uniprot', dbnam)
    notsaveCount = 0
    notsaveList = []
    for ac in aclist:
        protein = queryProtein(ac, do_all)
        if protein == None:
            print('not save protein %s' % ac)
            continue
        # projection = {'_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True,
        #               'comment.subcellularLocation.location': True}
        # protein_ens = ensomblePortein(queryProtein(ac, do_all,projection=projection))
        # protein_ens['accession'] = ac
        # protein['ensomble'] = protein_ens
        result = do_new.UpSertOne({'accession': ac}, protein)
        if result.matched_count != 0:
            print('not save %s' % ac)
            notsaveCount = notsaveCount + 1
            notsaveList.append(ac)
    print('not save %d' % (notsaveCount))
    if fnotSave:
        saveList(notsaveList, fnotSave)
    return notsaveList
Beispiel #3
0
def generateCoQLists(fcoQ):
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True, 'accession': True}
    docs = do.QueryObj({"keyword.@id": 'KW-0830'}, projection=projection)
    coQlist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    print('query %d coQ' % (len(coQlist)))
    saveList(coQlist, fcoQ)
    return coQlist
Beispiel #4
0
def generateCoQInfo(finfo):
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True, 'accession': True}
    docs = do.QueryObj({"keyword.@id": 'KW-0830'}, projection=projection)
    coQinfo = [
        (x['accession'][0],
         x['_id'].split('_')[1]) if isinstance(x['accession'], list) else
        (x['accession'], x['_id'].split('_')[1]) for x in docs
    ]
    print('query %d coQ info' % (len(coQinfo)))
    saveList(coQinfo, finfo)
    return coQinfo
Beispiel #5
0
def findKeyProtein(fin, fout, keyword):
    # keyword = 'KW-0297'
    tmplist = readIDlist(fin)
    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True}
    for ac in tmplist:
        dic = {'accession': ac, "keyword.@id": keyword}
        result = do.QueryObj(dic, projection=projection)
        for r in result:
            GPCRlist.append(ac)
            print(r)
    saveList(GPCRlist, fout)
Beispiel #6
0
def getALlGprotein(fout):
    # {$or:[
    # 	{"protein.recommendedName.fullName":{$regex:/G protein +/}},
    # 	{"protein.recommendedName.fullName":{$regex:/G protein-coupled receptor +/}},
    # 	{"protein.alternativeName.fullName":{$regex:/G protein +/}},
    # 	{"protein.alternativeName.fullName":{$regex:/G protein-coupled receptor +/}}
    # ]}

    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {
        '_id': True,
        'protein.recommendedName.fullName': True,
        'protein.alternativeName.fullName': True
    }
    count = 0
    dic = {
        # 'accession': ac,
        '$or': [{
            "protein.recommendedName.fullName": {
                '$regex': 'G protein '
            }
        }, {
            "protein.recommendedName.fullName": {
                '$regex': 'Guanine nucleotide-binding protein'
            }
        }, {
            "protein.alternativeName.fullName": {
                '$regex': 'G protein '
            }
        }, {
            "protein.alternativeName.fullName": {
                '$regex': 'Guanine nucleotide-binding protein'
            }
        }]
    }
    result = do.QueryObj(dic, projection=projection)
    for r in result:
        count = count + 1
        GPCRlist.append(((r['_id'], [
            x for x in handleSubcelluarLeaf(
                r['protein'],
                keys=['alternativeName', 'recommendedName', 'fullName'])
        ])))
        print(((r['_id'], [
            x for x in handleSubcelluarLeaf(
                r['protein'],
                keys=['alternativeName', 'recommendedName', 'fullName'])
        ])))
    saveList(GPCRlist, fout)
Beispiel #7
0
def getallProtein():
    '''
    dao
    :param AC:
    :param do:
    :return:
    '''
    do = DataOperation(db_name, table_target)
    projection = {'_id': True, 'UNIPROID':True}
    docs = do.GetALL(projection=projection,limit=0)
    for protein in docs:
        if protein['UNIPROID'] == '':continue
        for pro in multiSplit(protein['UNIPROID']):
            yield pro.strip()
Beispiel #8
0
def funcPfam(x):
    # ac = 'P03372'
    do = DataOperation('uniprot', 'uniprot_sprot')
    result = queryPfam(x[0], do, tophit=True)
    x[1] = result[0]
    x[2] = result[1]
    return x
Beispiel #9
0
def getPairInfo_TMP_nonTMP(fin, fout, sep='\t', checkTMP=True, keepOne=False):
    '''

    :param fin:
        Q7BCK4	B6JN06
        E7QG89	B2FN41
    :param fout:
        TMP + nonTMP ['accession', 'name', 'length', 'noX', 'inlenRange', 'subcellularLocations', 'seq']
        Q7BCK4	ICSA_SHIFL	1102	True	True	['Periplasm', 'Secreted', 'Cell surface', 'Cell outer membrane']	MNQIHKFFCNMTQCSQGGAGELPTVKEKTCKLSFSPFVVGASLLLGGPIAFATPLSGTQELHFSEDNYEKLLTPVDGLSPLGAGEDGMDAWYITSSNPSHASRTKLRINSDIMISAGHGGAGDNNDGNSCGGNGGDSITGSDLSIINQGMILGGSGGSGADHNGDGGEAVTGDNLFIINGEIISGGHGGDSYSDSDGGNGGDAVTGVNLPIINKGTISGGNGGNNYGEGDGGNGGDAITGSSLSVINKGTFAGGNGGAAYGYGYDGYGGNAITGDNLSVINNGAILGGNGGHWGDAINGSNMTIANSGYIISGKEDDGTQNVAGNAIHITGGNNSLILHEGSVITGDVQVNNSSILKIINNDYTGTTPTIEGDLCAGDCTTVSLSGNKFTVSGDVSFGENSSLNLAGISSLEASGNMSFGNNVKVEAIINNWAQKDYKLLSADKGITGFSVSNISIINPLLTTGAIDYTKSYISDQNKLIYGLSWNDTDGDSHGEFNLKENAELTVSTILADNLSHHNINSWDGKSLTKSGEGTLILAEKNTYSGFTNINAGILKMGTVEAMTRTAGVIVNKGATLNFSGMNQTVNTLLNSGTVLINNINAPFLPDPVIVTGNMTLEKNGHVILNNSSSNVGQTYVQKGNWHGKGGILSLGAVLGNDNSKTDRLEIAGHASGITYVAVTNEGGSGDKTLEGVQIISTDSSDKNAFIQKGRIVAGSYDYRLKQGTVSGLNTNKWYLTSQMDNQESKQMSNQESTQMSSRRASSQLVSSLNLGEGSIHTWRPEAGSYIANLIAMNTMFSPSLYDRHGSTIVDPTTGQLSETTMWIRTVGGHNEHNLADRQLKTTANRMVYQIGGDILKTNFTDHDGLHVGIMGAYGYQDSKTHNKYTSYSSRGTVSGYTAGLYSSWFQDEKERTGLYMDAWLQYSWFNNTVKGDGLTGEKYSSKGITGALEAGYIYPTIRWTAHNNIDNALYLNPQVQITRHGVKANDYIEHNGTMVTSSGGNNIQAKLGLRTSLISQSCIDKETLRKFEPFLEVNWKWSSKQYGVIMNGMSNHQIGNRNVIELKTGVGGRLADNLSIWGNVSQQLGNNSYRDTQGILGVKYTF	B6JN06	G6PI_HELP2	545	True	True	['Cytoplasm']	MLTQLKTYPKLLKHYEEIKEAHMRDWFSKDKERASRYFVQLESLSLDYSKNRLNDTTLKLLFELANDCSLKEKIEAMFKGEKINTTEKRAVLHTALRSLNDTEILLDNMEVLKSVRSVLKRMRAFSDSVRSGKRLGYTNQVITDIVNIGIGGSDLGALMVCTALKRYGHPRLKMHFVSNVDGTQILDVLEKINPASTLFIVASKTFSTQETLTNALTARKWFVERSGDEKHIAKHFVAVSTNKEAVQQFGIDEHNMFEFWDFVGGRYSLWSAIGLSIMIYLGKKNFNALLKGAYLMDEHFRNAPFESNLPVLMGLIGVWYINFFQSKSHLIAPYDQYLRHFPKFIQQLDMESNGKRISKKGETIPYDTCPVVWGDMGINAQHAFFQLLHQGTHLIPIDFIASLDKKPNAKGHHEILFSNVLAQAQAFMKGKSYEEALGELLFKGLDKDEAKDLAHHRVFFGNRPSNILLLEKISPSNIGALVALYEHKVFVQGVIWDINSFDQWGVELGKELAVPILQELEGHKSNAYFDSSTKHLIELYKNYNQ
        E7QG89	SEC11_YEASZ	167	True	True	['Endoplasmic reticulum membrane']	MNLRFELQKLLNVCFLFASAYMFWQGLAIATNSASPIVVVLSGSMEPAFQRGDILFLWNRNTFNQVGDVVVYEVEGKQIPIVHRVLRQHNNHADKQFLLTKGDNNAGNDISLYANKKIYLNKSKEIVGTVKGYFPQLGYITIWISENKYAKFALLGMLGLSALLGGE	B2FN41	EX7L_STRMK	443	True	True	['Cytoplasm']	MQPRNNDILTPSQLNTLARDLLEGSFPAIWVEAELGSVARPASGHLYFTLKDARAQLRAAMFRMKAQYLKFVPREGMRVLVRGKVTLYDARGEYQMVLDHMEEAGEGALRRAFEELKARLEAEGLFDPARKRPLPTHVQRLAVITSPTGAAVRDVLSVLGRRFPLLEVDLLPTLVQGSSAAAQITRLLQAADASGRYDVILLTRGGGSLEDLWAFNDEALARAIAASRTPVVSAVGHETDFSLSDFAADLRAPTPSVAAELLVPDQRELALRLRRTAARMVQLQRHAMQQAMQRADRALLRLNAQSPQARLDLLRRRQLDLGRRLHAVFNQQQERRAARLRHAAAVLRGHHPQRQLDAMQRRLAALRGRPQAAMQRLLERDALRLRGLARSLEAVSPLATVARGYSILTRTDDGTLVRKVNQVQPGDALQARVGDGVIDVQVK

    :return:
    fin = 'file/_1pair.txt'
    fout = 'file/_2pair_info.txt'
    getPairInfo_TMP_nonTMP(fin,fout)
    '''
    do = DataOperation('uniprot', 'uniprot_sprot')
    with open(fout, 'w') as fo:
        for pa, pb in getPairs(fin, sep=sep, title=False):
            print('%s\t%s' % (pa, pb))
            result = getTaN(pa, pb, do, checkTMP=checkTMP, keepOne=keepOne)
            if result == None: continue
            tmp = ensomblePortein(result[0])
            nontmp = ensomblePortein(result[1])
            for v in tmp.values():
                fo.write(str(v))
                fo.write('\t')
            for v in nontmp.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write('\n')
            fo.flush()
Beispiel #10
0
def getSingleInfo(fin, fout, fin_type='single', col=[0, 1]):
    if fin_type == 'pair':
        df = pd.read_table(fin, header=None)[col]
        dat = df.to_numpy().reshape(1, -1)
        proteins = set(dat[0])
    elif fin_type == 'single':
        proteins = readIDlist(fin)
    else:
        pass
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {
        '_id': True,
        'sequence.@length': True,
        'sequence.#text': True,
        'keyword.@id': True,
        'comment.subcellularLocation.location': True
    }
    prod = Protein()
    with open(fout, 'w') as fo:
        for AC in proteins:
            pro = queryProtein(AC, do, projection=projection)
            pro['accession'] = AC
            if not prod.checkProtein(
                    pro['sequence']['#text'], 50, 2000, uncomm=True):
                continue
            proinfo = ensomblePortein(pro)
            for v in proinfo.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write('\n')
            fo.flush()
Beispiel #11
0
def getallProtein(fout):
    '''
    dao
    :param AC:
    :param do:
    :return:
    '''
    do = DataOperation('DrugKB', 'protein')
    projection = {'_id': True, 'accession': True}
    # one accession mapping several protein sequence
    docs = do.GetALL(projection=projection, limit=0)
    proteinlist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    saveList(proteinlist, fout)
Beispiel #12
0
def getPairTag(fin, f1pairWithTag_info, sep='\t'):
    '''

    :param fin: fin pair
    :param f1pairWithTag_info: fout
    :return:
    tag
    0, TMP_SP
    1, TMP_TMP
    2, SP_SP
    '''
    do = DataOperation('uniprot', 'uniprot_sprot')
    with open(f1pairWithTag_info, 'w') as fo:
        for pa, pb in getPairs(fin, sep=sep, title=False):
            print('%s\t%s' % (pa, pb))
            result = tagPair(pa, pb, do)
            if result == None: continue
            proA = ensomblePortein(result[0])
            proB = ensomblePortein(result[1])
            for v in proA.values():
                fo.write(str(v))
                fo.write('\t')
            for v in proB.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write(str(result[2]))
            fo.write('\n')
            fo.flush()
Beispiel #13
0
def getTmp(fin, fout):
    '''
    :param fin: protein list
    :param fout:  tmp list
    :return:
    '''
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True, 'keyword.@id': True}
    aclist = readIDlist(fin)
    with open(fout, 'w') as fo:
        for AC in aclist:
            qa = do.QueryOne('accession', AC, projection=projection)
            if qa == None: continue
            if checkTaN(qa['keyword']):
                print('%s TMP' % AC)
            pa_TMP_flag = False if 'keyword' not in qa.keys() else checkTaN(
                qa['keyword'])
            if pa_TMP_flag:
                fo.write('%s\n' % AC)
                fo.flush()
Beispiel #14
0
 def test_queryProtein(self):
     do = DataOperation('uniprot', 'uniprot_sprot')
     AC = 'P34397'
     protein = None
     # one accession mapping several protein sequence
     projection = {
         '_id': True,
         'sequence.@length': True,
         'sequence.#text': True,
         'keyword.@id': True,
         'comment.subcellularLocation.location': True
     }
     docs = do.Query('accession', AC, projection=projection)
     count = 0
     for doc in docs:
         count = count + 1
         if count > 1:
             #    一个accession 查询到多个蛋白质
             # 保存这个列表
             print('%s is more than one entry' % AC)
         protein = doc
     protein['accession'] = AC
     ensomblePortein(protein)
Beispiel #15
0
def findGProtein(fin, fout):
    # {"protein.recommendedName.fullName":{$regex:/Guanine nucleotide-binding protein*/}}
    # keyword = 'KW-0297'
    tmplist = readIDlist(fin)
    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True}
    count = 0
    for ac in tmplist:
        dic = {
            'accession':
            ac,
            '$or': [{
                "protein.recommendedName.fullName": {
                    '$regex': 'G protein '
                }
            }, {
                "protein.recommendedName.fullName": {
                    '$regex': 'Guanine nucleotide-binding protein'
                }
            }, {
                "protein.alternativeName.fullName": {
                    '$regex': 'G protein '
                }
            }, {
                "protein.alternativeName.fullName": {
                    '$regex': 'Guanine nucleotide-binding protein'
                }
            }]
        }
        result = do.QueryObj(dic, projection=projection)
        for r in result:
            count = count + 1
            GPCRlist.append(ac)
            print(count, r)
    saveList(GPCRlist, fout)
Beispiel #16
0
def generateCriterLists(ftmp, fnontmp):
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': False, 'accession': True}
    docs = do.QueryObj(
        {
            "keyword.@id": 'KW-0812',
            'comment.subcellularLocation.location.#text': {
                '$exists': True
            }
        },
        projection=projection)
    tmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    docs = do.QueryObj(
        {
            "keyword.@id": {
                '$ne': 'KW-0812'
            },
            'keyword': {
                '$exists': True
            },
            'comment.subcellularLocation.location.#text': {
                '$exists': True
            }
        },
        projection=projection)
    nontmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist)))
    saveList(tmplist, ftmp)
    saveList(nontmplist, fnontmp)
    return tmplist, nontmplist
Beispiel #17
0
def getFasta(fin_all_protein, fout):
    proteins = readIDlist(fin_all_protein)
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'sequence.#text': True}
    with open(fout, 'w') as fo:
        from Bio import SeqIO
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        records = []

        for AC in proteins:
            pro = queryProtein(AC, do, projection=projection)
            if not pro: continue
            record = SeqRecord(Seq(pro['sequence']['#text']),
                               id=AC,
                               description='')
            records.append(record)

            #fo.write('>%s\n%s\n'%(AC,pro['sequence']['#text']))
            #fo.flush()

        SeqIO.write(records, fout, 'fasta')
Beispiel #18
0
from DatabaseOperation2 import DataOperation
from ProteinDealer import Protein
from dao import queryProtein, ensomblePortein


import time

if __name__ == '__main__':
    print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    start = time.time()

    pa = 'Q14118'
    projcetion = {'dbReference.text':True,'_id':False}
    dic = {'accession':pa,'dbReference.@type':'pfam'}
    do = DataOperation('uniprot', 'uniprot_sprot')
    qa = do.QueryObj(dic,projcetion= projcetion)
    dbReference = qa['dbReference']
    subcellularLocations = []

    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True,
                  'comment.subcellularLocation.location': True}
    AC = 'P82432'
    pro = queryProtein(AC, do, projection=projection)
    pro['accession'] = AC
    proinfo = ensomblePortein(pro)

    pass
    print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))
    print('time', time.time() - start)
Beispiel #19
0
def proteinPfam(fin,fout,tophit=True,item='Pfam'):
    do = DataOperation('uniprot', 'uniprot_sprot')
    df = pd.read_csv(fin, header=None)
    df = pd.DataFrame(df)
    df2 = df.apply(lambda x: funcPfam(x,do,tophit=tophit,item=item), axis=1)
    df2.to_csv(fout, sep='\t',header=None, index=None)
Beispiel #20
0
    for q in qa:
        dbReference = q['dbReference']
        # {'@type': 'Pfam', '@id': 'PF18424', 'property': [{'@type': 'entry name', '@value': 'a_DG1_N2'}
        #  {'@type': 'Pfam', '@id': 'PF12743', 'property': [{'@type': 'entry name', '@value': 'ESR1_C'}, {'@type': 'match status', '@value': '1'}]}
        result = getPfam(dbReference)
        return result if not tophit else result[0]


def addProteinPfam():
    pass


def funcPfam(x):
    return x


if __name__ == '__main__':

    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {
        '_id': True,
        'sequence.@length': True,
        'sequence.#text': True,
        'keyword.@id': True,
        'comment.subcellularLocation.location': True
    }
    AC = 'Q13206'
    pro = queryProtein(AC, do, projection=projection)
    pro['accession'] = AC
    proinfo = ensomblePortein(pro)