Example #1
0
def save(aclist, dbnam, fnotSave=None):
    # aclist = ['P06685', 'P06686']
    # dbnam = 'seqtmppi_test1'
    do_all = DataOperation('uniprot', 'seqtmppi_positive')
    do_new = DataOperation('uniprot', dbnam)
    notsaveCount = 0
    notsaveList = []
    for ac in aclist:
        protein = queryProtein(ac, do_all)
        if protein == None:
            print('not save protein %s' % ac)
            continue
        # projection = {'_id': True, 'sequence.@length': True, 'sequence.#text': True, 'keyword.@id': True,
        #               'comment.subcellularLocation.location': True}
        # protein_ens = ensomblePortein(queryProtein(ac, do_all,projection=projection))
        # protein_ens['accession'] = ac
        # protein['ensomble'] = protein_ens
        result = do_new.UpSertOne({'accession': ac}, protein)
        if result.matched_count != 0:
            print('not save %s' % ac)
            notsaveCount = notsaveCount + 1
            notsaveList.append(ac)
    print('not save %d' % (notsaveCount))
    if fnotSave:
        saveList(notsaveList, fnotSave)
    return notsaveList
Example #2
0
def pairToproteinList(fin,fout=''):
    """
    # Gets a list of non-repeating proteins from the protein pair
    :param fin:
    :param fout:
    :return:
    """
    proteinlist = []
    with open(fin,'r') as fi:
        line = fi.readline()
        while(line):
            pair = line.split('\t')
            a = pair[0]
            b = pair[1][:-1]
            if a not in proteinlist:
                proteinlist.append(a)
                # fo.write(a+'\n')
                # fo.flush()
            if b not in proteinlist:
                proteinlist.append(b)
                # fo.write(b+'\n')
                # fo.flush()
            line = fi.readline()
    if fout:saveList(proteinlist,fout)
    print('pairToproteinList end')
    return proteinlist
Example #3
0
def generateSomeSpeciesLists(ftmp, fnontmp, species='MYCPN'):
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': False, 'accession': True}
    docs = do.QueryObj({
        "keyword.@id": 'KW-0812',
        '_id': {
            '$regex': species
        }
    },
                       projection=projection)
    tmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    docs = do.QueryObj(
        {
            "keyword.@id": {
                '$ne': 'KW-0812'
            },
            'keyword': {
                '$exists': True
            },
            '_id': {
                '$regex': 'HUMAN'
            }
        },
        projection=projection)
    nontmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist)))
    saveList(tmplist, ftmp)
    saveList(nontmplist, fnontmp)
    return tmplist, nontmplist
Example #4
0
def queryPathway_Gene(foutPathway_Gene,
                      fpathway=None,
                      fpathwayInfo=None,
                      hsa='hsa'):
    '''

    :param foutPathway_Gene:
        0     hsa00010  pathway id
        1         3101  gene id
        2          HK3  gene symbol
        3     [K00844]  KO id
        4    [2.7.1.1]  EC number
    :param fpathway:
    :param fpathwayInfo:
    :param hsa:
    :return:
    '''
    if os.access(fpathway, os.F_OK):
        repair_pathways = readIDlist(fpathway)
    else:
        repair_pathways = queryAllPathway(fpathway=fpathway,
                                          fpathwayInfo=fpathwayInfo,
                                          hsa=hsa)
    for idx, pathway in enumerate(repair_pathways):
        geneID_geneName_KO_EC = []
        print(idx, end='.')
        if idx < 65: continue
        for gene_id, gene_symbol, KO, EC in extractGeneFromPathway(pathway):
            geneID_geneName_KO_EC.append(
                (pathway, gene_id, gene_symbol, KO, EC))
        saveList(geneID_geneName_KO_EC, foutPathway_Gene, file_mode='a')
Example #5
0
def queryKid_pathway(fTmpKEGGCount, outListFile):
    df = pd.read_table(fTmpKEGGCount, header=None)
    # kid_pathway = []
    for id, item in df.iterrows():
        # kid_pathway.extend(queryPathwayByKid(item[0]))
        print(id, 'query for ', item[0], item[1])
        result = queryPathwayByKid(item[0])
        if result == []: continue
        saveList(result, outListFile, file_mode='a')
Example #6
0
def queryAllPathway(fpathway=None, fpathwayInfo=None, hsa='hsa'):
    human_pathways = REST.kegg_list("pathway", hsa).read()
    repair_pathways = []
    repair_pathways_info = []
    for line in human_pathways.rstrip().split("\n"):
        entry, description = line.split("\t")
        entry = entry.split(':')[1]
        repair_pathways.append(entry)
        repair_pathways_info.append((entry, description))
    if fpathway: saveList(repair_pathways, fpathway)
    if fpathwayInfo: saveList(repair_pathways_info, fpathwayInfo)
    return repair_pathways
Example #7
0
def findKeyProtein(fin, fout, keyword):
    # keyword = 'KW-0297'
    tmplist = readIDlist(fin)
    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True}
    for ac in tmplist:
        dic = {'accession': ac, "keyword.@id": keyword}
        result = do.QueryObj(dic, projection=projection)
        for r in result:
            GPCRlist.append(ac)
            print(r)
    saveList(GPCRlist, fout)
Example #8
0
def getALlGprotein(fout):
    # {$or:[
    # 	{"protein.recommendedName.fullName":{$regex:/G protein +/}},
    # 	{"protein.recommendedName.fullName":{$regex:/G protein-coupled receptor +/}},
    # 	{"protein.alternativeName.fullName":{$regex:/G protein +/}},
    # 	{"protein.alternativeName.fullName":{$regex:/G protein-coupled receptor +/}}
    # ]}

    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {
        '_id': True,
        'protein.recommendedName.fullName': True,
        'protein.alternativeName.fullName': True
    }
    count = 0
    dic = {
        # 'accession': ac,
        '$or': [{
            "protein.recommendedName.fullName": {
                '$regex': 'G protein '
            }
        }, {
            "protein.recommendedName.fullName": {
                '$regex': 'Guanine nucleotide-binding protein'
            }
        }, {
            "protein.alternativeName.fullName": {
                '$regex': 'G protein '
            }
        }, {
            "protein.alternativeName.fullName": {
                '$regex': 'Guanine nucleotide-binding protein'
            }
        }]
    }
    result = do.QueryObj(dic, projection=projection)
    for r in result:
        count = count + 1
        GPCRlist.append(((r['_id'], [
            x for x in handleSubcelluarLeaf(
                r['protein'],
                keys=['alternativeName', 'recommendedName', 'fullName'])
        ])))
        print(((r['_id'], [
            x for x in handleSubcelluarLeaf(
                r['protein'],
                keys=['alternativeName', 'recommendedName', 'fullName'])
        ])))
    saveList(GPCRlist, fout)
Example #9
0
def getallProtein(fout):
    '''
    dao
    :param AC:
    :param do:
    :return:
    '''
    do = DataOperation('DrugKB', 'protein')
    projection = {'_id': True, 'accession': True}
    # one accession mapping several protein sequence
    docs = do.GetALL(projection=projection, limit=0)
    proteinlist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    saveList(proteinlist, fout)
Example #10
0
def findGProtein(fin, fout):
    # {"protein.recommendedName.fullName":{$regex:/Guanine nucleotide-binding protein*/}}
    # keyword = 'KW-0297'
    tmplist = readIDlist(fin)
    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True}
    count = 0
    for ac in tmplist:
        dic = {
            'accession':
            ac,
            '$or': [{
                "protein.recommendedName.fullName": {
                    '$regex': 'G protein '
                }
            }, {
                "protein.recommendedName.fullName": {
                    '$regex': 'Guanine nucleotide-binding protein'
                }
            }, {
                "protein.alternativeName.fullName": {
                    '$regex': 'G protein '
                }
            }, {
                "protein.alternativeName.fullName": {
                    '$regex': 'Guanine nucleotide-binding protein'
                }
            }]
        }
        result = do.QueryObj(dic, projection=projection)
        for r in result:
            count = count + 1
            GPCRlist.append(ac)
            print(count, r)
    saveList(GPCRlist, fout)
Example #11
0
def generateCriterLists(ftmp, fnontmp):
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': False, 'accession': True}
    docs = do.QueryObj(
        {
            "keyword.@id": 'KW-0812',
            'comment.subcellularLocation.location.#text': {
                '$exists': True
            }
        },
        projection=projection)
    tmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    docs = do.QueryObj(
        {
            "keyword.@id": {
                '$ne': 'KW-0812'
            },
            'keyword': {
                '$exists': True
            },
            'comment.subcellularLocation.location.#text': {
                '$exists': True
            }
        },
        projection=projection)
    nontmplist = [
        x['accession'][0]
        if isinstance(x['accession'], list) else x['accession'] for x in docs
    ]
    print('query %d tmp and %d nontmp' % (len(tmplist), len(nontmplist)))
    saveList(tmplist, ftmp)
    saveList(nontmplist, fnontmp)
    return tmplist, nontmplist
Example #12
0
def writeProtins(fout):
    proteins = []
    for protein in getallProtein():
        proteins.extend(protein)
    saveList(getallProtein(),fout)