Beispiel #1
0
def insertSimcomp():
    dirpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/kegg/kegg_20161010/simcomp'

    # SELECT com_id FROM `compound` where com_kegg_id!=''
    q = 'SELECT com_id,com_kegg_id FROM `compound` where com_kegg_id!=""'
    resp = util.mysqlCommit(db, cursor, q, True)

    kegg2ComIdMap = {}
    for i in resp:
        kegg2ComIdMap[i[1]] = i[0]

    #
    for filename in os.listdir(dirpath):
        if filename.endswith(".sim"):
            keggId = filename.split('_')[1].strip('.sim').strip()
            fpath = os.path.join(dirpath, filename)

            simcomp = []
            with open(fpath) as infile:
                for line in infile:
                    words = line.split('=')
                    words = [i.strip() for i in words]
                    keggId2 = words[0]
                    if keggId2 in kegg2ComIdMap.keys():
                        comId = kegg2ComIdMap[keggId2]
                        score = words[1]
                        simcomp.append(comId + ':' + keggId2 + '=' + score)

            if len(simcomp) != 0:
                simcompStr = '\n'.join(simcomp)
                simcompStr = '"' + simcompStr + '"'
                qf = 'UPDATE compound SET com_simcomp=' + simcompStr
                qr = ' WHERE com_kegg_id=' + '"' + keggId + '"'
                q = qf + qr
                util.mysqlCommit(db, cursor, q)
Beispiel #2
0
def main():
    # TODO get comId and proId from DB
    maxComId = 1000 #21994
    comList = range(1,maxComId+1)
    comList = ['COM'+str(i).zfill(8) for i in comList]

    maxProId = 3334
    proList = range(1,maxProId+1)
    proList = ['PRO'+str(i).zfill(8) for i in proList]

    #
    blm = BLM()

    #
    nCom = len(comList)
    nPro = len(proList)
    for i,c in enumerate(comList):
        for j,p in enumerate(proList):
            msg1 = 'search/predict for c= '+c+' ('+str(i+1)+'/'+str(nCom)+') '
            msg2 = ' vs p= '+p+' ('+str(j+1)+'/'+str(nPro)+') '
            msg = msg1+msg2
            print msg

            qf = 'SELECT source,timestamp FROM compound_vs_protein WHERE'
            qr = ' com_id='+'"'+c+'"'+' AND pro_id='+'"'+p+'"'
            q = qf+qr
            resp = util.mysqlCommit(db,cursor,q)

            needToPredict = True
            if resp!=None:
                src,stamp = resp
                if 'predictor' not in src:# then it is a fact
                    needToPredict = False

            if needToPredict:
                print 'predicting ...'
                # (re)predict
                # weight here is the confidence level that the link exists
                weight, srcs = blm.predict(c,p)

                # insert
                if weight > 0.0:
                    print 'inserting the prediction ...'
                    insertVals = [c,p,str(weight),srcs]
                    insertVals = ['"'+j+'"' for j in insertVals]
                    qf = '''INSERT INTO compound_vs_protein (com_id,pro_id,
                                                             weight,source) 
                            VALUES ('''
                    qm = ','.join(insertVals)
                    qr = ')'
                    q = qf+qm+qr
                    util.mysqlCommit(db,cursor,q)
def insertDrugVsProtein(drugProteinDict):
    idx = 0
    log = []
    logFpath = outDir + '/insertDrugVsProtein.log'
    src = 'drugbank.ca'
    weight = '1.0'
    for i, v in drugProteinDict.iteritems():
        idx += 1

        qf = 'SELECT com_id FROM compound WHERE com_drugbank_id ='
        qm = '"' + i + '"'
        qr = ''
        sql = qf + qm + qr
        comIdR = util.mysqlCommit(db, cursor, sql)

        pList = list(set(v['uniprotTargets']))
        for p in pList:
            msg = 'inserting ' + i + ' vs ' + p + ' idx= ' + str(
                idx) + ' of ' + str(len(drugProteinDict))
            print msg

            qf = 'SELECT pro_id FROM protein WHERE pro_uniprot_id ='
            qm = '"' + p + '"'
            qr = ''
            sql = qf + qm + qr
            proIdR = util.mysqlCommit(db, cursor, sql)

            if comIdR != None and proIdR != None:
                proId = proIdR[0]
                comId = comIdR[0]

                insertVals = [comId, proId, weight, src]
                insertVals = ['"' + j + '"' for j in insertVals]

                qf = '''INSERT INTO compound_vs_protein (com_id,pro_id,
                                                         weight,source) 
                        VALUES ('''
                qm = ','.join(insertVals)
                qr = ')'
                sql = qf + qm + qr
                util.mysqlCommit(db, cursor, sql)
            else:
                msg = 'FAIL: ' + msg
                print msg
                log.append(msg)

    with open(logFpath, 'w') as f:
        for l in log:
            f.write(str(l) + '\n')
Beispiel #4
0
def insertComFromDrugbank(comIdx):
    drugData = None
    fpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/drugbank/drugbank_20161002/drugbank_drug_data_2016-10-05_10:16:42.860649.pkl'
    with open(fpath, 'rb') as handle:
        drugData = pickle.load(handle)

    for i, v in drugData.iteritems():
        if len(v['uniprotTargets']) == 0:
            continue

        comIdx += 1
        print 'inserting', i, 'comIdx=', str(comIdx), 'of at most', str(
            len(drugData))

        comId = str(comIdx)
        comId = comId.zfill(8)
        comId = 'COM' + comId
        comDrugbankId = i
        na = ''

        insertVals = []
        insertVals.append(comId)
        insertVals.append(comDrugbankId)

        insertKeys = [
            'CAS number', 'pubchemCid', 'InChI Key', 'SMILES',
            'com_knapsack_id', 'com_kegg_id', 'com_simcomp'
        ]
        for k in insertKeys:
            if k in v.keys():
                insertVals.append(v[k])
            else:
                insertVals.append(na)

        insertVals = ['' if len(i) == 0 else i for i in insertVals]
        insertVals = ['' if 'not-available' in i else i for i in insertVals]
        insertVals = ['"' + iv + '"' for iv in insertVals]
        assert len(insertVals) == 9

        qf = '''INSERT INTO compound (com_id, com_drugbank_id,
                                      com_cas_id, com_pubchem_id, com_inchikey,
                                      com_smiles, com_knapsack_id, com_kegg_id, com_simcomp)
             VALUES ('''
        qm = ','.join(insertVals)
        qr = ')'
        sql = qf + qm + qr
        util.mysqlCommit(db, cursor, sql)

    return comIdx
Beispiel #5
0
def insertComFromKnapsack(comIdx):
    plantCompoundDict = None
    fpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/knapsack/20161003/knapsack_jsp_plant_vs_compound_2016-10-04_16:34:06.468234.pkl'
    with open(fpath, 'rb') as handle:
        plantCompoundDict = pickle.load(handle)

    compoundDict = {}
    for comList in plantCompoundDict.values():
        for kId, cas, name, form in comList:
            compoundDict[kId] = cas

    idx = 0
    nCom = len(compoundDict)
    matchList = []
    for k, cas in compoundDict.iteritems():
        idx += 1
        print 'insert/updating', k, 'idx=', str(idx), 'of', nCom

        if util.mysqlExist(db,
                           cursor,
                           table='compound',
                           where='com_cas_id=' + '"' + cas + '"'):
            # print 'match'
            matchList.append(cas)

            qf = 'UPDATE compound SET com_knapsack_id='
            qm = '"' + k + '"'
            qr = 'WHERE com_cas_id=' + '"' + cas + '"'
            q = qf + qm + qr
            util.mysqlCommit(db, cursor, q)
        else:
            comIdx += 1
            comIdStr = 'COM' + str(comIdx).zfill(8)

            insertVals = [comIdStr, cas, k]
            insertVals = ['"' + i + '"' for i in insertVals]

            qf = 'INSERT INTO compound (com_id,com_cas_id,com_knapsack_id) VALUES ('
            qm = ','.join(insertVals)
            qr = ')'
            q = qf + qm + qr
            util.mysqlCommit(db, cursor, q)

    fpath = outDir + '/knapsack_compound_match_with_drugbank.lst'
    with open(fpath, 'w') as f:
        for m in matchList:
            f.write(str(m) + '\n')

    return comIdx
Beispiel #6
0
def insertPlants(plantList):
    nPlant = len(plantList)
    for idx, p in enumerate(plantList):
        plaId = str(idx+1)
        plaId = plaId.zfill(8)
        plaId = '"'+'PLA'+plaId+'"'
        print 'inserting ', plaId, 'of', str(nPlant)

        plaName = '"'+p+'"'

        qf = 'INSERT INTO plant (pla_id,pla_name) VALUES ('
        qm = plaId+','+plaName
        qr = ')'
        q = qf+qm+qr
        util.mysqlCommit(q)
Beispiel #7
0
def insertProteinVsDisease(proteinDiseaseDict):
    src = 'uniprot.org'
    n = len(proteinDiseaseDict)
    log = []
    logFpath = dirPath + '/insertProteinVsDisease.log'
    idx = 0
    for p, v in proteinDiseaseDict.iteritems():
        idx += 1

        qf = 'SELECT pro_id FROM protein WHERE pro_uniprot_abbrv ='
        qm = '"' + p + '"'
        qr = ''
        q = qf + qm + qr
        proIdR = util.mysqlCommit(db, cursor, q)

        for d in v['disease']:
            msg = 'inserting ' + p + ' vs ' + d[1] + 'idx= ' + str(
                idx) + ' of ' + str(n)
            print msg

            qf = 'SELECT dis_id FROM disease WHERE dis_omim_id ='
            qm = '"' + d[2] + '"'
            qr = ''
            q = qf + qm + qr
            disIdR = util.mysqlCommit(db, cursor, q)

            if proIdR != None and disIdR != None:
                proId = proIdR[0]
                disId = disIdR[0]

                insertVals = [proId, disId, src]
                insertVals = ['"' + i + '"' for i in insertVals]

                qf = 'INSERT INTO protein_vs_disease (pro_id,dis_id,pro_vs_dis_source) VALUES ('
                qm = ','.join(insertVals)
                qr = ')'
                q = qf + qm + qr

                util.mysqlCommit(db, cursor, q)
            else:
                log.append('FAIL: ' + msg)

    with open(logFpath, 'w') as f:
        for i in log:
            f.write(str(i) + '\n')
Beispiel #8
0
def insertPlantVsCompound(plantCompoundDict):
    pc = plantCompoundDict
    src = 'knapsack.kanaya.naist.jp'
    log = []; logFpath = outDir+'/insertPlantVsCompound.log'

    n = len(pc); idx = 0; 
    for p,v in pc.iteritems():
        idx += 1

        qf = 'SELECT pla_id FROM plant WHERE pla_name ='
        qm = '"'+p+'"'
        qr = ''
        q = qf+qm+qr
        plaIdR = util.mysqlCommit(db,cursor,q); 

        comList = list( set([c[0] for c in v]) )
        for c in comList:
            msg = 'inserting '+ p+ ' vs '+ c+ ' idx= '+ str(idx)+ ' of '+ str(n)
            print msg

            qf = 'SELECT com_id FROM compound WHERE com_knapsack_id ='
            qm = '"' + c + '"'
            qr = ''
            q = qf+qm+qr
            comIdR = util.mysqlCommit(db,cursor,q); 

            if plaIdR!=None and comIdR!=None:
                plaId = plaIdR[0]
                comId = comIdR[0]

                insertVals = [plaId,comId,src]
                insertVals = ['"'+i+'"' for i in insertVals]

                qf = 'INSERT INTO plant_vs_compound (pla_id,com_id,source) VALUES ('
                qm = ','.join(insertVals)
                qr = ')'
                q = qf+qm+qr
                util.mysqlCommit(db,cursor,q)
            else:
                log.append('FAIL: '+msg)

    with open(logFpath,'w') as f:
        for i in log:
            f.write(str(i)+'\n')
Beispiel #9
0
def insertComFromKegg(comIdx):
    comDataDpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/kegg/kegg_20161010/keggCom_20161010_1-100K'
    drugDataFpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/kegg/kegg_20161010/keggdrug_data_2016-10-11_16:58:04.683546.pkl'

    # Load Kegg compound data
    data = {}
    for filename in os.listdir(comDataDpath):
        if filename.endswith(".pkl"):
            fpath = os.path.join(comDataDpath, filename)
            dPerFile = {}
            with open(fpath, 'rb') as handle:
                dPerFile = pickle.load(handle)

            for k, v in dPerFile.iteritems():
                if len(v) != 0:
                    data[k] = v

    sortedK = data.keys()
    sortedK.sort()
    fpath = outDir + '/keggComData_validComId.lst'
    with open(fpath, 'w') as f:
        for k in sortedK:
            f.write(str(k) + '\n')

    # Load Kegg drug data, to infer their drugbank equivalent
    drugData = None
    with open(drugDataFpath, 'rb') as handle:
        drugData = pickle.load(handle)

    # Update or Insert
    insertList = []

    n = len(data)
    idx = 0
    for keggId, d in data.iteritems():
        idx += 1
        print 'insert/update keggId=', keggId, 'idx=', idx, 'of', n

        knapsackId = ''
        if 'knapsackId' in d.keys():
            knapsackId = d['knapsackId']

        drugbankId = ''
        if 'keggDrugId' in d.keys():
            _ = drugData[d['keggDrugId']]['drugbankId']
            if len(_) != 0:
                drugbankId = _

        casId = ''
        if 'casId' in d.keys():
            casId = d['casId']

        insert = False

        if knapsackId != '':
            # check if exist
            if util.mysqlExist(db,
                               cursor,
                               table='compound',
                               where='com_knapsack_id=' + '"' + knapsackId +
                               '"'):
                # update based on knapsackID
                qf = 'UPDATE compound '
                qm = 'SET ' + 'com_kegg_id=' + '"' + keggId + '"'
                if casId != '':
                    qm = qm + ',' + ' com_cas_id=' + '"' + casId + '"'
                if drugbankId != '':
                    qm = qm + ',' + ' com_drugbank_id=' + '"' + drugbankId + '"'
                qr = ' WHERE com_knapsack_id=' + '"' + knapsackId + '"'
                q = qf + qm + qr
                resp = util.mysqlCommit(db, cursor, q)
            else:
                insert = True

        if drugbankId != '':
            if util.mysqlExist(db,
                               cursor,
                               table='compound',
                               where='com_drugbank_id=' + '"' + drugbankId +
                               '"'):
                # update based on drugbankId
                qf = 'UPDATE  compound '
                qm = 'SET ' + 'com_kegg_id=' + '"' + keggId + '"'
                if casId != '':
                    qm = qm + ',' + ' com_cas_id=' + '"' + casId + '"'
                if knapsackId != '':
                    qm = qm + ',' + ' com_knapsack_id=' + '"' + knapsackId + '"'
                qr = ' WHERE com_drugbank_id=' + '"' + drugbankId + '"'
                q = qf + qm + qr
                resp = util.mysqlCommit(db, cursor, q)
            else:
                insert = True

        if insert:
            comIdx += 1
            comIdStr = 'COM' + str(comIdx).zfill(8)

            insertVals = [comIdStr, casId, drugbankId, knapsackId, keggId]
            insertVals = ['"' + i + '"' for i in insertVals]

            qf = 'INSERT INTO compound (com_id,com_cas_id,com_drugbank_id,com_knapsack_id,com_kegg_id)'
            qr = ' VALUES (' + ','.join(insertVals) + ')'
            q = qf + qr
            resp = util.mysqlCommit(db, cursor, q)

            insertList.append(q)

    insertListFpath = outDir + '/insertion_from_keggComData.lst'
    with open(insertListFpath, 'w') as f:
        for l in insertList:
            f.write(str(l) + '\n')

    return comIdx