def getFDADrug(): fin = open("%s/polyDrugADR.txt" % params.DATA_DIR) se1 = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("|") ses = parts[0].split(",") for se in ses: utils.add_dict_counter(se1, se) fin.close() fin = open("%s/CADER.txt" % params.CAD_OUT) se2 = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("$") ses = parts[1].split(",") for se in ses: utils.add_dict_counter(se2, se) kvs1 = utils.sort_dict(se1) kvs2 = utils.sort_dict(se2) print(len(kvs1), len(kvs2)) k1 = set() k2 = set() MIN_T = 5 for kv in kvs1: k, v = kv if v >= MIN_T: k1.add(k) for kv in kvs2: k, v = kv if v >= 60: k2.add(k) n1 = 0 n2 = 0 for k in k1: if k not in k2: n1 += 1 for k in k2: if k not in k1: n2 += 1 print( len(k1), len(k2), n1, n2, n1 / len(k1), n2 / len(k2), )
def exportValidSEs(nSize=9210): def loadException(path="%s/InValidSEs.txt" % params.FADER_OUT): lines = open(path).readlines() invalidSes = set() invalidTokens = list() for line in lines: line = line.strip() if line[0] == '#': invalidTokens.append(line[1:]) else: invalidSes.add(line) return invalidSes, invalidTokens invalidSes, invalidTokens = loadException() fout = open("%s/ValidSes.txt" % params.FADER_OUT, "w") d = utils.load_obj("%s/FSECount_%s_0" % (params.FADER_OUT, nSize)) kvs = utils.sort_dict(d) for kv in kvs: k, v = kv if k in invalidSes: continue isInvalid = False for token in invalidTokens: if k.__contains__(token): isInvalid = True break if isInvalid: continue fout.write("%s\t%s\n" % (k, v)) fout.close()
def stats2(nSize=0): print("Loading...") drugComb = utils.load_obj("%s/FDrugCombCount_%s" % (params.FADER_OUT, nSize)) print("Sorting..") kvs = utils.sort_dict(drugComb) fout = open("%s/FDrugCombSort_%s" % (params.FADER_OUT, nSize), "w") print("Saving...") cc = 0 for kv in kvs: k, v = kv # print(k, v) cc += v fout.write("%s$%s\n" % (",".join(k), v)) fout.close() print("Total: %s cases" % cc) from plotLib import plotCul2 plotCul2(kvs[::-1], 200, 1, "SelectedCombDrugCutOff", xLabel="ThreshHold: Freq >=", yLabel="Number of Combs")
def exportSeCount(nSize=9210): d = utils.load_obj("%s/FSECount_%s_0" % (params.FADER_OUT, nSize)) kvs = utils.sort_dict(d) fout = open("%s/FSECountSorted_%s_0" % (params.FADER_OUT, nSize), "w") for kv in kvs: k, v = kv fout.write("%s\t%s\n" % (k, v)) fout.close()
def plotSeCount(): seCount = utils.load_obj( "%s/JADERSeCountFX" % params.JADER_OUT) kvs = utils.sort_dict(seCount) from dataProcessing.plotLib import plotCul2, plotCul, plotHistD plotCul(kvs[::-1], 50, 1, "JADERSEFreq", xLabel="Thresholds of SE Frequency", yLabel="Num. SEs")
def finalStats(): fin = open("%s/finalMap/FinalMap.txt" % params.OUTPUT_DIR) lines = fin.readlines() lines = [line.strip() for line in lines] dMap = dict() for line in lines: parts = line.split("||") dMap[parts[0]] = parts[1] fin.close() fin = open("%s/finalMap/FinalMapH.txt" % params.OUTPUT_DIR) lines = fin.readlines() lines = [line.strip() for line in lines] dMapH = dict() for line in lines: parts = line.split("||") dMapH[parts[0]] = parts[1] fin.close() dFreq = dict() fin = open("%s/Tmp/DrugFreq2.txt" % params.OUTPUT_DIR) while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("\t") drugJader = parts[1] c = int(parts[0]) dDrugBank = utils.get_dict(dMap, drugJader, -1) d2 = utils.get_dict(dMapH, drugJader, -1) if dDrugBank != -1: utils.add_dict_counter(dFreq, dDrugBank, c) elif d2 != -1: utils.add_dict_counter(dFreq, drugJader, c) kvs = utils.sort_dict(dFreq) fout = open("%s/FinalDrugFreq.txt" % params.OUTPUT_DIR, "w") for kv in kvs: k, v = kv fout.write("%.6s\t%s\n" % (v, k)) from plotLib import plotHistD, plotCul plotCul(kvs[::-1], 50, 2, "SelectedDrugCutOff", xLabel="ThreshHold: Freq >=", yLabel="Number of Drugs") fout.close() from plotLib import plotHistD, plotCul plotCul(kvs[::-1], 20, 1, "SelectedDrugCutOff", xLabel="ThreshHold: Freq >=", yLabel="Number of Drugs")
def exportSubG2(): fin = open("%s/JADER.txt" % params.JADER_OUT) foutDict = dict() dlen2SeCount = dict() nA = 0 print("Reading...") while True: line = fin.readline() if line == "": break nA += 1 print("\r%s" % nA, end="") parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] drugs = drugCmb.split(",") nD = len(drugs) drugs = sorted(drugs) sortNames = ",".join(drugs) fO = utils.get_dict(foutDict, nD, -1) if fO == -1: fO = open("%s/SUB/G%s" % (params.JADER_OUT, nD), "w") foutDict[nD] = fO fO.write("%s$%s\n" % (sortNames, ses)) if len(drugs) > 2 and len(drugs) <= 20: for i in range(len(drugs)): for j in range(i + 1, len(drugs)): d1 = drugs[i] d2 = drugs[j] pair = "%s,%s" % (d1, d2) try: f2 = foutDict[2] except: f2 = open("%s/SUB/G%s" % (params.JADER_OUT, 2), "w") foutDict[2] = f2 f2.write("%s$%s\n" % (pair, ses)) len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict()) sess = ses.split(",") for se in sess: utils.add_dict_counter(len2SeCount, se) for k, v in foutDict.items(): v.close() d2 = dict() for k, v in dlen2SeCount.items(): kvs = utils.sort_dict(v) ks = [] for kv in kvs: kk, _ = kv ks.append(kk) d2[k] = ks utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.JADER_OUT)
def exportDrugCom2Side(): fin = open("%s/JADER.txt" % params.JADER_OUT) fout = open("%s/JADER2AllSeList.txt" % params.JADER_OUT, "w") dDrugComb2Se = dict() dDrugCombCount = dict() dDrugCom2Lenght = dict() drugCont = dict() seCount = dict() cc = 0 while True: line = fin.readline() if line == "": break cc += 1 line = line.strip() parts = line.split("$") drugCom = parts[0] dDrugCom2Lenght[drugCom] = len(drugCom.split(",")) ses = parts[1].split(",") utils.add_dict_counter(dDrugCombCount, drugCom, 1) for drug in drugCom.split(","): utils.add_dict_counter(drugCont, drug, 1) sesComb = utils.get_insert_key_dict(dDrugComb2Se, drugCom, dict()) for se in ses: utils.add_dict_counter(sesComb, se, 1) utils.add_dict_counter(seCount, se) kvs = utils.sort_dict(dDrugCombCount) for kv in kvs: k, v = kv seCountKv = utils.sort_dict(dDrugComb2Se[k]) sString = [] for seCountx in seCountKv: se,count = seCountx sString.append("%s:%s"% (se, count)) fout.write("%s:%s$%s$%s\n" % (k, v, len(sString), ",".join(sString))) fout.close() utils.save_obj(seCount, "%s/JADERSeCountFX" % params.JADER_OUT) utils.save_obj(dDrugCom2Lenght, "%s/DrugCombLength" % params.JADER_OUT) print(len(drugCont), len(seCount))
def plot3X(): dLength = utils.load_obj("%s/FDrugCombLength" % params.FADER_OUT) kvs = utils.sort_dict(dLength) dCount = dict() for kv in kvs: _, v = kv utils.add_dict_counter(dCount, v) maxLength = max(dCount.keys()) x = [i for i in range(1, maxLength + 1)] import numpy as np y = np.zeros(maxLength) for k, v in dCount.items(): y[k - 1] = v fin = open("%s/FDrug2AllSeList.txt" % params.FADER_OUT) dLength2NReports = dict() kv = [] vs = [] while True: line = fin.readline() if line == "": break line = line.strip().split("$") parts = line[0].split(":") c = int(parts[1]) drugCombLenght = len(parts[0].split(",")) utils.add_dict_counter(dLength2NReports, drugCombLenght, c) vs.append(c) kv.append([parts[0], c]) # import matplotlib.pyplot as plt # import numpy as np # maxX = max(dLength2NReports.keys()) x = [i for i in range(1, maxLength + 1)] z = np.zeros(maxLength) for k, v in dLength2NReports.items(): z[k - 1] = v import matplotlib.pyplot as plt import numpy as np fig = plt.figure() ax = fig.add_subplot(projection='3d') ax.plot(x, y, z, marker='>') ax.set_xlabel('DrugComb Length') ax.set_ylabel('DrugComb Count') ax.set_zlabel('NReport') plt.tight_layout plt.savefig("%s/3DDrugCombLengthReport.png" % params.FIG_DIR)
def exportPair(): fin = open("%s/CADER.txt" % OUT_DIR) # fout = open("%s/JADERIndPair.txt" % params.JADER_OUT, "w") validDrugs = dict() validPairs = dict() validIndicates = dict() validSes = dict() while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("$") drugComb = parts[1] indications = parts[2] ses = parts[3] drugs = drugComb.split(",") # print(drugs) for drug in drugs: utils.add_dict_counter(validDrugs, drug) for ind in indications.split(","): utils.add_dict_counter(validIndicates, ind) for se in ses.split(","): utils.add_dict_counter(validSes, se) if len(drugs) >= 2 and len(drugs) <= 20: drugs = sorted(drugs) for i in range(len(drugs)): for j in range(i + 1, len(drugs)): d1, d2 = drugs[i], drugs[j] pair = "%s,%s" % (d1, d2) utils.add_dict_counter(validPairs, pair) cDrug = utils.sort_dict(validDrugs) cInd = utils.sort_dict(validIndicates) cSe = utils.sort_dict(validSes) cPair = utils.sort_dict(validPairs) print(len(cPair)) writeSortedDictC(cDrug, "%s/%sADrugs.txt" % (OUT_DIR, PREF)) writeSortedDictC(cInd, "%s/%sAInd.txt" % (OUT_DIR, PREF)) writeSortedDictC(cSe, "%s/%sASe.txt" % (OUT_DIR, PREF)) writeSortedDictC(cPair, "%s/%sPairs.txt" % (OUT_DIR, PREF))
def stats1(nSize=0): print("Loading...") drugComb = utils.load_obj("%s/FDrugNameCount_%s" % (params.FADER_OUT, nSize)) print("Sorting..") kvs = utils.sort_dict(drugComb) fout = open("%s/FDrugNamesSort_%s" % (params.FADER_OUT, nSize), "w") print("Saving...") for kv in kvs: k, v = kv if len(k) <= 1: continue fout.write("%s$%s\n" % (k, v)) fout.close()
def statsCommonSes(): fin = open("%s/CADER.txt" % (params.CAD_OUT)) dSeCout = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("$") ses = parts[-1].split(",") for se in ses: utils.add_dict_counter(dSeCout, se) kvs = utils.sort_dict(dSeCout) ks = [] for kv in kvs: k, v = kv if v <= 20: continue ks.append(k) utils.save_obj(ks, "%s/SeTopList.txt" % params.CAD_OUT)
def exportSub(): fin = open("%s/FDrug2SeList_19814.txt" % params.FADER_OUT) foutDict = dict() dlen2SeCount = dict() nA = 0 print("Reading...") while True: line = fin.readline() if line == "": break nA += 1 print("\r%s" % nA, end="") parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] drugs = drugCmb.split(",") nD = len(drugs) sortNames = ",".join(sorted(drugs)) fO = utils.get_dict(foutDict, nD, -1) if fO == -1: fO = open("%s/SUB/%s" % (params.FADER_OUT, nD), "w") foutDict[nD] = fO fO.write("%s$%s\n" % (sortNames, ses)) len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict()) sess = ses.split(",") for se in sess: utils.add_dict_counter(len2SeCount, se) for k, v in foutDict.items(): v.close() d2 = dict() for k, v in dlen2SeCount.items(): kvs = utils.sort_dict(v) ks = [] for kv in kvs: kk, _ = kv ks.append(kk) d2[k] = ks utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.FADER_OUT)
def checkDupR(): fin = open("%s/ReportDrug1.txt" % params.CAD_OUT) dCout = dict() nError = 0 cc = 0 while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("$") idx = parts[0] utils.add_dict_counter(dCout, idx) cc += 1 print("Total: ", nError, cc) kvs = utils.sort_dict(dCout) fout = open("%s/S1.txt" % params.CAD_OUT, "w") for kv in kvs: k, v = kv fout.write("%s\t%s\n" % (k, v)) fout.close()
def exportCanSaltFreq(): fin = open("%s/rawMatching/MatchingDrug2.txt" % params.OUTPUT_DIR) wordFreqs = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("||") # words = parts[0].split(" ") # for word in words: # if "(" not in word and ")" not in word: # utils.add_dict_counter(wordFreqs, word) utils.add_dict_counter(wordFreqs, parts[1]) kvs = utils.sort_dict(wordFreqs) fout = open("%s/rawMatching/CandSaltFreq.txt" % params.OUTPUT_DIR, "w") for kv in kvs: k, v = kv if v <= 2: continue fout.write("%s\n" % (k)) fout.close()
def plotDrugCombLength(): dLength = utils.load_obj("%s/DrugCombLength" % params.JADER_OUT) kvs = utils.sort_dict(dLength) dCount = dict() for kv in kvs: _, v = kv utils.add_dict_counter(dCount, v) maxLength = max(dCount.keys()) x = [i for i in range(1, maxLength+1)] import numpy as np y = np.zeros(maxLength) for k, v in dCount.items(): y[k-1] = v import matplotlib.pyplot as plt plt.scatter(x,y) plt.xlabel("DrugComb length") plt.ylabel("Num DrugComb") plt.tight_layout() plt.savefig("%s/%s.png" % (params.FIG_DIR, "JADERDrugLength"))
shuffle(only_values) known = only_values[:14500] test_records = only_values[14500:15400] problem = [x[1:] for x in test_records] answer = [x[0] for x in test_records] target = [x[0] for x in known] train = [x[1:] for x in known] rfRegressor = RandomForestRegressor(n_estimators=100) rfRegressor.fit(train, target) prediction = rfRegressor.predict(problem) errors = get_abs_errors(prediction, answer) print(errors) print(numpy.mean([abs(error) for error in errors])) feature_importances = get_combined_feature_importances(feature_names[1:], rfRegressor.feature_importances_) formatted_feature_importances = freq_dict_to_percent_dict(feature_importances) sorted_feature_importances = sort_dict(feature_importances) for x in sorted_feature_importances: print(x)