コード例 #1
0
ファイル: checkSEOv.py プロジェクト: anhnda/FDAPolyADR
def getFDADrug():
    fin = open("%s/polyDrugADR.txt" % params.DATA_DIR)
    se1 = dict()
    while True:
        line = fin.readline()
        if line == "":
            break
        parts = line.strip().split("|")
        ses = parts[0].split(",")
        for se in ses:
            utils.add_dict_counter(se1, se)
    fin.close()
    fin = open("%s/CADER.txt" % params.CAD_OUT)
    se2 = dict()
    while True:
        line = fin.readline()
        if line == "":
            break
        parts = line.strip().split("$")
        ses = parts[1].split(",")
        for se in ses:
            utils.add_dict_counter(se2, se)

    kvs1 = utils.sort_dict(se1)
    kvs2 = utils.sort_dict(se2)
    print(len(kvs1), len(kvs2))
    k1 = set()
    k2 = set()
    MIN_T = 5
    for kv in kvs1:
        k, v = kv
        if v >= MIN_T:
            k1.add(k)

    for kv in kvs2:
        k, v = kv
        if v >= 60:
            k2.add(k)

    n1 = 0
    n2 = 0
    for k in k1:
        if k not in k2:
            n1 += 1

    for k in k2:
        if k not in k1:
            n2 += 1
    print(
        len(k1),
        len(k2),
        n1,
        n2,
        n1 / len(k1),
        n2 / len(k2),
    )
コード例 #2
0
def exportValidSEs(nSize=9210):
    def loadException(path="%s/InValidSEs.txt" % params.FADER_OUT):
        lines = open(path).readlines()
        invalidSes = set()
        invalidTokens = list()
        for line in lines:
            line = line.strip()
            if line[0] == '#':
                invalidTokens.append(line[1:])
            else:
                invalidSes.add(line)
        return invalidSes, invalidTokens

    invalidSes, invalidTokens = loadException()

    fout = open("%s/ValidSes.txt" % params.FADER_OUT, "w")
    d = utils.load_obj("%s/FSECount_%s_0" % (params.FADER_OUT, nSize))
    kvs = utils.sort_dict(d)

    for kv in kvs:
        k, v = kv
        if k in invalidSes:
            continue
        isInvalid = False

        for token in invalidTokens:
            if k.__contains__(token):
                isInvalid = True
                break
        if isInvalid:
            continue
        fout.write("%s\t%s\n" % (k, v))
    fout.close()
コード例 #3
0
def stats2(nSize=0):
    print("Loading...")
    drugComb = utils.load_obj("%s/FDrugCombCount_%s" %
                              (params.FADER_OUT, nSize))

    print("Sorting..")
    kvs = utils.sort_dict(drugComb)

    fout = open("%s/FDrugCombSort_%s" % (params.FADER_OUT, nSize), "w")

    print("Saving...")
    cc = 0
    for kv in kvs:
        k, v = kv
        # print(k, v)
        cc += v
        fout.write("%s$%s\n" % (",".join(k), v))
    fout.close()
    print("Total: %s cases" % cc)
    from plotLib import plotCul2

    plotCul2(kvs[::-1],
             200,
             1,
             "SelectedCombDrugCutOff",
             xLabel="ThreshHold: Freq >=",
             yLabel="Number of Combs")
コード例 #4
0
def exportSeCount(nSize=9210):
    d = utils.load_obj("%s/FSECount_%s_0" % (params.FADER_OUT, nSize))
    kvs = utils.sort_dict(d)
    fout = open("%s/FSECountSorted_%s_0" % (params.FADER_OUT, nSize), "w")
    for kv in kvs:
        k, v = kv
        fout.write("%s\t%s\n" % (k, v))
    fout.close()
コード例 #5
0
def plotSeCount():
    seCount = utils.load_obj( "%s/JADERSeCountFX" % params.JADER_OUT)
    kvs = utils.sort_dict(seCount)


    from dataProcessing.plotLib import plotCul2, plotCul, plotHistD

    plotCul(kvs[::-1], 50, 1, "JADERSEFreq", xLabel="Thresholds of SE Frequency", yLabel="Num. SEs")
コード例 #6
0
ファイル: drugMatching.py プロジェクト: anhnda/FDAPolyADR
def finalStats():
    fin = open("%s/finalMap/FinalMap.txt" % params.OUTPUT_DIR)
    lines = fin.readlines()
    lines = [line.strip() for line in lines]
    dMap = dict()
    for line in lines:
        parts = line.split("||")
        dMap[parts[0]] = parts[1]
    fin.close()

    fin = open("%s/finalMap/FinalMapH.txt" % params.OUTPUT_DIR)
    lines = fin.readlines()
    lines = [line.strip() for line in lines]
    dMapH = dict()
    for line in lines:
        parts = line.split("||")
        dMapH[parts[0]] = parts[1]
    fin.close()

    dFreq = dict()
    fin = open("%s/Tmp/DrugFreq2.txt" % params.OUTPUT_DIR)
    while True:
        line = fin.readline()
        if line == "":
            break
        line = line.strip()
        parts = line.split("\t")
        drugJader = parts[1]
        c = int(parts[0])
        dDrugBank = utils.get_dict(dMap, drugJader, -1)
        d2 = utils.get_dict(dMapH, drugJader, -1)
        if dDrugBank != -1:
            utils.add_dict_counter(dFreq, dDrugBank, c)
        elif d2 != -1:
            utils.add_dict_counter(dFreq, drugJader, c)

    kvs = utils.sort_dict(dFreq)
    fout = open("%s/FinalDrugFreq.txt" % params.OUTPUT_DIR, "w")
    for kv in kvs:
        k, v = kv
        fout.write("%.6s\t%s\n" % (v, k))
    from plotLib import plotHistD, plotCul
    plotCul(kvs[::-1],
            50,
            2,
            "SelectedDrugCutOff",
            xLabel="ThreshHold: Freq >=",
            yLabel="Number of Drugs")

    fout.close()
    from plotLib import plotHistD, plotCul
    plotCul(kvs[::-1],
            20,
            1,
            "SelectedDrugCutOff",
            xLabel="ThreshHold: Freq >=",
            yLabel="Number of Drugs")
コード例 #7
0
def exportSubG2():
    fin = open("%s/JADER.txt" % params.JADER_OUT)
    foutDict = dict()
    dlen2SeCount = dict()
    nA = 0
    print("Reading...")

    while True:
        line = fin.readline()
        if line == "":
            break
        nA += 1
        print("\r%s" % nA, end="")
        parts = line.strip().split("$")
        drugCmb = parts[0]
        ses = parts[1]
        drugs = drugCmb.split(",")
        nD = len(drugs)
        drugs = sorted(drugs)
        sortNames = ",".join(drugs)

        fO = utils.get_dict(foutDict, nD, -1)
        if fO == -1:
            fO = open("%s/SUB/G%s" % (params.JADER_OUT, nD), "w")
            foutDict[nD] = fO
        fO.write("%s$%s\n" % (sortNames, ses))
        if len(drugs) > 2 and len(drugs) <= 20:
            for i in range(len(drugs)):
                for j in range(i + 1, len(drugs)):
                    d1 = drugs[i]
                    d2 = drugs[j]
                    pair = "%s,%s" % (d1, d2)
                    try:
                        f2 = foutDict[2]
                    except:
                        f2 = open("%s/SUB/G%s" % (params.JADER_OUT, 2), "w")
                        foutDict[2] = f2
                    f2.write("%s$%s\n" % (pair, ses))
        len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict())
        sess = ses.split(",")
        for se in sess:
            utils.add_dict_counter(len2SeCount, se)

    for k, v in foutDict.items():
        v.close()

    d2 = dict()
    for k, v in dlen2SeCount.items():
        kvs = utils.sort_dict(v)
        ks = []
        for kv in kvs:
            kk, _ = kv
            ks.append(kk)
        d2[k] = ks
    utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.JADER_OUT)
コード例 #8
0
def exportDrugCom2Side():
    fin = open("%s/JADER.txt" % params.JADER_OUT)
    fout = open("%s/JADER2AllSeList.txt" % params.JADER_OUT, "w")
    dDrugComb2Se = dict()
    dDrugCombCount = dict()
    dDrugCom2Lenght = dict()
    drugCont = dict()
    seCount = dict()
    cc = 0
    while True:
        line = fin.readline()
        if line == "":
            break
        cc += 1
        line = line.strip()
        parts = line.split("$")
        drugCom = parts[0]
        dDrugCom2Lenght[drugCom] = len(drugCom.split(","))

        ses = parts[1].split(",")
        utils.add_dict_counter(dDrugCombCount, drugCom, 1)
        for drug in drugCom.split(","):
            utils.add_dict_counter(drugCont, drug, 1)
        sesComb = utils.get_insert_key_dict(dDrugComb2Se, drugCom, dict())
        for se in ses:
            utils.add_dict_counter(sesComb, se, 1)
            utils.add_dict_counter(seCount, se)

    kvs = utils.sort_dict(dDrugCombCount)
    for kv in kvs:
        k, v = kv
        seCountKv = utils.sort_dict(dDrugComb2Se[k])
        sString = []
        for seCountx in seCountKv:
            se,count = seCountx
            sString.append("%s:%s"% (se, count))

        fout.write("%s:%s$%s$%s\n" % (k, v, len(sString), ",".join(sString)))
    fout.close()
    utils.save_obj(seCount, "%s/JADERSeCountFX" % params.JADER_OUT)
    utils.save_obj(dDrugCom2Lenght, "%s/DrugCombLength" % params.JADER_OUT)
    print(len(drugCont), len(seCount))
コード例 #9
0
def plot3X():
    dLength = utils.load_obj("%s/FDrugCombLength" % params.FADER_OUT)

    kvs = utils.sort_dict(dLength)
    dCount = dict()
    for kv in kvs:
        _, v = kv
        utils.add_dict_counter(dCount, v)

    maxLength = max(dCount.keys())
    x = [i for i in range(1, maxLength + 1)]
    import numpy as np

    y = np.zeros(maxLength)
    for k, v in dCount.items():
        y[k - 1] = v

    fin = open("%s/FDrug2AllSeList.txt" % params.FADER_OUT)
    dLength2NReports = dict()
    kv = []
    vs = []
    while True:
        line = fin.readline()
        if line == "":
            break
        line = line.strip().split("$")
        parts = line[0].split(":")
        c = int(parts[1])
        drugCombLenght = len(parts[0].split(","))
        utils.add_dict_counter(dLength2NReports, drugCombLenght, c)
        vs.append(c)
        kv.append([parts[0], c])

    # import matplotlib.pyplot as plt
    # import numpy as np
    # maxX = max(dLength2NReports.keys())
    x = [i for i in range(1, maxLength + 1)]
    z = np.zeros(maxLength)
    for k, v in dLength2NReports.items():
        z[k - 1] = v

    import matplotlib.pyplot as plt
    import numpy as np

    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')

    ax.plot(x, y, z, marker='>')

    ax.set_xlabel('DrugComb Length')
    ax.set_ylabel('DrugComb Count')
    ax.set_zlabel('NReport')
    plt.tight_layout
    plt.savefig("%s/3DDrugCombLengthReport.png" % params.FIG_DIR)
コード例 #10
0
ファイル: exportStatsX.py プロジェクト: anhnda/FDAPolyADR
def exportPair():
    fin = open("%s/CADER.txt" % OUT_DIR)
    # fout = open("%s/JADERIndPair.txt" % params.JADER_OUT, "w")
    validDrugs = dict()
    validPairs = dict()
    validIndicates = dict()
    validSes = dict()
    while True:
        line = fin.readline()
        if line == "":
            break
        line = line.strip()
        parts = line.split("$")
        drugComb = parts[1]
        indications = parts[2]
        ses = parts[3]
        drugs = drugComb.split(",")
        # print(drugs)
        for drug in drugs:
            utils.add_dict_counter(validDrugs, drug)
        for ind in indications.split(","):
            utils.add_dict_counter(validIndicates, ind)
        for se in ses.split(","):
            utils.add_dict_counter(validSes, se)
        if len(drugs) >= 2 and len(drugs) <= 20:
            drugs = sorted(drugs)
            for i in range(len(drugs)):
                for j in range(i + 1, len(drugs)):
                    d1, d2 = drugs[i], drugs[j]
                    pair = "%s,%s" % (d1, d2)
                    utils.add_dict_counter(validPairs, pair)

    cDrug = utils.sort_dict(validDrugs)
    cInd = utils.sort_dict(validIndicates)
    cSe = utils.sort_dict(validSes)
    cPair = utils.sort_dict(validPairs)
    print(len(cPair))
    writeSortedDictC(cDrug, "%s/%sADrugs.txt" % (OUT_DIR, PREF))
    writeSortedDictC(cInd, "%s/%sAInd.txt" % (OUT_DIR, PREF))
    writeSortedDictC(cSe, "%s/%sASe.txt" % (OUT_DIR, PREF))
    writeSortedDictC(cPair, "%s/%sPairs.txt" % (OUT_DIR, PREF))
コード例 #11
0
def stats1(nSize=0):
    print("Loading...")
    drugComb = utils.load_obj("%s/FDrugNameCount_%s" % (params.FADER_OUT, nSize))
    print("Sorting..")
    kvs = utils.sort_dict(drugComb)

    fout = open("%s/FDrugNamesSort_%s" % (params.FADER_OUT, nSize), "w")
    print("Saving...")
    for kv in kvs:
        k, v = kv
        if len(k) <= 1:
            continue

        fout.write("%s$%s\n" % (k, v))

    fout.close()
コード例 #12
0
ファイル: subStats3.py プロジェクト: anhnda/FDAPolyADR
def statsCommonSes():
    fin = open("%s/CADER.txt" % (params.CAD_OUT))
    dSeCout = dict()
    while True:
        line = fin.readline()
        if line == "":
            break
        parts = line.strip().split("$")
        ses = parts[-1].split(",")
        for se in ses:
            utils.add_dict_counter(dSeCout, se)
    kvs = utils.sort_dict(dSeCout)
    ks = []
    for kv in kvs:
        k, v = kv
        if v <= 20:
            continue
        ks.append(k)
    utils.save_obj(ks, "%s/SeTopList.txt" % params.CAD_OUT)
コード例 #13
0
ファイル: subStats.py プロジェクト: anhnda/FDAPolyADR
def exportSub():
    fin = open("%s/FDrug2SeList_19814.txt" % params.FADER_OUT)
    foutDict = dict()
    dlen2SeCount = dict()
    nA = 0
    print("Reading...")

    while True:
        line = fin.readline()
        if line == "":
            break
        nA += 1
        print("\r%s" % nA, end="")
        parts = line.strip().split("$")
        drugCmb = parts[0]
        ses = parts[1]
        drugs = drugCmb.split(",")
        nD = len(drugs)
        sortNames = ",".join(sorted(drugs))

        fO = utils.get_dict(foutDict, nD, -1)
        if fO == -1:
            fO = open("%s/SUB/%s" % (params.FADER_OUT, nD), "w")
            foutDict[nD] = fO
        fO.write("%s$%s\n" % (sortNames, ses))
        len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict())
        sess = ses.split(",")
        for se in sess:
            utils.add_dict_counter(len2SeCount, se)

    for k, v in foutDict.items():
        v.close()

    d2 = dict()
    for k, v in dlen2SeCount.items():
        kvs = utils.sort_dict(v)
        ks = []
        for kv in kvs:
            kk, _ = kv
            ks.append(kk)
        d2[k] = ks
    utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.FADER_OUT)
コード例 #14
0
def checkDupR():
    fin = open("%s/ReportDrug1.txt" % params.CAD_OUT)
    dCout = dict()
    nError = 0
    cc = 0
    while True:
        line = fin.readline()
        if line == "":
            break
        line = line.strip()
        parts = line.split("$")
        idx = parts[0]
        utils.add_dict_counter(dCout, idx)
        cc += 1

    print("Total: ", nError, cc)
    kvs = utils.sort_dict(dCout)
    fout = open("%s/S1.txt" % params.CAD_OUT, "w")
    for kv in kvs:
        k, v = kv
        fout.write("%s\t%s\n" % (k, v))
    fout.close()
コード例 #15
0
ファイル: drugMatching.py プロジェクト: anhnda/FDAPolyADR
def exportCanSaltFreq():
    fin = open("%s/rawMatching/MatchingDrug2.txt" % params.OUTPUT_DIR)
    wordFreqs = dict()
    while True:
        line = fin.readline()
        if line == "":
            break
        parts = line.strip().split("||")
        # words = parts[0].split(" ")
        # for word in words:
        #     if "(" not in word and ")" not in word:
        #         utils.add_dict_counter(wordFreqs, word)
        utils.add_dict_counter(wordFreqs, parts[1])
    kvs = utils.sort_dict(wordFreqs)

    fout = open("%s/rawMatching/CandSaltFreq.txt" % params.OUTPUT_DIR, "w")
    for kv in kvs:
        k, v = kv
        if v <= 2:
            continue
        fout.write("%s\n" % (k))
    fout.close()
コード例 #16
0
def plotDrugCombLength():
    dLength = utils.load_obj("%s/DrugCombLength" % params.JADER_OUT)

    kvs = utils.sort_dict(dLength)
    dCount = dict()
    for kv in kvs:
        _, v = kv
        utils.add_dict_counter(dCount, v)

    maxLength = max(dCount.keys())
    x = [i for i in range(1, maxLength+1)]
    import numpy as np

    y = np.zeros(maxLength)
    for k, v in dCount.items():
        y[k-1] = v

    import matplotlib.pyplot as plt
    plt.scatter(x,y)
    plt.xlabel("DrugComb length")
    plt.ylabel("Num DrugComb")
    plt.tight_layout()
    plt.savefig("%s/%s.png" % (params.FIG_DIR, "JADERDrugLength"))
コード例 #17
0
shuffle(only_values)

known = only_values[:14500]
test_records = only_values[14500:15400]

problem = [x[1:] for x in test_records]
answer = [x[0] for x in test_records]

target = [x[0] for x in known]
train = [x[1:] for x in known]

rfRegressor = RandomForestRegressor(n_estimators=100)
rfRegressor.fit(train, target)

prediction = rfRegressor.predict(problem)

errors = get_abs_errors(prediction, answer)

print(errors)
print(numpy.mean([abs(error) for error in errors]))

feature_importances = get_combined_feature_importances(feature_names[1:], rfRegressor.feature_importances_)

formatted_feature_importances = freq_dict_to_percent_dict(feature_importances)

sorted_feature_importances = sort_dict(feature_importances)

for x in sorted_feature_importances:
    print(x)