Beispiel #1
0
def getFirst24HrsDataValuesIndividually(hadm_id, itemids, mapping=None):
    """
    Runs an SQL Query to return featues that that returns features for top 100
    most frequent itemids of both chartevents and labevents (might overlap)
    HOWEVER, only for one hadm_id

    In addition uses the preprocessing csv stolen from mimic3 benchmark to deal with
    most common problems in clinical data
    :param itemids variable to use for counts of features
    :param hadm_id the admission id to run query and retrieve data for
    :param mapping a DataFrame with columns itemid and variable to translate from former to latter
            mapping is used to deal with multiple itemids that are essentially the same concept
    :return a Dataframe with the data
    """
    query = "WITH timeranges as (SELECT hadm_id, admittime, admittime + interval '24 hour' as endtime FROM admissions WHERE hadm_id = " + str(hadm_id) + "), \n"\
        + "topLabEvents as ( SELECT hadm_id, label, labevents.itemid, charttime, value, valuenum, valueuom FROM labevents  LEFT JOIN d_labitems on d_labitems.itemid = labevents.itemid WHERE labevents.itemid in  \n" \
        + commonDB.convertListToSQL(itemids) \
        + "\n AND hadm_id = " + str(hadm_id) + " AND charttime BETWEEN (SELECT admittime FROM timeranges) AND (SELECT endtime FROM timeranges)\n" \
        + "), topChartEvents as (SELECT hadm_id, label, chartevents.itemid, charttime, value, valuenum, valueuom FROM chartevents  LEFT JOIN d_items on d_items.itemid = chartevents.itemid WHERE chartevents.itemid in \n" \
        + commonDB.convertListToSQL(itemids) \
        + "\n AND hadm_id = " + str(hadm_id) + " AND charttime BETWEEN (SELECT admittime FROM timeranges) AND (SELECT endtime FROM timeranges) \n" \
        + " ) SELECT * FROM topLabEvents UNION SELECT * FROM topChartEvents ORDER BY charttime"
    conn = commonDB.getConnection()
    dataToReturn = pd.read_sql(query, conn)
    #default variable name is itemid if we cannot find the correct translation in the mapping dataframe we pass in
    if mapping is not None:
        mapping = mapping[["itemid", "variable"]] #when we merge, we want to discard miscellaneous columns for clarity TODO: do we need this line?
        dataToReturn = dataToReturn.merge(mapping, left_on=['itemid'], right_on=['itemid'], how='left')
        # dataToReturn.loc[:, "variable"].fillna(dataToReturn['itemid']) TODO: fix this?
    else:
        dataToReturn["variable"] = dataToReturn["itemid"]
    return dataToReturn
Beispiel #2
0
def countFeatures(subject_ids=None,
                  hadm_ids=None,
                  path="data/sql/perAdmissionCount.sql",
                  mapping=None):
    """
    This file goes and executes queries to count the most common features in 24 hour ranges
    as well as labels, itemids, and average occurrences of feature in each admission for the
    10,282 matched subset.
    :param path where sql query is stored
    :param subject_ids subjectIDS to restrict the count to; if None, then include all
    :param hadm_ids hadm_ids to restrict feature count to; if None, then include all
    :param mapping a DataFrame with columns itemid_x and itemid_y to deal with multiple mapping to same feature
    :return dataframe with the raw count data of features per admission
    """
    conn = commonDB.getConnection()

    with open(path, "r") as f:
        query = f.read()
    if subject_ids is None:
        query.replace("<INSERT IDS HERE>", "")
    else:
        query = query.replace(
            "<INSERT IDS HERE>",
            "AND subject_id in " + commonDB.convertListToSQL(subject_ids))
    if hadm_ids is None:
        query = query.replace("<INSERT labevents hadm_ids HERE>", "")
        query = query.replace("<INSERT chartevents hadm_ids HERE>", "")
    else:
        query = query.replace(
            "<INSERT hadm_ids HERE>",
            "AND timeranges.hadm_id in" + commonDB.convertListToSQL(hadm_ids))
        query = query.replace(
            "<INSERT labevents hadm_ids HERE>",
            "AND labevents.hadm_id in" + commonDB.convertListToSQL(hadm_ids))
        query = query.replace(
            "<INSERT chartevents hadm_ids HERE>",
            "AND chartevents.hadm_id in" + commonDB.convertListToSQL(hadm_ids))
    events = pd.read_sql(query, conn)
    # This was added in when we realized some different itemids map to same concept
    if (mapping is not None):
        toDrop = []
        for i in range(0, events.shape[1]):
            if events["itemid"][i] in set(mapping["itemid_x"]):
                ind = mapping[mapping["itemid_x"] == events["itemid"][i]].index
                itemid_y = mapping["itemid_y"][ind]
                countAdmissions = events["countAdmissionsPresent"][ind]
                perAdmissions = events["avgPerAdmission"][ind]
                countTotal = countAdmissions * perAdmissions
                # converts stats calculated during sql query to usable form here when using the mapping
                # (avg itemid_x * count itemid_x + avg itemid_y * count itemid_y) / total_counts
                events["avgPerAdmission"][events["itemid"] == itemid_y] = (events["avgPerAdmission"][events["itemid"] == itemid_y]
                                                                           * events["countAdmissionsPresent"][events["itemid"] == itemid_y]
                                                                           + countTotal) \
                    / (countAdmissions + events["countAdmissionsPresent"][events["itemid"] == itemid_y])
                events["countAdmissionsPresent"][events["itemid"] ==
                                                 itemid_y] += countTotal
                toDrop.append(ind)
        events.drop(toDrop, axis=1)
    return events
Beispiel #3
0
def getCategorizations(hadm_ids = None):
    """
    This is a function to run the Angus.sql query, which is responsible for categorizing patients
    :param ids the hadm_ids to look at when doing the query; if None, run it on all
    :return the angusData
    """
    conn = commonDB.getConnection()
    with open("data/sql/angus.sql") as f:
        query = f.read()
    if hadm_ids is None:
        query = query.replace("<INSERT IDS HERE>", "")
    else:
        query = query.replace("<INSERT IDS HERE>", "WHERE hadm_id IN " + commonDB.convertListToSQL(hadm_ids))

    angusData = pd.read_sql(query, conn)
    angusData.set_index(["hadm_id"], inplace=True)
    return angusData
Beispiel #4
0
def getCategorizationsBySubjectID(catDF = None):
    """
    This is a function to return the categorizations by subject instead of hadm_id
    in other words, if a subject had a sepsis diagnosis ever, he/she will be classfied as such
    :param optional categorization DataFrame to use that uses hadm_id based categorization
    :return the angus categorization
    """
    if catDF == None:
        conn = commonDB.getConnection()
        with open("data/sql/angus.sql") as f:
            query = f.read()
            query = query.replace("<INSERT IDS HERE>", "")
        catDF = pd.read_sql(query, conn)
        catDF.set_index(["hadm_id"], inplace=True)
    bySubjectCat = []
    for subject_id in catDF["subject_id"].unique():
        onlySubject = catDF[catDF["subject_id"] == subject_id]
        if onlySubject.apply(lambda row: row["angus"] == 1, axis = 1).any():
            bySubjectCat.append(pd.DataFrame({"subject_id": [subject_id], "angus": [1]}))
        else:
            bySubjectCat.append(pd.DataFrame({"subject_id": [subject_id], "angus": [0]}))
    bySubject = pd.concat(bySubjectCat)
    return bySubject.set_index("subject_id")